ghash-c64xplus.pl 7.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246
  1. #! /usr/bin/env perl
  2. # Copyright 2012-2020 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. # ====================================================================
  10. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  11. # project. The module is, however, dual licensed under OpenSSL and
  12. # CRYPTOGAMS licenses depending on where you obtain it. For further
  13. # details see http://www.openssl.org/~appro/cryptogams/.
  14. # ====================================================================
  15. #
  16. # December 2011
  17. #
  18. # The module implements GCM GHASH function and underlying single
  19. # multiplication operation in GF(2^128). Even though subroutines
  20. # have _4bit suffix, they are not using any tables, but rely on
  21. # hardware Galois Field Multiply support. Streamed GHASH processes
  22. # byte in ~7 cycles, which is >6x faster than "4-bit" table-driven
  23. # code compiled with TI's cl6x 6.0 with -mv6400+ -o2 flags. We are
  24. # comparing apples vs. oranges, but compiler surely could have done
  25. # better, because theoretical [though not necessarily achievable]
  26. # estimate for "4-bit" table-driven implementation is ~12 cycles.
  27. $output = pop and open STDOUT,">$output";
  28. ($Xip,$Htable,$inp,$len)=("A4","B4","A6","B6"); # arguments
  29. ($Z0,$Z1,$Z2,$Z3, $H0, $H1, $H2, $H3,
  30. $H0x,$H1x,$H2x,$H3x)=map("A$_",(16..27));
  31. ($H01u,$H01y,$H2u,$H3u, $H0y,$H1y,$H2y,$H3y,
  32. $H0z,$H1z,$H2z,$H3z)=map("B$_",(16..27));
  33. ($FF000000,$E10000)=("B30","B31");
  34. ($xip,$x0,$x1,$xib)=map("B$_",(6..9)); # $xip zaps $len
  35. $xia="A9";
  36. ($rem,$res)=("B4","B5"); # $rem zaps $Htable
  37. $code.=<<___;
  38. .text
  39. .if .ASSEMBLER_VERSION<7000000
  40. .asg 0,__TI_EABI__
  41. .endif
  42. .if __TI_EABI__
  43. .asg gcm_gmult_1bit,_gcm_gmult_1bit
  44. .asg gcm_gmult_4bit,_gcm_gmult_4bit
  45. .asg gcm_ghash_4bit,_gcm_ghash_4bit
  46. .endif
  47. .asg B3,RA
  48. .if 0
  49. .global _gcm_gmult_1bit
  50. _gcm_gmult_1bit:
  51. ADDAD $Htable,2,$Htable
  52. .endif
  53. .global _gcm_gmult_4bit
  54. _gcm_gmult_4bit:
  55. .asmfunc
  56. LDDW *${Htable}[-1],$H1:$H0 ; H.lo
  57. LDDW *${Htable}[-2],$H3:$H2 ; H.hi
  58. || MV $Xip,${xip} ; reassign Xi
  59. || MVK 15,B1 ; SPLOOPD constant
  60. MVK 0xE1,$E10000
  61. || LDBU *++${xip}[15],$x1 ; Xi[15]
  62. MVK 0xFF,$FF000000
  63. || LDBU *--${xip},$x0 ; Xi[14]
  64. SHL $E10000,16,$E10000 ; [pre-shifted] reduction polynomial
  65. SHL $FF000000,24,$FF000000 ; upper byte mask
  66. || BNOP ghash_loop?
  67. || MVK 1,B0 ; take a single spin
  68. PACKH2 $H0,$H1,$xia ; pack H0' and H1's upper bytes
  69. AND $H2,$FF000000,$H2u ; H2's upper byte
  70. AND $H3,$FF000000,$H3u ; H3's upper byte
  71. || SHRU $H2u,8,$H2u
  72. SHRU $H3u,8,$H3u
  73. || ZERO $Z1:$Z0
  74. SHRU2 $xia,8,$H01u
  75. || ZERO $Z3:$Z2
  76. .endasmfunc
  77. .global _gcm_ghash_4bit
  78. _gcm_ghash_4bit:
  79. .asmfunc
  80. LDDW *${Htable}[-1],$H1:$H0 ; H.lo
  81. || SHRU $len,4,B0 ; reassign len
  82. LDDW *${Htable}[-2],$H3:$H2 ; H.hi
  83. || MV $Xip,${xip} ; reassign Xi
  84. || MVK 15,B1 ; SPLOOPD constant
  85. MVK 0xE1,$E10000
  86. || [B0] LDNDW *${inp}[1],$H1x:$H0x
  87. MVK 0xFF,$FF000000
  88. || [B0] LDNDW *${inp}++[2],$H3x:$H2x
  89. SHL $E10000,16,$E10000 ; [pre-shifted] reduction polynomial
  90. || LDDW *${xip}[1],$Z1:$Z0
  91. SHL $FF000000,24,$FF000000 ; upper byte mask
  92. || LDDW *${xip}[0],$Z3:$Z2
  93. PACKH2 $H0,$H1,$xia ; pack H0' and H1's upper bytes
  94. AND $H2,$FF000000,$H2u ; H2's upper byte
  95. AND $H3,$FF000000,$H3u ; H3's upper byte
  96. || SHRU $H2u,8,$H2u
  97. SHRU $H3u,8,$H3u
  98. SHRU2 $xia,8,$H01u
  99. || [B0] XOR $H0x,$Z0,$Z0 ; Xi^=inp
  100. || [B0] XOR $H1x,$Z1,$Z1
  101. .if .LITTLE_ENDIAN
  102. [B0] XOR $H2x,$Z2,$Z2
  103. || [B0] XOR $H3x,$Z3,$Z3
  104. || [B0] SHRU $Z1,24,$xia ; Xi[15], avoid cross-path stall
  105. STDW $Z1:$Z0,*${xip}[1]
  106. || [B0] SHRU $Z1,16,$x0 ; Xi[14]
  107. || [B0] ZERO $Z1:$Z0
  108. .else
  109. [B0] XOR $H2x,$Z2,$Z2
  110. || [B0] XOR $H3x,$Z3,$Z3
  111. || [B0] MV $Z0,$xia ; Xi[15], avoid cross-path stall
  112. STDW $Z1:$Z0,*${xip}[1]
  113. || [B0] SHRU $Z0,8,$x0 ; Xi[14]
  114. || [B0] ZERO $Z1:$Z0
  115. .endif
  116. STDW $Z3:$Z2,*${xip}[0]
  117. || [B0] ZERO $Z3:$Z2
  118. || [B0] MV $xia,$x1
  119. [B0] ADDK 14,${xip}
  120. ghash_loop?:
  121. SPLOOPD 6 ; 6*16+7
  122. || MVC B1,ILC
  123. || [B0] SUB B0,1,B0
  124. || ZERO A0
  125. || ADD $x1,$x1,$xib ; SHL $x1,1,$xib
  126. || SHL $x1,1,$xia
  127. ___
  128. ########____________________________
  129. # 0 D2. M1 M2 |
  130. # 1 M1 |
  131. # 2 M1 M2 |
  132. # 3 D1. M1 M2 |
  133. # 4 S1. L1 |
  134. # 5 S2 S1x L1 D2 L2 |____________________________
  135. # 6/0 L1 S1 L2 S2x |D2. M1 M2 |
  136. # 7/1 L1 S1 D1x S2 M2 | M1 |
  137. # 8/2 S1 L1x S2 | M1 M2 |
  138. # 9/3 S1 L1x | D1. M1 M2 |
  139. # 10/4 D1x | S1. L1 |
  140. # 11/5 |S2 S1x L1 D2 L2 |____________
  141. # 12/6/0 D1x __| L1 S1 L2 S2x |D2. ....
  142. # 7/1 L1 S1 D1x S2 M2 | ....
  143. # 8/2 S1 L1x S2 | ....
  144. #####... ................|............
  145. $code.=<<___;
  146. XORMPY $H0,$xia,$H0x ; 0 ; H·(Xi[i]<<1)
  147. || XORMPY $H01u,$xib,$H01y
  148. || [A0] LDBU *--${xip},$x0
  149. XORMPY $H1,$xia,$H1x ; 1
  150. XORMPY $H2,$xia,$H2x ; 2
  151. || XORMPY $H2u,$xib,$H2y
  152. XORMPY $H3,$xia,$H3x ; 3
  153. || XORMPY $H3u,$xib,$H3y
  154. ||[!A0] MVK.D 15,A0 ; *--${xip} counter
  155. XOR.L $H0x,$Z0,$Z0 ; 4 ; Z^=H·(Xi[i]<<1)
  156. || [A0] SUB.S A0,1,A0
  157. XOR.L $H1x,$Z1,$Z1 ; 5
  158. || AND.D $H01y,$FF000000,$H0z
  159. || SWAP2.L $H01y,$H1y ; ; SHL $H01y,16,$H1y
  160. || SHL $x0,1,$xib
  161. || SHL $x0,1,$xia
  162. XOR.L $H2x,$Z2,$Z2 ; 6/0 ; [0,0] in epilogue
  163. || SHL $Z0,1,$rem ; ; rem=Z<<1
  164. || SHRMB.S $Z1,$Z0,$Z0 ; ; Z>>=8
  165. || AND.L $H1y,$FF000000,$H1z
  166. XOR.L $H3x,$Z3,$Z3 ; 7/1
  167. || SHRMB.S $Z2,$Z1,$Z1
  168. || XOR.D $H0z,$Z0,$Z0 ; merge upper byte products
  169. || AND.S $H2y,$FF000000,$H2z
  170. || XORMPY $E10000,$rem,$res ; ; implicit rem&0x1FE
  171. XOR.L $H1z,$Z1,$Z1 ; 8/2
  172. || SHRMB.S $Z3,$Z2,$Z2
  173. || AND.S $H3y,$FF000000,$H3z
  174. XOR.L $H2z,$Z2,$Z2 ; 9/3
  175. || SHRU $Z3,8,$Z3
  176. XOR.D $H3z,$Z3,$Z3 ; 10/4
  177. NOP ; 11/5
  178. SPKERNEL 0,2
  179. || XOR.D $res,$Z3,$Z3 ; 12/6/0; Z^=res
  180. ; input pre-fetch is possible where D1 slot is available...
  181. [B0] LDNDW *${inp}[1],$H1x:$H0x ; 8/-
  182. [B0] LDNDW *${inp}++[2],$H3x:$H2x ; 9/-
  183. NOP ; 10/-
  184. .if .LITTLE_ENDIAN
  185. SWAP2 $Z0,$Z1 ; 11/-
  186. || SWAP4 $Z1,$Z0
  187. SWAP4 $Z1,$Z1 ; 12/-
  188. || SWAP2 $Z0,$Z0
  189. SWAP2 $Z2,$Z3
  190. || SWAP4 $Z3,$Z2
  191. ||[!B0] BNOP RA
  192. SWAP4 $Z3,$Z3
  193. || SWAP2 $Z2,$Z2
  194. || [B0] BNOP ghash_loop?
  195. [B0] XOR $H0x,$Z0,$Z0 ; Xi^=inp
  196. || [B0] XOR $H1x,$Z1,$Z1
  197. [B0] XOR $H2x,$Z2,$Z2
  198. || [B0] XOR $H3x,$Z3,$Z3
  199. || [B0] SHRU $Z1,24,$xia ; Xi[15], avoid cross-path stall
  200. STDW $Z1:$Z0,*${xip}[1]
  201. || [B0] SHRU $Z1,16,$x0 ; Xi[14]
  202. || [B0] ZERO $Z1:$Z0
  203. .else
  204. [!B0] BNOP RA ; 11/-
  205. [B0] BNOP ghash_loop? ; 12/-
  206. [B0] XOR $H0x,$Z0,$Z0 ; Xi^=inp
  207. || [B0] XOR $H1x,$Z1,$Z1
  208. [B0] XOR $H2x,$Z2,$Z2
  209. || [B0] XOR $H3x,$Z3,$Z3
  210. || [B0] MV $Z0,$xia ; Xi[15], avoid cross-path stall
  211. STDW $Z1:$Z0,*${xip}[1]
  212. || [B0] SHRU $Z0,8,$x0 ; Xi[14]
  213. || [B0] ZERO $Z1:$Z0
  214. .endif
  215. STDW $Z3:$Z2,*${xip}[0]
  216. || [B0] ZERO $Z3:$Z2
  217. || [B0] MV $xia,$x1
  218. [B0] ADDK 14,${xip}
  219. .endasmfunc
  220. .sect .const
  221. .cstring "GHASH for C64x+, CRYPTOGAMS by <appro\@openssl.org>"
  222. .align 4
  223. ___
  224. print $code;
  225. close STDOUT or die "error closing STDOUT: $!";