2
0

ghash-c64xplus.pl 7.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247
  1. #! /usr/bin/env perl
  2. # Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the OpenSSL license (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. # ====================================================================
  10. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  11. # project. The module is, however, dual licensed under OpenSSL and
  12. # CRYPTOGAMS licenses depending on where you obtain it. For further
  13. # details see http://www.openssl.org/~appro/cryptogams/.
  14. # ====================================================================
  15. #
  16. # December 2011
  17. #
  18. # The module implements GCM GHASH function and underlying single
  19. # multiplication operation in GF(2^128). Even though subroutines
  20. # have _4bit suffix, they are not using any tables, but rely on
  21. # hardware Galois Field Multiply support. Streamed GHASH processes
  22. # byte in ~7 cycles, which is >6x faster than "4-bit" table-driven
  23. # code compiled with TI's cl6x 6.0 with -mv6400+ -o2 flags. We are
  24. # comparing apples vs. oranges, but compiler surely could have done
  25. # better, because theoretical [though not necessarily achievable]
  26. # estimate for "4-bit" table-driven implementation is ~12 cycles.
  27. while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
  28. open STDOUT,">$output";
  29. ($Xip,$Htable,$inp,$len)=("A4","B4","A6","B6"); # arguments
  30. ($Z0,$Z1,$Z2,$Z3, $H0, $H1, $H2, $H3,
  31. $H0x,$H1x,$H2x,$H3x)=map("A$_",(16..27));
  32. ($H01u,$H01y,$H2u,$H3u, $H0y,$H1y,$H2y,$H3y,
  33. $H0z,$H1z,$H2z,$H3z)=map("B$_",(16..27));
  34. ($FF000000,$E10000)=("B30","B31");
  35. ($xip,$x0,$x1,$xib)=map("B$_",(6..9)); # $xip zaps $len
  36. $xia="A9";
  37. ($rem,$res)=("B4","B5"); # $rem zaps $Htable
  38. $code.=<<___;
  39. .text
  40. .if .ASSEMBLER_VERSION<7000000
  41. .asg 0,__TI_EABI__
  42. .endif
  43. .if __TI_EABI__
  44. .asg gcm_gmult_1bit,_gcm_gmult_1bit
  45. .asg gcm_gmult_4bit,_gcm_gmult_4bit
  46. .asg gcm_ghash_4bit,_gcm_ghash_4bit
  47. .endif
  48. .asg B3,RA
  49. .if 0
  50. .global _gcm_gmult_1bit
  51. _gcm_gmult_1bit:
  52. ADDAD $Htable,2,$Htable
  53. .endif
  54. .global _gcm_gmult_4bit
  55. _gcm_gmult_4bit:
  56. .asmfunc
  57. LDDW *${Htable}[-1],$H1:$H0 ; H.lo
  58. LDDW *${Htable}[-2],$H3:$H2 ; H.hi
  59. || MV $Xip,${xip} ; reassign Xi
  60. || MVK 15,B1 ; SPLOOPD constant
  61. MVK 0xE1,$E10000
  62. || LDBU *++${xip}[15],$x1 ; Xi[15]
  63. MVK 0xFF,$FF000000
  64. || LDBU *--${xip},$x0 ; Xi[14]
  65. SHL $E10000,16,$E10000 ; [pre-shifted] reduction polynomial
  66. SHL $FF000000,24,$FF000000 ; upper byte mask
  67. || BNOP ghash_loop?
  68. || MVK 1,B0 ; take a single spin
  69. PACKH2 $H0,$H1,$xia ; pack H0' and H1's upper bytes
  70. AND $H2,$FF000000,$H2u ; H2's upper byte
  71. AND $H3,$FF000000,$H3u ; H3's upper byte
  72. || SHRU $H2u,8,$H2u
  73. SHRU $H3u,8,$H3u
  74. || ZERO $Z1:$Z0
  75. SHRU2 $xia,8,$H01u
  76. || ZERO $Z3:$Z2
  77. .endasmfunc
  78. .global _gcm_ghash_4bit
  79. _gcm_ghash_4bit:
  80. .asmfunc
  81. LDDW *${Htable}[-1],$H1:$H0 ; H.lo
  82. || SHRU $len,4,B0 ; reassign len
  83. LDDW *${Htable}[-2],$H3:$H2 ; H.hi
  84. || MV $Xip,${xip} ; reassign Xi
  85. || MVK 15,B1 ; SPLOOPD constant
  86. MVK 0xE1,$E10000
  87. || [B0] LDNDW *${inp}[1],$H1x:$H0x
  88. MVK 0xFF,$FF000000
  89. || [B0] LDNDW *${inp}++[2],$H3x:$H2x
  90. SHL $E10000,16,$E10000 ; [pre-shifted] reduction polynomial
  91. || LDDW *${xip}[1],$Z1:$Z0
  92. SHL $FF000000,24,$FF000000 ; upper byte mask
  93. || LDDW *${xip}[0],$Z3:$Z2
  94. PACKH2 $H0,$H1,$xia ; pack H0' and H1's upper bytes
  95. AND $H2,$FF000000,$H2u ; H2's upper byte
  96. AND $H3,$FF000000,$H3u ; H3's upper byte
  97. || SHRU $H2u,8,$H2u
  98. SHRU $H3u,8,$H3u
  99. SHRU2 $xia,8,$H01u
  100. || [B0] XOR $H0x,$Z0,$Z0 ; Xi^=inp
  101. || [B0] XOR $H1x,$Z1,$Z1
  102. .if .LITTLE_ENDIAN
  103. [B0] XOR $H2x,$Z2,$Z2
  104. || [B0] XOR $H3x,$Z3,$Z3
  105. || [B0] SHRU $Z1,24,$xia ; Xi[15], avoid cross-path stall
  106. STDW $Z1:$Z0,*${xip}[1]
  107. || [B0] SHRU $Z1,16,$x0 ; Xi[14]
  108. || [B0] ZERO $Z1:$Z0
  109. .else
  110. [B0] XOR $H2x,$Z2,$Z2
  111. || [B0] XOR $H3x,$Z3,$Z3
  112. || [B0] MV $Z0,$xia ; Xi[15], avoid cross-path stall
  113. STDW $Z1:$Z0,*${xip}[1]
  114. || [B0] SHRU $Z0,8,$x0 ; Xi[14]
  115. || [B0] ZERO $Z1:$Z0
  116. .endif
  117. STDW $Z3:$Z2,*${xip}[0]
  118. || [B0] ZERO $Z3:$Z2
  119. || [B0] MV $xia,$x1
  120. [B0] ADDK 14,${xip}
  121. ghash_loop?:
  122. SPLOOPD 6 ; 6*16+7
  123. || MVC B1,ILC
  124. || [B0] SUB B0,1,B0
  125. || ZERO A0
  126. || ADD $x1,$x1,$xib ; SHL $x1,1,$xib
  127. || SHL $x1,1,$xia
  128. ___
  129. ########____________________________
  130. # 0 D2. M1 M2 |
  131. # 1 M1 |
  132. # 2 M1 M2 |
  133. # 3 D1. M1 M2 |
  134. # 4 S1. L1 |
  135. # 5 S2 S1x L1 D2 L2 |____________________________
  136. # 6/0 L1 S1 L2 S2x |D2. M1 M2 |
  137. # 7/1 L1 S1 D1x S2 M2 | M1 |
  138. # 8/2 S1 L1x S2 | M1 M2 |
  139. # 9/3 S1 L1x | D1. M1 M2 |
  140. # 10/4 D1x | S1. L1 |
  141. # 11/5 |S2 S1x L1 D2 L2 |____________
  142. # 12/6/0 D1x __| L1 S1 L2 S2x |D2. ....
  143. # 7/1 L1 S1 D1x S2 M2 | ....
  144. # 8/2 S1 L1x S2 | ....
  145. #####... ................|............
  146. $code.=<<___;
  147. XORMPY $H0,$xia,$H0x ; 0 ; H·(Xi[i]<<1)
  148. || XORMPY $H01u,$xib,$H01y
  149. || [A0] LDBU *--${xip},$x0
  150. XORMPY $H1,$xia,$H1x ; 1
  151. XORMPY $H2,$xia,$H2x ; 2
  152. || XORMPY $H2u,$xib,$H2y
  153. XORMPY $H3,$xia,$H3x ; 3
  154. || XORMPY $H3u,$xib,$H3y
  155. ||[!A0] MVK.D 15,A0 ; *--${xip} counter
  156. XOR.L $H0x,$Z0,$Z0 ; 4 ; Z^=H·(Xi[i]<<1)
  157. || [A0] SUB.S A0,1,A0
  158. XOR.L $H1x,$Z1,$Z1 ; 5
  159. || AND.D $H01y,$FF000000,$H0z
  160. || SWAP2.L $H01y,$H1y ; ; SHL $H01y,16,$H1y
  161. || SHL $x0,1,$xib
  162. || SHL $x0,1,$xia
  163. XOR.L $H2x,$Z2,$Z2 ; 6/0 ; [0,0] in epilogue
  164. || SHL $Z0,1,$rem ; ; rem=Z<<1
  165. || SHRMB.S $Z1,$Z0,$Z0 ; ; Z>>=8
  166. || AND.L $H1y,$FF000000,$H1z
  167. XOR.L $H3x,$Z3,$Z3 ; 7/1
  168. || SHRMB.S $Z2,$Z1,$Z1
  169. || XOR.D $H0z,$Z0,$Z0 ; merge upper byte products
  170. || AND.S $H2y,$FF000000,$H2z
  171. || XORMPY $E10000,$rem,$res ; ; implicit rem&0x1FE
  172. XOR.L $H1z,$Z1,$Z1 ; 8/2
  173. || SHRMB.S $Z3,$Z2,$Z2
  174. || AND.S $H3y,$FF000000,$H3z
  175. XOR.L $H2z,$Z2,$Z2 ; 9/3
  176. || SHRU $Z3,8,$Z3
  177. XOR.D $H3z,$Z3,$Z3 ; 10/4
  178. NOP ; 11/5
  179. SPKERNEL 0,2
  180. || XOR.D $res,$Z3,$Z3 ; 12/6/0; Z^=res
  181. ; input pre-fetch is possible where D1 slot is available...
  182. [B0] LDNDW *${inp}[1],$H1x:$H0x ; 8/-
  183. [B0] LDNDW *${inp}++[2],$H3x:$H2x ; 9/-
  184. NOP ; 10/-
  185. .if .LITTLE_ENDIAN
  186. SWAP2 $Z0,$Z1 ; 11/-
  187. || SWAP4 $Z1,$Z0
  188. SWAP4 $Z1,$Z1 ; 12/-
  189. || SWAP2 $Z0,$Z0
  190. SWAP2 $Z2,$Z3
  191. || SWAP4 $Z3,$Z2
  192. ||[!B0] BNOP RA
  193. SWAP4 $Z3,$Z3
  194. || SWAP2 $Z2,$Z2
  195. || [B0] BNOP ghash_loop?
  196. [B0] XOR $H0x,$Z0,$Z0 ; Xi^=inp
  197. || [B0] XOR $H1x,$Z1,$Z1
  198. [B0] XOR $H2x,$Z2,$Z2
  199. || [B0] XOR $H3x,$Z3,$Z3
  200. || [B0] SHRU $Z1,24,$xia ; Xi[15], avoid cross-path stall
  201. STDW $Z1:$Z0,*${xip}[1]
  202. || [B0] SHRU $Z1,16,$x0 ; Xi[14]
  203. || [B0] ZERO $Z1:$Z0
  204. .else
  205. [!B0] BNOP RA ; 11/-
  206. [B0] BNOP ghash_loop? ; 12/-
  207. [B0] XOR $H0x,$Z0,$Z0 ; Xi^=inp
  208. || [B0] XOR $H1x,$Z1,$Z1
  209. [B0] XOR $H2x,$Z2,$Z2
  210. || [B0] XOR $H3x,$Z3,$Z3
  211. || [B0] MV $Z0,$xia ; Xi[15], avoid cross-path stall
  212. STDW $Z1:$Z0,*${xip}[1]
  213. || [B0] SHRU $Z0,8,$x0 ; Xi[14]
  214. || [B0] ZERO $Z1:$Z0
  215. .endif
  216. STDW $Z3:$Z2,*${xip}[0]
  217. || [B0] ZERO $Z3:$Z2
  218. || [B0] MV $xia,$x1
  219. [B0] ADDK 14,${xip}
  220. .endasmfunc
  221. .sect .const
  222. .cstring "GHASH for C64x+, CRYPTOGAMS by <appro\@openssl.org>"
  223. .align 4
  224. ___
  225. print $code;
  226. close STDOUT;