keccak1600-avx2.pl 16 KB


  1. #!/usr/bin/env perl
  2. # Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the OpenSSL license (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. # ====================================================================
  10. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  11. # project. The module is, however, dual licensed under OpenSSL and
  12. # CRYPTOGAMS licenses depending on where you obtain it. For further
  13. # details see http://www.openssl.org/~appro/cryptogams/.
  14. # ====================================================================
  15. #
  16. # Keccak-1600 for AVX2.
  17. #
  18. # July 2017.
  19. #
  20. # To paraphrase Gilles Van Assche, if you contemplate Fig. 2.3 on page
  21. # 20 of The Keccak reference [or Fig. 5 of FIPS PUB 202], and load data
  22. # other than A[0][0] in magic order into 6 [256-bit] registers, *each
  23. # dedicated to one axis*, Pi permutation is reduced to intra-register
  24. # shuffles...
  25. #
  26. # It makes other steps more intricate, but overall, is it a win? To be
  27. # more specific index permutations organized by quadruples are:
  28. #
  29. # [4][4] [3][3] [2][2] [1][1]<-+
  30. # [0][4] [0][3] [0][2] [0][1]<-+
  31. # [3][0] [1][0] [4][0] [2][0] |
  32. # [4][3] [3][1] [2][4] [1][2] |
  33. # [3][4] [1][3] [4][2] [2][1] |
  34. # [2][3] [4][1] [1][4] [3][2] |
  35. # [2][2] [4][4] [1][1] [3][3] -+
  36. #
  37. # This however is highly impractical for Theta and Chi. What would help
  38. # Theta is if x indices were aligned column-wise, or in other words:
  39. #
  40. # [0][4] [0][3] [0][2] [0][1]
  41. # [3][0] [1][0] [4][0] [2][0]
  42. #vpermq([4][3] [3][1] [2][4] [1][2], 0b01110010)
  43. # [2][4] [4][3] [1][2] [3][1]
  44. #vpermq([4][2] [3][4] [2][1] [1][3], 0b10001101)
  45. # [3][4] [1][3] [4][2] [2][1]
  46. #vpermq([2][3] [4][1] [1][4] [3][2], 0b01110010)
  47. # [1][4] [2][3] [3][2] [4][1]
  48. #vpermq([1][1] [2][2] [3][3] [4][4], 0b00011011)
  49. # [4][4] [3][3] [2][2] [1][1]
  50. #
  51. # So here we have it, lines not marked with vpermq() represent the magic
  52. # order in which data is to be loaded and maintained. [And lines marked
  53. # with vpermq() represent Pi circular permutation in chosen layout. Note
  54. # that first step is permutation-free.] A[0][0] is loaded to register of
  55. # its own, to all lanes. [A[0][0] is not part of Pi permutation or Rho.]
  56. # Digits in variables' names denote right-most coordinates:
  57. my ($A00, # [0][0] [0][0] [0][0] [0][0] # %ymm0
  58. $A01, # [0][4] [0][3] [0][2] [0][1] # %ymm1
  59. $A20, # [3][0] [1][0] [4][0] [2][0] # %ymm2
  60. $A31, # [2][4] [4][3] [1][2] [3][1] # %ymm3
  61. $A21, # [3][4] [1][3] [4][2] [2][1] # %ymm4
  62. $A41, # [1][4] [2][3] [3][2] [4][1] # %ymm5
  63. $A11) = # [4][4] [3][3] [2][2] [1][1] # %ymm6
  64. map("%ymm$_",(0..6));
  65. # We also need to map the magic order into offsets within structure:
  66. my @A_jagged = ([0,0], [1,0], [1,1], [1,2], [1,3], # [0][0..4]
  67. [2,2], [6,0], [3,1], [4,2], [5,3], # [1][0..4]
  68. [2,0], [4,0], [6,1], [5,2], [3,3], # [2][0..4]
  69. [2,3], [3,0], [5,1], [6,2], [4,3], # [3][0..4]
  70. [2,1], [5,0], [4,1], [3,2], [6,3]); # [4][0..4]
  71. @A_jagged = map(8*($$_[0]*4+$$_[1]), @A_jagged); # ... and now linear
  72. # But on the other hand Chi is much better off if y indices were aligned
  73. # column-wise, not x. For this reason we have to shuffle data prior
  74. # Chi and revert it afterwards. Prior shuffle is naturally merged with
  75. # Pi itself:
  76. #
  77. # [0][4] [0][3] [0][2] [0][1]
  78. # [3][0] [1][0] [4][0] [2][0]
  79. #vpermq([4][3] [3][1] [2][4] [1][2], 0b01110010)
  80. #vpermq([2][4] [4][3] [1][2] [3][1], 0b00011011) = 0b10001101
  81. # [3][1] [1][2] [4][3] [2][4]
  82. #vpermq([4][2] [3][4] [2][1] [1][3], 0b10001101)
  83. #vpermq([3][4] [1][3] [4][2] [2][1], 0b11100100) = 0b10001101
  84. # [3][4] [1][3] [4][2] [2][1]
  85. #vpermq([2][3] [4][1] [1][4] [3][2], 0b01110010)
  86. #vpermq([1][4] [2][3] [3][2] [4][1], 0b01110010) = 0b00011011
  87. # [3][2] [1][4] [4][1] [2][3]
  88. #vpermq([1][1] [2][2] [3][3] [4][4], 0b00011011)
  89. #vpermq([4][4] [3][3] [2][2] [1][1], 0b10001101) = 0b01110010
  90. # [3][3] [1][1] [4][4] [2][2]
  91. #
  92. # And reverse post-Chi permutation:
  93. #
  94. # [0][4] [0][3] [0][2] [0][1]
  95. # [3][0] [1][0] [4][0] [2][0]
  96. #vpermq([3][1] [1][2] [4][3] [2][4], 0b00011011)
  97. # [2][4] [4][3] [1][2] [3][1]
  98. #vpermq([3][4] [1][3] [4][2] [2][1], 0b11100100) = nop :-)
  99. # [3][4] [1][3] [4][2] [2][1]
  100. #vpermq([3][2] [1][4] [4][1] [2][3], 0b10001101)
  101. # [1][4] [2][3] [3][2] [4][1]
  102. #vpermq([3][3] [1][1] [4][4] [2][2], 0b01110010)
  103. # [4][4] [3][3] [2][2] [1][1]
  104. #
  105. ########################################################################
  106. # Numbers are cycles per processed byte out of large message.
  107. #
  108. # r=1088(*)
  109. #
  110. # Haswell 8.7/+10%
  111. # Skylake 7.8/+20%
  112. # Ryzen 17(**)
  113. #
  114. # (*) Corresponds to SHA3-256. Percentage after slash is improvement
  115. # coefficient in comparison to scalar keccak1600-x86_64.pl.
  116. # (**) It's expected that Ryzen performs poorly, because instruction
  117. # issue rate is limited to two AVX2 instructions per cycle and
  118. # in addition vpblendd is reportedly bound to specific port.
  119. # Obviously this code path should not be executed on Ryzen.
  120. my @T = map("%ymm$_",(7..15));
  121. my ($C14,$C00,$D00,$D14) = @T[5..8];
  122. $code.=<<___;
  123. .text
  124. .type __KeccakF1600,\@function
  125. .align 32
  126. __KeccakF1600:
  127. lea rhotates_left+96(%rip),%r8
  128. lea rhotates_right+96(%rip),%r9
  129. lea iotas(%rip),%r10
  130. mov \$24,%eax
  131. jmp .Loop_avx2
  132. .align 32
  133. .Loop_avx2:
  134. ######################################### Theta
  135. vpshufd \$0b01001110,$A20,$C00
  136. vpxor $A31,$A41,$C14
  137. vpxor $A11,$A21,@T[2]
  138. vpxor $A01,$C14,$C14
  139. vpxor @T[2],$C14,$C14 # C[1..4]
  140. vpermq \$0b10010011,$C14,@T[4]
  141. vpxor $A20,$C00,$C00
  142. vpermq \$0b01001110,$C00,@T[0]
  143. vpsrlq \$63,$C14,@T[1]
  144. vpaddq $C14,$C14,@T[2]
  145. vpor @T[2],@T[1],@T[1] # ROL64(C[1..4],1)
  146. vpermq \$0b00111001,@T[1],$D14
  147. vpxor @T[4],@T[1],$D00
  148. vpermq \$0b00000000,$D00,$D00 # D[0..0] = ROL64(C[1],1) ^ C[4]
  149. vpxor $A00,$C00,$C00
  150. vpxor @T[0],$C00,$C00 # C[0..0]
  151. vpsrlq \$63,$C00,@T[0]
  152. vpaddq $C00,$C00,@T[1]
  153. vpor @T[0],@T[1],@T[1] # ROL64(C[0..0],1)
  154. vpxor $D00,$A20,$A20 # ^= D[0..0]
  155. vpxor $D00,$A00,$A00 # ^= D[0..0]
  156. vpblendd \$0b11000000,@T[1],$D14,$D14
  157. vpblendd \$0b00000011,$C00,@T[4],@T[4]
  158. vpxor @T[4],$D14,$D14 # D[1..4] = ROL64(C[2..4,0),1) ^ C[0..3]
  159. ######################################### Rho + Pi + pre-Chi shuffle
  160. vpsllvq 0*32-96(%r8),$A20,@T[3]
  161. vpsrlvq 0*32-96(%r9),$A20,$A20
  162. vpor @T[3],$A20,$A20
  163. vpxor $D14,$A31,$A31 # ^= D[1..4] from Theta
  164. vpsllvq 2*32-96(%r8),$A31,@T[4]
  165. vpsrlvq 2*32-96(%r9),$A31,$A31
  166. vpor @T[4],$A31,$A31
  167. vpxor $D14,$A21,$A21 # ^= D[1..4] from Theta
  168. vpsllvq 3*32-96(%r8),$A21,@T[5]
  169. vpsrlvq 3*32-96(%r9),$A21,$A21
  170. vpor @T[5],$A21,$A21
  171. vpxor $D14,$A41,$A41 # ^= D[1..4] from Theta
  172. vpsllvq 4*32-96(%r8),$A41,@T[6]
  173. vpsrlvq 4*32-96(%r9),$A41,$A41
  174. vpor @T[6],$A41,$A41
  175. vpxor $D14,$A11,$A11 # ^= D[1..4] from Theta
  176. vpermq \$0b10001101,$A20,@T[3] # $A20 -> future $A31
  177. vpermq \$0b10001101,$A31,@T[4] # $A31 -> future $A21
  178. vpsllvq 5*32-96(%r8),$A11,@T[7]
  179. vpsrlvq 5*32-96(%r9),$A11,@T[1]
  180. vpor @T[7],@T[1],@T[1] # $A11 -> future $A01
  181. vpxor $D14,$A01,$A01 # ^= D[1..4] from Theta
  182. vpermq \$0b00011011,$A21,@T[5] # $A21 -> future $A41
  183. vpermq \$0b01110010,$A41,@T[6] # $A41 -> future $A11
  184. vpsllvq 1*32-96(%r8),$A01,@T[8]
  185. vpsrlvq 1*32-96(%r9),$A01,@T[2]
  186. vpor @T[8],@T[2],@T[2] # $A01 -> future $A20
  187. ######################################### Chi
  188. vpsrldq \$8,@T[1],@T[7]
  189. vpandn @T[7],@T[1],@T[0] # tgting [0][0] [0][0] [0][0] [0][0]
  190. vpblendd \$0b00001100,@T[6],@T[2],$A31 # [4][4] [2][0]
  191. vpblendd \$0b00001100,@T[2],@T[4],@T[8] # [4][0] [2][1]
  192. vpblendd \$0b00001100,@T[4],@T[3],$A41 # [4][2] [2][4]
  193. vpblendd \$0b00001100,@T[3],@T[2],@T[7] # [4][3] [2][0]
  194. vpblendd \$0b00110000,@T[4],$A31,$A31 # [1][3] [4][4] [2][0]
  195. vpblendd \$0b00110000,@T[5],@T[8],@T[8] # [1][4] [4][0] [2][1]
  196. vpblendd \$0b00110000,@T[2],$A41,$A41 # [1][0] [4][2] [2][4]
  197. vpblendd \$0b00110000,@T[6],@T[7],@T[7] # [1][1] [4][3] [2][0]
  198. vpblendd \$0b11000000,@T[5],$A31,$A31 # [3][2] [1][3] [4][4] [2][0]
  199. vpblendd \$0b11000000,@T[6],@T[8],@T[8] # [3][3] [1][4] [4][0] [2][1]
  200. vpblendd \$0b11000000,@T[6],$A41,$A41 # [3][3] [1][0] [4][2] [2][4]
  201. vpblendd \$0b11000000,@T[4],@T[7],@T[7] # [3][4] [1][1] [4][3] [2][0]
  202. vpandn @T[8],$A31,$A31 # tgting [3][1] [1][2] [4][3] [2][4]
  203. vpandn @T[7],$A41,$A41 # tgting [3][2] [1][4] [4][1] [2][3]
  204. vpblendd \$0b00001100,@T[2],@T[5],$A11 # [4][0] [2][3]
  205. vpblendd \$0b00001100,@T[5],@T[3],@T[8] # [4][1] [2][4]
  206. vpxor @T[3],$A31,$A31
  207. vpblendd \$0b00110000,@T[3],$A11,$A11 # [1][2] [4][0] [2][3]
  208. vpblendd \$0b00110000,@T[4],@T[8],@T[8] # [1][3] [4][1] [2][4]
  209. vpxor @T[5],$A41,$A41
  210. vpblendd \$0b11000000,@T[4],$A11,$A11 # [3][4] [1][2] [4][0] [2][3]
  211. vpblendd \$0b11000000,@T[2],@T[8],@T[8] # [3][0] [1][3] [4][1] [2][4]
  212. vpandn @T[8],$A11,$A11 # tgting [3][3] [1][1] [4][4] [2][2]
  213. vpxor @T[6],$A11,$A11
  214. vpermq \$0b00011110,@T[1],$A21 # [0][1] [0][2] [0][4] [0][3]
  215. vpblendd \$0b00110000,$A00,$A21,@T[8] # [0][1] [0][0] [0][4] [0][3]
  216. vpermq \$0b00111001,@T[1],$A01 # [0][1] [0][4] [0][3] [0][2]
  217. vpblendd \$0b11000000,$A00,$A01,$A01 # [0][0] [0][4] [0][3] [0][2]
  218. vpandn @T[8],$A01,$A01 # tgting [0][4] [0][3] [0][2] [0][1]
  219. vpblendd \$0b00001100,@T[5],@T[4],$A20 # [4][1] [2][1]
  220. vpblendd \$0b00001100,@T[4],@T[6],@T[7] # [4][2] [2][2]
  221. vpblendd \$0b00110000,@T[6],$A20,$A20 # [1][1] [4][1] [2][1]
  222. vpblendd \$0b00110000,@T[3],@T[7],@T[7] # [1][2] [4][2] [2][2]
  223. vpblendd \$0b11000000,@T[3],$A20,$A20 # [3][1] [1][1] [4][1] [2][1]
  224. vpblendd \$0b11000000,@T[5],@T[7],@T[7] # [3][2] [1][2] [4][2] [2][2]
  225. vpandn @T[7],$A20,$A20 # tgting [3][0] [1][0] [4][0] [2][0]
  226. vpxor @T[2],$A20,$A20
  227. vpermq \$0b00000000,@T[0],@T[0] # [0][0] [0][0] [0][0] [0][0]
  228. vpermq \$0b00011011,$A31,$A31 # post-Chi shuffle
  229. vpermq \$0b10001101,$A41,$A41
  230. vpermq \$0b01110010,$A11,$A11
  231. vpblendd \$0b00001100,@T[3],@T[6],$A21 # [4][3] [2][2]
  232. vpblendd \$0b00001100,@T[6],@T[5],@T[7] # [4][4] [2][3]
  233. vpblendd \$0b00110000,@T[5],$A21,$A21 # [1][4] [4][3] [2][2]
  234. vpblendd \$0b00110000,@T[2],@T[7],@T[7] # [1][0] [4][4] [2][3]
  235. vpblendd \$0b11000000,@T[2],$A21,$A21 # [3][0] [1][4] [4][3] [2][2]
  236. vpblendd \$0b11000000,@T[3],@T[7],@T[7] # [3][1] [1][0] [4][4] [2][3]
  237. vpandn @T[7],$A21,$A21 # tgting [3][4] [1][3] [4][2] [2][1]
  238. vpxor @T[0],$A00,$A00
  239. vpxor @T[1],$A01,$A01
  240. vpxor @T[4],$A21,$A21
  241. ######################################### Iota
  242. vpxor (%r10),$A00,$A00
  243. lea 32(%r10),%r10
  244. dec %eax
  245. jnz .Loop_avx2
  246. ret
  247. .size __KeccakF1600,.-__KeccakF1600
  248. ___
  249. my ($A_flat,$inp,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
  250. my $out = $inp; # in squeeze
  251. $code.=<<___;
  252. .globl SHA3_absorb
  253. .type SHA3_absorb,\@function
  254. .align 32
  255. SHA3_absorb:
  256. mov %rsp,%r11
  257. lea -240(%rsp),%rsp
  258. and \$-32,%rsp
  259. lea 96($A_flat),$A_flat
  260. lea 96($inp),$inp
  261. lea 96(%rsp),%r10
  262. vzeroupper
  263. vpbroadcastq -96($A_flat),$A00 # load A[5][5]
  264. vmovdqu 8+32*0-96($A_flat),$A01
  265. vmovdqu 8+32*1-96($A_flat),$A20
  266. vmovdqu 8+32*2-96($A_flat),$A31
  267. vmovdqu 8+32*3-96($A_flat),$A21
  268. vmovdqu 8+32*4-96($A_flat),$A41
  269. vmovdqu 8+32*5-96($A_flat),$A11
  270. vpxor @T[0],@T[0],@T[0]
  271. vmovdqa @T[0],32*2-96(%r10) # zero transfer area on stack
  272. vmovdqa @T[0],32*3-96(%r10)
  273. vmovdqa @T[0],32*4-96(%r10)
  274. vmovdqa @T[0],32*5-96(%r10)
  275. vmovdqa @T[0],32*6-96(%r10)
  276. .Loop_absorb_avx2:
  277. mov $bsz,%rax
  278. sub $bsz,$len
  279. jc .Ldone_absorb_avx2
  280. shr \$3,%eax
  281. vpbroadcastq 0-96($inp),@T[0]
  282. vmovdqu 8-96($inp),@T[1]
  283. sub \$4,%eax
  284. ___
  285. for(my $i=5; $i<25; $i++) {
  286. $code.=<<___
  287. dec %eax
  288. jz .Labsorved_avx2
  289. mov 8*$i-96($inp),%r8
  290. mov %r8,$A_jagged[$i]-96(%r10)
  291. ___
  292. }
  293. $code.=<<___;
  294. .Labsorved_avx2:
  295. lea ($inp,$bsz),$inp
  296. vpxor @T[0],$A00,$A00
  297. vpxor @T[1],$A01,$A01
  298. vpxor 32*2-96(%r10),$A20,$A20
  299. vpxor 32*3-96(%r10),$A31,$A31
  300. vpxor 32*4-96(%r10),$A21,$A21
  301. vpxor 32*5-96(%r10),$A41,$A41
  302. vpxor 32*6-96(%r10),$A11,$A11
  303. call __KeccakF1600
  304. lea 96(%rsp),%r10
  305. jmp .Loop_absorb_avx2
  306. .Ldone_absorb_avx2:
  307. vmovq %xmm0,-96($A_flat)
  308. vmovdqu $A01,8+32*0-96($A_flat)
  309. vmovdqu $A20,8+32*1-96($A_flat)
  310. vmovdqu $A31,8+32*2-96($A_flat)
  311. vmovdqu $A21,8+32*3-96($A_flat)
  312. vmovdqu $A41,8+32*4-96($A_flat)
  313. vmovdqu $A11,8+32*5-96($A_flat)
  314. vzeroupper
  315. lea (%r11),%rsp
  316. lea ($len,$bsz),%rax # return value
  317. ret
  318. .size SHA3_absorb,.-SHA3_absorb
  319. .globl SHA3_squeeze
  320. .type SHA3_squeeze,\@function
  321. .align 32
  322. SHA3_squeeze:
  323. mov %rsp,%r11
  324. lea 96($A_flat),$A_flat
  325. shr \$3,$bsz
  326. vzeroupper
  327. vpbroadcastq -96($A_flat),$A00
  328. vpxor @T[0],@T[0],@T[0]
  329. vmovdqu 8+32*0-96($A_flat),$A01
  330. vmovdqu 8+32*1-96($A_flat),$A20
  331. vmovdqu 8+32*2-96($A_flat),$A31
  332. vmovdqu 8+32*3-96($A_flat),$A21
  333. vmovdqu 8+32*4-96($A_flat),$A41
  334. vmovdqu 8+32*5-96($A_flat),$A11
  335. mov $bsz,%rax
  336. .Loop_squeeze_avx2:
  337. mov @A_jagged[$i]-96($A_flat),%r8
  338. ___
  339. for (my $i=0; $i<25; $i++) {
  340. $code.=<<___;
  341. sub \$8,$len
  342. jc .Ltail_squeeze_avx2
  343. mov %r8,($out)
  344. lea 8($out),$out
  345. je .Ldone_squeeze_avx2
  346. dec %eax
  347. je .Lextend_output_avx2
  348. mov @A_jagged[$i+1]-120($A_flat),%r8
  349. ___
  350. }
  351. $code.=<<___;
  352. .Lextend_output_avx2:
  353. call __KeccakF1600
  354. vmovq %xmm0,-96($A_flat)
  355. vmovdqu $A01,8+32*0-96($A_flat)
  356. vmovdqu $A20,8+32*1-96($A_flat)
  357. vmovdqu $A31,8+32*2-96($A_flat)
  358. vmovdqu $A21,8+32*3-96($A_flat)
  359. vmovdqu $A41,8+32*4-96($A_flat)
  360. vmovdqu $A11,8+32*5-96($A_flat)
  361. mov $bsz,%rax
  362. jmp .Loop_squeeze_avx2
  363. .Ltail_squeeze_avx2:
  364. add \$8,$len
  365. .Loop_tail_avx2:
  366. mov %r8b,($out)
  367. lea 1($out),$out
  368. shr \$8,%r8
  369. dec $len
  370. jnz .Loop_tail_avx2
  371. .Ldone_squeeze_avx2:
  372. vzeroupper
  373. lea (%r11),%rsp
  374. ret
  375. .size SHA3_squeeze,.-SHA3_squeeze
  376. .align 64
  377. rhotates_left:
  378. .quad 3, 18, 36, 41 # [2][0] [4][0] [1][0] [3][0]
  379. .quad 1, 62, 28, 27 # [0][1] [0][2] [0][3] [0][4]
  380. .quad 45, 6, 56, 39 # [3][1] [1][2] [4][3] [2][4]
  381. .quad 10, 61, 55, 8 # [2][1] [4][2] [1][3] [3][4]
  382. .quad 2, 15, 25, 20 # [4][1] [3][2] [2][3] [1][4]
  383. .quad 44, 43, 21, 14 # [1][1] [2][2] [3][3] [4][4]
  384. rhotates_right:
  385. .quad 64-3, 64-18, 64-36, 64-41
  386. .quad 64-1, 64-62, 64-28, 64-27
  387. .quad 64-45, 64-6, 64-56, 64-39
  388. .quad 64-10, 64-61, 64-55, 64-8
  389. .quad 64-2, 64-15, 64-25, 64-20
  390. .quad 64-44, 64-43, 64-21, 64-14
  391. iotas:
  392. .quad 0x0000000000000001, 0x0000000000000001, 0x0000000000000001, 0x0000000000000001
  393. .quad 0x0000000000008082, 0x0000000000008082, 0x0000000000008082, 0x0000000000008082
  394. .quad 0x800000000000808a, 0x800000000000808a, 0x800000000000808a, 0x800000000000808a
  395. .quad 0x8000000080008000, 0x8000000080008000, 0x8000000080008000, 0x8000000080008000
  396. .quad 0x000000000000808b, 0x000000000000808b, 0x000000000000808b, 0x000000000000808b
  397. .quad 0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001
  398. .quad 0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081
  399. .quad 0x8000000000008009, 0x8000000000008009, 0x8000000000008009, 0x8000000000008009
  400. .quad 0x000000000000008a, 0x000000000000008a, 0x000000000000008a, 0x000000000000008a
  401. .quad 0x0000000000000088, 0x0000000000000088, 0x0000000000000088, 0x0000000000000088
  402. .quad 0x0000000080008009, 0x0000000080008009, 0x0000000080008009, 0x0000000080008009
  403. .quad 0x000000008000000a, 0x000000008000000a, 0x000000008000000a, 0x000000008000000a
  404. .quad 0x000000008000808b, 0x000000008000808b, 0x000000008000808b, 0x000000008000808b
  405. .quad 0x800000000000008b, 0x800000000000008b, 0x800000000000008b, 0x800000000000008b
  406. .quad 0x8000000000008089, 0x8000000000008089, 0x8000000000008089, 0x8000000000008089
  407. .quad 0x8000000000008003, 0x8000000000008003, 0x8000000000008003, 0x8000000000008003
  408. .quad 0x8000000000008002, 0x8000000000008002, 0x8000000000008002, 0x8000000000008002
  409. .quad 0x8000000000000080, 0x8000000000000080, 0x8000000000000080, 0x8000000000000080
  410. .quad 0x000000000000800a, 0x000000000000800a, 0x000000000000800a, 0x000000000000800a
  411. .quad 0x800000008000000a, 0x800000008000000a, 0x800000008000000a, 0x800000008000000a
  412. .quad 0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081
  413. .quad 0x8000000000008080, 0x8000000000008080, 0x8000000000008080, 0x8000000000008080
  414. .quad 0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001
  415. .quad 0x8000000080008008, 0x8000000080008008, 0x8000000080008008, 0x8000000080008008
  416. .asciz "Keccak-1600 absorb and squeeze for AVX2, CRYPTOGAMS by <appro\@openssl.org>"
  417. ___
  418. $output=pop;
  419. open STDOUT,">$output";
  420. print $code;
  421. close STDOUT;