keccak1600-avx512.pl 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550
  1. #!/usr/bin/env perl
  2. # Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. # ====================================================================
  10. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  11. # project. The module is, however, dual licensed under OpenSSL and
  12. # CRYPTOGAMS licenses depending on where you obtain it. For further
  13. # details see http://www.openssl.org/~appro/cryptogams/.
  14. # ====================================================================
  15. #
  16. # Keccak-1600 for AVX-512F.
  17. #
  18. # July 2017.
  19. #
  20. # Below code is KECCAK_1X_ALT implementation (see sha/keccak1600.c).
  21. # Pretty straightforward, the only "magic" is data layout in registers.
  22. # It's impossible to have one that is optimal for every step, hence
  23. # it's changing as algorithm progresses. Data is saved in linear order,
  24. # but in-register order morphs between rounds. Even rounds take in
  25. # linear layout, and odd rounds - transposed, or "verticaly-shaped"...
  26. #
  27. ########################################################################
  28. # Numbers are cycles per processed byte out of large message.
  29. #
  30. # r=1088(*)
  31. #
  32. # Knights Landing 7.6
  33. # Skylake-X 5.7
  34. #
  35. # (*) Corresponds to SHA3-256.
  36. ########################################################################
  37. # Below code is combination of two ideas. One is taken from Keccak Code
  38. # Package, hereafter KCP, and another one from initial version of this
  39. # module. What is common is observation that Pi's input and output are
  40. # "mostly transposed", i.e. if input is aligned by x coordinate, then
  41. # output is [mostly] aligned by y. Both versions, KCP and predecessor,
  42. # were trying to use one of them from round to round, which resulted in
  43. # some kind of transposition in each round. This version still does
  44. # transpose data, but only every second round. Another essential factor
  45. # is that KCP transposition has to be performed with instructions that
  46. # turned to be rather expensive on Knights Landing, both latency- and
  47. # throughput-wise. Not to mention that some of them have to depend on
  48. # each other. On the other hand initial version of this module was
  49. # relying heavily on blend instructions. There were lots of them,
  50. # resulting in higher instruction count, yet it performed better on
  51. # Knights Landing, because processor can execute pair of them each
  52. # cycle and they have minimal latency. This module is an attempt to
  53. # bring best parts together:-)
  54. #
  55. # Coordinates below correspond to those in sha/keccak1600.c. Input
  56. # layout is straight linear:
  57. #
  58. # [0][4] [0][3] [0][2] [0][1] [0][0]
  59. # [1][4] [1][3] [1][2] [1][1] [1][0]
  60. # [2][4] [2][3] [2][2] [2][1] [2][0]
  61. # [3][4] [3][3] [3][2] [3][1] [3][0]
  62. # [4][4] [4][3] [4][2] [4][1] [4][0]
  63. #
  64. # It's perfect for Theta, while Pi is reduced to intra-register
  65. # permutations which yield layout perfect for Chi:
  66. #
  67. # [4][0] [3][0] [2][0] [1][0] [0][0]
  68. # [4][1] [3][1] [2][1] [1][1] [0][1]
  69. # [4][2] [3][2] [2][2] [1][2] [0][2]
  70. # [4][3] [3][3] [2][3] [1][3] [0][3]
  71. # [4][4] [3][4] [2][4] [1][4] [0][4]
  72. #
  73. # Now instead of performing full transposition and feeding it to next
  74. # identical round, we perform kind of diagonal transposition to layout
  75. # from initial version of this module, and make it suitable for Theta:
  76. #
  77. # [4][4] [3][3] [2][2] [1][1] [0][0]>4.3.2.1.0>[4][4] [3][3] [2][2] [1][1] [0][0]
  78. # [4][0] [3][4] [2][3] [1][2] [0][1]>3.2.1.0.4>[3][4] [2][3] [1][2] [0][1] [4][0]
  79. # [4][1] [3][0] [2][4] [1][3] [0][2]>2.1.0.4.3>[2][4] [1][3] [0][2] [4][1] [3][0]
  80. # [4][2] [3][1] [2][0] [1][4] [0][3]>1.0.4.3.2>[1][4] [0][3] [4][2] [3][1] [2][0]
  81. # [4][3] [3][2] [2][1] [1][0] [0][4]>0.4.3.2.1>[0][4] [4][3] [3][2] [2][1] [1][0]
  82. #
  83. # Now intra-register permutations yield initial [almost] straight
  84. # linear layout:
  85. #
  86. # [4][4] [3][3] [2][2] [1][1] [0][0]
  87. ##[0][4] [0][3] [0][2] [0][1] [0][0]
  88. # [3][4] [2][3] [1][2] [0][1] [4][0]
  89. ##[2][3] [2][2] [2][1] [2][0] [2][4]
  90. # [2][4] [1][3] [0][2] [4][1] [3][0]
  91. ##[4][2] [4][1] [4][0] [4][4] [4][3]
  92. # [1][4] [0][3] [4][2] [3][1] [2][0]
  93. ##[1][1] [1][0] [1][4] [1][3] [1][2]
  94. # [0][4] [4][3] [3][2] [2][1] [1][0]
  95. ##[3][0] [3][4] [3][3] [3][2] [3][1]
  96. #
  97. # This means that odd round Chi is performed in less suitable layout,
  98. # with a number of additional permutations. But overall it turned to be
  99. # a win. Permutations are fastest possible on Knights Landing and they
  100. # are laid down to be independent of each other. In the essence I traded
  101. # 20 blend instructions for 3 permutations. The result is 13% faster
  102. # than KCP on Skylake-X, and >40% on Knights Landing.
  103. #
  104. # As implied, data is loaded in straight linear order. Digits in
  105. # variables' names represent coordinates of right-most element of
  106. # loaded data chunk:
  107. my ($A00, # [0][4] [0][3] [0][2] [0][1] [0][0]
  108. $A10, # [1][4] [1][3] [1][2] [1][1] [1][0]
  109. $A20, # [2][4] [2][3] [2][2] [2][1] [2][0]
  110. $A30, # [3][4] [3][3] [3][2] [3][1] [3][0]
  111. $A40) = # [4][4] [4][3] [4][2] [4][1] [4][0]
  112. map("%zmm$_",(0..4));
  113. # We also need to map the magic order into offsets within structure:
  114. my @A_jagged = ([0,0], [0,1], [0,2], [0,3], [0,4],
  115. [1,0], [1,1], [1,2], [1,3], [1,4],
  116. [2,0], [2,1], [2,2], [2,3], [2,4],
  117. [3,0], [3,1], [3,2], [3,3], [3,4],
  118. [4,0], [4,1], [4,2], [4,3], [4,4]);
  119. @A_jagged = map(8*($$_[0]*8+$$_[1]), @A_jagged); # ... and now linear
  120. my @T = map("%zmm$_",(5..12));
  121. my @Theta = map("%zmm$_",(33,13..16)); # invalid @Theta[0] is not typo
  122. my @Pi0 = map("%zmm$_",(17..21));
  123. my @Rhotate0 = map("%zmm$_",(22..26));
  124. my @Rhotate1 = map("%zmm$_",(27..31));
  125. my ($C00,$D00) = @T[0..1];
  126. my ($k00001,$k00010,$k00100,$k01000,$k10000,$k11111) = map("%k$_",(1..6));
  127. $code.=<<___;
  128. .text
  129. .type __KeccakF1600,\@function
  130. .align 32
  131. __KeccakF1600:
  132. lea iotas(%rip),%r10
  133. mov \$12,%eax
  134. jmp .Loop_avx512
  135. .align 32
  136. .Loop_avx512:
  137. ######################################### Theta, even round
  138. vmovdqa64 $A00,@T[0] # put aside original A00
  139. vpternlogq \$0x96,$A20,$A10,$A00 # and use it as "C00"
  140. vpternlogq \$0x96,$A40,$A30,$A00
  141. vprolq \$1,$A00,$D00
  142. vpermq $A00,@Theta[1],$A00
  143. vpermq $D00,@Theta[4],$D00
  144. vpternlogq \$0x96,$A00,$D00,@T[0] # T[0] is original A00
  145. vpternlogq \$0x96,$A00,$D00,$A10
  146. vpternlogq \$0x96,$A00,$D00,$A20
  147. vpternlogq \$0x96,$A00,$D00,$A30
  148. vpternlogq \$0x96,$A00,$D00,$A40
  149. ######################################### Rho
  150. vprolvq @Rhotate0[0],@T[0],$A00 # T[0] is original A00
  151. vprolvq @Rhotate0[1],$A10,$A10
  152. vprolvq @Rhotate0[2],$A20,$A20
  153. vprolvq @Rhotate0[3],$A30,$A30
  154. vprolvq @Rhotate0[4],$A40,$A40
  155. ######################################### Pi
  156. vpermq $A00,@Pi0[0],$A00
  157. vpermq $A10,@Pi0[1],$A10
  158. vpermq $A20,@Pi0[2],$A20
  159. vpermq $A30,@Pi0[3],$A30
  160. vpermq $A40,@Pi0[4],$A40
  161. ######################################### Chi
  162. vmovdqa64 $A00,@T[0]
  163. vmovdqa64 $A10,@T[1]
  164. vpternlogq \$0xD2,$A20,$A10,$A00
  165. vpternlogq \$0xD2,$A30,$A20,$A10
  166. vpternlogq \$0xD2,$A40,$A30,$A20
  167. vpternlogq \$0xD2,@T[0],$A40,$A30
  168. vpternlogq \$0xD2,@T[1],@T[0],$A40
  169. ######################################### Iota
  170. vpxorq (%r10),$A00,${A00}{$k00001}
  171. lea 16(%r10),%r10
  172. ######################################### Harmonize rounds
  173. vpblendmq $A20,$A10,@{T[1]}{$k00010}
  174. vpblendmq $A30,$A20,@{T[2]}{$k00010}
  175. vpblendmq $A40,$A30,@{T[3]}{$k00010}
  176. vpblendmq $A10,$A00,@{T[0]}{$k00010}
  177. vpblendmq $A00,$A40,@{T[4]}{$k00010}
  178. vpblendmq $A30,@T[1],@{T[1]}{$k00100}
  179. vpblendmq $A40,@T[2],@{T[2]}{$k00100}
  180. vpblendmq $A20,@T[0],@{T[0]}{$k00100}
  181. vpblendmq $A00,@T[3],@{T[3]}{$k00100}
  182. vpblendmq $A10,@T[4],@{T[4]}{$k00100}
  183. vpblendmq $A40,@T[1],@{T[1]}{$k01000}
  184. vpblendmq $A30,@T[0],@{T[0]}{$k01000}
  185. vpblendmq $A00,@T[2],@{T[2]}{$k01000}
  186. vpblendmq $A10,@T[3],@{T[3]}{$k01000}
  187. vpblendmq $A20,@T[4],@{T[4]}{$k01000}
  188. vpblendmq $A40,@T[0],@{T[0]}{$k10000}
  189. vpblendmq $A00,@T[1],@{T[1]}{$k10000}
  190. vpblendmq $A10,@T[2],@{T[2]}{$k10000}
  191. vpblendmq $A20,@T[3],@{T[3]}{$k10000}
  192. vpblendmq $A30,@T[4],@{T[4]}{$k10000}
  193. #vpermq @T[0],@Theta[0],$A00 # doesn't actually change order
  194. vpermq @T[1],@Theta[1],$A10
  195. vpermq @T[2],@Theta[2],$A20
  196. vpermq @T[3],@Theta[3],$A30
  197. vpermq @T[4],@Theta[4],$A40
  198. ######################################### Theta, odd round
  199. vmovdqa64 $T[0],$A00 # real A00
  200. vpternlogq \$0x96,$A20,$A10,$C00 # C00 is @T[0]'s alias
  201. vpternlogq \$0x96,$A40,$A30,$C00
  202. vprolq \$1,$C00,$D00
  203. vpermq $C00,@Theta[1],$C00
  204. vpermq $D00,@Theta[4],$D00
  205. vpternlogq \$0x96,$C00,$D00,$A00
  206. vpternlogq \$0x96,$C00,$D00,$A30
  207. vpternlogq \$0x96,$C00,$D00,$A10
  208. vpternlogq \$0x96,$C00,$D00,$A40
  209. vpternlogq \$0x96,$C00,$D00,$A20
  210. ######################################### Rho
  211. vprolvq @Rhotate1[0],$A00,$A00
  212. vprolvq @Rhotate1[3],$A30,@T[1]
  213. vprolvq @Rhotate1[1],$A10,@T[2]
  214. vprolvq @Rhotate1[4],$A40,@T[3]
  215. vprolvq @Rhotate1[2],$A20,@T[4]
  216. vpermq $A00,@Theta[4],@T[5]
  217. vpermq $A00,@Theta[3],@T[6]
  218. ######################################### Iota
  219. vpxorq -8(%r10),$A00,${A00}{$k00001}
  220. ######################################### Pi
  221. vpermq @T[1],@Theta[2],$A10
  222. vpermq @T[2],@Theta[4],$A20
  223. vpermq @T[3],@Theta[1],$A30
  224. vpermq @T[4],@Theta[3],$A40
  225. ######################################### Chi
  226. vpternlogq \$0xD2,@T[6],@T[5],$A00
  227. vpermq @T[1],@Theta[1],@T[7]
  228. #vpermq @T[1],@Theta[0],@T[1]
  229. vpternlogq \$0xD2,@T[1],@T[7],$A10
  230. vpermq @T[2],@Theta[3],@T[0]
  231. vpermq @T[2],@Theta[2],@T[2]
  232. vpternlogq \$0xD2,@T[2],@T[0],$A20
  233. #vpermq @T[3],@Theta[0],@T[3]
  234. vpermq @T[3],@Theta[4],@T[1]
  235. vpternlogq \$0xD2,@T[1],@T[3],$A30
  236. vpermq @T[4],@Theta[2],@T[0]
  237. vpermq @T[4],@Theta[1],@T[4]
  238. vpternlogq \$0xD2,@T[4],@T[0],$A40
  239. dec %eax
  240. jnz .Loop_avx512
  241. ret
  242. .size __KeccakF1600,.-__KeccakF1600
  243. ___
  244. my ($A_flat,$inp,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
  245. my $out = $inp; # in squeeze
  246. $code.=<<___;
  247. .globl SHA3_absorb
  248. .type SHA3_absorb,\@function
  249. .align 32
  250. SHA3_absorb:
  251. mov %rsp,%r11
  252. lea -320(%rsp),%rsp
  253. and \$-64,%rsp
  254. lea 96($A_flat),$A_flat
  255. lea 96($inp),$inp
  256. lea 128(%rsp),%r9
  257. lea theta_perm(%rip),%r8
  258. kxnorw $k11111,$k11111,$k11111
  259. kshiftrw \$15,$k11111,$k00001
  260. kshiftrw \$11,$k11111,$k11111
  261. kshiftlw \$1,$k00001,$k00010
  262. kshiftlw \$2,$k00001,$k00100
  263. kshiftlw \$3,$k00001,$k01000
  264. kshiftlw \$4,$k00001,$k10000
  265. #vmovdqa64 64*0(%r8),@Theta[0]
  266. vmovdqa64 64*1(%r8),@Theta[1]
  267. vmovdqa64 64*2(%r8),@Theta[2]
  268. vmovdqa64 64*3(%r8),@Theta[3]
  269. vmovdqa64 64*4(%r8),@Theta[4]
  270. vmovdqa64 64*5(%r8),@Rhotate1[0]
  271. vmovdqa64 64*6(%r8),@Rhotate1[1]
  272. vmovdqa64 64*7(%r8),@Rhotate1[2]
  273. vmovdqa64 64*8(%r8),@Rhotate1[3]
  274. vmovdqa64 64*9(%r8),@Rhotate1[4]
  275. vmovdqa64 64*10(%r8),@Rhotate0[0]
  276. vmovdqa64 64*11(%r8),@Rhotate0[1]
  277. vmovdqa64 64*12(%r8),@Rhotate0[2]
  278. vmovdqa64 64*13(%r8),@Rhotate0[3]
  279. vmovdqa64 64*14(%r8),@Rhotate0[4]
  280. vmovdqa64 64*15(%r8),@Pi0[0]
  281. vmovdqa64 64*16(%r8),@Pi0[1]
  282. vmovdqa64 64*17(%r8),@Pi0[2]
  283. vmovdqa64 64*18(%r8),@Pi0[3]
  284. vmovdqa64 64*19(%r8),@Pi0[4]
  285. vmovdqu64 40*0-96($A_flat),${A00}{$k11111}{z}
  286. vpxorq @T[0],@T[0],@T[0]
  287. vmovdqu64 40*1-96($A_flat),${A10}{$k11111}{z}
  288. vmovdqu64 40*2-96($A_flat),${A20}{$k11111}{z}
  289. vmovdqu64 40*3-96($A_flat),${A30}{$k11111}{z}
  290. vmovdqu64 40*4-96($A_flat),${A40}{$k11111}{z}
  291. vmovdqa64 @T[0],0*64-128(%r9) # zero transfer area on stack
  292. vmovdqa64 @T[0],1*64-128(%r9)
  293. vmovdqa64 @T[0],2*64-128(%r9)
  294. vmovdqa64 @T[0],3*64-128(%r9)
  295. vmovdqa64 @T[0],4*64-128(%r9)
  296. jmp .Loop_absorb_avx512
  297. .align 32
  298. .Loop_absorb_avx512:
  299. mov $bsz,%rax
  300. sub $bsz,$len
  301. jc .Ldone_absorb_avx512
  302. shr \$3,%eax
  303. ___
  304. for(my $i=0; $i<25; $i++) {
  305. $code.=<<___
  306. mov 8*$i-96($inp),%r8
  307. mov %r8,$A_jagged[$i]-128(%r9)
  308. dec %eax
  309. jz .Labsorved_avx512
  310. ___
  311. }
  312. $code.=<<___;
  313. .Labsorved_avx512:
  314. lea ($inp,$bsz),$inp
  315. vpxorq 64*0-128(%r9),$A00,$A00
  316. vpxorq 64*1-128(%r9),$A10,$A10
  317. vpxorq 64*2-128(%r9),$A20,$A20
  318. vpxorq 64*3-128(%r9),$A30,$A30
  319. vpxorq 64*4-128(%r9),$A40,$A40
  320. call __KeccakF1600
  321. jmp .Loop_absorb_avx512
  322. .align 32
  323. .Ldone_absorb_avx512:
  324. vmovdqu64 $A00,40*0-96($A_flat){$k11111}
  325. vmovdqu64 $A10,40*1-96($A_flat){$k11111}
  326. vmovdqu64 $A20,40*2-96($A_flat){$k11111}
  327. vmovdqu64 $A30,40*3-96($A_flat){$k11111}
  328. vmovdqu64 $A40,40*4-96($A_flat){$k11111}
  329. vzeroupper
  330. lea (%r11),%rsp
  331. lea ($len,$bsz),%rax # return value
  332. ret
  333. .size SHA3_absorb,.-SHA3_absorb
  334. .globl SHA3_squeeze
  335. .type SHA3_squeeze,\@function
  336. .align 32
  337. SHA3_squeeze:
  338. mov %rsp,%r11
  339. lea 96($A_flat),$A_flat
  340. cmp $bsz,$len
  341. jbe .Lno_output_extension_avx512
  342. lea theta_perm(%rip),%r8
  343. kxnorw $k11111,$k11111,$k11111
  344. kshiftrw \$15,$k11111,$k00001
  345. kshiftrw \$11,$k11111,$k11111
  346. kshiftlw \$1,$k00001,$k00010
  347. kshiftlw \$2,$k00001,$k00100
  348. kshiftlw \$3,$k00001,$k01000
  349. kshiftlw \$4,$k00001,$k10000
  350. #vmovdqa64 64*0(%r8),@Theta[0]
  351. vmovdqa64 64*1(%r8),@Theta[1]
  352. vmovdqa64 64*2(%r8),@Theta[2]
  353. vmovdqa64 64*3(%r8),@Theta[3]
  354. vmovdqa64 64*4(%r8),@Theta[4]
  355. vmovdqa64 64*5(%r8),@Rhotate1[0]
  356. vmovdqa64 64*6(%r8),@Rhotate1[1]
  357. vmovdqa64 64*7(%r8),@Rhotate1[2]
  358. vmovdqa64 64*8(%r8),@Rhotate1[3]
  359. vmovdqa64 64*9(%r8),@Rhotate1[4]
  360. vmovdqa64 64*10(%r8),@Rhotate0[0]
  361. vmovdqa64 64*11(%r8),@Rhotate0[1]
  362. vmovdqa64 64*12(%r8),@Rhotate0[2]
  363. vmovdqa64 64*13(%r8),@Rhotate0[3]
  364. vmovdqa64 64*14(%r8),@Rhotate0[4]
  365. vmovdqa64 64*15(%r8),@Pi0[0]
  366. vmovdqa64 64*16(%r8),@Pi0[1]
  367. vmovdqa64 64*17(%r8),@Pi0[2]
  368. vmovdqa64 64*18(%r8),@Pi0[3]
  369. vmovdqa64 64*19(%r8),@Pi0[4]
  370. vmovdqu64 40*0-96($A_flat),${A00}{$k11111}{z}
  371. vmovdqu64 40*1-96($A_flat),${A10}{$k11111}{z}
  372. vmovdqu64 40*2-96($A_flat),${A20}{$k11111}{z}
  373. vmovdqu64 40*3-96($A_flat),${A30}{$k11111}{z}
  374. vmovdqu64 40*4-96($A_flat),${A40}{$k11111}{z}
  375. .Lno_output_extension_avx512:
  376. shr \$3,$bsz
  377. lea -96($A_flat),%r9
  378. mov $bsz,%rax
  379. jmp .Loop_squeeze_avx512
  380. .align 32
  381. .Loop_squeeze_avx512:
  382. cmp \$8,$len
  383. jb .Ltail_squeeze_avx512
  384. mov (%r9),%r8
  385. lea 8(%r9),%r9
  386. mov %r8,($out)
  387. lea 8($out),$out
  388. sub \$8,$len # len -= 8
  389. jz .Ldone_squeeze_avx512
  390. sub \$1,%rax # bsz--
  391. jnz .Loop_squeeze_avx512
  392. #vpermq @Theta[4],@Theta[4],@Theta[3]
  393. #vpermq @Theta[3],@Theta[4],@Theta[2]
  394. #vpermq @Theta[3],@Theta[3],@Theta[1]
  395. call __KeccakF1600
  396. vmovdqu64 $A00,40*0-96($A_flat){$k11111}
  397. vmovdqu64 $A10,40*1-96($A_flat){$k11111}
  398. vmovdqu64 $A20,40*2-96($A_flat){$k11111}
  399. vmovdqu64 $A30,40*3-96($A_flat){$k11111}
  400. vmovdqu64 $A40,40*4-96($A_flat){$k11111}
  401. lea -96($A_flat),%r9
  402. mov $bsz,%rax
  403. jmp .Loop_squeeze_avx512
  404. .Ltail_squeeze_avx512:
  405. mov $out,%rdi
  406. mov %r9,%rsi
  407. mov $len,%rcx
  408. .byte 0xf3,0xa4 # rep movsb
  409. .Ldone_squeeze_avx512:
  410. vzeroupper
  411. lea (%r11),%rsp
  412. ret
  413. .size SHA3_squeeze,.-SHA3_squeeze
  414. .align 64
  415. theta_perm:
  416. .quad 0, 1, 2, 3, 4, 5, 6, 7 # [not used]
  417. .quad 4, 0, 1, 2, 3, 5, 6, 7
  418. .quad 3, 4, 0, 1, 2, 5, 6, 7
  419. .quad 2, 3, 4, 0, 1, 5, 6, 7
  420. .quad 1, 2, 3, 4, 0, 5, 6, 7
  421. rhotates1:
  422. .quad 0, 44, 43, 21, 14, 0, 0, 0 # [0][0] [1][1] [2][2] [3][3] [4][4]
  423. .quad 18, 1, 6, 25, 8, 0, 0, 0 # [4][0] [0][1] [1][2] [2][3] [3][4]
  424. .quad 41, 2, 62, 55, 39, 0, 0, 0 # [3][0] [4][1] [0][2] [1][3] [2][4]
  425. .quad 3, 45, 61, 28, 20, 0, 0, 0 # [2][0] [3][1] [4][2] [0][3] [1][4]
  426. .quad 36, 10, 15, 56, 27, 0, 0, 0 # [1][0] [2][1] [3][2] [4][3] [0][4]
  427. rhotates0:
  428. .quad 0, 1, 62, 28, 27, 0, 0, 0
  429. .quad 36, 44, 6, 55, 20, 0, 0, 0
  430. .quad 3, 10, 43, 25, 39, 0, 0, 0
  431. .quad 41, 45, 15, 21, 8, 0, 0, 0
  432. .quad 18, 2, 61, 56, 14, 0, 0, 0
  433. pi0_perm:
  434. .quad 0, 3, 1, 4, 2, 5, 6, 7
  435. .quad 1, 4, 2, 0, 3, 5, 6, 7
  436. .quad 2, 0, 3, 1, 4, 5, 6, 7
  437. .quad 3, 1, 4, 2, 0, 5, 6, 7
  438. .quad 4, 2, 0, 3, 1, 5, 6, 7
  439. iotas:
  440. .quad 0x0000000000000001
  441. .quad 0x0000000000008082
  442. .quad 0x800000000000808a
  443. .quad 0x8000000080008000
  444. .quad 0x000000000000808b
  445. .quad 0x0000000080000001
  446. .quad 0x8000000080008081
  447. .quad 0x8000000000008009
  448. .quad 0x000000000000008a
  449. .quad 0x0000000000000088
  450. .quad 0x0000000080008009
  451. .quad 0x000000008000000a
  452. .quad 0x000000008000808b
  453. .quad 0x800000000000008b
  454. .quad 0x8000000000008089
  455. .quad 0x8000000000008003
  456. .quad 0x8000000000008002
  457. .quad 0x8000000000000080
  458. .quad 0x000000000000800a
  459. .quad 0x800000008000000a
  460. .quad 0x8000000080008081
  461. .quad 0x8000000000008080
  462. .quad 0x0000000080000001
  463. .quad 0x8000000080008008
  464. .asciz "Keccak-1600 absorb and squeeze for AVX-512F, CRYPTOGAMS by <appro\@openssl.org>"
  465. ___
  466. $output=pop and open STDOUT,">$output";
  467. print $code;
  468. close STDOUT;