keccak1600-avx512vl.pl 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391
  1. #!/usr/bin/env perl
  2. # Copyright 2017-2020 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. # ====================================================================
  10. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  11. # project. The module is, however, dual licensed under OpenSSL and
  12. # CRYPTOGAMS licenses depending on where you obtain it. For further
  13. # details see http://www.openssl.org/~appro/cryptogams/.
  14. # ====================================================================
  15. #
  16. # Keccak-1600 for AVX512VL.
  17. #
  18. # December 2017.
  19. #
  20. # This is an adaptation of AVX2 module that reuses register data
  21. # layout, but utilizes new 256-bit AVX512VL instructions. See AVX2
  22. # module for further information on layout.
  23. #
  24. ########################################################################
  25. # Numbers are cycles per processed byte out of large message.
  26. #
  27. # r=1088(*)
  28. #
  29. # Skylake-X 6.4/+47%
  30. #
  31. # (*) Corresponds to SHA3-256. Percentage after slash is improvement
  32. # coefficient in comparison to scalar keccak1600-x86_64.pl.
  33. # Digits in variables' names denote right-most coordinates:
  34. my ($A00, # [0][0] [0][0] [0][0] [0][0] # %ymm0
  35. $A01, # [0][4] [0][3] [0][2] [0][1] # %ymm1
  36. $A20, # [3][0] [1][0] [4][0] [2][0] # %ymm2
  37. $A31, # [2][4] [4][3] [1][2] [3][1] # %ymm3
  38. $A21, # [3][4] [1][3] [4][2] [2][1] # %ymm4
  39. $A41, # [1][4] [2][3] [3][2] [4][1] # %ymm5
  40. $A11) = # [4][4] [3][3] [2][2] [1][1] # %ymm6
  41. map("%ymm$_",(0..6));
  42. # We also need to map the magic order into offsets within structure:
  43. my @A_jagged = ([0,0], [1,0], [1,1], [1,2], [1,3], # [0][0..4]
  44. [2,2], [6,0], [3,1], [4,2], [5,3], # [1][0..4]
  45. [2,0], [4,0], [6,1], [5,2], [3,3], # [2][0..4]
  46. [2,3], [3,0], [5,1], [6,2], [4,3], # [3][0..4]
  47. [2,1], [5,0], [4,1], [3,2], [6,3]); # [4][0..4]
  48. @A_jagged = map(8*($$_[0]*4+$$_[1]), @A_jagged); # ... and now linear
  49. my @T = map("%ymm$_",(7..15));
  50. my ($C14,$C00,$D00,$D14) = @T[5..8];
  51. my ($R20,$R01,$R31,$R21,$R41,$R11) = map("%ymm$_",(16..21));
  52. $code.=<<___;
  53. .text
  54. .type __KeccakF1600,\@function
  55. .align 32
  56. __KeccakF1600:
  57. lea iotas(%rip),%r10
  58. mov \$24,%eax
  59. jmp .Loop_avx512vl
  60. .align 32
  61. .Loop_avx512vl:
  62. ######################################### Theta
  63. vpshufd \$0b01001110,$A20,$C00
  64. vpxor $A31,$A41,$C14
  65. vpxor $A11,$A21,@T[2]
  66. vpternlogq \$0x96,$A01,$T[2],$C14 # C[1..4]
  67. vpxor $A20,$C00,$C00
  68. vpermq \$0b01001110,$C00,@T[0]
  69. vpermq \$0b10010011,$C14,@T[4]
  70. vprolq \$1,$C14,@T[1] # ROL64(C[1..4],1)
  71. vpermq \$0b00111001,@T[1],$D14
  72. vpxor @T[4],@T[1],$D00
  73. vpermq \$0b00000000,$D00,$D00 # D[0..0] = ROL64(C[1],1) ^ C[4]
  74. vpternlogq \$0x96,@T[0],$A00,$C00 # C[0..0]
  75. vprolq \$1,$C00,@T[1] # ROL64(C[0..0],1)
  76. vpxor $D00,$A00,$A00 # ^= D[0..0]
  77. vpblendd \$0b11000000,@T[1],$D14,$D14
  78. vpblendd \$0b00000011,$C00,@T[4],@T[0]
  79. ######################################### Rho + Pi + pre-Chi shuffle
  80. vpxor $D00,$A20,$A20 # ^= D[0..0] from Theta
  81. vprolvq $R20,$A20,$A20
  82. vpternlogq \$0x96,@T[0],$D14,$A31 # ^= D[1..4] from Theta
  83. vprolvq $R31,$A31,$A31
  84. vpternlogq \$0x96,@T[0],$D14,$A21 # ^= D[1..4] from Theta
  85. vprolvq $R21,$A21,$A21
  86. vpternlogq \$0x96,@T[0],$D14,$A41 # ^= D[1..4] from Theta
  87. vprolvq $R41,$A41,$A41
  88. vpermq \$0b10001101,$A20,@T[3] # $A20 -> future $A31
  89. vpermq \$0b10001101,$A31,@T[4] # $A31 -> future $A21
  90. vpternlogq \$0x96,@T[0],$D14,$A11 # ^= D[1..4] from Theta
  91. vprolvq $R11,$A11,@T[1] # $A11 -> future $A01
  92. vpermq \$0b00011011,$A21,@T[5] # $A21 -> future $A41
  93. vpermq \$0b01110010,$A41,@T[6] # $A41 -> future $A11
  94. vpternlogq \$0x96,@T[0],$D14,$A01 # ^= D[1..4] from Theta
  95. vprolvq $R01,$A01,@T[2] # $A01 -> future $A20
  96. ######################################### Chi
  97. vpblendd \$0b00001100,@T[6],@T[2],$A31 # [4][4] [2][0]
  98. vpblendd \$0b00001100,@T[2],@T[4],@T[8] # [4][0] [2][1]
  99. vpblendd \$0b00001100,@T[4],@T[3],$A41 # [4][2] [2][4]
  100. vpblendd \$0b00001100,@T[3],@T[2],@T[7] # [4][3] [2][0]
  101. vpblendd \$0b00110000,@T[4],$A31,$A31 # [1][3] [4][4] [2][0]
  102. vpblendd \$0b00110000,@T[5],@T[8],@T[8] # [1][4] [4][0] [2][1]
  103. vpblendd \$0b00110000,@T[2],$A41,$A41 # [1][0] [4][2] [2][4]
  104. vpblendd \$0b00110000,@T[6],@T[7],@T[7] # [1][1] [4][3] [2][0]
  105. vpblendd \$0b11000000,@T[5],$A31,$A31 # [3][2] [1][3] [4][4] [2][0]
  106. vpblendd \$0b11000000,@T[6],@T[8],@T[8] # [3][3] [1][4] [4][0] [2][1]
  107. vpblendd \$0b11000000,@T[6],$A41,$A41 # [3][3] [1][0] [4][2] [2][4]
  108. vpblendd \$0b11000000,@T[4],@T[7],@T[7] # [3][4] [1][1] [4][3] [2][0]
  109. vpternlogq \$0xC6,@T[8],@T[3],$A31 # [3][1] [1][2] [4][3] [2][4]
  110. vpternlogq \$0xC6,@T[7],@T[5],$A41 # [3][2] [1][4] [4][1] [2][3]
  111. vpsrldq \$8,@T[1],@T[0]
  112. vpandn @T[0],@T[1],@T[0] # tgting [0][0] [0][0] [0][0] [0][0]
  113. vpblendd \$0b00001100,@T[2],@T[5],$A11 # [4][0] [2][3]
  114. vpblendd \$0b00001100,@T[5],@T[3],@T[8] # [4][1] [2][4]
  115. vpblendd \$0b00110000,@T[3],$A11,$A11 # [1][2] [4][0] [2][3]
  116. vpblendd \$0b00110000,@T[4],@T[8],@T[8] # [1][3] [4][1] [2][4]
  117. vpblendd \$0b11000000,@T[4],$A11,$A11 # [3][4] [1][2] [4][0] [2][3]
  118. vpblendd \$0b11000000,@T[2],@T[8],@T[8] # [3][0] [1][3] [4][1] [2][4]
  119. vpternlogq \$0xC6,@T[8],@T[6],$A11 # [3][3] [1][1] [4][4] [2][2]
  120. vpermq \$0b00011110,@T[1],$A21 # [0][1] [0][2] [0][4] [0][3]
  121. vpblendd \$0b00110000,$A00,$A21,@T[8] # [0][1] [0][0] [0][4] [0][3]
  122. vpermq \$0b00111001,@T[1],$A01 # [0][1] [0][4] [0][3] [0][2]
  123. vpblendd \$0b11000000,$A00,$A01,$A01 # [0][0] [0][4] [0][3] [0][2]
  124. vpblendd \$0b00001100,@T[5],@T[4],$A20 # [4][1] [2][1]
  125. vpblendd \$0b00001100,@T[4],@T[6],@T[7] # [4][2] [2][2]
  126. vpblendd \$0b00110000,@T[6],$A20,$A20 # [1][1] [4][1] [2][1]
  127. vpblendd \$0b00110000,@T[3],@T[7],@T[7] # [1][2] [4][2] [2][2]
  128. vpblendd \$0b11000000,@T[3],$A20,$A20 # [3][1] [1][1] [4][1] [2][1]
  129. vpblendd \$0b11000000,@T[5],@T[7],@T[7] # [3][2] [1][2] [4][2] [2][2]
  130. vpternlogq \$0xC6,@T[7],@T[2],$A20 # [3][0] [1][0] [4][0] [2][0]
  131. vpermq \$0b00000000,@T[0],@T[0] # [0][0] [0][0] [0][0] [0][0]
  132. vpermq \$0b00011011,$A31,$A31 # post-Chi shuffle
  133. vpermq \$0b10001101,$A41,$A41
  134. vpermq \$0b01110010,$A11,$A11
  135. vpblendd \$0b00001100,@T[3],@T[6],$A21 # [4][3] [2][2]
  136. vpblendd \$0b00001100,@T[6],@T[5],@T[7] # [4][4] [2][3]
  137. vpblendd \$0b00110000,@T[5],$A21,$A21 # [1][4] [4][3] [2][2]
  138. vpblendd \$0b00110000,@T[2],@T[7],@T[7] # [1][0] [4][4] [2][3]
  139. vpblendd \$0b11000000,@T[2],$A21,$A21 # [3][0] [1][4] [4][3] [2][2]
  140. vpblendd \$0b11000000,@T[3],@T[7],@T[7] # [3][1] [1][0] [4][4] [2][3]
  141. vpternlogq \$0xC6,@T[8],@T[1],$A01 # [0][4] [0][3] [0][2] [0][1]
  142. vpternlogq \$0xC6,@T[7],@T[4],$A21 # [3][4] [1][3] [4][2] [2][1]
  143. ######################################### Iota
  144. vpternlogq \$0x96,(%r10),@T[0],$A00
  145. lea 32(%r10),%r10
  146. dec %eax
  147. jnz .Loop_avx512vl
  148. ret
  149. .size __KeccakF1600,.-__KeccakF1600
  150. ___
  151. my ($A_flat,$inp,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
  152. my $out = $inp; # in squeeze
  153. $code.=<<___;
  154. .globl SHA3_absorb
  155. .type SHA3_absorb,\@function
  156. .align 32
  157. SHA3_absorb:
  158. mov %rsp,%r11
  159. lea -240(%rsp),%rsp
  160. and \$-32,%rsp
  161. lea 96($A_flat),$A_flat
  162. lea 96($inp),$inp
  163. lea 96(%rsp),%r10
  164. lea rhotates_left(%rip),%r8
  165. vzeroupper
  166. vpbroadcastq -96($A_flat),$A00 # load A[5][5]
  167. vmovdqu 8+32*0-96($A_flat),$A01
  168. vmovdqu 8+32*1-96($A_flat),$A20
  169. vmovdqu 8+32*2-96($A_flat),$A31
  170. vmovdqu 8+32*3-96($A_flat),$A21
  171. vmovdqu 8+32*4-96($A_flat),$A41
  172. vmovdqu 8+32*5-96($A_flat),$A11
  173. vmovdqa64 0*32(%r8),$R20 # load "rhotate" indices
  174. vmovdqa64 1*32(%r8),$R01
  175. vmovdqa64 2*32(%r8),$R31
  176. vmovdqa64 3*32(%r8),$R21
  177. vmovdqa64 4*32(%r8),$R41
  178. vmovdqa64 5*32(%r8),$R11
  179. vpxor @T[0],@T[0],@T[0]
  180. vmovdqa @T[0],32*2-96(%r10) # zero transfer area on stack
  181. vmovdqa @T[0],32*3-96(%r10)
  182. vmovdqa @T[0],32*4-96(%r10)
  183. vmovdqa @T[0],32*5-96(%r10)
  184. vmovdqa @T[0],32*6-96(%r10)
  185. .Loop_absorb_avx512vl:
  186. mov $bsz,%rax
  187. sub $bsz,$len
  188. jc .Ldone_absorb_avx512vl
  189. shr \$3,%eax
  190. vpbroadcastq 0-96($inp),@T[0]
  191. vmovdqu 8-96($inp),@T[1]
  192. sub \$4,%eax
  193. ___
  194. for(my $i=5; $i<25; $i++) {
  195. $code.=<<___
  196. dec %eax
  197. jz .Labsorved_avx512vl
  198. mov 8*$i-96($inp),%r8
  199. mov %r8,$A_jagged[$i]-96(%r10)
  200. ___
  201. }
  202. $code.=<<___;
  203. .Labsorved_avx512vl:
  204. lea ($inp,$bsz),$inp
  205. vpxor @T[0],$A00,$A00
  206. vpxor @T[1],$A01,$A01
  207. vpxor 32*2-96(%r10),$A20,$A20
  208. vpxor 32*3-96(%r10),$A31,$A31
  209. vpxor 32*4-96(%r10),$A21,$A21
  210. vpxor 32*5-96(%r10),$A41,$A41
  211. vpxor 32*6-96(%r10),$A11,$A11
  212. call __KeccakF1600
  213. lea 96(%rsp),%r10
  214. jmp .Loop_absorb_avx512vl
  215. .Ldone_absorb_avx512vl:
  216. vmovq %xmm0,-96($A_flat)
  217. vmovdqu $A01,8+32*0-96($A_flat)
  218. vmovdqu $A20,8+32*1-96($A_flat)
  219. vmovdqu $A31,8+32*2-96($A_flat)
  220. vmovdqu $A21,8+32*3-96($A_flat)
  221. vmovdqu $A41,8+32*4-96($A_flat)
  222. vmovdqu $A11,8+32*5-96($A_flat)
  223. vzeroupper
  224. lea (%r11),%rsp
  225. lea ($len,$bsz),%rax # return value
  226. ret
  227. .size SHA3_absorb,.-SHA3_absorb
  228. .globl SHA3_squeeze
  229. .type SHA3_squeeze,\@function
  230. .align 32
  231. SHA3_squeeze:
  232. mov %rsp,%r11
  233. lea 96($A_flat),$A_flat
  234. lea rhotates_left(%rip),%r8
  235. shr \$3,$bsz
  236. vzeroupper
  237. vpbroadcastq -96($A_flat),$A00
  238. vpxor @T[0],@T[0],@T[0]
  239. vmovdqu 8+32*0-96($A_flat),$A01
  240. vmovdqu 8+32*1-96($A_flat),$A20
  241. vmovdqu 8+32*2-96($A_flat),$A31
  242. vmovdqu 8+32*3-96($A_flat),$A21
  243. vmovdqu 8+32*4-96($A_flat),$A41
  244. vmovdqu 8+32*5-96($A_flat),$A11
  245. vmovdqa64 0*32(%r8),$R20 # load "rhotate" indices
  246. vmovdqa64 1*32(%r8),$R01
  247. vmovdqa64 2*32(%r8),$R31
  248. vmovdqa64 3*32(%r8),$R21
  249. vmovdqa64 4*32(%r8),$R41
  250. vmovdqa64 5*32(%r8),$R11
  251. mov $bsz,%rax
  252. .Loop_squeeze_avx512vl:
  253. mov @A_jagged[$i]-96($A_flat),%r8
  254. ___
  255. for (my $i=0; $i<25; $i++) {
  256. $code.=<<___;
  257. sub \$8,$len
  258. jc .Ltail_squeeze_avx512vl
  259. mov %r8,($out)
  260. lea 8($out),$out
  261. je .Ldone_squeeze_avx512vl
  262. dec %eax
  263. je .Lextend_output_avx512vl
  264. mov @A_jagged[$i+1]-120($A_flat),%r8
  265. ___
  266. }
  267. $code.=<<___;
  268. .Lextend_output_avx512vl:
  269. call __KeccakF1600
  270. vmovq %xmm0,-96($A_flat)
  271. vmovdqu $A01,8+32*0-96($A_flat)
  272. vmovdqu $A20,8+32*1-96($A_flat)
  273. vmovdqu $A31,8+32*2-96($A_flat)
  274. vmovdqu $A21,8+32*3-96($A_flat)
  275. vmovdqu $A41,8+32*4-96($A_flat)
  276. vmovdqu $A11,8+32*5-96($A_flat)
  277. mov $bsz,%rax
  278. jmp .Loop_squeeze_avx512vl
  279. .Ltail_squeeze_avx512vl:
  280. add \$8,$len
  281. .Loop_tail_avx512vl:
  282. mov %r8b,($out)
  283. lea 1($out),$out
  284. shr \$8,%r8
  285. dec $len
  286. jnz .Loop_tail_avx512vl
  287. .Ldone_squeeze_avx512vl:
  288. vzeroupper
  289. lea (%r11),%rsp
  290. ret
  291. .size SHA3_squeeze,.-SHA3_squeeze
  292. .align 64
  293. rhotates_left:
  294. .quad 3, 18, 36, 41 # [2][0] [4][0] [1][0] [3][0]
  295. .quad 1, 62, 28, 27 # [0][1] [0][2] [0][3] [0][4]
  296. .quad 45, 6, 56, 39 # [3][1] [1][2] [4][3] [2][4]
  297. .quad 10, 61, 55, 8 # [2][1] [4][2] [1][3] [3][4]
  298. .quad 2, 15, 25, 20 # [4][1] [3][2] [2][3] [1][4]
  299. .quad 44, 43, 21, 14 # [1][1] [2][2] [3][3] [4][4]
  300. iotas:
  301. .quad 0x0000000000000001, 0x0000000000000001, 0x0000000000000001, 0x0000000000000001
  302. .quad 0x0000000000008082, 0x0000000000008082, 0x0000000000008082, 0x0000000000008082
  303. .quad 0x800000000000808a, 0x800000000000808a, 0x800000000000808a, 0x800000000000808a
  304. .quad 0x8000000080008000, 0x8000000080008000, 0x8000000080008000, 0x8000000080008000
  305. .quad 0x000000000000808b, 0x000000000000808b, 0x000000000000808b, 0x000000000000808b
  306. .quad 0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001
  307. .quad 0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081
  308. .quad 0x8000000000008009, 0x8000000000008009, 0x8000000000008009, 0x8000000000008009
  309. .quad 0x000000000000008a, 0x000000000000008a, 0x000000000000008a, 0x000000000000008a
  310. .quad 0x0000000000000088, 0x0000000000000088, 0x0000000000000088, 0x0000000000000088
  311. .quad 0x0000000080008009, 0x0000000080008009, 0x0000000080008009, 0x0000000080008009
  312. .quad 0x000000008000000a, 0x000000008000000a, 0x000000008000000a, 0x000000008000000a
  313. .quad 0x000000008000808b, 0x000000008000808b, 0x000000008000808b, 0x000000008000808b
  314. .quad 0x800000000000008b, 0x800000000000008b, 0x800000000000008b, 0x800000000000008b
  315. .quad 0x8000000000008089, 0x8000000000008089, 0x8000000000008089, 0x8000000000008089
  316. .quad 0x8000000000008003, 0x8000000000008003, 0x8000000000008003, 0x8000000000008003
  317. .quad 0x8000000000008002, 0x8000000000008002, 0x8000000000008002, 0x8000000000008002
  318. .quad 0x8000000000000080, 0x8000000000000080, 0x8000000000000080, 0x8000000000000080
  319. .quad 0x000000000000800a, 0x000000000000800a, 0x000000000000800a, 0x000000000000800a
  320. .quad 0x800000008000000a, 0x800000008000000a, 0x800000008000000a, 0x800000008000000a
  321. .quad 0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081
  322. .quad 0x8000000000008080, 0x8000000000008080, 0x8000000000008080, 0x8000000000008080
  323. .quad 0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001
  324. .quad 0x8000000080008008, 0x8000000080008008, 0x8000000080008008, 0x8000000080008008
  325. .asciz "Keccak-1600 absorb and squeeze for AVX512VL, CRYPTOGAMS by <appro\@openssl.org>"
  326. ___
  327. $output=pop and open STDOUT,">$output";
  328. print $code;
  329. close STDOUT or die "error closing STDOUT: $!";