keccak1600-x86_64.pl 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609
  1. #!/usr/bin/env perl
  2. # Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. # ====================================================================
  10. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  11. # project. The module is, however, dual licensed under OpenSSL and
  12. # CRYPTOGAMS licenses depending on where you obtain it. For further
  13. # details see http://www.openssl.org/~appro/cryptogams/.
  14. # ====================================================================
  15. #
  16. # Keccak-1600 for x86_64.
  17. #
  18. # June 2017.
  19. #
  20. # Below code is [lane complementing] KECCAK_2X implementation (see
  21. # sha/keccak1600.c) with C[5] and D[5] held in register bank. Though
  22. # instead of actually unrolling the loop pair-wise I simply flip
  23. # pointers to T[][] and A[][] at the end of round. Since number of
  24. # rounds is even, last round writes to A[][] and everything works out.
  25. # How does it compare to x86_64 assembly module in Keccak Code Package?
  26. # Depending on processor it's either as fast or faster by up to 15%...
  27. #
  28. ########################################################################
  29. # Numbers are cycles per processed byte out of large message.
  30. #
  31. # r=1088(*)
  32. #
  33. # P4 25.8
  34. # Core 2 12.9
  35. # Westmere 13.7
  36. # Sandy Bridge 12.9(**)
  37. # Haswell 9.6
  38. # Skylake 9.4
  39. # Silvermont 22.8
  40. # Goldmont 15.8
  41. # VIA Nano 17.3
  42. # Sledgehammer 13.3
  43. # Bulldozer 16.5
  44. # Ryzen 8.8
  45. #
  46. # (*) Corresponds to SHA3-256. Improvement over compiler-generate
  47. # varies a lot, most commont coefficient is 15% in comparison to
  48. # gcc-5.x, 50% for gcc-4.x, 90% for gcc-3.x.
  49. # (**) Sandy Bridge has broken rotate instruction. Performance can be
  50. # improved by 14% by replacing rotates with double-precision
  51. # shift with same register as source and destination.
  52. # $output is the last argument if it looks like a file (it has an extension)
  53. # $flavour is the first argument if it doesn't look like a file
  54. $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  55. $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  56. $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  57. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  58. ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  59. ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  60. die "can't locate x86_64-xlate.pl";
  61. open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
  62. or die "can't call $xlate: $!";
  63. *STDOUT=*OUT;
  64. my @A = map([ 8*$_-100, 8*($_+1)-100, 8*($_+2)-100,
  65. 8*($_+3)-100, 8*($_+4)-100 ], (0,5,10,15,20));
  66. my @C = ("%rax","%rbx","%rcx","%rdx","%rbp");
  67. my @D = map("%r$_",(8..12));
  68. my @T = map("%r$_",(13..14));
  69. my $iotas = "%r15";
  70. my @rhotates = ([ 0, 1, 62, 28, 27 ],
  71. [ 36, 44, 6, 55, 20 ],
  72. [ 3, 10, 43, 25, 39 ],
  73. [ 41, 45, 15, 21, 8 ],
  74. [ 18, 2, 61, 56, 14 ]);
  75. $code.=<<___;
  76. .text
  77. .type __KeccakF1600,\@abi-omnipotent
  78. .align 32
  79. __KeccakF1600:
  80. mov $A[4][0](%rdi),@C[0]
  81. mov $A[4][1](%rdi),@C[1]
  82. mov $A[4][2](%rdi),@C[2]
  83. mov $A[4][3](%rdi),@C[3]
  84. mov $A[4][4](%rdi),@C[4]
  85. jmp .Loop
  86. .align 32
  87. .Loop:
  88. mov $A[0][0](%rdi),@D[0]
  89. mov $A[1][1](%rdi),@D[1]
  90. mov $A[2][2](%rdi),@D[2]
  91. mov $A[3][3](%rdi),@D[3]
  92. xor $A[0][2](%rdi),@C[2]
  93. xor $A[0][3](%rdi),@C[3]
  94. xor @D[0], @C[0]
  95. xor $A[0][1](%rdi),@C[1]
  96. xor $A[1][2](%rdi),@C[2]
  97. xor $A[1][0](%rdi),@C[0]
  98. mov @C[4],@D[4]
  99. xor $A[0][4](%rdi),@C[4]
  100. xor @D[2], @C[2]
  101. xor $A[2][0](%rdi),@C[0]
  102. xor $A[1][3](%rdi),@C[3]
  103. xor @D[1], @C[1]
  104. xor $A[1][4](%rdi),@C[4]
  105. xor $A[3][2](%rdi),@C[2]
  106. xor $A[3][0](%rdi),@C[0]
  107. xor $A[2][3](%rdi),@C[3]
  108. xor $A[2][1](%rdi),@C[1]
  109. xor $A[2][4](%rdi),@C[4]
  110. mov @C[2],@T[0]
  111. rol \$1,@C[2]
  112. xor @C[0],@C[2] # D[1] = ROL64(C[2], 1) ^ C[0]
  113. xor @D[3], @C[3]
  114. rol \$1,@C[0]
  115. xor @C[3],@C[0] # D[4] = ROL64(C[0], 1) ^ C[3]
  116. xor $A[3][1](%rdi),@C[1]
  117. rol \$1,@C[3]
  118. xor @C[1],@C[3] # D[2] = ROL64(C[3], 1) ^ C[1]
  119. xor $A[3][4](%rdi),@C[4]
  120. rol \$1,@C[1]
  121. xor @C[4],@C[1] # D[0] = ROL64(C[1], 1) ^ C[4]
  122. rol \$1,@C[4]
  123. xor @T[0],@C[4] # D[3] = ROL64(C[4], 1) ^ C[2]
  124. ___
  125. (@D[0..4], @C) = (@C[1..4,0], @D);
  126. $code.=<<___;
  127. xor @D[1],@C[1]
  128. xor @D[2],@C[2]
  129. rol \$$rhotates[1][1],@C[1]
  130. xor @D[3],@C[3]
  131. xor @D[4],@C[4]
  132. rol \$$rhotates[2][2],@C[2]
  133. xor @D[0],@C[0]
  134. mov @C[1],@T[0]
  135. rol \$$rhotates[3][3],@C[3]
  136. or @C[2],@C[1]
  137. xor @C[0],@C[1] # C[0] ^ ( C[1] | C[2])
  138. rol \$$rhotates[4][4],@C[4]
  139. xor ($iotas),@C[1]
  140. lea 8($iotas),$iotas
  141. mov @C[4],@T[1]
  142. and @C[3],@C[4]
  143. mov @C[1],$A[0][0](%rsi) # R[0][0] = C[0] ^ ( C[1] | C[2]) ^ iotas[i]
  144. xor @C[2],@C[4] # C[2] ^ ( C[4] & C[3])
  145. not @C[2]
  146. mov @C[4],$A[0][2](%rsi) # R[0][2] = C[2] ^ ( C[4] & C[3])
  147. or @C[3],@C[2]
  148. mov $A[4][2](%rdi),@C[4]
  149. xor @T[0],@C[2] # C[1] ^ (~C[2] | C[3])
  150. mov @C[2],$A[0][1](%rsi) # R[0][1] = C[1] ^ (~C[2] | C[3])
  151. and @C[0],@T[0]
  152. mov $A[1][4](%rdi),@C[1]
  153. xor @T[1],@T[0] # C[4] ^ ( C[1] & C[0])
  154. mov $A[2][0](%rdi),@C[2]
  155. mov @T[0],$A[0][4](%rsi) # R[0][4] = C[4] ^ ( C[1] & C[0])
  156. or @C[0],@T[1]
  157. mov $A[0][3](%rdi),@C[0]
  158. xor @C[3],@T[1] # C[3] ^ ( C[4] | C[0])
  159. mov $A[3][1](%rdi),@C[3]
  160. mov @T[1],$A[0][3](%rsi) # R[0][3] = C[3] ^ ( C[4] | C[0])
  161. xor @D[3],@C[0]
  162. xor @D[2],@C[4]
  163. rol \$$rhotates[0][3],@C[0]
  164. xor @D[1],@C[3]
  165. xor @D[4],@C[1]
  166. rol \$$rhotates[4][2],@C[4]
  167. rol \$$rhotates[3][1],@C[3]
  168. xor @D[0],@C[2]
  169. rol \$$rhotates[1][4],@C[1]
  170. mov @C[0],@T[0]
  171. or @C[4],@C[0]
  172. rol \$$rhotates[2][0],@C[2]
  173. xor @C[3],@C[0] # C[3] ^ (C[0] | C[4])
  174. mov @C[0],$A[1][3](%rsi) # R[1][3] = C[3] ^ (C[0] | C[4])
  175. mov @C[1],@T[1]
  176. and @T[0],@C[1]
  177. mov $A[0][1](%rdi),@C[0]
  178. xor @C[4],@C[1] # C[4] ^ (C[1] & C[0])
  179. not @C[4]
  180. mov @C[1],$A[1][4](%rsi) # R[1][4] = C[4] ^ (C[1] & C[0])
  181. or @C[3],@C[4]
  182. mov $A[1][2](%rdi),@C[1]
  183. xor @C[2],@C[4] # C[2] ^ (~C[4] | C[3])
  184. mov @C[4],$A[1][2](%rsi) # R[1][2] = C[2] ^ (~C[4] | C[3])
  185. and @C[2],@C[3]
  186. mov $A[4][0](%rdi),@C[4]
  187. xor @T[1],@C[3] # C[1] ^ (C[3] & C[2])
  188. mov @C[3],$A[1][1](%rsi) # R[1][1] = C[1] ^ (C[3] & C[2])
  189. or @C[2],@T[1]
  190. mov $A[2][3](%rdi),@C[2]
  191. xor @T[0],@T[1] # C[0] ^ (C[1] | C[2])
  192. mov $A[3][4](%rdi),@C[3]
  193. mov @T[1],$A[1][0](%rsi) # R[1][0] = C[0] ^ (C[1] | C[2])
  194. xor @D[3],@C[2]
  195. xor @D[4],@C[3]
  196. rol \$$rhotates[2][3],@C[2]
  197. xor @D[2],@C[1]
  198. rol \$$rhotates[3][4],@C[3]
  199. xor @D[0],@C[4]
  200. rol \$$rhotates[1][2],@C[1]
  201. xor @D[1],@C[0]
  202. rol \$$rhotates[4][0],@C[4]
  203. mov @C[2],@T[0]
  204. and @C[3],@C[2]
  205. rol \$$rhotates[0][1],@C[0]
  206. not @C[3]
  207. xor @C[1],@C[2] # C[1] ^ ( C[2] & C[3])
  208. mov @C[2],$A[2][1](%rsi) # R[2][1] = C[1] ^ ( C[2] & C[3])
  209. mov @C[4],@T[1]
  210. and @C[3],@C[4]
  211. mov $A[2][1](%rdi),@C[2]
  212. xor @T[0],@C[4] # C[2] ^ ( C[4] & ~C[3])
  213. mov @C[4],$A[2][2](%rsi) # R[2][2] = C[2] ^ ( C[4] & ~C[3])
  214. or @C[1],@T[0]
  215. mov $A[4][3](%rdi),@C[4]
  216. xor @C[0],@T[0] # C[0] ^ ( C[2] | C[1])
  217. mov @T[0],$A[2][0](%rsi) # R[2][0] = C[0] ^ ( C[2] | C[1])
  218. and @C[0],@C[1]
  219. xor @T[1],@C[1] # C[4] ^ ( C[1] & C[0])
  220. mov @C[1],$A[2][4](%rsi) # R[2][4] = C[4] ^ ( C[1] & C[0])
  221. or @C[0],@T[1]
  222. mov $A[1][0](%rdi),@C[1]
  223. xor @C[3],@T[1] # ~C[3] ^ ( C[0] | C[4])
  224. mov $A[3][2](%rdi),@C[3]
  225. mov @T[1],$A[2][3](%rsi) # R[2][3] = ~C[3] ^ ( C[0] | C[4])
  226. mov $A[0][4](%rdi),@C[0]
  227. xor @D[1],@C[2]
  228. xor @D[2],@C[3]
  229. rol \$$rhotates[2][1],@C[2]
  230. xor @D[0],@C[1]
  231. rol \$$rhotates[3][2],@C[3]
  232. xor @D[3],@C[4]
  233. rol \$$rhotates[1][0],@C[1]
  234. xor @D[4],@C[0]
  235. rol \$$rhotates[4][3],@C[4]
  236. mov @C[2],@T[0]
  237. or @C[3],@C[2]
  238. rol \$$rhotates[0][4],@C[0]
  239. not @C[3]
  240. xor @C[1],@C[2] # C[1] ^ ( C[2] | C[3])
  241. mov @C[2],$A[3][1](%rsi) # R[3][1] = C[1] ^ ( C[2] | C[3])
  242. mov @C[4],@T[1]
  243. or @C[3],@C[4]
  244. xor @T[0],@C[4] # C[2] ^ ( C[4] | ~C[3])
  245. mov @C[4],$A[3][2](%rsi) # R[3][2] = C[2] ^ ( C[4] | ~C[3])
  246. and @C[1],@T[0]
  247. xor @C[0],@T[0] # C[0] ^ ( C[2] & C[1])
  248. mov @T[0],$A[3][0](%rsi) # R[3][0] = C[0] ^ ( C[2] & C[1])
  249. or @C[0],@C[1]
  250. xor @T[1],@C[1] # C[4] ^ ( C[1] | C[0])
  251. mov @C[1],$A[3][4](%rsi) # R[3][4] = C[4] ^ ( C[1] | C[0])
  252. and @T[1],@C[0]
  253. xor @C[3],@C[0] # ~C[3] ^ ( C[0] & C[4])
  254. mov @C[0],$A[3][3](%rsi) # R[3][3] = ~C[3] ^ ( C[0] & C[4])
  255. xor $A[0][2](%rdi),@D[2]
  256. xor $A[1][3](%rdi),@D[3]
  257. rol \$$rhotates[0][2],@D[2]
  258. xor $A[4][1](%rdi),@D[1]
  259. rol \$$rhotates[1][3],@D[3]
  260. xor $A[2][4](%rdi),@D[4]
  261. rol \$$rhotates[4][1],@D[1]
  262. xor $A[3][0](%rdi),@D[0]
  263. xchg %rsi,%rdi
  264. rol \$$rhotates[2][4],@D[4]
  265. rol \$$rhotates[3][0],@D[0]
  266. ___
  267. @C = @D[2..4,0,1];
  268. $code.=<<___;
  269. mov @C[0],@T[0]
  270. and @C[1],@C[0]
  271. not @C[1]
  272. xor @C[4],@C[0] # C[4] ^ ( C[0] & C[1])
  273. mov @C[0],$A[4][4](%rdi) # R[4][4] = C[4] ^ ( C[0] & C[1])
  274. mov @C[2],@T[1]
  275. and @C[1],@C[2]
  276. xor @T[0],@C[2] # C[0] ^ ( C[2] & ~C[1])
  277. mov @C[2],$A[4][0](%rdi) # R[4][0] = C[0] ^ ( C[2] & ~C[1])
  278. or @C[4],@T[0]
  279. xor @C[3],@T[0] # C[3] ^ ( C[0] | C[4])
  280. mov @T[0],$A[4][3](%rdi) # R[4][3] = C[3] ^ ( C[0] | C[4])
  281. and @C[3],@C[4]
  282. xor @T[1],@C[4] # C[2] ^ ( C[4] & C[3])
  283. mov @C[4],$A[4][2](%rdi) # R[4][2] = C[2] ^ ( C[4] & C[3])
  284. or @T[1],@C[3]
  285. xor @C[1],@C[3] # ~C[1] ^ ( C[2] | C[3])
  286. mov @C[3],$A[4][1](%rdi) # R[4][1] = ~C[1] ^ ( C[2] | C[3])
  287. mov @C[0],@C[1] # harmonize with the loop top
  288. mov @T[0],@C[0]
  289. test \$255,$iotas
  290. jnz .Loop
  291. lea -192($iotas),$iotas # rewind iotas
  292. ret
  293. .size __KeccakF1600,.-__KeccakF1600
  294. .type KeccakF1600,\@abi-omnipotent
  295. .align 32
  296. KeccakF1600:
  297. .cfi_startproc
  298. push %rbx
  299. .cfi_push %rbx
  300. push %rbp
  301. .cfi_push %rbp
  302. push %r12
  303. .cfi_push %r12
  304. push %r13
  305. .cfi_push %r13
  306. push %r14
  307. .cfi_push %r14
  308. push %r15
  309. .cfi_push %r15
  310. lea 100(%rdi),%rdi # size optimization
  311. sub \$200,%rsp
  312. .cfi_adjust_cfa_offset 200
  313. notq $A[0][1](%rdi)
  314. notq $A[0][2](%rdi)
  315. notq $A[1][3](%rdi)
  316. notq $A[2][2](%rdi)
  317. notq $A[3][2](%rdi)
  318. notq $A[4][0](%rdi)
  319. lea iotas(%rip),$iotas
  320. lea 100(%rsp),%rsi # size optimization
  321. call __KeccakF1600
  322. notq $A[0][1](%rdi)
  323. notq $A[0][2](%rdi)
  324. notq $A[1][3](%rdi)
  325. notq $A[2][2](%rdi)
  326. notq $A[3][2](%rdi)
  327. notq $A[4][0](%rdi)
  328. lea -100(%rdi),%rdi # preserve A[][]
  329. add \$200,%rsp
  330. .cfi_adjust_cfa_offset -200
  331. pop %r15
  332. .cfi_pop %r15
  333. pop %r14
  334. .cfi_pop %r14
  335. pop %r13
  336. .cfi_pop %r13
  337. pop %r12
  338. .cfi_pop %r12
  339. pop %rbp
  340. .cfi_pop %rbp
  341. pop %rbx
  342. .cfi_pop %rbx
  343. ret
  344. .cfi_endproc
  345. .size KeccakF1600,.-KeccakF1600
  346. ___
  347. { my ($A_flat,$inp,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
  348. ($A_flat,$inp) = ("%r8","%r9");
  349. $code.=<<___;
  350. .globl SHA3_absorb
  351. .type SHA3_absorb,\@function,4
  352. .align 32
  353. SHA3_absorb:
  354. .cfi_startproc
  355. push %rbx
  356. .cfi_push %rbx
  357. push %rbp
  358. .cfi_push %rbp
  359. push %r12
  360. .cfi_push %r12
  361. push %r13
  362. .cfi_push %r13
  363. push %r14
  364. .cfi_push %r14
  365. push %r15
  366. .cfi_push %r15
  367. lea 100(%rdi),%rdi # size optimization
  368. sub \$232,%rsp
  369. .cfi_adjust_cfa_offset 232
  370. mov %rsi,$inp
  371. lea 100(%rsp),%rsi # size optimization
  372. notq $A[0][1](%rdi)
  373. notq $A[0][2](%rdi)
  374. notq $A[1][3](%rdi)
  375. notq $A[2][2](%rdi)
  376. notq $A[3][2](%rdi)
  377. notq $A[4][0](%rdi)
  378. lea iotas(%rip),$iotas
  379. mov $bsz,216-100(%rsi) # save bsz
  380. .Loop_absorb:
  381. cmp $bsz,$len
  382. jc .Ldone_absorb
  383. shr \$3,$bsz
  384. lea -100(%rdi),$A_flat
  385. .Lblock_absorb:
  386. mov ($inp),%rax
  387. lea 8($inp),$inp
  388. xor ($A_flat),%rax
  389. lea 8($A_flat),$A_flat
  390. sub \$8,$len
  391. mov %rax,-8($A_flat)
  392. sub \$1,$bsz
  393. jnz .Lblock_absorb
  394. mov $inp,200-100(%rsi) # save inp
  395. mov $len,208-100(%rsi) # save len
  396. call __KeccakF1600
  397. mov 200-100(%rsi),$inp # pull inp
  398. mov 208-100(%rsi),$len # pull len
  399. mov 216-100(%rsi),$bsz # pull bsz
  400. jmp .Loop_absorb
  401. .align 32
  402. .Ldone_absorb:
  403. mov $len,%rax # return value
  404. notq $A[0][1](%rdi)
  405. notq $A[0][2](%rdi)
  406. notq $A[1][3](%rdi)
  407. notq $A[2][2](%rdi)
  408. notq $A[3][2](%rdi)
  409. notq $A[4][0](%rdi)
  410. add \$232,%rsp
  411. .cfi_adjust_cfa_offset -232
  412. pop %r15
  413. .cfi_pop %r15
  414. pop %r14
  415. .cfi_pop %r14
  416. pop %r13
  417. .cfi_pop %r13
  418. pop %r12
  419. .cfi_pop %r12
  420. pop %rbp
  421. .cfi_pop %rbp
  422. pop %rbx
  423. .cfi_pop %rbx
  424. ret
  425. .cfi_endproc
  426. .size SHA3_absorb,.-SHA3_absorb
  427. ___
  428. }
  429. { my ($A_flat,$out,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
  430. ($out,$len,$bsz) = ("%r12","%r13","%r14");
  431. $code.=<<___;
  432. .globl SHA3_squeeze
  433. .type SHA3_squeeze,\@function,4
  434. .align 32
  435. SHA3_squeeze:
  436. .cfi_startproc
  437. push %r12
  438. .cfi_push %r12
  439. push %r13
  440. .cfi_push %r13
  441. push %r14
  442. .cfi_push %r14
  443. shr \$3,%rcx
  444. mov $A_flat,%r8
  445. mov %rsi,$out
  446. mov %rdx,$len
  447. mov %rcx,$bsz
  448. jmp .Loop_squeeze
  449. .align 32
  450. .Loop_squeeze:
  451. cmp \$8,$len
  452. jb .Ltail_squeeze
  453. mov (%r8),%rax
  454. lea 8(%r8),%r8
  455. mov %rax,($out)
  456. lea 8($out),$out
  457. sub \$8,$len # len -= 8
  458. jz .Ldone_squeeze
  459. sub \$1,%rcx # bsz--
  460. jnz .Loop_squeeze
  461. call KeccakF1600
  462. mov $A_flat,%r8
  463. mov $bsz,%rcx
  464. jmp .Loop_squeeze
  465. .Ltail_squeeze:
  466. mov %r8, %rsi
  467. mov $out,%rdi
  468. mov $len,%rcx
  469. .byte 0xf3,0xa4 # rep movsb
  470. .Ldone_squeeze:
  471. pop %r14
  472. .cfi_pop %r14
  473. pop %r13
  474. .cfi_pop %r13
  475. pop %r12
  476. .cfi_pop %r13
  477. ret
  478. .cfi_endproc
  479. .size SHA3_squeeze,.-SHA3_squeeze
  480. ___
  481. }
  482. $code.=<<___;
  483. .align 256
  484. .quad 0,0,0,0,0,0,0,0
  485. .type iotas,\@object
  486. iotas:
  487. .quad 0x0000000000000001
  488. .quad 0x0000000000008082
  489. .quad 0x800000000000808a
  490. .quad 0x8000000080008000
  491. .quad 0x000000000000808b
  492. .quad 0x0000000080000001
  493. .quad 0x8000000080008081
  494. .quad 0x8000000000008009
  495. .quad 0x000000000000008a
  496. .quad 0x0000000000000088
  497. .quad 0x0000000080008009
  498. .quad 0x000000008000000a
  499. .quad 0x000000008000808b
  500. .quad 0x800000000000008b
  501. .quad 0x8000000000008089
  502. .quad 0x8000000000008003
  503. .quad 0x8000000000008002
  504. .quad 0x8000000000000080
  505. .quad 0x000000000000800a
  506. .quad 0x800000008000000a
  507. .quad 0x8000000080008081
  508. .quad 0x8000000000008080
  509. .quad 0x0000000080000001
  510. .quad 0x8000000080008008
  511. .size iotas,.-iotas
  512. .asciz "Keccak-1600 absorb and squeeze for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
  513. ___
  514. foreach (split("\n",$code)) {
  515. # Below replacement results in 11.2 on Sandy Bridge, 9.4 on
  516. # Haswell, but it hurts other processors by up to 2-3-4x...
  517. #s/rol\s+(\$[0-9]+),(%[a-z][a-z0-9]+)/shld\t$1,$2,$2/;
  518. # Below replacement results in 9.3 on Haswell [as well as
  519. # on Ryzen, i.e. it *hurts* Ryzen]...
  520. #s/rol\s+\$([0-9]+),(%[a-z][a-z0-9]+)/rorx\t\$64-$1,$2,$2/;
  521. print $_, "\n";
  522. }
  523. close STDOUT;