keccak1600-x86_64.pl 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611
  1. #!/usr/bin/env perl
  2. # Copyright 2017-2020 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. # ====================================================================
  10. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  11. # project. The module is, however, dual licensed under OpenSSL and
  12. # CRYPTOGAMS licenses depending on where you obtain it. For further
  13. # details see http://www.openssl.org/~appro/cryptogams/.
  14. # ====================================================================
  15. #
  16. # Keccak-1600 for x86_64.
  17. #
  18. # June 2017.
  19. #
  20. # Below code is [lane complementing] KECCAK_2X implementation (see
  21. # sha/keccak1600.c) with C[5] and D[5] held in register bank. Though
  22. # instead of actually unrolling the loop pair-wise I simply flip
  23. # pointers to T[][] and A[][] at the end of round. Since number of
  24. # rounds is even, last round writes to A[][] and everything works out.
  25. # How does it compare to x86_64 assembly module in Keccak Code Package?
  26. # Depending on processor it's either as fast or faster by up to 15%...
  27. #
  28. ########################################################################
  29. # Numbers are cycles per processed byte out of large message.
  30. #
  31. # r=1088(*)
  32. #
  33. # P4 25.8
  34. # Core 2 12.9
  35. # Westmere 13.7
  36. # Sandy Bridge 12.9(**)
  37. # Haswell 9.6
  38. # Skylake 9.4
  39. # Silvermont 22.8
  40. # Goldmont 15.8
  41. # VIA Nano 17.3
  42. # Sledgehammer 13.3
  43. # Bulldozer 16.5
  44. # Ryzen 8.8
  45. #
  46. # (*) Corresponds to SHA3-256. Improvement over compiler-generate
  47. # varies a lot, most common coefficient is 15% in comparison to
  48. # gcc-5.x, 50% for gcc-4.x, 90% for gcc-3.x.
  49. # (**) Sandy Bridge has broken rotate instruction. Performance can be
  50. # improved by 14% by replacing rotates with double-precision
  51. # shift with same register as source and destination.
  52. # $output is the last argument if it looks like a file (it has an extension)
  53. # $flavour is the first argument if it doesn't look like a file
  54. $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  55. $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  56. $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  57. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  58. ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  59. ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  60. die "can't locate x86_64-xlate.pl";
  61. open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
  62. or die "can't call $xlate: $!";
  63. *STDOUT=*OUT;
  64. my @A = map([ 8*$_-100, 8*($_+1)-100, 8*($_+2)-100,
  65. 8*($_+3)-100, 8*($_+4)-100 ], (0,5,10,15,20));
  66. my @C = ("%rax","%rbx","%rcx","%rdx","%rbp");
  67. my @D = map("%r$_",(8..12));
  68. my @T = map("%r$_",(13..14));
  69. my $iotas = "%r15";
  70. my @rhotates = ([ 0, 1, 62, 28, 27 ],
  71. [ 36, 44, 6, 55, 20 ],
  72. [ 3, 10, 43, 25, 39 ],
  73. [ 41, 45, 15, 21, 8 ],
  74. [ 18, 2, 61, 56, 14 ]);
  75. $code.=<<___;
  76. .text
  77. .type __KeccakF1600,\@abi-omnipotent
  78. .align 32
  79. __KeccakF1600:
  80. .cfi_startproc
  81. mov $A[4][0](%rdi),@C[0]
  82. mov $A[4][1](%rdi),@C[1]
  83. mov $A[4][2](%rdi),@C[2]
  84. mov $A[4][3](%rdi),@C[3]
  85. mov $A[4][4](%rdi),@C[4]
  86. jmp .Loop
  87. .align 32
  88. .Loop:
  89. mov $A[0][0](%rdi),@D[0]
  90. mov $A[1][1](%rdi),@D[1]
  91. mov $A[2][2](%rdi),@D[2]
  92. mov $A[3][3](%rdi),@D[3]
  93. xor $A[0][2](%rdi),@C[2]
  94. xor $A[0][3](%rdi),@C[3]
  95. xor @D[0], @C[0]
  96. xor $A[0][1](%rdi),@C[1]
  97. xor $A[1][2](%rdi),@C[2]
  98. xor $A[1][0](%rdi),@C[0]
  99. mov @C[4],@D[4]
  100. xor $A[0][4](%rdi),@C[4]
  101. xor @D[2], @C[2]
  102. xor $A[2][0](%rdi),@C[0]
  103. xor $A[1][3](%rdi),@C[3]
  104. xor @D[1], @C[1]
  105. xor $A[1][4](%rdi),@C[4]
  106. xor $A[3][2](%rdi),@C[2]
  107. xor $A[3][0](%rdi),@C[0]
  108. xor $A[2][3](%rdi),@C[3]
  109. xor $A[2][1](%rdi),@C[1]
  110. xor $A[2][4](%rdi),@C[4]
  111. mov @C[2],@T[0]
  112. rol \$1,@C[2]
  113. xor @C[0],@C[2] # D[1] = ROL64(C[2], 1) ^ C[0]
  114. xor @D[3], @C[3]
  115. rol \$1,@C[0]
  116. xor @C[3],@C[0] # D[4] = ROL64(C[0], 1) ^ C[3]
  117. xor $A[3][1](%rdi),@C[1]
  118. rol \$1,@C[3]
  119. xor @C[1],@C[3] # D[2] = ROL64(C[3], 1) ^ C[1]
  120. xor $A[3][4](%rdi),@C[4]
  121. rol \$1,@C[1]
  122. xor @C[4],@C[1] # D[0] = ROL64(C[1], 1) ^ C[4]
  123. rol \$1,@C[4]
  124. xor @T[0],@C[4] # D[3] = ROL64(C[4], 1) ^ C[2]
  125. ___
  126. (@D[0..4], @C) = (@C[1..4,0], @D);
  127. $code.=<<___;
  128. xor @D[1],@C[1]
  129. xor @D[2],@C[2]
  130. rol \$$rhotates[1][1],@C[1]
  131. xor @D[3],@C[3]
  132. xor @D[4],@C[4]
  133. rol \$$rhotates[2][2],@C[2]
  134. xor @D[0],@C[0]
  135. mov @C[1],@T[0]
  136. rol \$$rhotates[3][3],@C[3]
  137. or @C[2],@C[1]
  138. xor @C[0],@C[1] # C[0] ^ ( C[1] | C[2])
  139. rol \$$rhotates[4][4],@C[4]
  140. xor ($iotas),@C[1]
  141. lea 8($iotas),$iotas
  142. mov @C[4],@T[1]
  143. and @C[3],@C[4]
  144. mov @C[1],$A[0][0](%rsi) # R[0][0] = C[0] ^ ( C[1] | C[2]) ^ iotas[i]
  145. xor @C[2],@C[4] # C[2] ^ ( C[4] & C[3])
  146. not @C[2]
  147. mov @C[4],$A[0][2](%rsi) # R[0][2] = C[2] ^ ( C[4] & C[3])
  148. or @C[3],@C[2]
  149. mov $A[4][2](%rdi),@C[4]
  150. xor @T[0],@C[2] # C[1] ^ (~C[2] | C[3])
  151. mov @C[2],$A[0][1](%rsi) # R[0][1] = C[1] ^ (~C[2] | C[3])
  152. and @C[0],@T[0]
  153. mov $A[1][4](%rdi),@C[1]
  154. xor @T[1],@T[0] # C[4] ^ ( C[1] & C[0])
  155. mov $A[2][0](%rdi),@C[2]
  156. mov @T[0],$A[0][4](%rsi) # R[0][4] = C[4] ^ ( C[1] & C[0])
  157. or @C[0],@T[1]
  158. mov $A[0][3](%rdi),@C[0]
  159. xor @C[3],@T[1] # C[3] ^ ( C[4] | C[0])
  160. mov $A[3][1](%rdi),@C[3]
  161. mov @T[1],$A[0][3](%rsi) # R[0][3] = C[3] ^ ( C[4] | C[0])
  162. xor @D[3],@C[0]
  163. xor @D[2],@C[4]
  164. rol \$$rhotates[0][3],@C[0]
  165. xor @D[1],@C[3]
  166. xor @D[4],@C[1]
  167. rol \$$rhotates[4][2],@C[4]
  168. rol \$$rhotates[3][1],@C[3]
  169. xor @D[0],@C[2]
  170. rol \$$rhotates[1][4],@C[1]
  171. mov @C[0],@T[0]
  172. or @C[4],@C[0]
  173. rol \$$rhotates[2][0],@C[2]
  174. xor @C[3],@C[0] # C[3] ^ (C[0] | C[4])
  175. mov @C[0],$A[1][3](%rsi) # R[1][3] = C[3] ^ (C[0] | C[4])
  176. mov @C[1],@T[1]
  177. and @T[0],@C[1]
  178. mov $A[0][1](%rdi),@C[0]
  179. xor @C[4],@C[1] # C[4] ^ (C[1] & C[0])
  180. not @C[4]
  181. mov @C[1],$A[1][4](%rsi) # R[1][4] = C[4] ^ (C[1] & C[0])
  182. or @C[3],@C[4]
  183. mov $A[1][2](%rdi),@C[1]
  184. xor @C[2],@C[4] # C[2] ^ (~C[4] | C[3])
  185. mov @C[4],$A[1][2](%rsi) # R[1][2] = C[2] ^ (~C[4] | C[3])
  186. and @C[2],@C[3]
  187. mov $A[4][0](%rdi),@C[4]
  188. xor @T[1],@C[3] # C[1] ^ (C[3] & C[2])
  189. mov @C[3],$A[1][1](%rsi) # R[1][1] = C[1] ^ (C[3] & C[2])
  190. or @C[2],@T[1]
  191. mov $A[2][3](%rdi),@C[2]
  192. xor @T[0],@T[1] # C[0] ^ (C[1] | C[2])
  193. mov $A[3][4](%rdi),@C[3]
  194. mov @T[1],$A[1][0](%rsi) # R[1][0] = C[0] ^ (C[1] | C[2])
  195. xor @D[3],@C[2]
  196. xor @D[4],@C[3]
  197. rol \$$rhotates[2][3],@C[2]
  198. xor @D[2],@C[1]
  199. rol \$$rhotates[3][4],@C[3]
  200. xor @D[0],@C[4]
  201. rol \$$rhotates[1][2],@C[1]
  202. xor @D[1],@C[0]
  203. rol \$$rhotates[4][0],@C[4]
  204. mov @C[2],@T[0]
  205. and @C[3],@C[2]
  206. rol \$$rhotates[0][1],@C[0]
  207. not @C[3]
  208. xor @C[1],@C[2] # C[1] ^ ( C[2] & C[3])
  209. mov @C[2],$A[2][1](%rsi) # R[2][1] = C[1] ^ ( C[2] & C[3])
  210. mov @C[4],@T[1]
  211. and @C[3],@C[4]
  212. mov $A[2][1](%rdi),@C[2]
  213. xor @T[0],@C[4] # C[2] ^ ( C[4] & ~C[3])
  214. mov @C[4],$A[2][2](%rsi) # R[2][2] = C[2] ^ ( C[4] & ~C[3])
  215. or @C[1],@T[0]
  216. mov $A[4][3](%rdi),@C[4]
  217. xor @C[0],@T[0] # C[0] ^ ( C[2] | C[1])
  218. mov @T[0],$A[2][0](%rsi) # R[2][0] = C[0] ^ ( C[2] | C[1])
  219. and @C[0],@C[1]
  220. xor @T[1],@C[1] # C[4] ^ ( C[1] & C[0])
  221. mov @C[1],$A[2][4](%rsi) # R[2][4] = C[4] ^ ( C[1] & C[0])
  222. or @C[0],@T[1]
  223. mov $A[1][0](%rdi),@C[1]
  224. xor @C[3],@T[1] # ~C[3] ^ ( C[0] | C[4])
  225. mov $A[3][2](%rdi),@C[3]
  226. mov @T[1],$A[2][3](%rsi) # R[2][3] = ~C[3] ^ ( C[0] | C[4])
  227. mov $A[0][4](%rdi),@C[0]
  228. xor @D[1],@C[2]
  229. xor @D[2],@C[3]
  230. rol \$$rhotates[2][1],@C[2]
  231. xor @D[0],@C[1]
  232. rol \$$rhotates[3][2],@C[3]
  233. xor @D[3],@C[4]
  234. rol \$$rhotates[1][0],@C[1]
  235. xor @D[4],@C[0]
  236. rol \$$rhotates[4][3],@C[4]
  237. mov @C[2],@T[0]
  238. or @C[3],@C[2]
  239. rol \$$rhotates[0][4],@C[0]
  240. not @C[3]
  241. xor @C[1],@C[2] # C[1] ^ ( C[2] | C[3])
  242. mov @C[2],$A[3][1](%rsi) # R[3][1] = C[1] ^ ( C[2] | C[3])
  243. mov @C[4],@T[1]
  244. or @C[3],@C[4]
  245. xor @T[0],@C[4] # C[2] ^ ( C[4] | ~C[3])
  246. mov @C[4],$A[3][2](%rsi) # R[3][2] = C[2] ^ ( C[4] | ~C[3])
  247. and @C[1],@T[0]
  248. xor @C[0],@T[0] # C[0] ^ ( C[2] & C[1])
  249. mov @T[0],$A[3][0](%rsi) # R[3][0] = C[0] ^ ( C[2] & C[1])
  250. or @C[0],@C[1]
  251. xor @T[1],@C[1] # C[4] ^ ( C[1] | C[0])
  252. mov @C[1],$A[3][4](%rsi) # R[3][4] = C[4] ^ ( C[1] | C[0])
  253. and @T[1],@C[0]
  254. xor @C[3],@C[0] # ~C[3] ^ ( C[0] & C[4])
  255. mov @C[0],$A[3][3](%rsi) # R[3][3] = ~C[3] ^ ( C[0] & C[4])
  256. xor $A[0][2](%rdi),@D[2]
  257. xor $A[1][3](%rdi),@D[3]
  258. rol \$$rhotates[0][2],@D[2]
  259. xor $A[4][1](%rdi),@D[1]
  260. rol \$$rhotates[1][3],@D[3]
  261. xor $A[2][4](%rdi),@D[4]
  262. rol \$$rhotates[4][1],@D[1]
  263. xor $A[3][0](%rdi),@D[0]
  264. xchg %rsi,%rdi
  265. rol \$$rhotates[2][4],@D[4]
  266. rol \$$rhotates[3][0],@D[0]
  267. ___
  268. @C = @D[2..4,0,1];
  269. $code.=<<___;
  270. mov @C[0],@T[0]
  271. and @C[1],@C[0]
  272. not @C[1]
  273. xor @C[4],@C[0] # C[4] ^ ( C[0] & C[1])
  274. mov @C[0],$A[4][4](%rdi) # R[4][4] = C[4] ^ ( C[0] & C[1])
  275. mov @C[2],@T[1]
  276. and @C[1],@C[2]
  277. xor @T[0],@C[2] # C[0] ^ ( C[2] & ~C[1])
  278. mov @C[2],$A[4][0](%rdi) # R[4][0] = C[0] ^ ( C[2] & ~C[1])
  279. or @C[4],@T[0]
  280. xor @C[3],@T[0] # C[3] ^ ( C[0] | C[4])
  281. mov @T[0],$A[4][3](%rdi) # R[4][3] = C[3] ^ ( C[0] | C[4])
  282. and @C[3],@C[4]
  283. xor @T[1],@C[4] # C[2] ^ ( C[4] & C[3])
  284. mov @C[4],$A[4][2](%rdi) # R[4][2] = C[2] ^ ( C[4] & C[3])
  285. or @T[1],@C[3]
  286. xor @C[1],@C[3] # ~C[1] ^ ( C[2] | C[3])
  287. mov @C[3],$A[4][1](%rdi) # R[4][1] = ~C[1] ^ ( C[2] | C[3])
  288. mov @C[0],@C[1] # harmonize with the loop top
  289. mov @T[0],@C[0]
  290. test \$255,$iotas
  291. jnz .Loop
  292. lea -192($iotas),$iotas # rewind iotas
  293. ret
  294. .cfi_endproc
  295. .size __KeccakF1600,.-__KeccakF1600
  296. .type KeccakF1600,\@abi-omnipotent
  297. .align 32
  298. KeccakF1600:
  299. .cfi_startproc
  300. push %rbx
  301. .cfi_push %rbx
  302. push %rbp
  303. .cfi_push %rbp
  304. push %r12
  305. .cfi_push %r12
  306. push %r13
  307. .cfi_push %r13
  308. push %r14
  309. .cfi_push %r14
  310. push %r15
  311. .cfi_push %r15
  312. lea 100(%rdi),%rdi # size optimization
  313. sub \$200,%rsp
  314. .cfi_adjust_cfa_offset 200
  315. notq $A[0][1](%rdi)
  316. notq $A[0][2](%rdi)
  317. notq $A[1][3](%rdi)
  318. notq $A[2][2](%rdi)
  319. notq $A[3][2](%rdi)
  320. notq $A[4][0](%rdi)
  321. lea iotas(%rip),$iotas
  322. lea 100(%rsp),%rsi # size optimization
  323. call __KeccakF1600
  324. notq $A[0][1](%rdi)
  325. notq $A[0][2](%rdi)
  326. notq $A[1][3](%rdi)
  327. notq $A[2][2](%rdi)
  328. notq $A[3][2](%rdi)
  329. notq $A[4][0](%rdi)
  330. lea -100(%rdi),%rdi # preserve A[][]
  331. add \$200,%rsp
  332. .cfi_adjust_cfa_offset -200
  333. pop %r15
  334. .cfi_pop %r15
  335. pop %r14
  336. .cfi_pop %r14
  337. pop %r13
  338. .cfi_pop %r13
  339. pop %r12
  340. .cfi_pop %r12
  341. pop %rbp
  342. .cfi_pop %rbp
  343. pop %rbx
  344. .cfi_pop %rbx
  345. ret
  346. .cfi_endproc
  347. .size KeccakF1600,.-KeccakF1600
  348. ___
  349. { my ($A_flat,$inp,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
  350. ($A_flat,$inp) = ("%r8","%r9");
  351. $code.=<<___;
  352. .globl SHA3_absorb
  353. .type SHA3_absorb,\@function,4
  354. .align 32
  355. SHA3_absorb:
  356. .cfi_startproc
  357. push %rbx
  358. .cfi_push %rbx
  359. push %rbp
  360. .cfi_push %rbp
  361. push %r12
  362. .cfi_push %r12
  363. push %r13
  364. .cfi_push %r13
  365. push %r14
  366. .cfi_push %r14
  367. push %r15
  368. .cfi_push %r15
  369. lea 100(%rdi),%rdi # size optimization
  370. sub \$232,%rsp
  371. .cfi_adjust_cfa_offset 232
  372. mov %rsi,$inp
  373. lea 100(%rsp),%rsi # size optimization
  374. notq $A[0][1](%rdi)
  375. notq $A[0][2](%rdi)
  376. notq $A[1][3](%rdi)
  377. notq $A[2][2](%rdi)
  378. notq $A[3][2](%rdi)
  379. notq $A[4][0](%rdi)
  380. lea iotas(%rip),$iotas
  381. mov $bsz,216-100(%rsi) # save bsz
  382. .Loop_absorb:
  383. cmp $bsz,$len
  384. jc .Ldone_absorb
  385. shr \$3,$bsz
  386. lea -100(%rdi),$A_flat
  387. .Lblock_absorb:
  388. mov ($inp),%rax
  389. lea 8($inp),$inp
  390. xor ($A_flat),%rax
  391. lea 8($A_flat),$A_flat
  392. sub \$8,$len
  393. mov %rax,-8($A_flat)
  394. sub \$1,$bsz
  395. jnz .Lblock_absorb
  396. mov $inp,200-100(%rsi) # save inp
  397. mov $len,208-100(%rsi) # save len
  398. call __KeccakF1600
  399. mov 200-100(%rsi),$inp # pull inp
  400. mov 208-100(%rsi),$len # pull len
  401. mov 216-100(%rsi),$bsz # pull bsz
  402. jmp .Loop_absorb
  403. .align 32
  404. .Ldone_absorb:
  405. mov $len,%rax # return value
  406. notq $A[0][1](%rdi)
  407. notq $A[0][2](%rdi)
  408. notq $A[1][3](%rdi)
  409. notq $A[2][2](%rdi)
  410. notq $A[3][2](%rdi)
  411. notq $A[4][0](%rdi)
  412. add \$232,%rsp
  413. .cfi_adjust_cfa_offset -232
  414. pop %r15
  415. .cfi_pop %r15
  416. pop %r14
  417. .cfi_pop %r14
  418. pop %r13
  419. .cfi_pop %r13
  420. pop %r12
  421. .cfi_pop %r12
  422. pop %rbp
  423. .cfi_pop %rbp
  424. pop %rbx
  425. .cfi_pop %rbx
  426. ret
  427. .cfi_endproc
  428. .size SHA3_absorb,.-SHA3_absorb
  429. ___
  430. }
  431. { my ($A_flat,$out,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
  432. ($out,$len,$bsz) = ("%r12","%r13","%r14");
  433. $code.=<<___;
  434. .globl SHA3_squeeze
  435. .type SHA3_squeeze,\@function,4
  436. .align 32
  437. SHA3_squeeze:
  438. .cfi_startproc
  439. push %r12
  440. .cfi_push %r12
  441. push %r13
  442. .cfi_push %r13
  443. push %r14
  444. .cfi_push %r14
  445. shr \$3,%rcx
  446. mov $A_flat,%r8
  447. mov %rsi,$out
  448. mov %rdx,$len
  449. mov %rcx,$bsz
  450. jmp .Loop_squeeze
  451. .align 32
  452. .Loop_squeeze:
  453. cmp \$8,$len
  454. jb .Ltail_squeeze
  455. mov (%r8),%rax
  456. lea 8(%r8),%r8
  457. mov %rax,($out)
  458. lea 8($out),$out
  459. sub \$8,$len # len -= 8
  460. jz .Ldone_squeeze
  461. sub \$1,%rcx # bsz--
  462. jnz .Loop_squeeze
  463. call KeccakF1600
  464. mov $A_flat,%r8
  465. mov $bsz,%rcx
  466. jmp .Loop_squeeze
  467. .Ltail_squeeze:
  468. mov %r8, %rsi
  469. mov $out,%rdi
  470. mov $len,%rcx
  471. .byte 0xf3,0xa4 # rep movsb
  472. .Ldone_squeeze:
  473. pop %r14
  474. .cfi_pop %r14
  475. pop %r13
  476. .cfi_pop %r13
  477. pop %r12
  478. .cfi_pop %r13
  479. ret
  480. .cfi_endproc
  481. .size SHA3_squeeze,.-SHA3_squeeze
  482. ___
  483. }
  484. $code.=<<___;
  485. .align 256
  486. .quad 0,0,0,0,0,0,0,0
  487. .type iotas,\@object
  488. iotas:
  489. .quad 0x0000000000000001
  490. .quad 0x0000000000008082
  491. .quad 0x800000000000808a
  492. .quad 0x8000000080008000
  493. .quad 0x000000000000808b
  494. .quad 0x0000000080000001
  495. .quad 0x8000000080008081
  496. .quad 0x8000000000008009
  497. .quad 0x000000000000008a
  498. .quad 0x0000000000000088
  499. .quad 0x0000000080008009
  500. .quad 0x000000008000000a
  501. .quad 0x000000008000808b
  502. .quad 0x800000000000008b
  503. .quad 0x8000000000008089
  504. .quad 0x8000000000008003
  505. .quad 0x8000000000008002
  506. .quad 0x8000000000000080
  507. .quad 0x000000000000800a
  508. .quad 0x800000008000000a
  509. .quad 0x8000000080008081
  510. .quad 0x8000000000008080
  511. .quad 0x0000000080000001
  512. .quad 0x8000000080008008
  513. .size iotas,.-iotas
  514. .asciz "Keccak-1600 absorb and squeeze for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
  515. ___
  516. foreach (split("\n",$code)) {
  517. # Below replacement results in 11.2 on Sandy Bridge, 9.4 on
  518. # Haswell, but it hurts other processors by up to 2-3-4x...
  519. #s/rol\s+(\$[0-9]+),(%[a-z][a-z0-9]+)/shld\t$1,$2,$2/;
  520. # Below replacement results in 9.3 on Haswell [as well as
  521. # on Ryzen, i.e. it *hurts* Ryzen]...
  522. #s/rol\s+\$([0-9]+),(%[a-z][a-z0-9]+)/rorx\t\$64-$1,$2,$2/;
  523. print $_, "\n";
  524. }
  525. close STDOUT or die "error closing STDOUT: $!";