keccak1600-x86_64.pl 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607
  1. #!/usr/bin/env perl
  2. # Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the OpenSSL license (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. # ====================================================================
  10. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  11. # project. The module is, however, dual licensed under OpenSSL and
  12. # CRYPTOGAMS licenses depending on where you obtain it. For further
  13. # details see http://www.openssl.org/~appro/cryptogams/.
  14. # ====================================================================
  15. #
  16. # Keccak-1600 for x86_64.
  17. #
  18. # June 2017.
  19. #
  20. # Below code is [lane complementing] KECCAK_2X implementation (see
  21. # sha/keccak1600.c) with C[5] and D[5] held in register bank. Though
  22. # instead of actually unrolling the loop pair-wise I simply flip
  23. # pointers to T[][] and A[][] at the end of round. Since number of
  24. # rounds is even, last round writes to A[][] and everything works out.
  25. # How does it compare to x86_64 assembly module in Keccak Code Package?
  26. # Depending on processor it's either as fast or faster by up to 15%...
  27. #
  28. ########################################################################
  29. # Numbers are cycles per processed byte out of large message.
  30. #
  31. # r=1088(*)
  32. #
  33. # P4 25.8
  34. # Core 2 12.9
  35. # Westmere 13.7
  36. # Sandy Bridge 12.9(**)
  37. # Haswell 9.6
  38. # Skylake 9.4
  39. # Silvermont 22.8
  40. # Goldmont 15.8
  41. # VIA Nano 17.3
  42. # Sledgehammer 13.3
  43. # Bulldozer 16.5
  44. # Ryzen 8.8
  45. #
  46. # (*) Corresponds to SHA3-256. Improvement over compiler-generate
  47. # varies a lot, most commont coefficient is 15% in comparison to
  48. # gcc-5.x, 50% for gcc-4.x, 90% for gcc-3.x.
  49. # (**) Sandy Bridge has broken rotate instruction. Performance can be
  50. # improved by 14% by replacing rotates with double-precision
  51. # shift with same register as source and destination.
  52. $flavour = shift;
  53. $output = shift;
  54. if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
  55. $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  56. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  57. ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  58. ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  59. die "can't locate x86_64-xlate.pl";
  60. open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
  61. *STDOUT=*OUT;
  62. my @A = map([ 8*$_-100, 8*($_+1)-100, 8*($_+2)-100,
  63. 8*($_+3)-100, 8*($_+4)-100 ], (0,5,10,15,20));
  64. my @C = ("%rax","%rbx","%rcx","%rdx","%rbp");
  65. my @D = map("%r$_",(8..12));
  66. my @T = map("%r$_",(13..14));
  67. my $iotas = "%r15";
  68. my @rhotates = ([ 0, 1, 62, 28, 27 ],
  69. [ 36, 44, 6, 55, 20 ],
  70. [ 3, 10, 43, 25, 39 ],
  71. [ 41, 45, 15, 21, 8 ],
  72. [ 18, 2, 61, 56, 14 ]);
  73. $code.=<<___;
  74. .text
  75. .type __KeccakF1600,\@abi-omnipotent
  76. .align 32
  77. __KeccakF1600:
  78. mov $A[4][0](%rdi),@C[0]
  79. mov $A[4][1](%rdi),@C[1]
  80. mov $A[4][2](%rdi),@C[2]
  81. mov $A[4][3](%rdi),@C[3]
  82. mov $A[4][4](%rdi),@C[4]
  83. jmp .Loop
  84. .align 32
  85. .Loop:
  86. mov $A[0][0](%rdi),@D[0]
  87. mov $A[1][1](%rdi),@D[1]
  88. mov $A[2][2](%rdi),@D[2]
  89. mov $A[3][3](%rdi),@D[3]
  90. xor $A[0][2](%rdi),@C[2]
  91. xor $A[0][3](%rdi),@C[3]
  92. xor @D[0], @C[0]
  93. xor $A[0][1](%rdi),@C[1]
  94. xor $A[1][2](%rdi),@C[2]
  95. xor $A[1][0](%rdi),@C[0]
  96. mov @C[4],@D[4]
  97. xor $A[0][4](%rdi),@C[4]
  98. xor @D[2], @C[2]
  99. xor $A[2][0](%rdi),@C[0]
  100. xor $A[1][3](%rdi),@C[3]
  101. xor @D[1], @C[1]
  102. xor $A[1][4](%rdi),@C[4]
  103. xor $A[3][2](%rdi),@C[2]
  104. xor $A[3][0](%rdi),@C[0]
  105. xor $A[2][3](%rdi),@C[3]
  106. xor $A[2][1](%rdi),@C[1]
  107. xor $A[2][4](%rdi),@C[4]
  108. mov @C[2],@T[0]
  109. rol \$1,@C[2]
  110. xor @C[0],@C[2] # D[1] = ROL64(C[2], 1) ^ C[0]
  111. xor @D[3], @C[3]
  112. rol \$1,@C[0]
  113. xor @C[3],@C[0] # D[4] = ROL64(C[0], 1) ^ C[3]
  114. xor $A[3][1](%rdi),@C[1]
  115. rol \$1,@C[3]
  116. xor @C[1],@C[3] # D[2] = ROL64(C[3], 1) ^ C[1]
  117. xor $A[3][4](%rdi),@C[4]
  118. rol \$1,@C[1]
  119. xor @C[4],@C[1] # D[0] = ROL64(C[1], 1) ^ C[4]
  120. rol \$1,@C[4]
  121. xor @T[0],@C[4] # D[3] = ROL64(C[4], 1) ^ C[2]
  122. ___
  123. (@D[0..4], @C) = (@C[1..4,0], @D);
  124. $code.=<<___;
  125. xor @D[1],@C[1]
  126. xor @D[2],@C[2]
  127. rol \$$rhotates[1][1],@C[1]
  128. xor @D[3],@C[3]
  129. xor @D[4],@C[4]
  130. rol \$$rhotates[2][2],@C[2]
  131. xor @D[0],@C[0]
  132. mov @C[1],@T[0]
  133. rol \$$rhotates[3][3],@C[3]
  134. or @C[2],@C[1]
  135. xor @C[0],@C[1] # C[0] ^ ( C[1] | C[2])
  136. rol \$$rhotates[4][4],@C[4]
  137. xor ($iotas),@C[1]
  138. lea 8($iotas),$iotas
  139. mov @C[4],@T[1]
  140. and @C[3],@C[4]
  141. mov @C[1],$A[0][0](%rsi) # R[0][0] = C[0] ^ ( C[1] | C[2]) ^ iotas[i]
  142. xor @C[2],@C[4] # C[2] ^ ( C[4] & C[3])
  143. not @C[2]
  144. mov @C[4],$A[0][2](%rsi) # R[0][2] = C[2] ^ ( C[4] & C[3])
  145. or @C[3],@C[2]
  146. mov $A[4][2](%rdi),@C[4]
  147. xor @T[0],@C[2] # C[1] ^ (~C[2] | C[3])
  148. mov @C[2],$A[0][1](%rsi) # R[0][1] = C[1] ^ (~C[2] | C[3])
  149. and @C[0],@T[0]
  150. mov $A[1][4](%rdi),@C[1]
  151. xor @T[1],@T[0] # C[4] ^ ( C[1] & C[0])
  152. mov $A[2][0](%rdi),@C[2]
  153. mov @T[0],$A[0][4](%rsi) # R[0][4] = C[4] ^ ( C[1] & C[0])
  154. or @C[0],@T[1]
  155. mov $A[0][3](%rdi),@C[0]
  156. xor @C[3],@T[1] # C[3] ^ ( C[4] | C[0])
  157. mov $A[3][1](%rdi),@C[3]
  158. mov @T[1],$A[0][3](%rsi) # R[0][3] = C[3] ^ ( C[4] | C[0])
  159. xor @D[3],@C[0]
  160. xor @D[2],@C[4]
  161. rol \$$rhotates[0][3],@C[0]
  162. xor @D[1],@C[3]
  163. xor @D[4],@C[1]
  164. rol \$$rhotates[4][2],@C[4]
  165. rol \$$rhotates[3][1],@C[3]
  166. xor @D[0],@C[2]
  167. rol \$$rhotates[1][4],@C[1]
  168. mov @C[0],@T[0]
  169. or @C[4],@C[0]
  170. rol \$$rhotates[2][0],@C[2]
  171. xor @C[3],@C[0] # C[3] ^ (C[0] | C[4])
  172. mov @C[0],$A[1][3](%rsi) # R[1][3] = C[3] ^ (C[0] | C[4])
  173. mov @C[1],@T[1]
  174. and @T[0],@C[1]
  175. mov $A[0][1](%rdi),@C[0]
  176. xor @C[4],@C[1] # C[4] ^ (C[1] & C[0])
  177. not @C[4]
  178. mov @C[1],$A[1][4](%rsi) # R[1][4] = C[4] ^ (C[1] & C[0])
  179. or @C[3],@C[4]
  180. mov $A[1][2](%rdi),@C[1]
  181. xor @C[2],@C[4] # C[2] ^ (~C[4] | C[3])
  182. mov @C[4],$A[1][2](%rsi) # R[1][2] = C[2] ^ (~C[4] | C[3])
  183. and @C[2],@C[3]
  184. mov $A[4][0](%rdi),@C[4]
  185. xor @T[1],@C[3] # C[1] ^ (C[3] & C[2])
  186. mov @C[3],$A[1][1](%rsi) # R[1][1] = C[1] ^ (C[3] & C[2])
  187. or @C[2],@T[1]
  188. mov $A[2][3](%rdi),@C[2]
  189. xor @T[0],@T[1] # C[0] ^ (C[1] | C[2])
  190. mov $A[3][4](%rdi),@C[3]
  191. mov @T[1],$A[1][0](%rsi) # R[1][0] = C[0] ^ (C[1] | C[2])
  192. xor @D[3],@C[2]
  193. xor @D[4],@C[3]
  194. rol \$$rhotates[2][3],@C[2]
  195. xor @D[2],@C[1]
  196. rol \$$rhotates[3][4],@C[3]
  197. xor @D[0],@C[4]
  198. rol \$$rhotates[1][2],@C[1]
  199. xor @D[1],@C[0]
  200. rol \$$rhotates[4][0],@C[4]
  201. mov @C[2],@T[0]
  202. and @C[3],@C[2]
  203. rol \$$rhotates[0][1],@C[0]
  204. not @C[3]
  205. xor @C[1],@C[2] # C[1] ^ ( C[2] & C[3])
  206. mov @C[2],$A[2][1](%rsi) # R[2][1] = C[1] ^ ( C[2] & C[3])
  207. mov @C[4],@T[1]
  208. and @C[3],@C[4]
  209. mov $A[2][1](%rdi),@C[2]
  210. xor @T[0],@C[4] # C[2] ^ ( C[4] & ~C[3])
  211. mov @C[4],$A[2][2](%rsi) # R[2][2] = C[2] ^ ( C[4] & ~C[3])
  212. or @C[1],@T[0]
  213. mov $A[4][3](%rdi),@C[4]
  214. xor @C[0],@T[0] # C[0] ^ ( C[2] | C[1])
  215. mov @T[0],$A[2][0](%rsi) # R[2][0] = C[0] ^ ( C[2] | C[1])
  216. and @C[0],@C[1]
  217. xor @T[1],@C[1] # C[4] ^ ( C[1] & C[0])
  218. mov @C[1],$A[2][4](%rsi) # R[2][4] = C[4] ^ ( C[1] & C[0])
  219. or @C[0],@T[1]
  220. mov $A[1][0](%rdi),@C[1]
  221. xor @C[3],@T[1] # ~C[3] ^ ( C[0] | C[4])
  222. mov $A[3][2](%rdi),@C[3]
  223. mov @T[1],$A[2][3](%rsi) # R[2][3] = ~C[3] ^ ( C[0] | C[4])
  224. mov $A[0][4](%rdi),@C[0]
  225. xor @D[1],@C[2]
  226. xor @D[2],@C[3]
  227. rol \$$rhotates[2][1],@C[2]
  228. xor @D[0],@C[1]
  229. rol \$$rhotates[3][2],@C[3]
  230. xor @D[3],@C[4]
  231. rol \$$rhotates[1][0],@C[1]
  232. xor @D[4],@C[0]
  233. rol \$$rhotates[4][3],@C[4]
  234. mov @C[2],@T[0]
  235. or @C[3],@C[2]
  236. rol \$$rhotates[0][4],@C[0]
  237. not @C[3]
  238. xor @C[1],@C[2] # C[1] ^ ( C[2] | C[3])
  239. mov @C[2],$A[3][1](%rsi) # R[3][1] = C[1] ^ ( C[2] | C[3])
  240. mov @C[4],@T[1]
  241. or @C[3],@C[4]
  242. xor @T[0],@C[4] # C[2] ^ ( C[4] | ~C[3])
  243. mov @C[4],$A[3][2](%rsi) # R[3][2] = C[2] ^ ( C[4] | ~C[3])
  244. and @C[1],@T[0]
  245. xor @C[0],@T[0] # C[0] ^ ( C[2] & C[1])
  246. mov @T[0],$A[3][0](%rsi) # R[3][0] = C[0] ^ ( C[2] & C[1])
  247. or @C[0],@C[1]
  248. xor @T[1],@C[1] # C[4] ^ ( C[1] | C[0])
  249. mov @C[1],$A[3][4](%rsi) # R[3][4] = C[4] ^ ( C[1] | C[0])
  250. and @T[1],@C[0]
  251. xor @C[3],@C[0] # ~C[3] ^ ( C[0] & C[4])
  252. mov @C[0],$A[3][3](%rsi) # R[3][3] = ~C[3] ^ ( C[0] & C[4])
  253. xor $A[0][2](%rdi),@D[2]
  254. xor $A[1][3](%rdi),@D[3]
  255. rol \$$rhotates[0][2],@D[2]
  256. xor $A[4][1](%rdi),@D[1]
  257. rol \$$rhotates[1][3],@D[3]
  258. xor $A[2][4](%rdi),@D[4]
  259. rol \$$rhotates[4][1],@D[1]
  260. xor $A[3][0](%rdi),@D[0]
  261. xchg %rsi,%rdi
  262. rol \$$rhotates[2][4],@D[4]
  263. rol \$$rhotates[3][0],@D[0]
  264. ___
  265. @C = @D[2..4,0,1];
  266. $code.=<<___;
  267. mov @C[0],@T[0]
  268. and @C[1],@C[0]
  269. not @C[1]
  270. xor @C[4],@C[0] # C[4] ^ ( C[0] & C[1])
  271. mov @C[0],$A[4][4](%rdi) # R[4][4] = C[4] ^ ( C[0] & C[1])
  272. mov @C[2],@T[1]
  273. and @C[1],@C[2]
  274. xor @T[0],@C[2] # C[0] ^ ( C[2] & ~C[1])
  275. mov @C[2],$A[4][0](%rdi) # R[4][0] = C[0] ^ ( C[2] & ~C[1])
  276. or @C[4],@T[0]
  277. xor @C[3],@T[0] # C[3] ^ ( C[0] | C[4])
  278. mov @T[0],$A[4][3](%rdi) # R[4][3] = C[3] ^ ( C[0] | C[4])
  279. and @C[3],@C[4]
  280. xor @T[1],@C[4] # C[2] ^ ( C[4] & C[3])
  281. mov @C[4],$A[4][2](%rdi) # R[4][2] = C[2] ^ ( C[4] & C[3])
  282. or @T[1],@C[3]
  283. xor @C[1],@C[3] # ~C[1] ^ ( C[2] | C[3])
  284. mov @C[3],$A[4][1](%rdi) # R[4][1] = ~C[1] ^ ( C[2] | C[3])
  285. mov @C[0],@C[1] # harmonize with the loop top
  286. mov @T[0],@C[0]
  287. test \$255,$iotas
  288. jnz .Loop
  289. lea -192($iotas),$iotas # rewind iotas
  290. ret
  291. .size __KeccakF1600,.-__KeccakF1600
  292. .type KeccakF1600,\@abi-omnipotent
  293. .align 32
  294. KeccakF1600:
  295. .cfi_startproc
  296. push %rbx
  297. .cfi_push %rbx
  298. push %rbp
  299. .cfi_push %rbp
  300. push %r12
  301. .cfi_push %r12
  302. push %r13
  303. .cfi_push %r13
  304. push %r14
  305. .cfi_push %r14
  306. push %r15
  307. .cfi_push %r15
  308. lea 100(%rdi),%rdi # size optimization
  309. sub \$200,%rsp
  310. .cfi_adjust_cfa_offset 200
  311. notq $A[0][1](%rdi)
  312. notq $A[0][2](%rdi)
  313. notq $A[1][3](%rdi)
  314. notq $A[2][2](%rdi)
  315. notq $A[3][2](%rdi)
  316. notq $A[4][0](%rdi)
  317. lea iotas(%rip),$iotas
  318. lea 100(%rsp),%rsi # size optimization
  319. call __KeccakF1600
  320. notq $A[0][1](%rdi)
  321. notq $A[0][2](%rdi)
  322. notq $A[1][3](%rdi)
  323. notq $A[2][2](%rdi)
  324. notq $A[3][2](%rdi)
  325. notq $A[4][0](%rdi)
  326. lea -100(%rdi),%rdi # preserve A[][]
  327. add \$200,%rsp
  328. .cfi_adjust_cfa_offset -200
  329. pop %r15
  330. .cfi_pop %r15
  331. pop %r14
  332. .cfi_pop %r14
  333. pop %r13
  334. .cfi_pop %r13
  335. pop %r12
  336. .cfi_pop %r12
  337. pop %rbp
  338. .cfi_pop %rbp
  339. pop %rbx
  340. .cfi_pop %rbx
  341. ret
  342. .cfi_endproc
  343. .size KeccakF1600,.-KeccakF1600
  344. ___
  345. { my ($A_flat,$inp,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
  346. ($A_flat,$inp) = ("%r8","%r9");
  347. $code.=<<___;
  348. .globl SHA3_absorb
  349. .type SHA3_absorb,\@function,4
  350. .align 32
  351. SHA3_absorb:
  352. .cfi_startproc
  353. push %rbx
  354. .cfi_push %rbx
  355. push %rbp
  356. .cfi_push %rbp
  357. push %r12
  358. .cfi_push %r12
  359. push %r13
  360. .cfi_push %r13
  361. push %r14
  362. .cfi_push %r14
  363. push %r15
  364. .cfi_push %r15
  365. lea 100(%rdi),%rdi # size optimization
  366. sub \$232,%rsp
  367. .cfi_adjust_cfa_offset 232
  368. mov %rsi,$inp
  369. lea 100(%rsp),%rsi # size optimization
  370. notq $A[0][1](%rdi)
  371. notq $A[0][2](%rdi)
  372. notq $A[1][3](%rdi)
  373. notq $A[2][2](%rdi)
  374. notq $A[3][2](%rdi)
  375. notq $A[4][0](%rdi)
  376. lea iotas(%rip),$iotas
  377. mov $bsz,216-100(%rsi) # save bsz
  378. .Loop_absorb:
  379. cmp $bsz,$len
  380. jc .Ldone_absorb
  381. shr \$3,$bsz
  382. lea -100(%rdi),$A_flat
  383. .Lblock_absorb:
  384. mov ($inp),%rax
  385. lea 8($inp),$inp
  386. xor ($A_flat),%rax
  387. lea 8($A_flat),$A_flat
  388. sub \$8,$len
  389. mov %rax,-8($A_flat)
  390. sub \$1,$bsz
  391. jnz .Lblock_absorb
  392. mov $inp,200-100(%rsi) # save inp
  393. mov $len,208-100(%rsi) # save len
  394. call __KeccakF1600
  395. mov 200-100(%rsi),$inp # pull inp
  396. mov 208-100(%rsi),$len # pull len
  397. mov 216-100(%rsi),$bsz # pull bsz
  398. jmp .Loop_absorb
  399. .align 32
  400. .Ldone_absorb:
  401. mov $len,%rax # return value
  402. notq $A[0][1](%rdi)
  403. notq $A[0][2](%rdi)
  404. notq $A[1][3](%rdi)
  405. notq $A[2][2](%rdi)
  406. notq $A[3][2](%rdi)
  407. notq $A[4][0](%rdi)
  408. add \$232,%rsp
  409. .cfi_adjust_cfa_offset -232
  410. pop %r15
  411. .cfi_pop %r15
  412. pop %r14
  413. .cfi_pop %r14
  414. pop %r13
  415. .cfi_pop %r13
  416. pop %r12
  417. .cfi_pop %r12
  418. pop %rbp
  419. .cfi_pop %rbp
  420. pop %rbx
  421. .cfi_pop %rbx
  422. ret
  423. .cfi_endproc
  424. .size SHA3_absorb,.-SHA3_absorb
  425. ___
  426. }
  427. { my ($A_flat,$out,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
  428. ($out,$len,$bsz) = ("%r12","%r13","%r14");
  429. $code.=<<___;
  430. .globl SHA3_squeeze
  431. .type SHA3_squeeze,\@function,4
  432. .align 32
  433. SHA3_squeeze:
  434. .cfi_startproc
  435. push %r12
  436. .cfi_push %r12
  437. push %r13
  438. .cfi_push %r13
  439. push %r14
  440. .cfi_push %r14
  441. shr \$3,%rcx
  442. mov $A_flat,%r8
  443. mov %rsi,$out
  444. mov %rdx,$len
  445. mov %rcx,$bsz
  446. jmp .Loop_squeeze
  447. .align 32
  448. .Loop_squeeze:
  449. cmp \$8,$len
  450. jb .Ltail_squeeze
  451. mov (%r8),%rax
  452. lea 8(%r8),%r8
  453. mov %rax,($out)
  454. lea 8($out),$out
  455. sub \$8,$len # len -= 8
  456. jz .Ldone_squeeze
  457. sub \$1,%rcx # bsz--
  458. jnz .Loop_squeeze
  459. call KeccakF1600
  460. mov $A_flat,%r8
  461. mov $bsz,%rcx
  462. jmp .Loop_squeeze
  463. .Ltail_squeeze:
  464. mov %r8, %rsi
  465. mov $out,%rdi
  466. mov $len,%rcx
  467. .byte 0xf3,0xa4 # rep movsb
  468. .Ldone_squeeze:
  469. pop %r14
  470. .cfi_pop %r14
  471. pop %r13
  472. .cfi_pop %r13
  473. pop %r12
  474. .cfi_pop %r13
  475. ret
  476. .cfi_endproc
  477. .size SHA3_squeeze,.-SHA3_squeeze
  478. ___
  479. }
  480. $code.=<<___;
  481. .align 256
  482. .quad 0,0,0,0,0,0,0,0
  483. .type iotas,\@object
  484. iotas:
  485. .quad 0x0000000000000001
  486. .quad 0x0000000000008082
  487. .quad 0x800000000000808a
  488. .quad 0x8000000080008000
  489. .quad 0x000000000000808b
  490. .quad 0x0000000080000001
  491. .quad 0x8000000080008081
  492. .quad 0x8000000000008009
  493. .quad 0x000000000000008a
  494. .quad 0x0000000000000088
  495. .quad 0x0000000080008009
  496. .quad 0x000000008000000a
  497. .quad 0x000000008000808b
  498. .quad 0x800000000000008b
  499. .quad 0x8000000000008089
  500. .quad 0x8000000000008003
  501. .quad 0x8000000000008002
  502. .quad 0x8000000000000080
  503. .quad 0x000000000000800a
  504. .quad 0x800000008000000a
  505. .quad 0x8000000080008081
  506. .quad 0x8000000000008080
  507. .quad 0x0000000080000001
  508. .quad 0x8000000080008008
  509. .size iotas,.-iotas
  510. .asciz "Keccak-1600 absorb and squeeze for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
  511. ___
  512. foreach (split("\n",$code)) {
  513. # Below replacement results in 11.2 on Sandy Bridge, 9.4 on
  514. # Haswell, but it hurts other processors by up to 2-3-4x...
  515. #s/rol\s+(\$[0-9]+),(%[a-z][a-z0-9]+)/shld\t$1,$2,$2/;
  516. # Below replacement results in 9.3 on Haswell [as well as
  517. # on Ryzen, i.e. it *hurts* Ryzen]...
  518. #s/rol\s+\$([0-9]+),(%[a-z][a-z0-9]+)/rorx\t\$64-$1,$2,$2/;
  519. print $_, "\n";
  520. }
  521. close STDOUT;