keccak1600-x86_64.pl 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608
  1. #!/usr/bin/env perl
  2. # Copyright 2017 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the OpenSSL license (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. # ====================================================================
  10. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  11. # project. The module is, however, dual licensed under OpenSSL and
  12. # CRYPTOGAMS licenses depending on where you obtain it. For further
  13. # details see http://www.openssl.org/~appro/cryptogams/.
  14. # ====================================================================
  15. #
  16. # Keccak-1600 for x86_64.
  17. #
  18. # June 2017.
  19. #
  20. # Below code is [lane complementing] KECCAK_2X implementation (see
  21. # sha/keccak1600.c) with C[5] and D[5] held in register bank. Though
  22. # instead of actually unrolling the loop pair-wise I simply flip
  23. # pointers to T[][] and A[][] at the end of round. Since number of
  24. # rounds is even, last round writes to A[][] and everything works out.
  25. # How does it compare to x86_64 assembly module in Keccak Code Package?
  26. # Depending on processor it's either as fast or faster by up to 15%...
  27. #
  28. ########################################################################
  29. # Numbers are cycles per processed byte out of large message.
  30. #
  31. # r=1088(*)
  32. #
  33. # P4 25.8
  34. # Core 2 12.9
  35. # Westmere 13.7
  36. # Sandy Bridge 12.9(**)
  37. # Haswell 9.6
  38. # Skylake 9.4
  39. # Silvermont 22.8
  40. # Goldmont 15.8
  41. # VIA Nano 17.3
  42. # Sledgehammer 13.3
  43. # Bulldozer 16.5
  44. # Ryzen 8.8
  45. #
  46. # (*) Corresponds to SHA3-256. Improvement over compiler-generate
  47. # varies a lot, most commont coefficient is 15% in comparison to
  48. # gcc-5.x, 50% for gcc-4.x, 90% for gcc-3.x.
  49. # (**) Sandy Bridge has broken rotate instruction. Performance can be
  50. # improved by 14% by replacing rotates with double-precision
  51. # shift with same register as source and destination.
  52. $flavour = shift;
  53. $output = shift;
  54. if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
  55. $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  56. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  57. ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  58. ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  59. die "can't locate x86_64-xlate.pl";
  60. open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
  61. *STDOUT=*OUT;
  62. my @A = map([ 8*$_-100, 8*($_+1)-100, 8*($_+2)-100,
  63. 8*($_+3)-100, 8*($_+4)-100 ], (0,5,10,15,20));
  64. my @C = ("%rax","%rbx","%rcx","%rdx","%rbp");
  65. my @D = map("%r$_",(8..12));
  66. my @T = map("%r$_",(13..14));
  67. my $iotas = "%r15";
  68. my @rhotates = ([ 0, 1, 62, 28, 27 ],
  69. [ 36, 44, 6, 55, 20 ],
  70. [ 3, 10, 43, 25, 39 ],
  71. [ 41, 45, 15, 21, 8 ],
  72. [ 18, 2, 61, 56, 14 ]);
  73. $code.=<<___;
  74. .text
  75. .type __KeccakF1600,\@function
  76. .align 32
  77. __KeccakF1600:
  78. mov $A[4][0](%rdi),@C[0]
  79. mov $A[4][1](%rdi),@C[1]
  80. mov $A[4][2](%rdi),@C[2]
  81. mov $A[4][3](%rdi),@C[3]
  82. mov $A[4][4](%rdi),@C[4]
  83. jmp .Loop
  84. .align 32
  85. .Loop:
  86. mov $A[0][0](%rdi),@D[0]
  87. mov $A[1][1](%rdi),@D[1]
  88. mov $A[2][2](%rdi),@D[2]
  89. mov $A[3][3](%rdi),@D[3]
  90. xor $A[0][2](%rdi),@C[2]
  91. xor $A[0][3](%rdi),@C[3]
  92. xor @D[0], @C[0]
  93. xor $A[0][1](%rdi),@C[1]
  94. xor $A[1][2](%rdi),@C[2]
  95. xor $A[1][0](%rdi),@C[0]
  96. mov @C[4],@D[4]
  97. xor $A[0][4](%rdi),@C[4]
  98. xor @D[2], @C[2]
  99. xor $A[2][0](%rdi),@C[0]
  100. xor $A[1][3](%rdi),@C[3]
  101. xor @D[1], @C[1]
  102. xor $A[1][4](%rdi),@C[4]
  103. xor $A[3][2](%rdi),@C[2]
  104. xor $A[3][0](%rdi),@C[0]
  105. xor $A[2][3](%rdi),@C[3]
  106. xor $A[2][1](%rdi),@C[1]
  107. xor $A[2][4](%rdi),@C[4]
  108. mov @C[2],@T[0]
  109. rol \$1,@C[2]
  110. xor @C[0],@C[2] # D[1] = ROL64(C[2], 1) ^ C[0]
  111. xor @D[3], @C[3]
  112. rol \$1,@C[0]
  113. xor @C[3],@C[0] # D[4] = ROL64(C[0], 1) ^ C[3]
  114. xor $A[3][1](%rdi),@C[1]
  115. rol \$1,@C[3]
  116. xor @C[1],@C[3] # D[2] = ROL64(C[3], 1) ^ C[1]
  117. xor $A[3][4](%rdi),@C[4]
  118. rol \$1,@C[1]
  119. xor @C[4],@C[1] # D[0] = ROL64(C[1], 1) ^ C[4]
  120. rol \$1,@C[4]
  121. xor @T[0],@C[4] # D[3] = ROL64(C[4], 1) ^ C[2]
  122. ___
  123. (@D[0..4], @C) = (@C[1..4,0], @D);
  124. $code.=<<___;
  125. xor @D[1],@C[1]
  126. xor @D[2],@C[2]
  127. rol \$$rhotates[1][1],@C[1]
  128. xor @D[3],@C[3]
  129. xor @D[4],@C[4]
  130. rol \$$rhotates[2][2],@C[2]
  131. xor @D[0],@C[0]
  132. mov @C[1],@T[0]
  133. rol \$$rhotates[3][3],@C[3]
  134. or @C[2],@C[1]
  135. xor @C[0],@C[1] # C[0] ^ ( C[1] | C[2])
  136. rol \$$rhotates[4][4],@C[4]
  137. xor ($iotas),@C[1]
  138. lea 8($iotas),$iotas
  139. mov @C[4],@T[1]
  140. and @C[3],@C[4]
  141. mov @C[1],$A[0][0](%rsi) # R[0][0] = C[0] ^ ( C[1] | C[2]) ^ iotas[i]
  142. xor @C[2],@C[4] # C[2] ^ ( C[4] & C[3])
  143. not @C[2]
  144. mov @C[4],$A[0][2](%rsi) # R[0][2] = C[2] ^ ( C[4] & C[3])
  145. or @C[3],@C[2]
  146. mov $A[4][2](%rdi),@C[4]
  147. xor @T[0],@C[2] # C[1] ^ (~C[2] | C[3])
  148. mov @C[2],$A[0][1](%rsi) # R[0][1] = C[1] ^ (~C[2] | C[3])
  149. and @C[0],@T[0]
  150. mov $A[1][4](%rdi),@C[1]
  151. xor @T[1],@T[0] # C[4] ^ ( C[1] & C[0])
  152. mov $A[2][0](%rdi),@C[2]
  153. mov @T[0],$A[0][4](%rsi) # R[0][4] = C[4] ^ ( C[1] & C[0])
  154. or @C[0],@T[1]
  155. mov $A[0][3](%rdi),@C[0]
  156. xor @C[3],@T[1] # C[3] ^ ( C[4] | C[0])
  157. mov $A[3][1](%rdi),@C[3]
  158. mov @T[1],$A[0][3](%rsi) # R[0][3] = C[3] ^ ( C[4] | C[0])
  159. xor @D[3],@C[0]
  160. xor @D[2],@C[4]
  161. rol \$$rhotates[0][3],@C[0]
  162. xor @D[1],@C[3]
  163. xor @D[4],@C[1]
  164. rol \$$rhotates[4][2],@C[4]
  165. rol \$$rhotates[3][1],@C[3]
  166. xor @D[0],@C[2]
  167. rol \$$rhotates[1][4],@C[1]
  168. mov @C[0],@T[0]
  169. or @C[4],@C[0]
  170. rol \$$rhotates[2][0],@C[2]
  171. xor @C[3],@C[0] # C[3] ^ (C[0] | C[4])
  172. mov @C[0],$A[1][3](%rsi) # R[1][3] = C[3] ^ (C[0] | C[4])
  173. mov @C[1],@T[1]
  174. and @T[0],@C[1]
  175. mov $A[0][1](%rdi),@C[0]
  176. xor @C[4],@C[1] # C[4] ^ (C[1] & C[0])
  177. not @C[4]
  178. mov @C[1],$A[1][4](%rsi) # R[1][4] = C[4] ^ (C[1] & C[0])
  179. or @C[3],@C[4]
  180. mov $A[1][2](%rdi),@C[1]
  181. xor @C[2],@C[4] # C[2] ^ (~C[4] | C[3])
  182. mov @C[4],$A[1][2](%rsi) # R[1][2] = C[2] ^ (~C[4] | C[3])
  183. and @C[2],@C[3]
  184. mov $A[4][0](%rdi),@C[4]
  185. xor @T[1],@C[3] # C[1] ^ (C[3] & C[2])
  186. mov @C[3],$A[1][1](%rsi) # R[1][1] = C[1] ^ (C[3] & C[2])
  187. or @C[2],@T[1]
  188. mov $A[2][3](%rdi),@C[2]
  189. xor @T[0],@T[1] # C[0] ^ (C[1] | C[2])
  190. mov $A[3][4](%rdi),@C[3]
  191. mov @T[1],$A[1][0](%rsi) # R[1][0] = C[0] ^ (C[1] | C[2])
  192. xor @D[3],@C[2]
  193. xor @D[4],@C[3]
  194. rol \$$rhotates[2][3],@C[2]
  195. xor @D[2],@C[1]
  196. rol \$$rhotates[3][4],@C[3]
  197. xor @D[0],@C[4]
  198. rol \$$rhotates[1][2],@C[1]
  199. xor @D[1],@C[0]
  200. rol \$$rhotates[4][0],@C[4]
  201. mov @C[2],@T[0]
  202. and @C[3],@C[2]
  203. rol \$$rhotates[0][1],@C[0]
  204. not @C[3]
  205. xor @C[1],@C[2] # C[1] ^ ( C[2] & C[3])
  206. mov @C[2],$A[2][1](%rsi) # R[2][1] = C[1] ^ ( C[2] & C[3])
  207. mov @C[4],@T[1]
  208. and @C[3],@C[4]
  209. mov $A[2][1](%rdi),@C[2]
  210. xor @T[0],@C[4] # C[2] ^ ( C[4] & ~C[3])
  211. mov @C[4],$A[2][2](%rsi) # R[2][2] = C[2] ^ ( C[4] & ~C[3])
  212. or @C[1],@T[0]
  213. mov $A[4][3](%rdi),@C[4]
  214. xor @C[0],@T[0] # C[0] ^ ( C[2] | C[1])
  215. mov @T[0],$A[2][0](%rsi) # R[2][0] = C[0] ^ ( C[2] | C[1])
  216. and @C[0],@C[1]
  217. xor @T[1],@C[1] # C[4] ^ ( C[1] & C[0])
  218. mov @C[1],$A[2][4](%rsi) # R[2][4] = C[4] ^ ( C[1] & C[0])
  219. or @C[0],@T[1]
  220. mov $A[1][0](%rdi),@C[1]
  221. xor @C[3],@T[1] # ~C[3] ^ ( C[0] | C[4])
  222. mov $A[3][2](%rdi),@C[3]
  223. mov @T[1],$A[2][3](%rsi) # R[2][3] = ~C[3] ^ ( C[0] | C[4])
  224. mov $A[0][4](%rdi),@C[0]
  225. xor @D[1],@C[2]
  226. xor @D[2],@C[3]
  227. rol \$$rhotates[2][1],@C[2]
  228. xor @D[0],@C[1]
  229. rol \$$rhotates[3][2],@C[3]
  230. xor @D[3],@C[4]
  231. rol \$$rhotates[1][0],@C[1]
  232. xor @D[4],@C[0]
  233. rol \$$rhotates[4][3],@C[4]
  234. mov @C[2],@T[0]
  235. or @C[3],@C[2]
  236. rol \$$rhotates[0][4],@C[0]
  237. not @C[3]
  238. xor @C[1],@C[2] # C[1] ^ ( C[2] | C[3])
  239. mov @C[2],$A[3][1](%rsi) # R[3][1] = C[1] ^ ( C[2] | C[3])
  240. mov @C[4],@T[1]
  241. or @C[3],@C[4]
  242. xor @T[0],@C[4] # C[2] ^ ( C[4] | ~C[3])
  243. mov @C[4],$A[3][2](%rsi) # R[3][2] = C[2] ^ ( C[4] | ~C[3])
  244. and @C[1],@T[0]
  245. xor @C[0],@T[0] # C[0] ^ ( C[2] & C[1])
  246. mov @T[0],$A[3][0](%rsi) # R[3][0] = C[0] ^ ( C[2] & C[1])
  247. or @C[0],@C[1]
  248. xor @T[1],@C[1] # C[4] ^ ( C[1] | C[0])
  249. mov @C[1],$A[3][4](%rsi) # R[3][4] = C[4] ^ ( C[1] | C[0])
  250. and @T[1],@C[0]
  251. xor @C[3],@C[0] # ~C[3] ^ ( C[0] & C[4])
  252. mov @C[0],$A[3][3](%rsi) # R[3][3] = ~C[3] ^ ( C[0] & C[4])
  253. xor $A[0][2](%rdi),@D[2]
  254. xor $A[1][3](%rdi),@D[3]
  255. rol \$$rhotates[0][2],@D[2]
  256. xor $A[4][1](%rdi),@D[1]
  257. rol \$$rhotates[1][3],@D[3]
  258. xor $A[2][4](%rdi),@D[4]
  259. rol \$$rhotates[4][1],@D[1]
  260. xor $A[3][0](%rdi),@D[0]
  261. xchg %rsi,%rdi
  262. rol \$$rhotates[2][4],@D[4]
  263. rol \$$rhotates[3][0],@D[0]
  264. ___
  265. @C = @D[2..4,0,1];
  266. $code.=<<___;
  267. mov @C[0],@T[0]
  268. and @C[1],@C[0]
  269. not @C[1]
  270. xor @C[4],@C[0] # C[4] ^ ( C[0] & C[1])
  271. mov @C[0],$A[4][4](%rdi) # R[4][4] = C[4] ^ ( C[0] & C[1])
  272. mov @C[2],@T[1]
  273. and @C[1],@C[2]
  274. xor @T[0],@C[2] # C[0] ^ ( C[2] & ~C[1])
  275. mov @C[2],$A[4][0](%rdi) # R[4][0] = C[0] ^ ( C[2] & ~C[1])
  276. or @C[4],@T[0]
  277. xor @C[3],@T[0] # C[3] ^ ( C[0] | C[4])
  278. mov @T[0],$A[4][3](%rdi) # R[4][3] = C[3] ^ ( C[0] | C[4])
  279. and @C[3],@C[4]
  280. xor @T[1],@C[4] # C[2] ^ ( C[4] & C[3])
  281. mov @C[4],$A[4][2](%rdi) # R[4][2] = C[2] ^ ( C[4] & C[3])
  282. or @T[1],@C[3]
  283. xor @C[1],@C[3] # ~C[1] ^ ( C[2] | C[3])
  284. mov @C[3],$A[4][1](%rdi) # R[4][1] = ~C[1] ^ ( C[2] | C[3])
  285. mov @C[0],@C[1] # harmonize with the loop top
  286. mov @T[0],@C[0]
  287. test \$255,$iotas
  288. jnz .Loop
  289. lea -192($iotas),$iotas # rewind iotas
  290. ret
  291. .size __KeccakF1600,.-__KeccakF1600
  292. .globl KeccakF1600
  293. .type KeccakF1600,\@function
  294. .align 32
  295. KeccakF1600:
  296. .cfi_startproc
  297. push %rbx
  298. .cfi_push %rbx
  299. push %rbp
  300. .cfi_push %rbp
  301. push %r12
  302. .cfi_push %r12
  303. push %r13
  304. .cfi_push %r13
  305. push %r14
  306. .cfi_push %r14
  307. push %r15
  308. .cfi_push %r15
  309. lea 100(%rdi),%rdi # size optimization
  310. sub \$200,%rsp
  311. .cfi_adjust_cfa_offset 200
  312. notq $A[0][1](%rdi)
  313. notq $A[0][2](%rdi)
  314. notq $A[1][3](%rdi)
  315. notq $A[2][2](%rdi)
  316. notq $A[3][2](%rdi)
  317. notq $A[4][0](%rdi)
  318. lea iotas(%rip),$iotas
  319. lea 100(%rsp),%rsi # size optimization
  320. call __KeccakF1600
  321. notq $A[0][1](%rdi)
  322. notq $A[0][2](%rdi)
  323. notq $A[1][3](%rdi)
  324. notq $A[2][2](%rdi)
  325. notq $A[3][2](%rdi)
  326. notq $A[4][0](%rdi)
  327. lea -100(%rdi),%rdi # preserve A[][]
  328. add \$200,%rsp
  329. .cfi_adjust_cfa_offset -200
  330. pop %r15
  331. .cfi_pop %r15
  332. pop %r14
  333. .cfi_pop %r14
  334. pop %r13
  335. .cfi_pop %r13
  336. pop %r12
  337. .cfi_pop %r12
  338. pop %rbp
  339. .cfi_pop %rbp
  340. pop %rbx
  341. .cfi_pop %rbx
  342. ret
  343. .cfi_endproc
  344. .size KeccakF1600,.-KeccakF1600
  345. ___
  346. { my ($A_flat,$inp,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
  347. ($A_flat,$inp) = ("%r8","%r9");
  348. $code.=<<___;
  349. .globl SHA3_absorb
  350. .type SHA3_absorb,\@function
  351. .align 32
  352. SHA3_absorb:
  353. .cfi_startproc
  354. push %rbx
  355. .cfi_push %rbx
  356. push %rbp
  357. .cfi_push %rbp
  358. push %r12
  359. .cfi_push %r12
  360. push %r13
  361. .cfi_push %r13
  362. push %r14
  363. .cfi_push %r14
  364. push %r15
  365. .cfi_push %r15
  366. lea 100(%rdi),%rdi # size optimization
  367. sub \$232,%rsp
  368. .cfi_adjust_cfa_offset 232
  369. mov %rsi,$inp
  370. lea 100(%rsp),%rsi # size optimization
  371. notq $A[0][1](%rdi)
  372. notq $A[0][2](%rdi)
  373. notq $A[1][3](%rdi)
  374. notq $A[2][2](%rdi)
  375. notq $A[3][2](%rdi)
  376. notq $A[4][0](%rdi)
  377. lea iotas(%rip),$iotas
  378. mov $bsz,216-100(%rsi) # save bsz
  379. .Loop_absorb:
  380. cmp $bsz,$len
  381. jc .Ldone_absorb
  382. shr \$3,$bsz
  383. lea -100(%rdi),$A_flat
  384. .Lblock_absorb:
  385. mov ($inp),%rax
  386. lea 8($inp),$inp
  387. xor ($A_flat),%rax
  388. lea 8($A_flat),$A_flat
  389. sub \$8,$len
  390. mov %rax,-8($A_flat)
  391. sub \$1,$bsz
  392. jnz .Lblock_absorb
  393. mov $inp,200-100(%rsi) # save inp
  394. mov $len,208-100(%rsi) # save len
  395. call __KeccakF1600
  396. mov 200-100(%rsi),$inp # pull inp
  397. mov 208-100(%rsi),$len # pull len
  398. mov 216-100(%rsi),$bsz # pull bsz
  399. jmp .Loop_absorb
  400. .align 32
  401. .Ldone_absorb:
  402. mov $len,%rax # return value
  403. notq $A[0][1](%rdi)
  404. notq $A[0][2](%rdi)
  405. notq $A[1][3](%rdi)
  406. notq $A[2][2](%rdi)
  407. notq $A[3][2](%rdi)
  408. notq $A[4][0](%rdi)
  409. add \$232,%rsp
  410. .cfi_adjust_cfa_offset -232
  411. pop %r15
  412. .cfi_pop %r15
  413. pop %r14
  414. .cfi_pop %r14
  415. pop %r13
  416. .cfi_pop %r13
  417. pop %r12
  418. .cfi_pop %r12
  419. pop %rbp
  420. .cfi_pop %rbp
  421. pop %rbx
  422. .cfi_pop %rbx
  423. ret
  424. .cfi_endproc
  425. .size SHA3_absorb,.-SHA3_absorb
  426. ___
  427. }
  428. { my ($A_flat,$out,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
  429. ($out,$len,$bsz) = ("%r12","%r13","%r14");
  430. $code.=<<___;
  431. .globl SHA3_squeeze
  432. .type SHA3_squeeze,\@function
  433. .align 32
  434. SHA3_squeeze:
  435. .cfi_startproc
  436. push %r12
  437. .cfi_push %r12
  438. push %r13
  439. .cfi_push %r13
  440. push %r14
  441. .cfi_push %r14
  442. shr \$3,%rcx
  443. mov $A_flat,%r8
  444. mov %rsi,$out
  445. mov %rdx,$len
  446. mov %rcx,$bsz
  447. jmp .Loop_squeeze
  448. .align 32
  449. .Loop_squeeze:
  450. cmp \$8,$len
  451. jb .Ltail_squeeze
  452. mov (%r8),%rax
  453. lea 8(%r8),%r8
  454. mov %rax,($out)
  455. lea 8($out),$out
  456. sub \$8,$len # len -= 8
  457. jz .Ldone_squeeze
  458. sub \$1,%rcx # bsz--
  459. jnz .Loop_squeeze
  460. call KeccakF1600
  461. mov $A_flat,%r8
  462. mov $bsz,%rcx
  463. jmp .Loop_squeeze
  464. .Ltail_squeeze:
  465. mov %r8, %rsi
  466. mov $out,%rdi
  467. mov $len,%rcx
  468. .byte 0xf3,0xa4 # rep movsb
  469. .Ldone_squeeze:
  470. pop %r14
  471. .cfi_pop %r14
  472. pop %r13
  473. .cfi_pop %r13
  474. pop %r12
  475. .cfi_pop %r13
  476. ret
  477. .cfi_endproc
  478. .size SHA3_squeeze,.-SHA3_squeeze
  479. ___
  480. }
  481. $code.=<<___;
  482. .align 256
  483. .quad 0,0,0,0,0,0,0,0
  484. .type iotas,\@object
  485. iotas:
  486. .quad 0x0000000000000001
  487. .quad 0x0000000000008082
  488. .quad 0x800000000000808a
  489. .quad 0x8000000080008000
  490. .quad 0x000000000000808b
  491. .quad 0x0000000080000001
  492. .quad 0x8000000080008081
  493. .quad 0x8000000000008009
  494. .quad 0x000000000000008a
  495. .quad 0x0000000000000088
  496. .quad 0x0000000080008009
  497. .quad 0x000000008000000a
  498. .quad 0x000000008000808b
  499. .quad 0x800000000000008b
  500. .quad 0x8000000000008089
  501. .quad 0x8000000000008003
  502. .quad 0x8000000000008002
  503. .quad 0x8000000000000080
  504. .quad 0x000000000000800a
  505. .quad 0x800000008000000a
  506. .quad 0x8000000080008081
  507. .quad 0x8000000000008080
  508. .quad 0x0000000080000001
  509. .quad 0x8000000080008008
  510. .size iotas,.-iotas
  511. .asciz "Keccak-1600 absorb and squeeze for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
  512. ___
  513. foreach (split("\n",$code)) {
  514. # Below replacement results in 11.2 on Sandy Bridge, 9.4 on
  515. # Haswell, but it hurts other processors by up to 2-3-4x...
  516. #s/rol\s+(\$[0-9]+),(%[a-z][a-z0-9]+)/shld\t$1,$2,$2/;
  517. # Below replacement results in 9.3 on Haswell [as well as
  518. # on Ryzen, i.e. it *hurts* Ryzen]...
  519. #s/rol\s+\$([0-9]+),(%[a-z][a-z0-9]+)/rorx\t\$64-$1,$2,$2/;
  520. print $_, "\n";
  521. }
  522. close STDOUT;