sha512-x86_64.pl 62 KB


  1. #! /usr/bin/env perl
  2. # Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the OpenSSL license (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. # ====================================================================
  10. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  11. # project. Rights for redistribution and usage in source and binary
  12. # forms are granted according to the OpenSSL license.
  13. # ====================================================================
  14. #
  15. # sha256/512_block procedure for x86_64.
  16. #
  17. # 40% improvement over compiler-generated code on Opteron. On EM64T
  18. # sha256 was observed to run >80% faster and sha512 - >40%. No magical
  19. # tricks, just straight implementation... I really wonder why gcc
  20. # [being armed with inline assembler] fails to generate as fast code.
  21. # The only thing which is cool about this module is that it's very
  22. # same instruction sequence used for both SHA-256 and SHA-512. In
  23. # former case the instructions operate on 32-bit operands, while in
  24. # latter - on 64-bit ones. All I had to do is to get one flavor right,
  25. # the other one passed the test right away:-)
  26. #
  27. # sha256_block runs in ~1005 cycles on Opteron, which gives you
  28. # asymptotic performance of 64*1000/1005=63.7MBps times CPU clock
  29. # frequency in GHz. sha512_block runs in ~1275 cycles, which results
  30. # in 128*1000/1275=100MBps per GHz. Is there room for improvement?
  31. # Well, if you compare it to IA-64 implementation, which maintains
  32. # X[16] in register bank[!], tends to 4 instructions per CPU clock
  33. # cycle and runs in 1003 cycles, 1275 is very good result for 3-way
  34. # issue Opteron pipeline and X[16] maintained in memory. So that *if*
  35. # there is a way to improve it, *then* the only way would be to try to
  36. # offload X[16] updates to SSE unit, but that would require "deeper"
  37. # loop unroll, which in turn would naturally cause size blow-up, not
  38. # to mention increased complexity! And once again, only *if* it's
  39. # actually possible to noticeably improve overall ILP, instruction
  40. # level parallelism, on a given CPU implementation in this case.
  41. #
  42. # Special note on Intel EM64T. While Opteron CPU exhibits perfect
  43. # performance ratio of 1.5 between 64- and 32-bit flavors [see above],
  44. # [currently available] EM64T CPUs apparently are far from it. On the
  45. # contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit
  46. # sha256_block:-( This is presumably because 64-bit shifts/rotates
  47. # apparently are not atomic instructions, but implemented in microcode.
  48. #
  49. # May 2012.
  50. #
  51. # Optimization including one of Pavel Semjanov's ideas, alternative
  52. # Maj, resulted in >=5% improvement on most CPUs, +20% SHA256 and
  53. # unfortunately -2% SHA512 on P4 [which nobody should care about
  54. # that much].
  55. #
  56. # June 2012.
  57. #
  58. # Add SIMD code paths, see below for improvement coefficients. SSSE3
  59. # code path was not attempted for SHA512, because improvement is not
  60. # estimated to be high enough, noticeably less than 9%, to justify
  61. # the effort, not on pre-AVX processors. [Obviously with exclusion
  62. # for VIA Nano, but it has SHA512 instruction that is faster and
  63. # should be used instead.] For reference, corresponding estimated
  64. # upper limit for improvement for SSSE3 SHA256 is 28%. The fact that
  65. # higher coefficients are observed on VIA Nano and Bulldozer has more
  66. # to do with specifics of their architecture [which is topic for
  67. # separate discussion].
  68. #
  69. # November 2012.
  70. #
  71. # Add AVX2 code path. Two consecutive input blocks are loaded to
  72. # 256-bit %ymm registers, with data from first block to least
  73. # significant 128-bit halves and data from second to most significant.
  74. # The data is then processed with same SIMD instruction sequence as
  75. # for AVX, but with %ymm as operands. Side effect is increased stack
  76. # frame, 448 additional bytes in SHA256 and 1152 in SHA512, and 1.2KB
  77. # code size increase.
  78. #
  79. # March 2014.
  80. #
  81. # Add support for Intel SHA Extensions.
  82. ######################################################################
  83. # Current performance in cycles per processed byte (less is better):
  84. #
  85. # SHA256 SSSE3 AVX/XOP(*) SHA512 AVX/XOP(*)
  86. #
  87. # AMD K8 14.9 - - 9.57 -
  88. # P4 17.3 - - 30.8 -
  89. # Core 2 15.6 13.8(+13%) - 9.97 -
  90. # Westmere 14.8 12.3(+19%) - 9.58 -
  91. # Sandy Bridge 17.4 14.2(+23%) 11.6(+50%(**)) 11.2 8.10(+38%(**))
  92. # Ivy Bridge 12.6 10.5(+20%) 10.3(+22%) 8.17 7.22(+13%)
  93. # Haswell 12.2 9.28(+31%) 7.80(+56%) 7.66 5.40(+42%)
  94. # Skylake 11.4 9.03(+26%) 7.70(+48%) 7.25 5.20(+40%)
  95. # Bulldozer 21.1 13.6(+54%) 13.6(+54%(***)) 13.5 8.58(+57%)
  96. # Ryzen 11.0 9.02(+22%) 2.05(+440%) 7.05 5.67(+20%)
  97. # VIA Nano 23.0 16.5(+39%) - 14.7 -
  98. # Atom 23.0 18.9(+22%) - 14.7 -
  99. # Silvermont 27.4 20.6(+33%) - 17.5 -
  100. # Knights L 27.4 21.0(+30%) 19.6(+40%) 17.5 12.8(+37%)
  101. # Goldmont 18.9 14.3(+32%) 4.16(+350%) 12.0 -
  102. #
  103. # (*) whichever best applicable, including SHAEXT;
  104. # (**) switch from ror to shrd stands for fair share of improvement;
  105. # (***) execution time is fully determined by remaining integer-only
  106. # part, body_00_15; reducing the amount of SIMD instructions
  107. # below certain limit makes no difference/sense; to conserve
  108. # space SHA256 XOP code path is therefore omitted;
  109. $flavour = shift;
  110. $output = shift;
  111. if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
  112. $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  113. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  114. ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  115. ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  116. die "can't locate x86_64-xlate.pl";
  117. if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
  118. =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
  119. $avx = ($1>=2.19) + ($1>=2.22);
  120. }
  121. if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
  122. `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
  123. $avx = ($1>=2.09) + ($1>=2.10);
  124. }
  125. if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
  126. `ml64 2>&1` =~ /Version ([0-9]+)\./) {
  127. $avx = ($1>=10) + ($1>=11);
  128. }
  129. if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
  130. $avx = ($2>=3.0) + ($2>3.0);
  131. }
  132. $shaext=1; ### set to zero if compiling for 1.0.1
  133. $avx=1 if (!$shaext && $avx);
  134. open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
  135. *STDOUT=*OUT;
  136. if ($output =~ /512/) {
  137. $func="sha512_block_data_order";
  138. $TABLE="K512";
  139. $SZ=8;
  140. @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%rax","%rbx","%rcx","%rdx",
  141. "%r8", "%r9", "%r10","%r11");
  142. ($T1,$a0,$a1,$a2,$a3)=("%r12","%r13","%r14","%r15","%rdi");
  143. @Sigma0=(28,34,39);
  144. @Sigma1=(14,18,41);
  145. @sigma0=(1, 8, 7);
  146. @sigma1=(19,61, 6);
  147. $rounds=80;
  148. } else {
  149. $func="sha256_block_data_order";
  150. $TABLE="K256";
  151. $SZ=4;
  152. @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
  153. "%r8d","%r9d","%r10d","%r11d");
  154. ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi");
  155. @Sigma0=( 2,13,22);
  156. @Sigma1=( 6,11,25);
  157. @sigma0=( 7,18, 3);
  158. @sigma1=(17,19,10);
  159. $rounds=64;
  160. }
  161. $ctx="%rdi"; # 1st arg, zapped by $a3
  162. $inp="%rsi"; # 2nd arg
  163. $Tbl="%rbp";
  164. $_ctx="16*$SZ+0*8(%rsp)";
  165. $_inp="16*$SZ+1*8(%rsp)";
  166. $_end="16*$SZ+2*8(%rsp)";
  167. $_rsp="`16*$SZ+3*8`(%rsp)";
  168. $framesz="16*$SZ+4*8";
  169. sub ROUND_00_15()
  170. { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
  171. my $STRIDE=$SZ;
  172. $STRIDE += 16 if ($i%(16/$SZ)==(16/$SZ-1));
  173. $code.=<<___;
  174. ror \$`$Sigma1[2]-$Sigma1[1]`,$a0
  175. mov $f,$a2
  176. xor $e,$a0
  177. ror \$`$Sigma0[2]-$Sigma0[1]`,$a1
  178. xor $g,$a2 # f^g
  179. mov $T1,`$SZ*($i&0xf)`(%rsp)
  180. xor $a,$a1
  181. and $e,$a2 # (f^g)&e
  182. ror \$`$Sigma1[1]-$Sigma1[0]`,$a0
  183. add $h,$T1 # T1+=h
  184. xor $g,$a2 # Ch(e,f,g)=((f^g)&e)^g
  185. ror \$`$Sigma0[1]-$Sigma0[0]`,$a1
  186. xor $e,$a0
  187. add $a2,$T1 # T1+=Ch(e,f,g)
  188. mov $a,$a2
  189. add ($Tbl),$T1 # T1+=K[round]
  190. xor $a,$a1
  191. xor $b,$a2 # a^b, b^c in next round
  192. ror \$$Sigma1[0],$a0 # Sigma1(e)
  193. mov $b,$h
  194. and $a2,$a3
  195. ror \$$Sigma0[0],$a1 # Sigma0(a)
  196. add $a0,$T1 # T1+=Sigma1(e)
  197. xor $a3,$h # h=Maj(a,b,c)=Ch(a^b,c,b)
  198. add $T1,$d # d+=T1
  199. add $T1,$h # h+=T1
  200. lea $STRIDE($Tbl),$Tbl # round++
  201. ___
  202. $code.=<<___ if ($i<15);
  203. add $a1,$h # h+=Sigma0(a)
  204. ___
  205. ($a2,$a3) = ($a3,$a2);
  206. }
  207. sub ROUND_16_XX()
  208. { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
  209. $code.=<<___;
  210. mov `$SZ*(($i+1)&0xf)`(%rsp),$a0
  211. mov `$SZ*(($i+14)&0xf)`(%rsp),$a2
  212. mov $a0,$T1
  213. ror \$`$sigma0[1]-$sigma0[0]`,$a0
  214. add $a1,$a # modulo-scheduled h+=Sigma0(a)
  215. mov $a2,$a1
  216. ror \$`$sigma1[1]-$sigma1[0]`,$a2
  217. xor $T1,$a0
  218. shr \$$sigma0[2],$T1
  219. ror \$$sigma0[0],$a0
  220. xor $a1,$a2
  221. shr \$$sigma1[2],$a1
  222. ror \$$sigma1[0],$a2
  223. xor $a0,$T1 # sigma0(X[(i+1)&0xf])
  224. xor $a1,$a2 # sigma1(X[(i+14)&0xf])
  225. add `$SZ*(($i+9)&0xf)`(%rsp),$T1
  226. add `$SZ*($i&0xf)`(%rsp),$T1
  227. mov $e,$a0
  228. add $a2,$T1
  229. mov $a,$a1
  230. ___
  231. &ROUND_00_15(@_);
  232. }
  233. $code=<<___;
  234. .text
  235. .extern OPENSSL_ia32cap_P
  236. .globl $func
  237. .type $func,\@function,3
  238. .align 16
  239. $func:
  240. .cfi_startproc
  241. ___
  242. $code.=<<___ if ($SZ==4 || $avx);
  243. lea OPENSSL_ia32cap_P(%rip),%r11
  244. mov 0(%r11),%r9d
  245. mov 4(%r11),%r10d
  246. mov 8(%r11),%r11d
  247. ___
  248. $code.=<<___ if ($SZ==4 && $shaext);
  249. test \$`1<<29`,%r11d # check for SHA
  250. jnz _shaext_shortcut
  251. ___
  252. $code.=<<___ if ($avx && $SZ==8);
  253. test \$`1<<11`,%r10d # check for XOP
  254. jnz .Lxop_shortcut
  255. ___
  256. $code.=<<___ if ($avx>1);
  257. and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1
  258. cmp \$`1<<8|1<<5|1<<3`,%r11d
  259. je .Lavx2_shortcut
  260. ___
  261. $code.=<<___ if ($avx);
  262. and \$`1<<30`,%r9d # mask "Intel CPU" bit
  263. and \$`1<<28|1<<9`,%r10d # mask AVX and SSSE3 bits
  264. or %r9d,%r10d
  265. cmp \$`1<<28|1<<9|1<<30`,%r10d
  266. je .Lavx_shortcut
  267. ___
  268. $code.=<<___ if ($SZ==4);
  269. test \$`1<<9`,%r10d
  270. jnz .Lssse3_shortcut
  271. ___
  272. $code.=<<___;
  273. mov %rsp,%rax # copy %rsp
  274. .cfi_def_cfa_register %rax
  275. push %rbx
  276. .cfi_push %rbx
  277. push %rbp
  278. .cfi_push %rbp
  279. push %r12
  280. .cfi_push %r12
  281. push %r13
  282. .cfi_push %r13
  283. push %r14
  284. .cfi_push %r14
  285. push %r15
  286. .cfi_push %r15
  287. shl \$4,%rdx # num*16
  288. sub \$$framesz,%rsp
  289. lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
  290. and \$-64,%rsp # align stack frame
  291. mov $ctx,$_ctx # save ctx, 1st arg
  292. mov $inp,$_inp # save inp, 2nd arh
  293. mov %rdx,$_end # save end pointer, "3rd" arg
  294. mov %rax,$_rsp # save copy of %rsp
  295. .cfi_cfa_expression $_rsp,deref,+8
  296. .Lprologue:
  297. mov $SZ*0($ctx),$A
  298. mov $SZ*1($ctx),$B
  299. mov $SZ*2($ctx),$C
  300. mov $SZ*3($ctx),$D
  301. mov $SZ*4($ctx),$E
  302. mov $SZ*5($ctx),$F
  303. mov $SZ*6($ctx),$G
  304. mov $SZ*7($ctx),$H
  305. jmp .Lloop
  306. .align 16
  307. .Lloop:
  308. mov $B,$a3
  309. lea $TABLE(%rip),$Tbl
  310. xor $C,$a3 # magic
  311. ___
  312. for($i=0;$i<16;$i++) {
  313. $code.=" mov $SZ*$i($inp),$T1\n";
  314. $code.=" mov @ROT[4],$a0\n";
  315. $code.=" mov @ROT[0],$a1\n";
  316. $code.=" bswap $T1\n";
  317. &ROUND_00_15($i,@ROT);
  318. unshift(@ROT,pop(@ROT));
  319. }
  320. $code.=<<___;
  321. jmp .Lrounds_16_xx
  322. .align 16
  323. .Lrounds_16_xx:
  324. ___
  325. for(;$i<32;$i++) {
  326. &ROUND_16_XX($i,@ROT);
  327. unshift(@ROT,pop(@ROT));
  328. }
  329. $code.=<<___;
  330. cmpb \$0,`$SZ-1`($Tbl)
  331. jnz .Lrounds_16_xx
  332. mov $_ctx,$ctx
  333. add $a1,$A # modulo-scheduled h+=Sigma0(a)
  334. lea 16*$SZ($inp),$inp
  335. add $SZ*0($ctx),$A
  336. add $SZ*1($ctx),$B
  337. add $SZ*2($ctx),$C
  338. add $SZ*3($ctx),$D
  339. add $SZ*4($ctx),$E
  340. add $SZ*5($ctx),$F
  341. add $SZ*6($ctx),$G
  342. add $SZ*7($ctx),$H
  343. cmp $_end,$inp
  344. mov $A,$SZ*0($ctx)
  345. mov $B,$SZ*1($ctx)
  346. mov $C,$SZ*2($ctx)
  347. mov $D,$SZ*3($ctx)
  348. mov $E,$SZ*4($ctx)
  349. mov $F,$SZ*5($ctx)
  350. mov $G,$SZ*6($ctx)
  351. mov $H,$SZ*7($ctx)
  352. jb .Lloop
  353. mov $_rsp,%rsi
  354. .cfi_def_cfa %rsi,8
  355. mov -48(%rsi),%r15
  356. .cfi_restore %r15
  357. mov -40(%rsi),%r14
  358. .cfi_restore %r14
  359. mov -32(%rsi),%r13
  360. .cfi_restore %r13
  361. mov -24(%rsi),%r12
  362. .cfi_restore %r12
  363. mov -16(%rsi),%rbp
  364. .cfi_restore %rbp
  365. mov -8(%rsi),%rbx
  366. .cfi_restore %rbx
  367. lea (%rsi),%rsp
  368. .cfi_def_cfa_register %rsp
  369. .Lepilogue:
  370. ret
  371. .cfi_endproc
  372. .size $func,.-$func
  373. ___
  374. if ($SZ==4) {
  375. $code.=<<___;
  376. .align 64
  377. .type $TABLE,\@object
  378. $TABLE:
  379. .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
  380. .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
  381. .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
  382. .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
  383. .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
  384. .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
  385. .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
  386. .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
  387. .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
  388. .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
  389. .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
  390. .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
  391. .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
  392. .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
  393. .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
  394. .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
  395. .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
  396. .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
  397. .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
  398. .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
  399. .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
  400. .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
  401. .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
  402. .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
  403. .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
  404. .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
  405. .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
  406. .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
  407. .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
  408. .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
  409. .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
  410. .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
  411. .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
  412. .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
  413. .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff
  414. .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff
  415. .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908
  416. .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908
  417. .asciz "SHA256 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
  418. ___
  419. } else {
  420. $code.=<<___;
  421. .align 64
  422. .type $TABLE,\@object
  423. $TABLE:
  424. .quad 0x428a2f98d728ae22,0x7137449123ef65cd
  425. .quad 0x428a2f98d728ae22,0x7137449123ef65cd
  426. .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
  427. .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
  428. .quad 0x3956c25bf348b538,0x59f111f1b605d019
  429. .quad 0x3956c25bf348b538,0x59f111f1b605d019
  430. .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
  431. .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
  432. .quad 0xd807aa98a3030242,0x12835b0145706fbe
  433. .quad 0xd807aa98a3030242,0x12835b0145706fbe
  434. .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
  435. .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
  436. .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
  437. .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
  438. .quad 0x9bdc06a725c71235,0xc19bf174cf692694
  439. .quad 0x9bdc06a725c71235,0xc19bf174cf692694
  440. .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
  441. .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
  442. .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
  443. .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
  444. .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
  445. .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
  446. .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
  447. .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
  448. .quad 0x983e5152ee66dfab,0xa831c66d2db43210
  449. .quad 0x983e5152ee66dfab,0xa831c66d2db43210
  450. .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
  451. .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
  452. .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
  453. .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
  454. .quad 0x06ca6351e003826f,0x142929670a0e6e70
  455. .quad 0x06ca6351e003826f,0x142929670a0e6e70
  456. .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
  457. .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
  458. .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
  459. .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
  460. .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
  461. .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
  462. .quad 0x81c2c92e47edaee6,0x92722c851482353b
  463. .quad 0x81c2c92e47edaee6,0x92722c851482353b
  464. .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
  465. .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
  466. .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
  467. .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
  468. .quad 0xd192e819d6ef5218,0xd69906245565a910
  469. .quad 0xd192e819d6ef5218,0xd69906245565a910
  470. .quad 0xf40e35855771202a,0x106aa07032bbd1b8
  471. .quad 0xf40e35855771202a,0x106aa07032bbd1b8
  472. .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
  473. .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
  474. .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
  475. .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
  476. .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
  477. .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
  478. .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
  479. .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
  480. .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
  481. .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
  482. .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
  483. .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
  484. .quad 0x90befffa23631e28,0xa4506cebde82bde9
  485. .quad 0x90befffa23631e28,0xa4506cebde82bde9
  486. .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
  487. .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
  488. .quad 0xca273eceea26619c,0xd186b8c721c0c207
  489. .quad 0xca273eceea26619c,0xd186b8c721c0c207
  490. .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
  491. .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
  492. .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
  493. .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
  494. .quad 0x113f9804bef90dae,0x1b710b35131c471b
  495. .quad 0x113f9804bef90dae,0x1b710b35131c471b
  496. .quad 0x28db77f523047d84,0x32caab7b40c72493
  497. .quad 0x28db77f523047d84,0x32caab7b40c72493
  498. .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
  499. .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
  500. .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
  501. .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
  502. .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
  503. .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
  504. .quad 0x0001020304050607,0x08090a0b0c0d0e0f
  505. .quad 0x0001020304050607,0x08090a0b0c0d0e0f
  506. .asciz "SHA512 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
  507. ___
  508. }
  509. ######################################################################
  510. # SIMD code paths
  511. #
  512. if ($SZ==4 && $shaext) {{{
  513. ######################################################################
  514. # Intel SHA Extensions implementation of SHA256 update function.
  515. #
  516. my ($ctx,$inp,$num,$Tbl)=("%rdi","%rsi","%rdx","%rcx");
  517. my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..2,7..10));
  518. my @MSG=map("%xmm$_",(3..6));
  519. $code.=<<___;
  520. .type sha256_block_data_order_shaext,\@function,3
  521. .align 64
  522. sha256_block_data_order_shaext:
  523. _shaext_shortcut:
  524. ___
  525. $code.=<<___ if ($win64);
  526. lea `-8-5*16`(%rsp),%rsp
  527. movaps %xmm6,-8-5*16(%rax)
  528. movaps %xmm7,-8-4*16(%rax)
  529. movaps %xmm8,-8-3*16(%rax)
  530. movaps %xmm9,-8-2*16(%rax)
  531. movaps %xmm10,-8-1*16(%rax)
  532. .Lprologue_shaext:
  533. ___
  534. $code.=<<___;
  535. lea K256+0x80(%rip),$Tbl
  536. movdqu ($ctx),$ABEF # DCBA
  537. movdqu 16($ctx),$CDGH # HGFE
  538. movdqa 0x200-0x80($Tbl),$TMP # byte swap mask
  539. pshufd \$0x1b,$ABEF,$Wi # ABCD
  540. pshufd \$0xb1,$ABEF,$ABEF # CDAB
  541. pshufd \$0x1b,$CDGH,$CDGH # EFGH
  542. movdqa $TMP,$BSWAP # offload
  543. palignr \$8,$CDGH,$ABEF # ABEF
  544. punpcklqdq $Wi,$CDGH # CDGH
  545. jmp .Loop_shaext
  546. .align 16
  547. .Loop_shaext:
  548. movdqu ($inp),@MSG[0]
  549. movdqu 0x10($inp),@MSG[1]
  550. movdqu 0x20($inp),@MSG[2]
  551. pshufb $TMP,@MSG[0]
  552. movdqu 0x30($inp),@MSG[3]
  553. movdqa 0*32-0x80($Tbl),$Wi
  554. paddd @MSG[0],$Wi
  555. pshufb $TMP,@MSG[1]
  556. movdqa $CDGH,$CDGH_SAVE # offload
  557. sha256rnds2 $ABEF,$CDGH # 0-3
  558. pshufd \$0x0e,$Wi,$Wi
  559. nop
  560. movdqa $ABEF,$ABEF_SAVE # offload
  561. sha256rnds2 $CDGH,$ABEF
  562. movdqa 1*32-0x80($Tbl),$Wi
  563. paddd @MSG[1],$Wi
  564. pshufb $TMP,@MSG[2]
  565. sha256rnds2 $ABEF,$CDGH # 4-7
  566. pshufd \$0x0e,$Wi,$Wi
  567. lea 0x40($inp),$inp
  568. sha256msg1 @MSG[1],@MSG[0]
  569. sha256rnds2 $CDGH,$ABEF
  570. movdqa 2*32-0x80($Tbl),$Wi
  571. paddd @MSG[2],$Wi
  572. pshufb $TMP,@MSG[3]
  573. sha256rnds2 $ABEF,$CDGH # 8-11
  574. pshufd \$0x0e,$Wi,$Wi
  575. movdqa @MSG[3],$TMP
  576. palignr \$4,@MSG[2],$TMP
  577. nop
  578. paddd $TMP,@MSG[0]
  579. sha256msg1 @MSG[2],@MSG[1]
  580. sha256rnds2 $CDGH,$ABEF
  581. movdqa 3*32-0x80($Tbl),$Wi
  582. paddd @MSG[3],$Wi
  583. sha256msg2 @MSG[3],@MSG[0]
  584. sha256rnds2 $ABEF,$CDGH # 12-15
  585. pshufd \$0x0e,$Wi,$Wi
  586. movdqa @MSG[0],$TMP
  587. palignr \$4,@MSG[3],$TMP
  588. nop
  589. paddd $TMP,@MSG[1]
  590. sha256msg1 @MSG[3],@MSG[2]
  591. sha256rnds2 $CDGH,$ABEF
  592. ___
  593. for($i=4;$i<16-3;$i++) {
  594. $code.=<<___;
  595. movdqa $i*32-0x80($Tbl),$Wi
  596. paddd @MSG[0],$Wi
  597. sha256msg2 @MSG[0],@MSG[1]
  598. sha256rnds2 $ABEF,$CDGH # 16-19...
  599. pshufd \$0x0e,$Wi,$Wi
  600. movdqa @MSG[1],$TMP
  601. palignr \$4,@MSG[0],$TMP
  602. nop
  603. paddd $TMP,@MSG[2]
  604. sha256msg1 @MSG[0],@MSG[3]
  605. sha256rnds2 $CDGH,$ABEF
  606. ___
  607. push(@MSG,shift(@MSG));
  608. }
  609. $code.=<<___;
  610. movdqa 13*32-0x80($Tbl),$Wi
  611. paddd @MSG[0],$Wi
  612. sha256msg2 @MSG[0],@MSG[1]
  613. sha256rnds2 $ABEF,$CDGH # 52-55
  614. pshufd \$0x0e,$Wi,$Wi
  615. movdqa @MSG[1],$TMP
  616. palignr \$4,@MSG[0],$TMP
  617. sha256rnds2 $CDGH,$ABEF
  618. paddd $TMP,@MSG[2]
  619. movdqa 14*32-0x80($Tbl),$Wi
  620. paddd @MSG[1],$Wi
  621. sha256rnds2 $ABEF,$CDGH # 56-59
  622. pshufd \$0x0e,$Wi,$Wi
  623. sha256msg2 @MSG[1],@MSG[2]
  624. movdqa $BSWAP,$TMP
  625. sha256rnds2 $CDGH,$ABEF
  626. movdqa 15*32-0x80($Tbl),$Wi
  627. paddd @MSG[2],$Wi
  628. nop
  629. sha256rnds2 $ABEF,$CDGH # 60-63
  630. pshufd \$0x0e,$Wi,$Wi
  631. dec $num
  632. nop
  633. sha256rnds2 $CDGH,$ABEF
  634. paddd $CDGH_SAVE,$CDGH
  635. paddd $ABEF_SAVE,$ABEF
  636. jnz .Loop_shaext
  637. pshufd \$0xb1,$CDGH,$CDGH # DCHG
  638. pshufd \$0x1b,$ABEF,$TMP # FEBA
  639. pshufd \$0xb1,$ABEF,$ABEF # BAFE
  640. punpckhqdq $CDGH,$ABEF # DCBA
  641. palignr \$8,$TMP,$CDGH # HGFE
  642. movdqu $ABEF,($ctx)
  643. movdqu $CDGH,16($ctx)
  644. ___
  645. $code.=<<___ if ($win64);
  646. movaps -8-5*16(%rax),%xmm6
  647. movaps -8-4*16(%rax),%xmm7
  648. movaps -8-3*16(%rax),%xmm8
  649. movaps -8-2*16(%rax),%xmm9
  650. movaps -8-1*16(%rax),%xmm10
  651. mov %rax,%rsp
  652. .Lepilogue_shaext:
  653. ___
  654. $code.=<<___;
  655. ret
  656. .size sha256_block_data_order_shaext,.-sha256_block_data_order_shaext
  657. ___
  658. }}}
  659. {{{
  660. my $a4=$T1;
  661. my ($a,$b,$c,$d,$e,$f,$g,$h);
  662. sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
  663. { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
  664. my $arg = pop;
  665. $arg = "\$$arg" if ($arg*1 eq $arg);
  666. $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
  667. }
  668. sub body_00_15 () {
  669. (
  670. '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
  671. '&ror ($a0,$Sigma1[2]-$Sigma1[1])',
  672. '&mov ($a,$a1)',
  673. '&mov ($a4,$f)',
  674. '&ror ($a1,$Sigma0[2]-$Sigma0[1])',
  675. '&xor ($a0,$e)',
  676. '&xor ($a4,$g)', # f^g
  677. '&ror ($a0,$Sigma1[1]-$Sigma1[0])',
  678. '&xor ($a1,$a)',
  679. '&and ($a4,$e)', # (f^g)&e
  680. '&xor ($a0,$e)',
  681. '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i]
  682. '&mov ($a2,$a)',
  683. '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g
  684. '&ror ($a1,$Sigma0[1]-$Sigma0[0])',
  685. '&xor ($a2,$b)', # a^b, b^c in next round
  686. '&add ($h,$a4)', # h+=Ch(e,f,g)
  687. '&ror ($a0,$Sigma1[0])', # Sigma1(e)
  688. '&and ($a3,$a2)', # (b^c)&(a^b)
  689. '&xor ($a1,$a)',
  690. '&add ($h,$a0)', # h+=Sigma1(e)
  691. '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
  692. '&ror ($a1,$Sigma0[0])', # Sigma0(a)
  693. '&add ($d,$h)', # d+=h
  694. '&add ($h,$a3)', # h+=Maj(a,b,c)
  695. '&mov ($a0,$d)',
  696. '&add ($a1,$h);'. # h+=Sigma0(a)
  697. '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
  698. );
  699. }
  700. ######################################################################
  701. # SSSE3 code path
  702. #
  703. if ($SZ==4) { # SHA256 only
  704. my @X = map("%xmm$_",(0..3));
  705. my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
  706. $code.=<<___;
  707. .type ${func}_ssse3,\@function,3
  708. .align 64
  709. ${func}_ssse3:
  710. .cfi_startproc
  711. .Lssse3_shortcut:
  712. mov %rsp,%rax # copy %rsp
  713. .cfi_def_cfa_register %rax
  714. push %rbx
  715. .cfi_push %rbx
  716. push %rbp
  717. .cfi_push %rbp
  718. push %r12
  719. .cfi_push %r12
  720. push %r13
  721. .cfi_push %r13
  722. push %r14
  723. .cfi_push %r14
  724. push %r15
  725. .cfi_push %r15
  726. shl \$4,%rdx # num*16
  727. sub \$`$framesz+$win64*16*4`,%rsp
  728. lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
  729. and \$-64,%rsp # align stack frame
  730. mov $ctx,$_ctx # save ctx, 1st arg
  731. mov $inp,$_inp # save inp, 2nd arh
  732. mov %rdx,$_end # save end pointer, "3rd" arg
  733. mov %rax,$_rsp # save copy of %rsp
  734. .cfi_cfa_expression $_rsp,deref,+8
  735. ___
  736. $code.=<<___ if ($win64);
  737. movaps %xmm6,16*$SZ+32(%rsp)
  738. movaps %xmm7,16*$SZ+48(%rsp)
  739. movaps %xmm8,16*$SZ+64(%rsp)
  740. movaps %xmm9,16*$SZ+80(%rsp)
  741. ___
  742. $code.=<<___;
  743. .Lprologue_ssse3:
  744. mov $SZ*0($ctx),$A
  745. mov $SZ*1($ctx),$B
  746. mov $SZ*2($ctx),$C
  747. mov $SZ*3($ctx),$D
  748. mov $SZ*4($ctx),$E
  749. mov $SZ*5($ctx),$F
  750. mov $SZ*6($ctx),$G
  751. mov $SZ*7($ctx),$H
  752. ___
  753. $code.=<<___;
  754. #movdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4
  755. #movdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5
  756. jmp .Lloop_ssse3
  757. .align 16
  758. .Lloop_ssse3:
  759. movdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
  760. movdqu 0x00($inp),@X[0]
  761. movdqu 0x10($inp),@X[1]
  762. movdqu 0x20($inp),@X[2]
  763. pshufb $t3,@X[0]
  764. movdqu 0x30($inp),@X[3]
  765. lea $TABLE(%rip),$Tbl
  766. pshufb $t3,@X[1]
  767. movdqa 0x00($Tbl),$t0
  768. movdqa 0x20($Tbl),$t1
  769. pshufb $t3,@X[2]
  770. paddd @X[0],$t0
  771. movdqa 0x40($Tbl),$t2
  772. pshufb $t3,@X[3]
  773. movdqa 0x60($Tbl),$t3
  774. paddd @X[1],$t1
  775. paddd @X[2],$t2
  776. paddd @X[3],$t3
  777. movdqa $t0,0x00(%rsp)
  778. mov $A,$a1
  779. movdqa $t1,0x10(%rsp)
  780. mov $B,$a3
  781. movdqa $t2,0x20(%rsp)
  782. xor $C,$a3 # magic
  783. movdqa $t3,0x30(%rsp)
  784. mov $E,$a0
  785. jmp .Lssse3_00_47
  786. .align 16
  787. .Lssse3_00_47:
  788. sub \$`-16*2*$SZ`,$Tbl # size optimization
  789. ___
  790. sub Xupdate_256_SSSE3 () {
  791. (
  792. '&movdqa ($t0,@X[1]);',
  793. '&movdqa ($t3,@X[3])',
  794. '&palignr ($t0,@X[0],$SZ)', # X[1..4]
  795. '&palignr ($t3,@X[2],$SZ);', # X[9..12]
  796. '&movdqa ($t1,$t0)',
  797. '&movdqa ($t2,$t0);',
  798. '&psrld ($t0,$sigma0[2])',
  799. '&paddd (@X[0],$t3);', # X[0..3] += X[9..12]
  800. '&psrld ($t2,$sigma0[0])',
  801. '&pshufd ($t3,@X[3],0b11111010)',# X[14..15]
  802. '&pslld ($t1,8*$SZ-$sigma0[1]);'.
  803. '&pxor ($t0,$t2)',
  804. '&psrld ($t2,$sigma0[1]-$sigma0[0]);'.
  805. '&pxor ($t0,$t1)',
  806. '&pslld ($t1,$sigma0[1]-$sigma0[0]);'.
  807. '&pxor ($t0,$t2);',
  808. '&movdqa ($t2,$t3)',
  809. '&pxor ($t0,$t1);', # sigma0(X[1..4])
  810. '&psrld ($t3,$sigma1[2])',
  811. '&paddd (@X[0],$t0);', # X[0..3] += sigma0(X[1..4])
  812. '&psrlq ($t2,$sigma1[0])',
  813. '&pxor ($t3,$t2);',
  814. '&psrlq ($t2,$sigma1[1]-$sigma1[0])',
  815. '&pxor ($t3,$t2)',
  816. '&pshufb ($t3,$t4)', # sigma1(X[14..15])
  817. '&paddd (@X[0],$t3)', # X[0..1] += sigma1(X[14..15])
  818. '&pshufd ($t3,@X[0],0b01010000)',# X[16..17]
  819. '&movdqa ($t2,$t3);',
  820. '&psrld ($t3,$sigma1[2])',
  821. '&psrlq ($t2,$sigma1[0])',
  822. '&pxor ($t3,$t2);',
  823. '&psrlq ($t2,$sigma1[1]-$sigma1[0])',
  824. '&pxor ($t3,$t2);',
  825. '&movdqa ($t2,16*2*$j."($Tbl)")',
  826. '&pshufb ($t3,$t5)',
  827. '&paddd (@X[0],$t3)' # X[2..3] += sigma1(X[16..17])
  828. );
  829. }
  830. sub SSSE3_256_00_47 () {
  831. my $j = shift;
  832. my $body = shift;
  833. my @X = @_;
  834. my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
  835. if (0) {
  836. foreach (Xupdate_256_SSSE3()) { # 36 instructions
  837. eval;
  838. eval(shift(@insns));
  839. eval(shift(@insns));
  840. eval(shift(@insns));
  841. }
  842. } else { # squeeze extra 4% on Westmere and 19% on Atom
  843. eval(shift(@insns)); #@
  844. &movdqa ($t0,@X[1]);
  845. eval(shift(@insns));
  846. eval(shift(@insns));
  847. &movdqa ($t3,@X[3]);
  848. eval(shift(@insns)); #@
  849. eval(shift(@insns));
  850. eval(shift(@insns));
  851. eval(shift(@insns)); #@
  852. eval(shift(@insns));
  853. &palignr ($t0,@X[0],$SZ); # X[1..4]
  854. eval(shift(@insns));
  855. eval(shift(@insns));
  856. &palignr ($t3,@X[2],$SZ); # X[9..12]
  857. eval(shift(@insns));
  858. eval(shift(@insns));
  859. eval(shift(@insns));
  860. eval(shift(@insns)); #@
  861. &movdqa ($t1,$t0);
  862. eval(shift(@insns));
  863. eval(shift(@insns));
  864. &movdqa ($t2,$t0);
  865. eval(shift(@insns)); #@
  866. eval(shift(@insns));
  867. &psrld ($t0,$sigma0[2]);
  868. eval(shift(@insns));
  869. eval(shift(@insns));
  870. eval(shift(@insns));
  871. &paddd (@X[0],$t3); # X[0..3] += X[9..12]
  872. eval(shift(@insns)); #@
  873. eval(shift(@insns));
  874. &psrld ($t2,$sigma0[0]);
  875. eval(shift(@insns));
  876. eval(shift(@insns));
  877. &pshufd ($t3,@X[3],0b11111010); # X[4..15]
  878. eval(shift(@insns));
  879. eval(shift(@insns)); #@
  880. &pslld ($t1,8*$SZ-$sigma0[1]);
  881. eval(shift(@insns));
  882. eval(shift(@insns));
  883. &pxor ($t0,$t2);
  884. eval(shift(@insns)); #@
  885. eval(shift(@insns));
  886. eval(shift(@insns));
  887. eval(shift(@insns)); #@
  888. &psrld ($t2,$sigma0[1]-$sigma0[0]);
  889. eval(shift(@insns));
  890. &pxor ($t0,$t1);
  891. eval(shift(@insns));
  892. eval(shift(@insns));
  893. &pslld ($t1,$sigma0[1]-$sigma0[0]);
  894. eval(shift(@insns));
  895. eval(shift(@insns));
  896. &pxor ($t0,$t2);
  897. eval(shift(@insns));
  898. eval(shift(@insns)); #@
  899. &movdqa ($t2,$t3);
  900. eval(shift(@insns));
  901. eval(shift(@insns));
  902. &pxor ($t0,$t1); # sigma0(X[1..4])
  903. eval(shift(@insns)); #@
  904. eval(shift(@insns));
  905. eval(shift(@insns));
  906. &psrld ($t3,$sigma1[2]);
  907. eval(shift(@insns));
  908. eval(shift(@insns));
  909. &paddd (@X[0],$t0); # X[0..3] += sigma0(X[1..4])
  910. eval(shift(@insns)); #@
  911. eval(shift(@insns));
  912. &psrlq ($t2,$sigma1[0]);
  913. eval(shift(@insns));
  914. eval(shift(@insns));
  915. eval(shift(@insns));
  916. &pxor ($t3,$t2);
  917. eval(shift(@insns)); #@
  918. eval(shift(@insns));
  919. eval(shift(@insns));
  920. eval(shift(@insns)); #@
  921. &psrlq ($t2,$sigma1[1]-$sigma1[0]);
  922. eval(shift(@insns));
  923. eval(shift(@insns));
  924. &pxor ($t3,$t2);
  925. eval(shift(@insns)); #@
  926. eval(shift(@insns));
  927. eval(shift(@insns));
  928. #&pshufb ($t3,$t4); # sigma1(X[14..15])
  929. &pshufd ($t3,$t3,0b10000000);
  930. eval(shift(@insns));
  931. eval(shift(@insns));
  932. eval(shift(@insns));
  933. &psrldq ($t3,8);
  934. eval(shift(@insns));
  935. eval(shift(@insns)); #@
  936. eval(shift(@insns));
  937. eval(shift(@insns));
  938. eval(shift(@insns)); #@
  939. &paddd (@X[0],$t3); # X[0..1] += sigma1(X[14..15])
  940. eval(shift(@insns));
  941. eval(shift(@insns));
  942. eval(shift(@insns));
  943. &pshufd ($t3,@X[0],0b01010000); # X[16..17]
  944. eval(shift(@insns));
  945. eval(shift(@insns)); #@
  946. eval(shift(@insns));
  947. &movdqa ($t2,$t3);
  948. eval(shift(@insns));
  949. eval(shift(@insns));
  950. &psrld ($t3,$sigma1[2]);
  951. eval(shift(@insns));
  952. eval(shift(@insns)); #@
  953. &psrlq ($t2,$sigma1[0]);
  954. eval(shift(@insns));
  955. eval(shift(@insns));
  956. &pxor ($t3,$t2);
  957. eval(shift(@insns)); #@
  958. eval(shift(@insns));
  959. eval(shift(@insns));
  960. eval(shift(@insns)); #@
  961. eval(shift(@insns));
  962. &psrlq ($t2,$sigma1[1]-$sigma1[0]);
  963. eval(shift(@insns));
  964. eval(shift(@insns));
  965. eval(shift(@insns));
  966. &pxor ($t3,$t2);
  967. eval(shift(@insns));
  968. eval(shift(@insns));
  969. eval(shift(@insns)); #@
  970. #&pshufb ($t3,$t5);
  971. &pshufd ($t3,$t3,0b00001000);
  972. eval(shift(@insns));
  973. eval(shift(@insns));
  974. &movdqa ($t2,16*2*$j."($Tbl)");
  975. eval(shift(@insns)); #@
  976. eval(shift(@insns));
  977. &pslldq ($t3,8);
  978. eval(shift(@insns));
  979. eval(shift(@insns));
  980. eval(shift(@insns));
  981. &paddd (@X[0],$t3); # X[2..3] += sigma1(X[16..17])
  982. eval(shift(@insns)); #@
  983. eval(shift(@insns));
  984. eval(shift(@insns));
  985. }
  986. &paddd ($t2,@X[0]);
  987. foreach (@insns) { eval; } # remaining instructions
  988. &movdqa (16*$j."(%rsp)",$t2);
  989. }
  990. for ($i=0,$j=0; $j<4; $j++) {
  991. &SSSE3_256_00_47($j,\&body_00_15,@X);
  992. push(@X,shift(@X)); # rotate(@X)
  993. }
  994. &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
  995. &jne (".Lssse3_00_47");
  996. for ($i=0; $i<16; ) {
  997. foreach(body_00_15()) { eval; }
  998. }
  999. $code.=<<___;
  1000. mov $_ctx,$ctx
  1001. mov $a1,$A
  1002. add $SZ*0($ctx),$A
  1003. lea 16*$SZ($inp),$inp
  1004. add $SZ*1($ctx),$B
  1005. add $SZ*2($ctx),$C
  1006. add $SZ*3($ctx),$D
  1007. add $SZ*4($ctx),$E
  1008. add $SZ*5($ctx),$F
  1009. add $SZ*6($ctx),$G
  1010. add $SZ*7($ctx),$H
  1011. cmp $_end,$inp
  1012. mov $A,$SZ*0($ctx)
  1013. mov $B,$SZ*1($ctx)
  1014. mov $C,$SZ*2($ctx)
  1015. mov $D,$SZ*3($ctx)
  1016. mov $E,$SZ*4($ctx)
  1017. mov $F,$SZ*5($ctx)
  1018. mov $G,$SZ*6($ctx)
  1019. mov $H,$SZ*7($ctx)
  1020. jb .Lloop_ssse3
  1021. mov $_rsp,%rsi
  1022. .cfi_def_cfa %rsi,8
  1023. ___
  1024. $code.=<<___ if ($win64);
  1025. movaps 16*$SZ+32(%rsp),%xmm6
  1026. movaps 16*$SZ+48(%rsp),%xmm7
  1027. movaps 16*$SZ+64(%rsp),%xmm8
  1028. movaps 16*$SZ+80(%rsp),%xmm9
  1029. ___
  1030. $code.=<<___;
  1031. mov -48(%rsi),%r15
  1032. .cfi_restore %r15
  1033. mov -40(%rsi),%r14
  1034. .cfi_restore %r14
  1035. mov -32(%rsi),%r13
  1036. .cfi_restore %r13
  1037. mov -24(%rsi),%r12
  1038. .cfi_restore %r12
  1039. mov -16(%rsi),%rbp
  1040. .cfi_restore %rbp
  1041. mov -8(%rsi),%rbx
  1042. .cfi_restore %rbx
  1043. lea (%rsi),%rsp
  1044. .cfi_def_cfa_register %rsp
  1045. .Lepilogue_ssse3:
  1046. ret
  1047. .cfi_endproc
  1048. .size ${func}_ssse3,.-${func}_ssse3
  1049. ___
  1050. }
  1051. if ($avx) {{
  1052. ######################################################################
  1053. # XOP code path
  1054. #
  1055. if ($SZ==8) { # SHA512 only
  1056. $code.=<<___;
  1057. .type ${func}_xop,\@function,3
  1058. .align 64
  1059. ${func}_xop:
  1060. .cfi_startproc
  1061. .Lxop_shortcut:
  1062. mov %rsp,%rax # copy %rsp
  1063. .cfi_def_cfa_register %rax
  1064. push %rbx
  1065. .cfi_push %rbx
  1066. push %rbp
  1067. .cfi_push %rbp
  1068. push %r12
  1069. .cfi_push %r12
  1070. push %r13
  1071. .cfi_push %r13
  1072. push %r14
  1073. .cfi_push %r14
  1074. push %r15
  1075. .cfi_push %r15
  1076. shl \$4,%rdx # num*16
  1077. sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
  1078. lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
  1079. and \$-64,%rsp # align stack frame
  1080. mov $ctx,$_ctx # save ctx, 1st arg
  1081. mov $inp,$_inp # save inp, 2nd arh
  1082. mov %rdx,$_end # save end pointer, "3rd" arg
  1083. mov %rax,$_rsp # save copy of %rsp
  1084. .cfi_cfa_expression $_rsp,deref,+8
  1085. ___
  1086. $code.=<<___ if ($win64);
  1087. movaps %xmm6,16*$SZ+32(%rsp)
  1088. movaps %xmm7,16*$SZ+48(%rsp)
  1089. movaps %xmm8,16*$SZ+64(%rsp)
  1090. movaps %xmm9,16*$SZ+80(%rsp)
  1091. ___
  1092. $code.=<<___ if ($win64 && $SZ>4);
  1093. movaps %xmm10,16*$SZ+96(%rsp)
  1094. movaps %xmm11,16*$SZ+112(%rsp)
  1095. ___
  1096. $code.=<<___;
  1097. .Lprologue_xop:
  1098. vzeroupper
  1099. mov $SZ*0($ctx),$A
  1100. mov $SZ*1($ctx),$B
  1101. mov $SZ*2($ctx),$C
  1102. mov $SZ*3($ctx),$D
  1103. mov $SZ*4($ctx),$E
  1104. mov $SZ*5($ctx),$F
  1105. mov $SZ*6($ctx),$G
  1106. mov $SZ*7($ctx),$H
  1107. jmp .Lloop_xop
  1108. ___
  1109. if ($SZ==4) { # SHA256
  1110. my @X = map("%xmm$_",(0..3));
  1111. my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
  1112. $code.=<<___;
  1113. .align 16
  1114. .Lloop_xop:
  1115. vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
  1116. vmovdqu 0x00($inp),@X[0]
  1117. vmovdqu 0x10($inp),@X[1]
  1118. vmovdqu 0x20($inp),@X[2]
  1119. vmovdqu 0x30($inp),@X[3]
  1120. vpshufb $t3,@X[0],@X[0]
  1121. lea $TABLE(%rip),$Tbl
  1122. vpshufb $t3,@X[1],@X[1]
  1123. vpshufb $t3,@X[2],@X[2]
  1124. vpaddd 0x00($Tbl),@X[0],$t0
  1125. vpshufb $t3,@X[3],@X[3]
  1126. vpaddd 0x20($Tbl),@X[1],$t1
  1127. vpaddd 0x40($Tbl),@X[2],$t2
  1128. vpaddd 0x60($Tbl),@X[3],$t3
  1129. vmovdqa $t0,0x00(%rsp)
  1130. mov $A,$a1
  1131. vmovdqa $t1,0x10(%rsp)
  1132. mov $B,$a3
  1133. vmovdqa $t2,0x20(%rsp)
  1134. xor $C,$a3 # magic
  1135. vmovdqa $t3,0x30(%rsp)
  1136. mov $E,$a0
  1137. jmp .Lxop_00_47
  1138. .align 16
  1139. .Lxop_00_47:
  1140. sub \$`-16*2*$SZ`,$Tbl # size optimization
  1141. ___
  1142. sub XOP_256_00_47 () {
  1143. my $j = shift;
  1144. my $body = shift;
  1145. my @X = @_;
  1146. my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
  1147. &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..4]
  1148. eval(shift(@insns));
  1149. eval(shift(@insns));
  1150. &vpalignr ($t3,@X[3],@X[2],$SZ); # X[9..12]
  1151. eval(shift(@insns));
  1152. eval(shift(@insns));
  1153. &vprotd ($t1,$t0,8*$SZ-$sigma0[1]);
  1154. eval(shift(@insns));
  1155. eval(shift(@insns));
  1156. &vpsrld ($t0,$t0,$sigma0[2]);
  1157. eval(shift(@insns));
  1158. eval(shift(@insns));
  1159. &vpaddd (@X[0],@X[0],$t3); # X[0..3] += X[9..12]
  1160. eval(shift(@insns));
  1161. eval(shift(@insns));
  1162. eval(shift(@insns));
  1163. eval(shift(@insns));
  1164. &vprotd ($t2,$t1,$sigma0[1]-$sigma0[0]);
  1165. eval(shift(@insns));
  1166. eval(shift(@insns));
  1167. &vpxor ($t0,$t0,$t1);
  1168. eval(shift(@insns));
  1169. eval(shift(@insns));
  1170. eval(shift(@insns));
  1171. eval(shift(@insns));
  1172. &vprotd ($t3,@X[3],8*$SZ-$sigma1[1]);
  1173. eval(shift(@insns));
  1174. eval(shift(@insns));
  1175. &vpxor ($t0,$t0,$t2); # sigma0(X[1..4])
  1176. eval(shift(@insns));
  1177. eval(shift(@insns));
  1178. &vpsrld ($t2,@X[3],$sigma1[2]);
  1179. eval(shift(@insns));
  1180. eval(shift(@insns));
  1181. &vpaddd (@X[0],@X[0],$t0); # X[0..3] += sigma0(X[1..4])
  1182. eval(shift(@insns));
  1183. eval(shift(@insns));
  1184. &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
  1185. eval(shift(@insns));
  1186. eval(shift(@insns));
  1187. &vpxor ($t3,$t3,$t2);
  1188. eval(shift(@insns));
  1189. eval(shift(@insns));
  1190. eval(shift(@insns));
  1191. eval(shift(@insns));
  1192. &vpxor ($t3,$t3,$t1); # sigma1(X[14..15])
  1193. eval(shift(@insns));
  1194. eval(shift(@insns));
  1195. eval(shift(@insns));
  1196. eval(shift(@insns));
  1197. &vpsrldq ($t3,$t3,8);
  1198. eval(shift(@insns));
  1199. eval(shift(@insns));
  1200. eval(shift(@insns));
  1201. eval(shift(@insns));
  1202. &vpaddd (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15])
  1203. eval(shift(@insns));
  1204. eval(shift(@insns));
  1205. eval(shift(@insns));
  1206. eval(shift(@insns));
  1207. &vprotd ($t3,@X[0],8*$SZ-$sigma1[1]);
  1208. eval(shift(@insns));
  1209. eval(shift(@insns));
  1210. &vpsrld ($t2,@X[0],$sigma1[2]);
  1211. eval(shift(@insns));
  1212. eval(shift(@insns));
  1213. &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
  1214. eval(shift(@insns));
  1215. eval(shift(@insns));
  1216. &vpxor ($t3,$t3,$t2);
  1217. eval(shift(@insns));
  1218. eval(shift(@insns));
  1219. eval(shift(@insns));
  1220. eval(shift(@insns));
  1221. &vpxor ($t3,$t3,$t1); # sigma1(X[16..17])
  1222. eval(shift(@insns));
  1223. eval(shift(@insns));
  1224. eval(shift(@insns));
  1225. eval(shift(@insns));
  1226. &vpslldq ($t3,$t3,8); # 22 instructions
  1227. eval(shift(@insns));
  1228. eval(shift(@insns));
  1229. eval(shift(@insns));
  1230. eval(shift(@insns));
  1231. &vpaddd (@X[0],@X[0],$t3); # X[2..3] += sigma1(X[16..17])
  1232. eval(shift(@insns));
  1233. eval(shift(@insns));
  1234. eval(shift(@insns));
  1235. eval(shift(@insns));
  1236. &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
  1237. foreach (@insns) { eval; } # remaining instructions
  1238. &vmovdqa (16*$j."(%rsp)",$t2);
  1239. }
  1240. for ($i=0,$j=0; $j<4; $j++) {
  1241. &XOP_256_00_47($j,\&body_00_15,@X);
  1242. push(@X,shift(@X)); # rotate(@X)
  1243. }
  1244. &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
  1245. &jne (".Lxop_00_47");
  1246. for ($i=0; $i<16; ) {
  1247. foreach(body_00_15()) { eval; }
  1248. }
  1249. } else { # SHA512
  1250. my @X = map("%xmm$_",(0..7));
  1251. my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11));
  1252. $code.=<<___;
  1253. .align 16
  1254. .Lloop_xop:
  1255. vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
  1256. vmovdqu 0x00($inp),@X[0]
  1257. lea $TABLE+0x80(%rip),$Tbl # size optimization
  1258. vmovdqu 0x10($inp),@X[1]
  1259. vmovdqu 0x20($inp),@X[2]
  1260. vpshufb $t3,@X[0],@X[0]
  1261. vmovdqu 0x30($inp),@X[3]
  1262. vpshufb $t3,@X[1],@X[1]
  1263. vmovdqu 0x40($inp),@X[4]
  1264. vpshufb $t3,@X[2],@X[2]
  1265. vmovdqu 0x50($inp),@X[5]
  1266. vpshufb $t3,@X[3],@X[3]
  1267. vmovdqu 0x60($inp),@X[6]
  1268. vpshufb $t3,@X[4],@X[4]
  1269. vmovdqu 0x70($inp),@X[7]
  1270. vpshufb $t3,@X[5],@X[5]
  1271. vpaddq -0x80($Tbl),@X[0],$t0
  1272. vpshufb $t3,@X[6],@X[6]
  1273. vpaddq -0x60($Tbl),@X[1],$t1
  1274. vpshufb $t3,@X[7],@X[7]
  1275. vpaddq -0x40($Tbl),@X[2],$t2
  1276. vpaddq -0x20($Tbl),@X[3],$t3
  1277. vmovdqa $t0,0x00(%rsp)
  1278. vpaddq 0x00($Tbl),@X[4],$t0
  1279. vmovdqa $t1,0x10(%rsp)
  1280. vpaddq 0x20($Tbl),@X[5],$t1
  1281. vmovdqa $t2,0x20(%rsp)
  1282. vpaddq 0x40($Tbl),@X[6],$t2
  1283. vmovdqa $t3,0x30(%rsp)
  1284. vpaddq 0x60($Tbl),@X[7],$t3
  1285. vmovdqa $t0,0x40(%rsp)
  1286. mov $A,$a1
  1287. vmovdqa $t1,0x50(%rsp)
  1288. mov $B,$a3
  1289. vmovdqa $t2,0x60(%rsp)
  1290. xor $C,$a3 # magic
  1291. vmovdqa $t3,0x70(%rsp)
  1292. mov $E,$a0
  1293. jmp .Lxop_00_47
  1294. .align 16
  1295. .Lxop_00_47:
  1296. add \$`16*2*$SZ`,$Tbl
  1297. ___
  1298. sub XOP_512_00_47 () {
  1299. my $j = shift;
  1300. my $body = shift;
  1301. my @X = @_;
  1302. my @insns = (&$body,&$body); # 52 instructions
  1303. &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..2]
  1304. eval(shift(@insns));
  1305. eval(shift(@insns));
  1306. &vpalignr ($t3,@X[5],@X[4],$SZ); # X[9..10]
  1307. eval(shift(@insns));
  1308. eval(shift(@insns));
  1309. &vprotq ($t1,$t0,8*$SZ-$sigma0[1]);
  1310. eval(shift(@insns));
  1311. eval(shift(@insns));
  1312. &vpsrlq ($t0,$t0,$sigma0[2]);
  1313. eval(shift(@insns));
  1314. eval(shift(@insns));
  1315. &vpaddq (@X[0],@X[0],$t3); # X[0..1] += X[9..10]
  1316. eval(shift(@insns));
  1317. eval(shift(@insns));
  1318. eval(shift(@insns));
  1319. eval(shift(@insns));
  1320. &vprotq ($t2,$t1,$sigma0[1]-$sigma0[0]);
  1321. eval(shift(@insns));
  1322. eval(shift(@insns));
  1323. &vpxor ($t0,$t0,$t1);
  1324. eval(shift(@insns));
  1325. eval(shift(@insns));
  1326. eval(shift(@insns));
  1327. eval(shift(@insns));
  1328. &vprotq ($t3,@X[7],8*$SZ-$sigma1[1]);
  1329. eval(shift(@insns));
  1330. eval(shift(@insns));
  1331. &vpxor ($t0,$t0,$t2); # sigma0(X[1..2])
  1332. eval(shift(@insns));
  1333. eval(shift(@insns));
  1334. &vpsrlq ($t2,@X[7],$sigma1[2]);
  1335. eval(shift(@insns));
  1336. eval(shift(@insns));
  1337. &vpaddq (@X[0],@X[0],$t0); # X[0..1] += sigma0(X[1..2])
  1338. eval(shift(@insns));
  1339. eval(shift(@insns));
  1340. &vprotq ($t1,$t3,$sigma1[1]-$sigma1[0]);
  1341. eval(shift(@insns));
  1342. eval(shift(@insns));
  1343. &vpxor ($t3,$t3,$t2);
  1344. eval(shift(@insns));
  1345. eval(shift(@insns));
  1346. eval(shift(@insns));
  1347. eval(shift(@insns));
  1348. &vpxor ($t3,$t3,$t1); # sigma1(X[14..15])
  1349. eval(shift(@insns));
  1350. eval(shift(@insns));
  1351. eval(shift(@insns));
  1352. eval(shift(@insns));
  1353. &vpaddq (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15])
  1354. eval(shift(@insns));
  1355. eval(shift(@insns));
  1356. eval(shift(@insns));
  1357. eval(shift(@insns));
  1358. &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)");
  1359. foreach (@insns) { eval; } # remaining instructions
  1360. &vmovdqa (16*$j."(%rsp)",$t2);
  1361. }
  1362. for ($i=0,$j=0; $j<8; $j++) {
  1363. &XOP_512_00_47($j,\&body_00_15,@X);
  1364. push(@X,shift(@X)); # rotate(@X)
  1365. }
  1366. &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0);
  1367. &jne (".Lxop_00_47");
  1368. for ($i=0; $i<16; ) {
  1369. foreach(body_00_15()) { eval; }
  1370. }
  1371. }
  1372. $code.=<<___;
  1373. mov $_ctx,$ctx
  1374. mov $a1,$A
  1375. add $SZ*0($ctx),$A
  1376. lea 16*$SZ($inp),$inp
  1377. add $SZ*1($ctx),$B
  1378. add $SZ*2($ctx),$C
  1379. add $SZ*3($ctx),$D
  1380. add $SZ*4($ctx),$E
  1381. add $SZ*5($ctx),$F
  1382. add $SZ*6($ctx),$G
  1383. add $SZ*7($ctx),$H
  1384. cmp $_end,$inp
  1385. mov $A,$SZ*0($ctx)
  1386. mov $B,$SZ*1($ctx)
  1387. mov $C,$SZ*2($ctx)
  1388. mov $D,$SZ*3($ctx)
  1389. mov $E,$SZ*4($ctx)
  1390. mov $F,$SZ*5($ctx)
  1391. mov $G,$SZ*6($ctx)
  1392. mov $H,$SZ*7($ctx)
  1393. jb .Lloop_xop
  1394. mov $_rsp,%rsi
  1395. .cfi_def_cfa %rsi,8
  1396. vzeroupper
  1397. ___
  1398. $code.=<<___ if ($win64);
  1399. movaps 16*$SZ+32(%rsp),%xmm6
  1400. movaps 16*$SZ+48(%rsp),%xmm7
  1401. movaps 16*$SZ+64(%rsp),%xmm8
  1402. movaps 16*$SZ+80(%rsp),%xmm9
  1403. ___
  1404. $code.=<<___ if ($win64 && $SZ>4);
  1405. movaps 16*$SZ+96(%rsp),%xmm10
  1406. movaps 16*$SZ+112(%rsp),%xmm11
  1407. ___
  1408. $code.=<<___;
  1409. mov -48(%rsi),%r15
  1410. .cfi_restore %r15
  1411. mov -40(%rsi),%r14
  1412. .cfi_restore %r14
  1413. mov -32(%rsi),%r13
  1414. .cfi_restore %r13
  1415. mov -24(%rsi),%r12
  1416. .cfi_restore %r12
  1417. mov -16(%rsi),%rbp
  1418. .cfi_restore %rbp
  1419. mov -8(%rsi),%rbx
  1420. .cfi_restore %rbx
  1421. lea (%rsi),%rsp
  1422. .cfi_def_cfa_register %rsp
  1423. .Lepilogue_xop:
  1424. ret
  1425. .cfi_endproc
  1426. .size ${func}_xop,.-${func}_xop
  1427. ___
  1428. }
  1429. ######################################################################
  1430. # AVX+shrd code path
  1431. #
  1432. local *ror = sub { &shrd(@_[0],@_) };
  1433. $code.=<<___;
  1434. .type ${func}_avx,\@function,3
  1435. .align 64
  1436. ${func}_avx:
  1437. .cfi_startproc
  1438. .Lavx_shortcut:
  1439. mov %rsp,%rax # copy %rsp
  1440. .cfi_def_cfa_register %rax
  1441. push %rbx
  1442. .cfi_push %rbx
  1443. push %rbp
  1444. .cfi_push %rbp
  1445. push %r12
  1446. .cfi_push %r12
  1447. push %r13
  1448. .cfi_push %r13
  1449. push %r14
  1450. .cfi_push %r14
  1451. push %r15
  1452. .cfi_push %r15
  1453. shl \$4,%rdx # num*16
  1454. sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
  1455. lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
  1456. and \$-64,%rsp # align stack frame
  1457. mov $ctx,$_ctx # save ctx, 1st arg
  1458. mov $inp,$_inp # save inp, 2nd arh
  1459. mov %rdx,$_end # save end pointer, "3rd" arg
  1460. mov %rax,$_rsp # save copy of %rsp
  1461. .cfi_cfa_expression $_rsp,deref,+8
  1462. ___
  1463. $code.=<<___ if ($win64);
  1464. movaps %xmm6,16*$SZ+32(%rsp)
  1465. movaps %xmm7,16*$SZ+48(%rsp)
  1466. movaps %xmm8,16*$SZ+64(%rsp)
  1467. movaps %xmm9,16*$SZ+80(%rsp)
  1468. ___
  1469. $code.=<<___ if ($win64 && $SZ>4);
  1470. movaps %xmm10,16*$SZ+96(%rsp)
  1471. movaps %xmm11,16*$SZ+112(%rsp)
  1472. ___
  1473. $code.=<<___;
  1474. .Lprologue_avx:
  1475. vzeroupper
  1476. mov $SZ*0($ctx),$A
  1477. mov $SZ*1($ctx),$B
  1478. mov $SZ*2($ctx),$C
  1479. mov $SZ*3($ctx),$D
  1480. mov $SZ*4($ctx),$E
  1481. mov $SZ*5($ctx),$F
  1482. mov $SZ*6($ctx),$G
  1483. mov $SZ*7($ctx),$H
  1484. ___
  1485. if ($SZ==4) { # SHA256
  1486. my @X = map("%xmm$_",(0..3));
  1487. my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
  1488. $code.=<<___;
  1489. vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4
  1490. vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5
  1491. jmp .Lloop_avx
  1492. .align 16
  1493. .Lloop_avx:
  1494. vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
  1495. vmovdqu 0x00($inp),@X[0]
  1496. vmovdqu 0x10($inp),@X[1]
  1497. vmovdqu 0x20($inp),@X[2]
  1498. vmovdqu 0x30($inp),@X[3]
  1499. vpshufb $t3,@X[0],@X[0]
  1500. lea $TABLE(%rip),$Tbl
  1501. vpshufb $t3,@X[1],@X[1]
  1502. vpshufb $t3,@X[2],@X[2]
  1503. vpaddd 0x00($Tbl),@X[0],$t0
  1504. vpshufb $t3,@X[3],@X[3]
  1505. vpaddd 0x20($Tbl),@X[1],$t1
  1506. vpaddd 0x40($Tbl),@X[2],$t2
  1507. vpaddd 0x60($Tbl),@X[3],$t3
  1508. vmovdqa $t0,0x00(%rsp)
  1509. mov $A,$a1
  1510. vmovdqa $t1,0x10(%rsp)
  1511. mov $B,$a3
  1512. vmovdqa $t2,0x20(%rsp)
  1513. xor $C,$a3 # magic
  1514. vmovdqa $t3,0x30(%rsp)
  1515. mov $E,$a0
  1516. jmp .Lavx_00_47
  1517. .align 16
  1518. .Lavx_00_47:
  1519. sub \$`-16*2*$SZ`,$Tbl # size optimization
  1520. ___
  1521. sub Xupdate_256_AVX () {
  1522. (
  1523. '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4]
  1524. '&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12]
  1525. '&vpsrld ($t2,$t0,$sigma0[0]);',
  1526. '&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12]
  1527. '&vpsrld ($t3,$t0,$sigma0[2])',
  1528. '&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);',
  1529. '&vpxor ($t0,$t3,$t2)',
  1530. '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15]
  1531. '&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);',
  1532. '&vpxor ($t0,$t0,$t1)',
  1533. '&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);',
  1534. '&vpxor ($t0,$t0,$t2)',
  1535. '&vpsrld ($t2,$t3,$sigma1[2]);',
  1536. '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4])
  1537. '&vpsrlq ($t3,$t3,$sigma1[0]);',
  1538. '&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4])
  1539. '&vpxor ($t2,$t2,$t3);',
  1540. '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
  1541. '&vpxor ($t2,$t2,$t3)',
  1542. '&vpshufb ($t2,$t2,$t4)', # sigma1(X[14..15])
  1543. '&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15])
  1544. '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17]
  1545. '&vpsrld ($t2,$t3,$sigma1[2])',
  1546. '&vpsrlq ($t3,$t3,$sigma1[0])',
  1547. '&vpxor ($t2,$t2,$t3);',
  1548. '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
  1549. '&vpxor ($t2,$t2,$t3)',
  1550. '&vpshufb ($t2,$t2,$t5)',
  1551. '&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17])
  1552. );
  1553. }
  1554. sub AVX_256_00_47 () {
  1555. my $j = shift;
  1556. my $body = shift;
  1557. my @X = @_;
  1558. my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
  1559. foreach (Xupdate_256_AVX()) { # 29 instructions
  1560. eval;
  1561. eval(shift(@insns));
  1562. eval(shift(@insns));
  1563. eval(shift(@insns));
  1564. }
  1565. &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
  1566. foreach (@insns) { eval; } # remaining instructions
  1567. &vmovdqa (16*$j."(%rsp)",$t2);
  1568. }
  1569. for ($i=0,$j=0; $j<4; $j++) {
  1570. &AVX_256_00_47($j,\&body_00_15,@X);
  1571. push(@X,shift(@X)); # rotate(@X)
  1572. }
  1573. &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
  1574. &jne (".Lavx_00_47");
  1575. for ($i=0; $i<16; ) {
  1576. foreach(body_00_15()) { eval; }
  1577. }
  1578. } else { # SHA512
  1579. my @X = map("%xmm$_",(0..7));
  1580. my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11));
  1581. $code.=<<___;
  1582. jmp .Lloop_avx
  1583. .align 16
  1584. .Lloop_avx:
  1585. vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
  1586. vmovdqu 0x00($inp),@X[0]
  1587. lea $TABLE+0x80(%rip),$Tbl # size optimization
  1588. vmovdqu 0x10($inp),@X[1]
  1589. vmovdqu 0x20($inp),@X[2]
  1590. vpshufb $t3,@X[0],@X[0]
  1591. vmovdqu 0x30($inp),@X[3]
  1592. vpshufb $t3,@X[1],@X[1]
  1593. vmovdqu 0x40($inp),@X[4]
  1594. vpshufb $t3,@X[2],@X[2]
  1595. vmovdqu 0x50($inp),@X[5]
  1596. vpshufb $t3,@X[3],@X[3]
  1597. vmovdqu 0x60($inp),@X[6]
  1598. vpshufb $t3,@X[4],@X[4]
  1599. vmovdqu 0x70($inp),@X[7]
  1600. vpshufb $t3,@X[5],@X[5]
  1601. vpaddq -0x80($Tbl),@X[0],$t0
  1602. vpshufb $t3,@X[6],@X[6]
  1603. vpaddq -0x60($Tbl),@X[1],$t1
  1604. vpshufb $t3,@X[7],@X[7]
  1605. vpaddq -0x40($Tbl),@X[2],$t2
  1606. vpaddq -0x20($Tbl),@X[3],$t3
  1607. vmovdqa $t0,0x00(%rsp)
  1608. vpaddq 0x00($Tbl),@X[4],$t0
  1609. vmovdqa $t1,0x10(%rsp)
  1610. vpaddq 0x20($Tbl),@X[5],$t1
  1611. vmovdqa $t2,0x20(%rsp)
  1612. vpaddq 0x40($Tbl),@X[6],$t2
  1613. vmovdqa $t3,0x30(%rsp)
  1614. vpaddq 0x60($Tbl),@X[7],$t3
  1615. vmovdqa $t0,0x40(%rsp)
  1616. mov $A,$a1
  1617. vmovdqa $t1,0x50(%rsp)
  1618. mov $B,$a3
  1619. vmovdqa $t2,0x60(%rsp)
  1620. xor $C,$a3 # magic
  1621. vmovdqa $t3,0x70(%rsp)
  1622. mov $E,$a0
  1623. jmp .Lavx_00_47
  1624. .align 16
  1625. .Lavx_00_47:
  1626. add \$`16*2*$SZ`,$Tbl
  1627. ___
  1628. sub Xupdate_512_AVX () {
  1629. (
  1630. '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..2]
  1631. '&vpalignr ($t3,@X[5],@X[4],$SZ)', # X[9..10]
  1632. '&vpsrlq ($t2,$t0,$sigma0[0])',
  1633. '&vpaddq (@X[0],@X[0],$t3);', # X[0..1] += X[9..10]
  1634. '&vpsrlq ($t3,$t0,$sigma0[2])',
  1635. '&vpsllq ($t1,$t0,8*$SZ-$sigma0[1]);',
  1636. '&vpxor ($t0,$t3,$t2)',
  1637. '&vpsrlq ($t2,$t2,$sigma0[1]-$sigma0[0]);',
  1638. '&vpxor ($t0,$t0,$t1)',
  1639. '&vpsllq ($t1,$t1,$sigma0[1]-$sigma0[0]);',
  1640. '&vpxor ($t0,$t0,$t2)',
  1641. '&vpsrlq ($t3,@X[7],$sigma1[2]);',
  1642. '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..2])
  1643. '&vpsllq ($t2,@X[7],8*$SZ-$sigma1[1]);',
  1644. '&vpaddq (@X[0],@X[0],$t0)', # X[0..1] += sigma0(X[1..2])
  1645. '&vpsrlq ($t1,@X[7],$sigma1[0]);',
  1646. '&vpxor ($t3,$t3,$t2)',
  1647. '&vpsllq ($t2,$t2,$sigma1[1]-$sigma1[0]);',
  1648. '&vpxor ($t3,$t3,$t1)',
  1649. '&vpsrlq ($t1,$t1,$sigma1[1]-$sigma1[0]);',
  1650. '&vpxor ($t3,$t3,$t2)',
  1651. '&vpxor ($t3,$t3,$t1)', # sigma1(X[14..15])
  1652. '&vpaddq (@X[0],@X[0],$t3)', # X[0..1] += sigma1(X[14..15])
  1653. );
  1654. }
  1655. sub AVX_512_00_47 () {
  1656. my $j = shift;
  1657. my $body = shift;
  1658. my @X = @_;
  1659. my @insns = (&$body,&$body); # 52 instructions
  1660. foreach (Xupdate_512_AVX()) { # 23 instructions
  1661. eval;
  1662. eval(shift(@insns));
  1663. eval(shift(@insns));
  1664. }
  1665. &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)");
  1666. foreach (@insns) { eval; } # remaining instructions
  1667. &vmovdqa (16*$j."(%rsp)",$t2);
  1668. }
  1669. for ($i=0,$j=0; $j<8; $j++) {
  1670. &AVX_512_00_47($j,\&body_00_15,@X);
  1671. push(@X,shift(@X)); # rotate(@X)
  1672. }
  1673. &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0);
  1674. &jne (".Lavx_00_47");
  1675. for ($i=0; $i<16; ) {
  1676. foreach(body_00_15()) { eval; }
  1677. }
  1678. }
  1679. $code.=<<___;
  1680. mov $_ctx,$ctx
  1681. mov $a1,$A
  1682. add $SZ*0($ctx),$A
  1683. lea 16*$SZ($inp),$inp
  1684. add $SZ*1($ctx),$B
  1685. add $SZ*2($ctx),$C
  1686. add $SZ*3($ctx),$D
  1687. add $SZ*4($ctx),$E
  1688. add $SZ*5($ctx),$F
  1689. add $SZ*6($ctx),$G
  1690. add $SZ*7($ctx),$H
  1691. cmp $_end,$inp
  1692. mov $A,$SZ*0($ctx)
  1693. mov $B,$SZ*1($ctx)
  1694. mov $C,$SZ*2($ctx)
  1695. mov $D,$SZ*3($ctx)
  1696. mov $E,$SZ*4($ctx)
  1697. mov $F,$SZ*5($ctx)
  1698. mov $G,$SZ*6($ctx)
  1699. mov $H,$SZ*7($ctx)
  1700. jb .Lloop_avx
  1701. mov $_rsp,%rsi
  1702. .cfi_def_cfa %rsi,8
  1703. vzeroupper
  1704. ___
  1705. $code.=<<___ if ($win64);
  1706. movaps 16*$SZ+32(%rsp),%xmm6
  1707. movaps 16*$SZ+48(%rsp),%xmm7
  1708. movaps 16*$SZ+64(%rsp),%xmm8
  1709. movaps 16*$SZ+80(%rsp),%xmm9
  1710. ___
  1711. $code.=<<___ if ($win64 && $SZ>4);
  1712. movaps 16*$SZ+96(%rsp),%xmm10
  1713. movaps 16*$SZ+112(%rsp),%xmm11
  1714. ___
  1715. $code.=<<___;
  1716. mov -48(%rsi),%r15
  1717. .cfi_restore %r15
  1718. mov -40(%rsi),%r14
  1719. .cfi_restore %r14
  1720. mov -32(%rsi),%r13
  1721. .cfi_restore %r13
  1722. mov -24(%rsi),%r12
  1723. .cfi_restore %r12
  1724. mov -16(%rsi),%rbp
  1725. .cfi_restore %rbp
  1726. mov -8(%rsi),%rbx
  1727. .cfi_restore %rbx
  1728. lea (%rsi),%rsp
  1729. .cfi_def_cfa_register %rsp
  1730. .Lepilogue_avx:
  1731. ret
  1732. .cfi_endproc
  1733. .size ${func}_avx,.-${func}_avx
  1734. ___
  1735. if ($avx>1) {{
  1736. ######################################################################
  1737. # AVX2+BMI code path
  1738. #
  1739. my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp
  1740. my $PUSH8=8*2*$SZ;
  1741. use integer;
  1742. sub bodyx_00_15 () {
  1743. # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
  1744. (
  1745. '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
  1746. '&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i]
  1747. '&and ($a4,$e)', # f&e
  1748. '&rorx ($a0,$e,$Sigma1[2])',
  1749. '&rorx ($a2,$e,$Sigma1[1])',
  1750. '&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past
  1751. '&lea ($h,"($h,$a4)")',
  1752. '&andn ($a4,$e,$g)', # ~e&g
  1753. '&xor ($a0,$a2)',
  1754. '&rorx ($a1,$e,$Sigma1[0])',
  1755. '&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g)
  1756. '&xor ($a0,$a1)', # Sigma1(e)
  1757. '&mov ($a2,$a)',
  1758. '&rorx ($a4,$a,$Sigma0[2])',
  1759. '&lea ($h,"($h,$a0)")', # h+=Sigma1(e)
  1760. '&xor ($a2,$b)', # a^b, b^c in next round
  1761. '&rorx ($a1,$a,$Sigma0[1])',
  1762. '&rorx ($a0,$a,$Sigma0[0])',
  1763. '&lea ($d,"($d,$h)")', # d+=h
  1764. '&and ($a3,$a2)', # (b^c)&(a^b)
  1765. '&xor ($a1,$a4)',
  1766. '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
  1767. '&xor ($a1,$a0)', # Sigma0(a)
  1768. '&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c)
  1769. '&mov ($a4,$e)', # copy of f in future
  1770. '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
  1771. );
  1772. # and at the finish one has to $a+=$a1
  1773. }
  1774. $code.=<<___;
  1775. .type ${func}_avx2,\@function,3
  1776. .align 64
  1777. ${func}_avx2:
  1778. .cfi_startproc
  1779. .Lavx2_shortcut:
  1780. mov %rsp,%rax # copy %rsp
  1781. .cfi_def_cfa_register %rax
  1782. push %rbx
  1783. .cfi_push %rbx
  1784. push %rbp
  1785. .cfi_push %rbp
  1786. push %r12
  1787. .cfi_push %r12
  1788. push %r13
  1789. .cfi_push %r13
  1790. push %r14
  1791. .cfi_push %r14
  1792. push %r15
  1793. .cfi_push %r15
  1794. sub \$`2*$SZ*$rounds+4*8+$win64*16*($SZ==4?4:6)`,%rsp
  1795. shl \$4,%rdx # num*16
  1796. and \$-256*$SZ,%rsp # align stack frame
  1797. lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
  1798. add \$`2*$SZ*($rounds-8)`,%rsp
  1799. mov $ctx,$_ctx # save ctx, 1st arg
  1800. mov $inp,$_inp # save inp, 2nd arh
  1801. mov %rdx,$_end # save end pointer, "3rd" arg
  1802. mov %rax,$_rsp # save copy of %rsp
  1803. .cfi_cfa_expression $_rsp,deref,+8
  1804. ___
  1805. $code.=<<___ if ($win64);
  1806. movaps %xmm6,16*$SZ+32(%rsp)
  1807. movaps %xmm7,16*$SZ+48(%rsp)
  1808. movaps %xmm8,16*$SZ+64(%rsp)
  1809. movaps %xmm9,16*$SZ+80(%rsp)
  1810. ___
  1811. $code.=<<___ if ($win64 && $SZ>4);
  1812. movaps %xmm10,16*$SZ+96(%rsp)
  1813. movaps %xmm11,16*$SZ+112(%rsp)
  1814. ___
  1815. $code.=<<___;
  1816. .Lprologue_avx2:
  1817. vzeroupper
  1818. sub \$-16*$SZ,$inp # inp++, size optimization
  1819. mov $SZ*0($ctx),$A
  1820. mov $inp,%r12 # borrow $T1
  1821. mov $SZ*1($ctx),$B
  1822. cmp %rdx,$inp # $_end
  1823. mov $SZ*2($ctx),$C
  1824. cmove %rsp,%r12 # next block or random data
  1825. mov $SZ*3($ctx),$D
  1826. mov $SZ*4($ctx),$E
  1827. mov $SZ*5($ctx),$F
  1828. mov $SZ*6($ctx),$G
  1829. mov $SZ*7($ctx),$H
  1830. ___
  1831. if ($SZ==4) { # SHA256
  1832. my @X = map("%ymm$_",(0..3));
  1833. my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%ymm$_",(4..9));
  1834. $code.=<<___;
  1835. vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4
  1836. vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5
  1837. jmp .Loop_avx2
  1838. .align 16
  1839. .Loop_avx2:
  1840. vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
  1841. vmovdqu -16*$SZ+0($inp),%xmm0
  1842. vmovdqu -16*$SZ+16($inp),%xmm1
  1843. vmovdqu -16*$SZ+32($inp),%xmm2
  1844. vmovdqu -16*$SZ+48($inp),%xmm3
  1845. #mov $inp,$_inp # offload $inp
  1846. vinserti128 \$1,(%r12),@X[0],@X[0]
  1847. vinserti128 \$1,16(%r12),@X[1],@X[1]
  1848. vpshufb $t3,@X[0],@X[0]
  1849. vinserti128 \$1,32(%r12),@X[2],@X[2]
  1850. vpshufb $t3,@X[1],@X[1]
  1851. vinserti128 \$1,48(%r12),@X[3],@X[3]
  1852. lea $TABLE(%rip),$Tbl
  1853. vpshufb $t3,@X[2],@X[2]
  1854. vpaddd 0x00($Tbl),@X[0],$t0
  1855. vpshufb $t3,@X[3],@X[3]
  1856. vpaddd 0x20($Tbl),@X[1],$t1
  1857. vpaddd 0x40($Tbl),@X[2],$t2
  1858. vpaddd 0x60($Tbl),@X[3],$t3
  1859. vmovdqa $t0,0x00(%rsp)
  1860. xor $a1,$a1
  1861. vmovdqa $t1,0x20(%rsp)
  1862. lea -$PUSH8(%rsp),%rsp
  1863. mov $B,$a3
  1864. vmovdqa $t2,0x00(%rsp)
  1865. xor $C,$a3 # magic
  1866. vmovdqa $t3,0x20(%rsp)
  1867. mov $F,$a4
  1868. sub \$-16*2*$SZ,$Tbl # size optimization
  1869. jmp .Lavx2_00_47
  1870. .align 16
  1871. .Lavx2_00_47:
  1872. ___
  1873. sub AVX2_256_00_47 () {
  1874. my $j = shift;
  1875. my $body = shift;
  1876. my @X = @_;
  1877. my @insns = (&$body,&$body,&$body,&$body); # 96 instructions
  1878. my $base = "+2*$PUSH8(%rsp)";
  1879. &lea ("%rsp","-$PUSH8(%rsp)") if (($j%2)==0);
  1880. foreach (Xupdate_256_AVX()) { # 29 instructions
  1881. eval;
  1882. eval(shift(@insns));
  1883. eval(shift(@insns));
  1884. eval(shift(@insns));
  1885. }
  1886. &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
  1887. foreach (@insns) { eval; } # remaining instructions
  1888. &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2);
  1889. }
  1890. for ($i=0,$j=0; $j<4; $j++) {
  1891. &AVX2_256_00_47($j,\&bodyx_00_15,@X);
  1892. push(@X,shift(@X)); # rotate(@X)
  1893. }
  1894. &lea ($Tbl,16*2*$SZ."($Tbl)");
  1895. &cmpb (($SZ-1)."($Tbl)",0);
  1896. &jne (".Lavx2_00_47");
  1897. for ($i=0; $i<16; ) {
  1898. my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
  1899. foreach(bodyx_00_15()) { eval; }
  1900. }
  1901. } else { # SHA512
  1902. my @X = map("%ymm$_",(0..7));
  1903. my ($t0,$t1,$t2,$t3) = map("%ymm$_",(8..11));
  1904. $code.=<<___;
  1905. jmp .Loop_avx2
  1906. .align 16
  1907. .Loop_avx2:
  1908. vmovdqu -16*$SZ($inp),%xmm0
  1909. vmovdqu -16*$SZ+16($inp),%xmm1
  1910. vmovdqu -16*$SZ+32($inp),%xmm2
  1911. lea $TABLE+0x80(%rip),$Tbl # size optimization
  1912. vmovdqu -16*$SZ+48($inp),%xmm3
  1913. vmovdqu -16*$SZ+64($inp),%xmm4
  1914. vmovdqu -16*$SZ+80($inp),%xmm5
  1915. vmovdqu -16*$SZ+96($inp),%xmm6
  1916. vmovdqu -16*$SZ+112($inp),%xmm7
  1917. #mov $inp,$_inp # offload $inp
  1918. vmovdqa `$SZ*2*$rounds-0x80`($Tbl),$t2
  1919. vinserti128 \$1,(%r12),@X[0],@X[0]
  1920. vinserti128 \$1,16(%r12),@X[1],@X[1]
  1921. vpshufb $t2,@X[0],@X[0]
  1922. vinserti128 \$1,32(%r12),@X[2],@X[2]
  1923. vpshufb $t2,@X[1],@X[1]
  1924. vinserti128 \$1,48(%r12),@X[3],@X[3]
  1925. vpshufb $t2,@X[2],@X[2]
  1926. vinserti128 \$1,64(%r12),@X[4],@X[4]
  1927. vpshufb $t2,@X[3],@X[3]
  1928. vinserti128 \$1,80(%r12),@X[5],@X[5]
  1929. vpshufb $t2,@X[4],@X[4]
  1930. vinserti128 \$1,96(%r12),@X[6],@X[6]
  1931. vpshufb $t2,@X[5],@X[5]
  1932. vinserti128 \$1,112(%r12),@X[7],@X[7]
  1933. vpaddq -0x80($Tbl),@X[0],$t0
  1934. vpshufb $t2,@X[6],@X[6]
  1935. vpaddq -0x60($Tbl),@X[1],$t1
  1936. vpshufb $t2,@X[7],@X[7]
  1937. vpaddq -0x40($Tbl),@X[2],$t2
  1938. vpaddq -0x20($Tbl),@X[3],$t3
  1939. vmovdqa $t0,0x00(%rsp)
  1940. vpaddq 0x00($Tbl),@X[4],$t0
  1941. vmovdqa $t1,0x20(%rsp)
  1942. vpaddq 0x20($Tbl),@X[5],$t1
  1943. vmovdqa $t2,0x40(%rsp)
  1944. vpaddq 0x40($Tbl),@X[6],$t2
  1945. vmovdqa $t3,0x60(%rsp)
  1946. lea -$PUSH8(%rsp),%rsp
  1947. vpaddq 0x60($Tbl),@X[7],$t3
  1948. vmovdqa $t0,0x00(%rsp)
  1949. xor $a1,$a1
  1950. vmovdqa $t1,0x20(%rsp)
  1951. mov $B,$a3
  1952. vmovdqa $t2,0x40(%rsp)
  1953. xor $C,$a3 # magic
  1954. vmovdqa $t3,0x60(%rsp)
  1955. mov $F,$a4
  1956. add \$16*2*$SZ,$Tbl
  1957. jmp .Lavx2_00_47
  1958. .align 16
  1959. .Lavx2_00_47:
  1960. ___
  1961. sub AVX2_512_00_47 () {
  1962. my $j = shift;
  1963. my $body = shift;
  1964. my @X = @_;
  1965. my @insns = (&$body,&$body); # 48 instructions
  1966. my $base = "+2*$PUSH8(%rsp)";
  1967. &lea ("%rsp","-$PUSH8(%rsp)") if (($j%4)==0);
  1968. foreach (Xupdate_512_AVX()) { # 23 instructions
  1969. eval;
  1970. if ($_ !~ /\;$/) {
  1971. eval(shift(@insns));
  1972. eval(shift(@insns));
  1973. eval(shift(@insns));
  1974. }
  1975. }
  1976. &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)");
  1977. foreach (@insns) { eval; } # remaining instructions
  1978. &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2);
  1979. }
  1980. for ($i=0,$j=0; $j<8; $j++) {
  1981. &AVX2_512_00_47($j,\&bodyx_00_15,@X);
  1982. push(@X,shift(@X)); # rotate(@X)
  1983. }
  1984. &lea ($Tbl,16*2*$SZ."($Tbl)");
  1985. &cmpb (($SZ-1-0x80)."($Tbl)",0);
  1986. &jne (".Lavx2_00_47");
  1987. for ($i=0; $i<16; ) {
  1988. my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
  1989. foreach(bodyx_00_15()) { eval; }
  1990. }
  1991. }
  1992. $code.=<<___;
  1993. mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx
  1994. add $a1,$A
  1995. #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp
  1996. lea `2*$SZ*($rounds-8)`(%rsp),$Tbl
  1997. add $SZ*0($ctx),$A
  1998. add $SZ*1($ctx),$B
  1999. add $SZ*2($ctx),$C
  2000. add $SZ*3($ctx),$D
  2001. add $SZ*4($ctx),$E
  2002. add $SZ*5($ctx),$F
  2003. add $SZ*6($ctx),$G
  2004. add $SZ*7($ctx),$H
  2005. mov $A,$SZ*0($ctx)
  2006. mov $B,$SZ*1($ctx)
  2007. mov $C,$SZ*2($ctx)
  2008. mov $D,$SZ*3($ctx)
  2009. mov $E,$SZ*4($ctx)
  2010. mov $F,$SZ*5($ctx)
  2011. mov $G,$SZ*6($ctx)
  2012. mov $H,$SZ*7($ctx)
  2013. cmp `$PUSH8+2*8`($Tbl),$inp # $_end
  2014. je .Ldone_avx2
  2015. xor $a1,$a1
  2016. mov $B,$a3
  2017. xor $C,$a3 # magic
  2018. mov $F,$a4
  2019. jmp .Lower_avx2
  2020. .align 16
  2021. .Lower_avx2:
  2022. ___
  2023. for ($i=0; $i<8; ) {
  2024. my $base="+16($Tbl)";
  2025. foreach(bodyx_00_15()) { eval; }
  2026. }
  2027. $code.=<<___;
  2028. lea -$PUSH8($Tbl),$Tbl
  2029. cmp %rsp,$Tbl
  2030. jae .Lower_avx2
  2031. mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx
  2032. add $a1,$A
  2033. #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp
  2034. lea `2*$SZ*($rounds-8)`(%rsp),%rsp
  2035. add $SZ*0($ctx),$A
  2036. add $SZ*1($ctx),$B
  2037. add $SZ*2($ctx),$C
  2038. add $SZ*3($ctx),$D
  2039. add $SZ*4($ctx),$E
  2040. add $SZ*5($ctx),$F
  2041. lea `2*16*$SZ`($inp),$inp # inp+=2
  2042. add $SZ*6($ctx),$G
  2043. mov $inp,%r12
  2044. add $SZ*7($ctx),$H
  2045. cmp $_end,$inp
  2046. mov $A,$SZ*0($ctx)
  2047. cmove %rsp,%r12 # next block or stale data
  2048. mov $B,$SZ*1($ctx)
  2049. mov $C,$SZ*2($ctx)
  2050. mov $D,$SZ*3($ctx)
  2051. mov $E,$SZ*4($ctx)
  2052. mov $F,$SZ*5($ctx)
  2053. mov $G,$SZ*6($ctx)
  2054. mov $H,$SZ*7($ctx)
  2055. jbe .Loop_avx2
  2056. lea (%rsp),$Tbl
  2057. .Ldone_avx2:
  2058. lea ($Tbl),%rsp
  2059. mov $_rsp,%rsi
  2060. .cfi_def_cfa %rsi,8
  2061. vzeroupper
  2062. ___
  2063. $code.=<<___ if ($win64);
  2064. movaps 16*$SZ+32(%rsp),%xmm6
  2065. movaps 16*$SZ+48(%rsp),%xmm7
  2066. movaps 16*$SZ+64(%rsp),%xmm8
  2067. movaps 16*$SZ+80(%rsp),%xmm9
  2068. ___
  2069. $code.=<<___ if ($win64 && $SZ>4);
  2070. movaps 16*$SZ+96(%rsp),%xmm10
  2071. movaps 16*$SZ+112(%rsp),%xmm11
  2072. ___
  2073. $code.=<<___;
  2074. mov -48(%rsi),%r15
  2075. .cfi_restore %r15
  2076. mov -40(%rsi),%r14
  2077. .cfi_restore %r14
  2078. mov -32(%rsi),%r13
  2079. .cfi_restore %r13
  2080. mov -24(%rsi),%r12
  2081. .cfi_restore %r12
  2082. mov -16(%rsi),%rbp
  2083. .cfi_restore %rbp
  2084. mov -8(%rsi),%rbx
  2085. .cfi_restore %rbx
  2086. lea (%rsi),%rsp
  2087. .cfi_def_cfa_register %rsp
  2088. .Lepilogue_avx2:
  2089. ret
  2090. .cfi_endproc
  2091. .size ${func}_avx2,.-${func}_avx2
  2092. ___
  2093. }}
  2094. }}}}}
  2095. # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
  2096. # CONTEXT *context,DISPATCHER_CONTEXT *disp)
  2097. if ($win64) {
  2098. $rec="%rcx";
  2099. $frame="%rdx";
  2100. $context="%r8";
  2101. $disp="%r9";
  2102. $code.=<<___;
  2103. .extern __imp_RtlVirtualUnwind
  2104. .type se_handler,\@abi-omnipotent
  2105. .align 16
  2106. se_handler:
  2107. push %rsi
  2108. push %rdi
  2109. push %rbx
  2110. push %rbp
  2111. push %r12
  2112. push %r13
  2113. push %r14
  2114. push %r15
  2115. pushfq
  2116. sub \$64,%rsp
  2117. mov 120($context),%rax # pull context->Rax
  2118. mov 248($context),%rbx # pull context->Rip
  2119. mov 8($disp),%rsi # disp->ImageBase
  2120. mov 56($disp),%r11 # disp->HanderlData
  2121. mov 0(%r11),%r10d # HandlerData[0]
  2122. lea (%rsi,%r10),%r10 # prologue label
  2123. cmp %r10,%rbx # context->Rip<prologue label
  2124. jb .Lin_prologue
  2125. mov 152($context),%rax # pull context->Rsp
  2126. mov 4(%r11),%r10d # HandlerData[1]
  2127. lea (%rsi,%r10),%r10 # epilogue label
  2128. cmp %r10,%rbx # context->Rip>=epilogue label
  2129. jae .Lin_prologue
  2130. ___
  2131. $code.=<<___ if ($avx>1);
  2132. lea .Lavx2_shortcut(%rip),%r10
  2133. cmp %r10,%rbx # context->Rip<avx2_shortcut
  2134. jb .Lnot_in_avx2
  2135. and \$-256*$SZ,%rax
  2136. add \$`2*$SZ*($rounds-8)`,%rax
  2137. .Lnot_in_avx2:
  2138. ___
  2139. $code.=<<___;
  2140. mov %rax,%rsi # put aside Rsp
  2141. mov 16*$SZ+3*8(%rax),%rax # pull $_rsp
  2142. mov -8(%rax),%rbx
  2143. mov -16(%rax),%rbp
  2144. mov -24(%rax),%r12
  2145. mov -32(%rax),%r13
  2146. mov -40(%rax),%r14
  2147. mov -48(%rax),%r15
  2148. mov %rbx,144($context) # restore context->Rbx
  2149. mov %rbp,160($context) # restore context->Rbp
  2150. mov %r12,216($context) # restore context->R12
  2151. mov %r13,224($context) # restore context->R13
  2152. mov %r14,232($context) # restore context->R14
  2153. mov %r15,240($context) # restore context->R15
  2154. lea .Lepilogue(%rip),%r10
  2155. cmp %r10,%rbx
  2156. jb .Lin_prologue # non-AVX code
  2157. lea 16*$SZ+4*8(%rsi),%rsi # Xmm6- save area
  2158. lea 512($context),%rdi # &context.Xmm6
  2159. mov \$`$SZ==4?8:12`,%ecx
  2160. .long 0xa548f3fc # cld; rep movsq
  2161. .Lin_prologue:
  2162. mov 8(%rax),%rdi
  2163. mov 16(%rax),%rsi
  2164. mov %rax,152($context) # restore context->Rsp
  2165. mov %rsi,168($context) # restore context->Rsi
  2166. mov %rdi,176($context) # restore context->Rdi
  2167. mov 40($disp),%rdi # disp->ContextRecord
  2168. mov $context,%rsi # context
  2169. mov \$154,%ecx # sizeof(CONTEXT)
  2170. .long 0xa548f3fc # cld; rep movsq
  2171. mov $disp,%rsi
  2172. xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
  2173. mov 8(%rsi),%rdx # arg2, disp->ImageBase
  2174. mov 0(%rsi),%r8 # arg3, disp->ControlPc
  2175. mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
  2176. mov 40(%rsi),%r10 # disp->ContextRecord
  2177. lea 56(%rsi),%r11 # &disp->HandlerData
  2178. lea 24(%rsi),%r12 # &disp->EstablisherFrame
  2179. mov %r10,32(%rsp) # arg5
  2180. mov %r11,40(%rsp) # arg6
  2181. mov %r12,48(%rsp) # arg7
  2182. mov %rcx,56(%rsp) # arg8, (NULL)
  2183. call *__imp_RtlVirtualUnwind(%rip)
  2184. mov \$1,%eax # ExceptionContinueSearch
  2185. add \$64,%rsp
  2186. popfq
  2187. pop %r15
  2188. pop %r14
  2189. pop %r13
  2190. pop %r12
  2191. pop %rbp
  2192. pop %rbx
  2193. pop %rdi
  2194. pop %rsi
  2195. ret
  2196. .size se_handler,.-se_handler
  2197. ___
  2198. $code.=<<___ if ($SZ==4 && $shaext);
  2199. .type shaext_handler,\@abi-omnipotent
  2200. .align 16
  2201. shaext_handler:
  2202. push %rsi
  2203. push %rdi
  2204. push %rbx
  2205. push %rbp
  2206. push %r12
  2207. push %r13
  2208. push %r14
  2209. push %r15
  2210. pushfq
  2211. sub \$64,%rsp
  2212. mov 120($context),%rax # pull context->Rax
  2213. mov 248($context),%rbx # pull context->Rip
  2214. lea .Lprologue_shaext(%rip),%r10
  2215. cmp %r10,%rbx # context->Rip<.Lprologue
  2216. jb .Lin_prologue
  2217. lea .Lepilogue_shaext(%rip),%r10
  2218. cmp %r10,%rbx # context->Rip>=.Lepilogue
  2219. jae .Lin_prologue
  2220. lea -8-5*16(%rax),%rsi
  2221. lea 512($context),%rdi # &context.Xmm6
  2222. mov \$10,%ecx
  2223. .long 0xa548f3fc # cld; rep movsq
  2224. jmp .Lin_prologue
  2225. .size shaext_handler,.-shaext_handler
  2226. ___
  2227. $code.=<<___;
  2228. .section .pdata
  2229. .align 4
  2230. .rva .LSEH_begin_$func
  2231. .rva .LSEH_end_$func
  2232. .rva .LSEH_info_$func
  2233. ___
  2234. $code.=<<___ if ($SZ==4 && $shaext);
  2235. .rva .LSEH_begin_${func}_shaext
  2236. .rva .LSEH_end_${func}_shaext
  2237. .rva .LSEH_info_${func}_shaext
  2238. ___
  2239. $code.=<<___ if ($SZ==4);
  2240. .rva .LSEH_begin_${func}_ssse3
  2241. .rva .LSEH_end_${func}_ssse3
  2242. .rva .LSEH_info_${func}_ssse3
  2243. ___
  2244. $code.=<<___ if ($avx && $SZ==8);
  2245. .rva .LSEH_begin_${func}_xop
  2246. .rva .LSEH_end_${func}_xop
  2247. .rva .LSEH_info_${func}_xop
  2248. ___
  2249. $code.=<<___ if ($avx);
  2250. .rva .LSEH_begin_${func}_avx
  2251. .rva .LSEH_end_${func}_avx
  2252. .rva .LSEH_info_${func}_avx
  2253. ___
  2254. $code.=<<___ if ($avx>1);
  2255. .rva .LSEH_begin_${func}_avx2
  2256. .rva .LSEH_end_${func}_avx2
  2257. .rva .LSEH_info_${func}_avx2
  2258. ___
  2259. $code.=<<___;
  2260. .section .xdata
  2261. .align 8
  2262. .LSEH_info_$func:
  2263. .byte 9,0,0,0
  2264. .rva se_handler
  2265. .rva .Lprologue,.Lepilogue # HandlerData[]
  2266. ___
  2267. $code.=<<___ if ($SZ==4 && $shaext);
  2268. .LSEH_info_${func}_shaext:
  2269. .byte 9,0,0,0
  2270. .rva shaext_handler
  2271. ___
  2272. $code.=<<___ if ($SZ==4);
  2273. .LSEH_info_${func}_ssse3:
  2274. .byte 9,0,0,0
  2275. .rva se_handler
  2276. .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[]
  2277. ___
  2278. $code.=<<___ if ($avx && $SZ==8);
  2279. .LSEH_info_${func}_xop:
  2280. .byte 9,0,0,0
  2281. .rva se_handler
  2282. .rva .Lprologue_xop,.Lepilogue_xop # HandlerData[]
  2283. ___
  2284. $code.=<<___ if ($avx);
  2285. .LSEH_info_${func}_avx:
  2286. .byte 9,0,0,0
  2287. .rva se_handler
  2288. .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
  2289. ___
  2290. $code.=<<___ if ($avx>1);
  2291. .LSEH_info_${func}_avx2:
  2292. .byte 9,0,0,0
  2293. .rva se_handler
  2294. .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[]
  2295. ___
  2296. }
  2297. sub sha256op38 {
  2298. my $instr = shift;
  2299. my %opcodelet = (
  2300. "sha256rnds2" => 0xcb,
  2301. "sha256msg1" => 0xcc,
  2302. "sha256msg2" => 0xcd );
  2303. if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) {
  2304. my @opcode=(0x0f,0x38);
  2305. push @opcode,$opcodelet{$instr};
  2306. push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
  2307. return ".byte\t".join(',',@opcode);
  2308. } else {
  2309. return $instr."\t".@_[0];
  2310. }
  2311. }
  2312. foreach (split("\n",$code)) {
  2313. s/\`([^\`]*)\`/eval $1/geo;
  2314. s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo;
  2315. print $_,"\n";
  2316. }
  2317. close STDOUT;