2
0

sha512-x86_64.pl 64 KB


  1. #! /usr/bin/env perl
  2. # Copyright 2005-2020 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. # ====================================================================
  10. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  11. # project. Rights for redistribution and usage in source and binary
  12. # forms are granted according to the License.
  13. # ====================================================================
  14. #
  15. # sha256/512_block procedure for x86_64.
  16. #
  17. # 40% improvement over compiler-generated code on Opteron. On EM64T
  18. # sha256 was observed to run >80% faster and sha512 - >40%. No magical
  19. # tricks, just straight implementation... I really wonder why gcc
  20. # [being armed with inline assembler] fails to generate as fast code.
  21. # The only thing which is cool about this module is that it's very
  22. # same instruction sequence used for both SHA-256 and SHA-512. In
  23. # former case the instructions operate on 32-bit operands, while in
  24. # latter - on 64-bit ones. All I had to do is to get one flavor right,
  25. # the other one passed the test right away:-)
  26. #
  27. # sha256_block runs in ~1005 cycles on Opteron, which gives you
  28. # asymptotic performance of 64*1000/1005=63.7MBps times CPU clock
  29. # frequency in GHz. sha512_block runs in ~1275 cycles, which results
  30. # in 128*1000/1275=100MBps per GHz. Is there room for improvement?
  31. # Well, if you compare it to IA-64 implementation, which maintains
  32. # X[16] in register bank[!], tends to 4 instructions per CPU clock
  33. # cycle and runs in 1003 cycles, 1275 is very good result for 3-way
  34. # issue Opteron pipeline and X[16] maintained in memory. So that *if*
  35. # there is a way to improve it, *then* the only way would be to try to
  36. # offload X[16] updates to SSE unit, but that would require "deeper"
  37. # loop unroll, which in turn would naturally cause size blow-up, not
  38. # to mention increased complexity! And once again, only *if* it's
  39. # actually possible to noticeably improve overall ILP, instruction
  40. # level parallelism, on a given CPU implementation in this case.
  41. #
  42. # Special note on Intel EM64T. While Opteron CPU exhibits perfect
  43. # performance ratio of 1.5 between 64- and 32-bit flavors [see above],
  44. # [currently available] EM64T CPUs apparently are far from it. On the
  45. # contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit
  46. # sha256_block:-( This is presumably because 64-bit shifts/rotates
  47. # apparently are not atomic instructions, but implemented in microcode.
  48. #
  49. # May 2012.
  50. #
  51. # Optimization including one of Pavel Semjanov's ideas, alternative
  52. # Maj, resulted in >=5% improvement on most CPUs, +20% SHA256 and
  53. # unfortunately -2% SHA512 on P4 [which nobody should care about
  54. # that much].
  55. #
  56. # June 2012.
  57. #
  58. # Add SIMD code paths, see below for improvement coefficients. SSSE3
  59. # code path was not attempted for SHA512, because improvement is not
  60. # estimated to be high enough, noticeably less than 9%, to justify
  61. # the effort, not on pre-AVX processors. [Obviously with exclusion
  62. # for VIA Nano, but it has SHA512 instruction that is faster and
  63. # should be used instead.] For reference, corresponding estimated
  64. # upper limit for improvement for SSSE3 SHA256 is 28%. The fact that
  65. # higher coefficients are observed on VIA Nano and Bulldozer has more
  66. # to do with specifics of their architecture [which is topic for
  67. # separate discussion].
  68. #
  69. # November 2012.
  70. #
  71. # Add AVX2 code path. Two consecutive input blocks are loaded to
  72. # 256-bit %ymm registers, with data from first block to least
  73. # significant 128-bit halves and data from second to most significant.
  74. # The data is then processed with same SIMD instruction sequence as
  75. # for AVX, but with %ymm as operands. Side effect is increased stack
  76. # frame, 448 additional bytes in SHA256 and 1152 in SHA512, and 1.2KB
  77. # code size increase.
  78. #
  79. # March 2014.
  80. #
  81. # Add support for Intel SHA Extensions.
  82. ######################################################################
  83. # Current performance in cycles per processed byte (less is better):
  84. #
  85. # SHA256 SSSE3 AVX/XOP(*) SHA512 AVX/XOP(*)
  86. #
  87. # AMD K8 14.9 - - 9.57 -
  88. # P4 17.3 - - 30.8 -
  89. # Core 2 15.6 13.8(+13%) - 9.97 -
  90. # Westmere 14.8 12.3(+19%) - 9.58 -
  91. # Sandy Bridge 17.4 14.2(+23%) 11.6(+50%(**)) 11.2 8.10(+38%(**))
  92. # Ivy Bridge 12.6 10.5(+20%) 10.3(+22%) 8.17 7.22(+13%)
  93. # Haswell 12.2 9.28(+31%) 7.80(+56%) 7.66 5.40(+42%)
  94. # Skylake 11.4 9.03(+26%) 7.70(+48%) 7.25 5.20(+40%)
  95. # Bulldozer 21.1 13.6(+54%) 13.6(+54%(***)) 13.5 8.58(+57%)
  96. # Ryzen 11.0 9.02(+22%) 2.05(+440%) 7.05 5.67(+20%)
  97. # VIA Nano 23.0 16.5(+39%) - 14.7 -
  98. # Atom 23.0 18.9(+22%) - 14.7 -
  99. # Silvermont 27.4 20.6(+33%) - 17.5 -
  100. # Knights L 27.4 21.0(+30%) 19.6(+40%) 17.5 12.8(+37%)
  101. # Goldmont 18.9 14.3(+32%) 4.16(+350%) 12.0 -
  102. #
  103. # (*) whichever best applicable, including SHAEXT;
  104. # (**) switch from ror to shrd stands for fair share of improvement;
  105. # (***) execution time is fully determined by remaining integer-only
  106. # part, body_00_15; reducing the amount of SIMD instructions
  107. # below certain limit makes no difference/sense; to conserve
  108. # space SHA256 XOP code path is therefore omitted;
  109. # $output is the last argument if it looks like a file (it has an extension)
  110. # $flavour is the first argument if it doesn't look like a file
  111. $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  112. $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  113. $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  114. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  115. ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  116. ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  117. die "can't locate x86_64-xlate.pl";
  118. if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
  119. =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
  120. $avx = ($1>=2.19) + ($1>=2.22);
  121. }
  122. if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
  123. `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
  124. $avx = ($1>=2.09) + ($1>=2.10);
  125. }
  126. if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
  127. `ml64 2>&1` =~ /Version ([0-9]+)\./) {
  128. $avx = ($1>=10) + ($1>=11);
  129. }
  130. if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) {
  131. $avx = ($2>=3.0) + ($2>3.0);
  132. }
  133. $shaext=1; ### set to zero if compiling for 1.0.1
  134. $avx=1 if (!$shaext && $avx);
  135. open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
  136. or die "can't call $xlate: $!";
  137. *STDOUT=*OUT;
  138. if ($output =~ /512/) {
  139. $func="sha512_block_data_order";
  140. $TABLE="K512";
  141. $SZ=8;
  142. @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%rax","%rbx","%rcx","%rdx",
  143. "%r8", "%r9", "%r10","%r11");
  144. ($T1,$a0,$a1,$a2,$a3)=("%r12","%r13","%r14","%r15","%rdi");
  145. @Sigma0=(28,34,39);
  146. @Sigma1=(14,18,41);
  147. @sigma0=(1, 8, 7);
  148. @sigma1=(19,61, 6);
  149. $rounds=80;
  150. } else {
  151. $func="sha256_block_data_order";
  152. $TABLE="K256";
  153. $SZ=4;
  154. @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
  155. "%r8d","%r9d","%r10d","%r11d");
  156. ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi");
  157. @Sigma0=( 2,13,22);
  158. @Sigma1=( 6,11,25);
  159. @sigma0=( 7,18, 3);
  160. @sigma1=(17,19,10);
  161. $rounds=64;
  162. }
  163. $ctx="%rdi"; # 1st arg, zapped by $a3
  164. $inp="%rsi"; # 2nd arg
  165. $Tbl="%rbp";
  166. $_ctx="16*$SZ+0*8(%rsp)";
  167. $_inp="16*$SZ+1*8(%rsp)";
  168. $_end="16*$SZ+2*8(%rsp)";
  169. $_rsp="`16*$SZ+3*8`(%rsp)";
  170. $framesz="16*$SZ+4*8";
  171. sub ROUND_00_15()
  172. { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
  173. my $STRIDE=$SZ;
  174. $STRIDE += 16 if ($i%(16/$SZ)==(16/$SZ-1));
  175. $code.=<<___;
  176. ror \$`$Sigma1[2]-$Sigma1[1]`,$a0
  177. mov $f,$a2
  178. xor $e,$a0
  179. ror \$`$Sigma0[2]-$Sigma0[1]`,$a1
  180. xor $g,$a2 # f^g
  181. mov $T1,`$SZ*($i&0xf)`(%rsp)
  182. xor $a,$a1
  183. and $e,$a2 # (f^g)&e
  184. ror \$`$Sigma1[1]-$Sigma1[0]`,$a0
  185. add $h,$T1 # T1+=h
  186. xor $g,$a2 # Ch(e,f,g)=((f^g)&e)^g
  187. ror \$`$Sigma0[1]-$Sigma0[0]`,$a1
  188. xor $e,$a0
  189. add $a2,$T1 # T1+=Ch(e,f,g)
  190. mov $a,$a2
  191. add ($Tbl),$T1 # T1+=K[round]
  192. xor $a,$a1
  193. xor $b,$a2 # a^b, b^c in next round
  194. ror \$$Sigma1[0],$a0 # Sigma1(e)
  195. mov $b,$h
  196. and $a2,$a3
  197. ror \$$Sigma0[0],$a1 # Sigma0(a)
  198. add $a0,$T1 # T1+=Sigma1(e)
  199. xor $a3,$h # h=Maj(a,b,c)=Ch(a^b,c,b)
  200. add $T1,$d # d+=T1
  201. add $T1,$h # h+=T1
  202. lea $STRIDE($Tbl),$Tbl # round++
  203. ___
  204. $code.=<<___ if ($i<15);
  205. add $a1,$h # h+=Sigma0(a)
  206. ___
  207. ($a2,$a3) = ($a3,$a2);
  208. }
  209. sub ROUND_16_XX()
  210. { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
  211. $code.=<<___;
  212. mov `$SZ*(($i+1)&0xf)`(%rsp),$a0
  213. mov `$SZ*(($i+14)&0xf)`(%rsp),$a2
  214. mov $a0,$T1
  215. ror \$`$sigma0[1]-$sigma0[0]`,$a0
  216. add $a1,$a # modulo-scheduled h+=Sigma0(a)
  217. mov $a2,$a1
  218. ror \$`$sigma1[1]-$sigma1[0]`,$a2
  219. xor $T1,$a0
  220. shr \$$sigma0[2],$T1
  221. ror \$$sigma0[0],$a0
  222. xor $a1,$a2
  223. shr \$$sigma1[2],$a1
  224. ror \$$sigma1[0],$a2
  225. xor $a0,$T1 # sigma0(X[(i+1)&0xf])
  226. xor $a1,$a2 # sigma1(X[(i+14)&0xf])
  227. add `$SZ*(($i+9)&0xf)`(%rsp),$T1
  228. add `$SZ*($i&0xf)`(%rsp),$T1
  229. mov $e,$a0
  230. add $a2,$T1
  231. mov $a,$a1
  232. ___
  233. &ROUND_00_15(@_);
  234. }
  235. $code=<<___;
  236. .text
  237. .extern OPENSSL_ia32cap_P
  238. .globl $func
  239. .type $func,\@function,3
  240. .align 16
  241. $func:
  242. .cfi_startproc
  243. ___
  244. $code.=<<___ if ($SZ==4 || $avx);
  245. lea OPENSSL_ia32cap_P(%rip),%r11
  246. mov 0(%r11),%r9d
  247. mov 4(%r11),%r10d
  248. mov 8(%r11),%r11d
  249. ___
  250. $code.=<<___ if ($SZ==4 && $shaext);
  251. test \$`1<<29`,%r11d # check for SHA
  252. jnz _shaext_shortcut
  253. ___
  254. $code.=<<___ if ($avx && $SZ==8);
  255. test \$`1<<11`,%r10d # check for XOP
  256. jnz .Lxop_shortcut
  257. ___
  258. $code.=<<___ if ($avx>1);
  259. and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1
  260. cmp \$`1<<8|1<<5|1<<3`,%r11d
  261. je .Lavx2_shortcut
  262. ___
  263. $code.=<<___ if ($avx);
  264. and \$`1<<30`,%r9d # mask "Intel CPU" bit
  265. and \$`1<<28|1<<9`,%r10d # mask AVX and SSSE3 bits
  266. or %r9d,%r10d
  267. cmp \$`1<<28|1<<9|1<<30`,%r10d
  268. je .Lavx_shortcut
  269. ___
  270. $code.=<<___ if ($SZ==4);
  271. test \$`1<<9`,%r10d
  272. jnz .Lssse3_shortcut
  273. ___
  274. $code.=<<___;
  275. mov %rsp,%rax # copy %rsp
  276. .cfi_def_cfa_register %rax
  277. push %rbx
  278. .cfi_push %rbx
  279. push %rbp
  280. .cfi_push %rbp
  281. push %r12
  282. .cfi_push %r12
  283. push %r13
  284. .cfi_push %r13
  285. push %r14
  286. .cfi_push %r14
  287. push %r15
  288. .cfi_push %r15
  289. shl \$4,%rdx # num*16
  290. sub \$$framesz,%rsp
  291. lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
  292. and \$-64,%rsp # align stack frame
  293. mov $ctx,$_ctx # save ctx, 1st arg
  294. mov $inp,$_inp # save inp, 2nd arh
  295. mov %rdx,$_end # save end pointer, "3rd" arg
  296. mov %rax,$_rsp # save copy of %rsp
  297. .cfi_cfa_expression $_rsp,deref,+8
  298. .Lprologue:
  299. mov $SZ*0($ctx),$A
  300. mov $SZ*1($ctx),$B
  301. mov $SZ*2($ctx),$C
  302. mov $SZ*3($ctx),$D
  303. mov $SZ*4($ctx),$E
  304. mov $SZ*5($ctx),$F
  305. mov $SZ*6($ctx),$G
  306. mov $SZ*7($ctx),$H
  307. jmp .Lloop
  308. .align 16
  309. .Lloop:
  310. mov $B,$a3
  311. lea $TABLE(%rip),$Tbl
  312. xor $C,$a3 # magic
  313. ___
  314. for($i=0;$i<16;$i++) {
  315. $code.=" mov $SZ*$i($inp),$T1\n";
  316. $code.=" mov @ROT[4],$a0\n";
  317. $code.=" mov @ROT[0],$a1\n";
  318. $code.=" bswap $T1\n";
  319. &ROUND_00_15($i,@ROT);
  320. unshift(@ROT,pop(@ROT));
  321. }
  322. $code.=<<___;
  323. jmp .Lrounds_16_xx
  324. .align 16
  325. .Lrounds_16_xx:
  326. ___
  327. for(;$i<32;$i++) {
  328. &ROUND_16_XX($i,@ROT);
  329. unshift(@ROT,pop(@ROT));
  330. }
  331. $code.=<<___;
  332. cmpb \$0,`$SZ-1`($Tbl)
  333. jnz .Lrounds_16_xx
  334. mov $_ctx,$ctx
  335. add $a1,$A # modulo-scheduled h+=Sigma0(a)
  336. lea 16*$SZ($inp),$inp
  337. add $SZ*0($ctx),$A
  338. add $SZ*1($ctx),$B
  339. add $SZ*2($ctx),$C
  340. add $SZ*3($ctx),$D
  341. add $SZ*4($ctx),$E
  342. add $SZ*5($ctx),$F
  343. add $SZ*6($ctx),$G
  344. add $SZ*7($ctx),$H
  345. cmp $_end,$inp
  346. mov $A,$SZ*0($ctx)
  347. mov $B,$SZ*1($ctx)
  348. mov $C,$SZ*2($ctx)
  349. mov $D,$SZ*3($ctx)
  350. mov $E,$SZ*4($ctx)
  351. mov $F,$SZ*5($ctx)
  352. mov $G,$SZ*6($ctx)
  353. mov $H,$SZ*7($ctx)
  354. jb .Lloop
  355. mov $_rsp,%rsi
  356. .cfi_def_cfa %rsi,8
  357. mov -48(%rsi),%r15
  358. .cfi_restore %r15
  359. mov -40(%rsi),%r14
  360. .cfi_restore %r14
  361. mov -32(%rsi),%r13
  362. .cfi_restore %r13
  363. mov -24(%rsi),%r12
  364. .cfi_restore %r12
  365. mov -16(%rsi),%rbp
  366. .cfi_restore %rbp
  367. mov -8(%rsi),%rbx
  368. .cfi_restore %rbx
  369. lea (%rsi),%rsp
  370. .cfi_def_cfa_register %rsp
  371. .Lepilogue:
  372. ret
  373. .cfi_endproc
  374. .size $func,.-$func
  375. ___
  376. if ($SZ==4) {
  377. $code.=<<___;
  378. .section .rodata align=64
  379. .align 64
  380. .type $TABLE,\@object
  381. $TABLE:
  382. .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
  383. .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
  384. .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
  385. .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
  386. .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
  387. .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
  388. .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
  389. .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
  390. .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
  391. .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
  392. .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
  393. .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
  394. .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
  395. .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
  396. .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
  397. .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
  398. .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
  399. .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
  400. .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
  401. .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
  402. .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
  403. .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
  404. .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
  405. .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
  406. .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
  407. .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
  408. .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
  409. .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
  410. .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
  411. .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
  412. .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
  413. .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
  414. .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
  415. .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
  416. .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff
  417. .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff
  418. .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908
  419. .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908
  420. .asciz "SHA256 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
  421. .previous
  422. ___
  423. } else {
  424. $code.=<<___;
  425. .section .rodata align=64
  426. .align 64
  427. .type $TABLE,\@object
  428. $TABLE:
  429. .quad 0x428a2f98d728ae22,0x7137449123ef65cd
  430. .quad 0x428a2f98d728ae22,0x7137449123ef65cd
  431. .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
  432. .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
  433. .quad 0x3956c25bf348b538,0x59f111f1b605d019
  434. .quad 0x3956c25bf348b538,0x59f111f1b605d019
  435. .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
  436. .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
  437. .quad 0xd807aa98a3030242,0x12835b0145706fbe
  438. .quad 0xd807aa98a3030242,0x12835b0145706fbe
  439. .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
  440. .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
  441. .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
  442. .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
  443. .quad 0x9bdc06a725c71235,0xc19bf174cf692694
  444. .quad 0x9bdc06a725c71235,0xc19bf174cf692694
  445. .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
  446. .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
  447. .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
  448. .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
  449. .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
  450. .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
  451. .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
  452. .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
  453. .quad 0x983e5152ee66dfab,0xa831c66d2db43210
  454. .quad 0x983e5152ee66dfab,0xa831c66d2db43210
  455. .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
  456. .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
  457. .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
  458. .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
  459. .quad 0x06ca6351e003826f,0x142929670a0e6e70
  460. .quad 0x06ca6351e003826f,0x142929670a0e6e70
  461. .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
  462. .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
  463. .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
  464. .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
  465. .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
  466. .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
  467. .quad 0x81c2c92e47edaee6,0x92722c851482353b
  468. .quad 0x81c2c92e47edaee6,0x92722c851482353b
  469. .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
  470. .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
  471. .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
  472. .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
  473. .quad 0xd192e819d6ef5218,0xd69906245565a910
  474. .quad 0xd192e819d6ef5218,0xd69906245565a910
  475. .quad 0xf40e35855771202a,0x106aa07032bbd1b8
  476. .quad 0xf40e35855771202a,0x106aa07032bbd1b8
  477. .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
  478. .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
  479. .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
  480. .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
  481. .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
  482. .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
  483. .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
  484. .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
  485. .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
  486. .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
  487. .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
  488. .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
  489. .quad 0x90befffa23631e28,0xa4506cebde82bde9
  490. .quad 0x90befffa23631e28,0xa4506cebde82bde9
  491. .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
  492. .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
  493. .quad 0xca273eceea26619c,0xd186b8c721c0c207
  494. .quad 0xca273eceea26619c,0xd186b8c721c0c207
  495. .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
  496. .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
  497. .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
  498. .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
  499. .quad 0x113f9804bef90dae,0x1b710b35131c471b
  500. .quad 0x113f9804bef90dae,0x1b710b35131c471b
  501. .quad 0x28db77f523047d84,0x32caab7b40c72493
  502. .quad 0x28db77f523047d84,0x32caab7b40c72493
  503. .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
  504. .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
  505. .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
  506. .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
  507. .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
  508. .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
  509. .quad 0x0001020304050607,0x08090a0b0c0d0e0f
  510. .quad 0x0001020304050607,0x08090a0b0c0d0e0f
  511. .asciz "SHA512 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
  512. .previous
  513. ___
  514. }
  515. ######################################################################
  516. # SIMD code paths
  517. #
  518. if ($SZ==4 && $shaext) {{{
  519. ######################################################################
  520. # Intel SHA Extensions implementation of SHA256 update function.
  521. #
  522. my ($ctx,$inp,$num,$Tbl)=("%rdi","%rsi","%rdx","%rcx");
  523. my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..2,7..10));
  524. my @MSG=map("%xmm$_",(3..6));
  525. $code.=<<___;
  526. .type sha256_block_data_order_shaext,\@function,3
  527. .align 64
  528. sha256_block_data_order_shaext:
  529. _shaext_shortcut:
  530. .cfi_startproc
  531. ___
  532. $code.=<<___ if ($win64);
  533. lea `-8-5*16`(%rsp),%rsp
  534. movaps %xmm6,-8-5*16(%rax)
  535. movaps %xmm7,-8-4*16(%rax)
  536. movaps %xmm8,-8-3*16(%rax)
  537. movaps %xmm9,-8-2*16(%rax)
  538. movaps %xmm10,-8-1*16(%rax)
  539. .Lprologue_shaext:
  540. ___
  541. $code.=<<___;
  542. lea K256+0x80(%rip),$Tbl
  543. movdqu ($ctx),$ABEF # DCBA
  544. movdqu 16($ctx),$CDGH # HGFE
  545. movdqa 0x200-0x80($Tbl),$TMP # byte swap mask
  546. pshufd \$0x1b,$ABEF,$Wi # ABCD
  547. pshufd \$0xb1,$ABEF,$ABEF # CDAB
  548. pshufd \$0x1b,$CDGH,$CDGH # EFGH
  549. movdqa $TMP,$BSWAP # offload
  550. palignr \$8,$CDGH,$ABEF # ABEF
  551. punpcklqdq $Wi,$CDGH # CDGH
  552. jmp .Loop_shaext
  553. .align 16
  554. .Loop_shaext:
  555. movdqu ($inp),@MSG[0]
  556. movdqu 0x10($inp),@MSG[1]
  557. movdqu 0x20($inp),@MSG[2]
  558. pshufb $TMP,@MSG[0]
  559. movdqu 0x30($inp),@MSG[3]
  560. movdqa 0*32-0x80($Tbl),$Wi
  561. paddd @MSG[0],$Wi
  562. pshufb $TMP,@MSG[1]
  563. movdqa $CDGH,$CDGH_SAVE # offload
  564. sha256rnds2 $ABEF,$CDGH # 0-3
  565. pshufd \$0x0e,$Wi,$Wi
  566. nop
  567. movdqa $ABEF,$ABEF_SAVE # offload
  568. sha256rnds2 $CDGH,$ABEF
  569. movdqa 1*32-0x80($Tbl),$Wi
  570. paddd @MSG[1],$Wi
  571. pshufb $TMP,@MSG[2]
  572. sha256rnds2 $ABEF,$CDGH # 4-7
  573. pshufd \$0x0e,$Wi,$Wi
  574. lea 0x40($inp),$inp
  575. sha256msg1 @MSG[1],@MSG[0]
  576. sha256rnds2 $CDGH,$ABEF
  577. movdqa 2*32-0x80($Tbl),$Wi
  578. paddd @MSG[2],$Wi
  579. pshufb $TMP,@MSG[3]
  580. sha256rnds2 $ABEF,$CDGH # 8-11
  581. pshufd \$0x0e,$Wi,$Wi
  582. movdqa @MSG[3],$TMP
  583. palignr \$4,@MSG[2],$TMP
  584. nop
  585. paddd $TMP,@MSG[0]
  586. sha256msg1 @MSG[2],@MSG[1]
  587. sha256rnds2 $CDGH,$ABEF
  588. movdqa 3*32-0x80($Tbl),$Wi
  589. paddd @MSG[3],$Wi
  590. sha256msg2 @MSG[3],@MSG[0]
  591. sha256rnds2 $ABEF,$CDGH # 12-15
  592. pshufd \$0x0e,$Wi,$Wi
  593. movdqa @MSG[0],$TMP
  594. palignr \$4,@MSG[3],$TMP
  595. nop
  596. paddd $TMP,@MSG[1]
  597. sha256msg1 @MSG[3],@MSG[2]
  598. sha256rnds2 $CDGH,$ABEF
  599. ___
  600. for($i=4;$i<16-3;$i++) {
  601. $code.=<<___;
  602. movdqa $i*32-0x80($Tbl),$Wi
  603. paddd @MSG[0],$Wi
  604. sha256msg2 @MSG[0],@MSG[1]
  605. sha256rnds2 $ABEF,$CDGH # 16-19...
  606. pshufd \$0x0e,$Wi,$Wi
  607. movdqa @MSG[1],$TMP
  608. palignr \$4,@MSG[0],$TMP
  609. nop
  610. paddd $TMP,@MSG[2]
  611. sha256msg1 @MSG[0],@MSG[3]
  612. sha256rnds2 $CDGH,$ABEF
  613. ___
  614. push(@MSG,shift(@MSG));
  615. }
  616. $code.=<<___;
  617. movdqa 13*32-0x80($Tbl),$Wi
  618. paddd @MSG[0],$Wi
  619. sha256msg2 @MSG[0],@MSG[1]
  620. sha256rnds2 $ABEF,$CDGH # 52-55
  621. pshufd \$0x0e,$Wi,$Wi
  622. movdqa @MSG[1],$TMP
  623. palignr \$4,@MSG[0],$TMP
  624. sha256rnds2 $CDGH,$ABEF
  625. paddd $TMP,@MSG[2]
  626. movdqa 14*32-0x80($Tbl),$Wi
  627. paddd @MSG[1],$Wi
  628. sha256rnds2 $ABEF,$CDGH # 56-59
  629. pshufd \$0x0e,$Wi,$Wi
  630. sha256msg2 @MSG[1],@MSG[2]
  631. movdqa $BSWAP,$TMP
  632. sha256rnds2 $CDGH,$ABEF
  633. movdqa 15*32-0x80($Tbl),$Wi
  634. paddd @MSG[2],$Wi
  635. nop
  636. sha256rnds2 $ABEF,$CDGH # 60-63
  637. pshufd \$0x0e,$Wi,$Wi
  638. dec $num
  639. nop
  640. sha256rnds2 $CDGH,$ABEF
  641. paddd $CDGH_SAVE,$CDGH
  642. paddd $ABEF_SAVE,$ABEF
  643. jnz .Loop_shaext
  644. pshufd \$0xb1,$CDGH,$CDGH # DCHG
  645. pshufd \$0x1b,$ABEF,$TMP # FEBA
  646. pshufd \$0xb1,$ABEF,$ABEF # BAFE
  647. punpckhqdq $CDGH,$ABEF # DCBA
  648. palignr \$8,$TMP,$CDGH # HGFE
  649. movdqu $ABEF,($ctx)
  650. movdqu $CDGH,16($ctx)
  651. ___
  652. $code.=<<___ if ($win64);
  653. movaps -8-5*16(%rax),%xmm6
  654. movaps -8-4*16(%rax),%xmm7
  655. movaps -8-3*16(%rax),%xmm8
  656. movaps -8-2*16(%rax),%xmm9
  657. movaps -8-1*16(%rax),%xmm10
  658. mov %rax,%rsp
  659. .Lepilogue_shaext:
  660. ___
  661. $code.=<<___;
  662. ret
  663. .cfi_endproc
  664. .size sha256_block_data_order_shaext,.-sha256_block_data_order_shaext
  665. ___
  666. }}}
  667. {{{
  668. my $a4=$T1;
  669. my ($a,$b,$c,$d,$e,$f,$g,$h);
  670. sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
  671. { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
  672. my $arg = pop;
  673. $arg = "\$$arg" if ($arg*1 eq $arg);
  674. $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
  675. }
  676. sub body_00_15 () {
  677. (
  678. '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
  679. '&ror ($a0,$Sigma1[2]-$Sigma1[1])',
  680. '&mov ($a,$a1)',
  681. '&mov ($a4,$f)',
  682. '&ror ($a1,$Sigma0[2]-$Sigma0[1])',
  683. '&xor ($a0,$e)',
  684. '&xor ($a4,$g)', # f^g
  685. '&ror ($a0,$Sigma1[1]-$Sigma1[0])',
  686. '&xor ($a1,$a)',
  687. '&and ($a4,$e)', # (f^g)&e
  688. '&xor ($a0,$e)',
  689. '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i]
  690. '&mov ($a2,$a)',
  691. '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g
  692. '&ror ($a1,$Sigma0[1]-$Sigma0[0])',
  693. '&xor ($a2,$b)', # a^b, b^c in next round
  694. '&add ($h,$a4)', # h+=Ch(e,f,g)
  695. '&ror ($a0,$Sigma1[0])', # Sigma1(e)
  696. '&and ($a3,$a2)', # (b^c)&(a^b)
  697. '&xor ($a1,$a)',
  698. '&add ($h,$a0)', # h+=Sigma1(e)
  699. '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
  700. '&ror ($a1,$Sigma0[0])', # Sigma0(a)
  701. '&add ($d,$h)', # d+=h
  702. '&add ($h,$a3)', # h+=Maj(a,b,c)
  703. '&mov ($a0,$d)',
  704. '&add ($a1,$h);'. # h+=Sigma0(a)
  705. '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
  706. );
  707. }
  708. ######################################################################
  709. # SSSE3 code path
  710. #
  711. if ($SZ==4) { # SHA256 only
  712. my @X = map("%xmm$_",(0..3));
  713. my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
  714. $code.=<<___;
  715. .type ${func}_ssse3,\@function,3
  716. .align 64
  717. ${func}_ssse3:
  718. .cfi_startproc
  719. .Lssse3_shortcut:
  720. mov %rsp,%rax # copy %rsp
  721. .cfi_def_cfa_register %rax
  722. push %rbx
  723. .cfi_push %rbx
  724. push %rbp
  725. .cfi_push %rbp
  726. push %r12
  727. .cfi_push %r12
  728. push %r13
  729. .cfi_push %r13
  730. push %r14
  731. .cfi_push %r14
  732. push %r15
  733. .cfi_push %r15
  734. shl \$4,%rdx # num*16
  735. sub \$`$framesz+$win64*16*4`,%rsp
  736. lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
  737. and \$-64,%rsp # align stack frame
  738. mov $ctx,$_ctx # save ctx, 1st arg
  739. mov $inp,$_inp # save inp, 2nd arh
  740. mov %rdx,$_end # save end pointer, "3rd" arg
  741. mov %rax,$_rsp # save copy of %rsp
  742. .cfi_cfa_expression $_rsp,deref,+8
  743. ___
  744. $code.=<<___ if ($win64);
  745. movaps %xmm6,16*$SZ+32(%rsp)
  746. movaps %xmm7,16*$SZ+48(%rsp)
  747. movaps %xmm8,16*$SZ+64(%rsp)
  748. movaps %xmm9,16*$SZ+80(%rsp)
  749. ___
  750. $code.=<<___;
  751. .Lprologue_ssse3:
  752. mov $SZ*0($ctx),$A
  753. mov $SZ*1($ctx),$B
  754. mov $SZ*2($ctx),$C
  755. mov $SZ*3($ctx),$D
  756. mov $SZ*4($ctx),$E
  757. mov $SZ*5($ctx),$F
  758. mov $SZ*6($ctx),$G
  759. mov $SZ*7($ctx),$H
  760. ___
  761. $code.=<<___;
  762. #movdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4
  763. #movdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5
  764. jmp .Lloop_ssse3
  765. .align 16
  766. .Lloop_ssse3:
  767. movdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
  768. movdqu 0x00($inp),@X[0]
  769. movdqu 0x10($inp),@X[1]
  770. movdqu 0x20($inp),@X[2]
  771. pshufb $t3,@X[0]
  772. movdqu 0x30($inp),@X[3]
  773. lea $TABLE(%rip),$Tbl
  774. pshufb $t3,@X[1]
  775. movdqa 0x00($Tbl),$t0
  776. movdqa 0x20($Tbl),$t1
  777. pshufb $t3,@X[2]
  778. paddd @X[0],$t0
  779. movdqa 0x40($Tbl),$t2
  780. pshufb $t3,@X[3]
  781. movdqa 0x60($Tbl),$t3
  782. paddd @X[1],$t1
  783. paddd @X[2],$t2
  784. paddd @X[3],$t3
  785. movdqa $t0,0x00(%rsp)
  786. mov $A,$a1
  787. movdqa $t1,0x10(%rsp)
  788. mov $B,$a3
  789. movdqa $t2,0x20(%rsp)
  790. xor $C,$a3 # magic
  791. movdqa $t3,0x30(%rsp)
  792. mov $E,$a0
  793. jmp .Lssse3_00_47
  794. .align 16
  795. .Lssse3_00_47:
  796. sub \$`-16*2*$SZ`,$Tbl # size optimization
  797. ___
  798. sub Xupdate_256_SSSE3 () {
  799. (
  800. '&movdqa ($t0,@X[1]);',
  801. '&movdqa ($t3,@X[3])',
  802. '&palignr ($t0,@X[0],$SZ)', # X[1..4]
  803. '&palignr ($t3,@X[2],$SZ);', # X[9..12]
  804. '&movdqa ($t1,$t0)',
  805. '&movdqa ($t2,$t0);',
  806. '&psrld ($t0,$sigma0[2])',
  807. '&paddd (@X[0],$t3);', # X[0..3] += X[9..12]
  808. '&psrld ($t2,$sigma0[0])',
  809. '&pshufd ($t3,@X[3],0b11111010)',# X[14..15]
  810. '&pslld ($t1,8*$SZ-$sigma0[1]);'.
  811. '&pxor ($t0,$t2)',
  812. '&psrld ($t2,$sigma0[1]-$sigma0[0]);'.
  813. '&pxor ($t0,$t1)',
  814. '&pslld ($t1,$sigma0[1]-$sigma0[0]);'.
  815. '&pxor ($t0,$t2);',
  816. '&movdqa ($t2,$t3)',
  817. '&pxor ($t0,$t1);', # sigma0(X[1..4])
  818. '&psrld ($t3,$sigma1[2])',
  819. '&paddd (@X[0],$t0);', # X[0..3] += sigma0(X[1..4])
  820. '&psrlq ($t2,$sigma1[0])',
  821. '&pxor ($t3,$t2);',
  822. '&psrlq ($t2,$sigma1[1]-$sigma1[0])',
  823. '&pxor ($t3,$t2)',
  824. '&pshufb ($t3,$t4)', # sigma1(X[14..15])
  825. '&paddd (@X[0],$t3)', # X[0..1] += sigma1(X[14..15])
  826. '&pshufd ($t3,@X[0],0b01010000)',# X[16..17]
  827. '&movdqa ($t2,$t3);',
  828. '&psrld ($t3,$sigma1[2])',
  829. '&psrlq ($t2,$sigma1[0])',
  830. '&pxor ($t3,$t2);',
  831. '&psrlq ($t2,$sigma1[1]-$sigma1[0])',
  832. '&pxor ($t3,$t2);',
  833. '&movdqa ($t2,16*2*$j."($Tbl)")',
  834. '&pshufb ($t3,$t5)',
  835. '&paddd (@X[0],$t3)' # X[2..3] += sigma1(X[16..17])
  836. );
  837. }
  838. sub SSSE3_256_00_47 () {
  839. my $j = shift;
  840. my $body = shift;
  841. my @X = @_;
  842. my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
  843. if (0) {
  844. foreach (Xupdate_256_SSSE3()) { # 36 instructions
  845. eval;
  846. eval(shift(@insns));
  847. eval(shift(@insns));
  848. eval(shift(@insns));
  849. }
  850. } else { # squeeze extra 4% on Westmere and 19% on Atom
  851. eval(shift(@insns)); #@
  852. &movdqa ($t0,@X[1]);
  853. eval(shift(@insns));
  854. eval(shift(@insns));
  855. &movdqa ($t3,@X[3]);
  856. eval(shift(@insns)); #@
  857. eval(shift(@insns));
  858. eval(shift(@insns));
  859. eval(shift(@insns)); #@
  860. eval(shift(@insns));
  861. &palignr ($t0,@X[0],$SZ); # X[1..4]
  862. eval(shift(@insns));
  863. eval(shift(@insns));
  864. &palignr ($t3,@X[2],$SZ); # X[9..12]
  865. eval(shift(@insns));
  866. eval(shift(@insns));
  867. eval(shift(@insns));
  868. eval(shift(@insns)); #@
  869. &movdqa ($t1,$t0);
  870. eval(shift(@insns));
  871. eval(shift(@insns));
  872. &movdqa ($t2,$t0);
  873. eval(shift(@insns)); #@
  874. eval(shift(@insns));
  875. &psrld ($t0,$sigma0[2]);
  876. eval(shift(@insns));
  877. eval(shift(@insns));
  878. eval(shift(@insns));
  879. &paddd (@X[0],$t3); # X[0..3] += X[9..12]
  880. eval(shift(@insns)); #@
  881. eval(shift(@insns));
  882. &psrld ($t2,$sigma0[0]);
  883. eval(shift(@insns));
  884. eval(shift(@insns));
  885. &pshufd ($t3,@X[3],0b11111010); # X[4..15]
  886. eval(shift(@insns));
  887. eval(shift(@insns)); #@
  888. &pslld ($t1,8*$SZ-$sigma0[1]);
  889. eval(shift(@insns));
  890. eval(shift(@insns));
  891. &pxor ($t0,$t2);
  892. eval(shift(@insns)); #@
  893. eval(shift(@insns));
  894. eval(shift(@insns));
  895. eval(shift(@insns)); #@
  896. &psrld ($t2,$sigma0[1]-$sigma0[0]);
  897. eval(shift(@insns));
  898. &pxor ($t0,$t1);
  899. eval(shift(@insns));
  900. eval(shift(@insns));
  901. &pslld ($t1,$sigma0[1]-$sigma0[0]);
  902. eval(shift(@insns));
  903. eval(shift(@insns));
  904. &pxor ($t0,$t2);
  905. eval(shift(@insns));
  906. eval(shift(@insns)); #@
  907. &movdqa ($t2,$t3);
  908. eval(shift(@insns));
  909. eval(shift(@insns));
  910. &pxor ($t0,$t1); # sigma0(X[1..4])
  911. eval(shift(@insns)); #@
  912. eval(shift(@insns));
  913. eval(shift(@insns));
  914. &psrld ($t3,$sigma1[2]);
  915. eval(shift(@insns));
  916. eval(shift(@insns));
  917. &paddd (@X[0],$t0); # X[0..3] += sigma0(X[1..4])
  918. eval(shift(@insns)); #@
  919. eval(shift(@insns));
  920. &psrlq ($t2,$sigma1[0]);
  921. eval(shift(@insns));
  922. eval(shift(@insns));
  923. eval(shift(@insns));
  924. &pxor ($t3,$t2);
  925. eval(shift(@insns)); #@
  926. eval(shift(@insns));
  927. eval(shift(@insns));
  928. eval(shift(@insns)); #@
  929. &psrlq ($t2,$sigma1[1]-$sigma1[0]);
  930. eval(shift(@insns));
  931. eval(shift(@insns));
  932. &pxor ($t3,$t2);
  933. eval(shift(@insns)); #@
  934. eval(shift(@insns));
  935. eval(shift(@insns));
  936. #&pshufb ($t3,$t4); # sigma1(X[14..15])
  937. &pshufd ($t3,$t3,0b10000000);
  938. eval(shift(@insns));
  939. eval(shift(@insns));
  940. eval(shift(@insns));
  941. &psrldq ($t3,8);
  942. eval(shift(@insns));
  943. eval(shift(@insns)); #@
  944. eval(shift(@insns));
  945. eval(shift(@insns));
  946. eval(shift(@insns)); #@
  947. &paddd (@X[0],$t3); # X[0..1] += sigma1(X[14..15])
  948. eval(shift(@insns));
  949. eval(shift(@insns));
  950. eval(shift(@insns));
  951. &pshufd ($t3,@X[0],0b01010000); # X[16..17]
  952. eval(shift(@insns));
  953. eval(shift(@insns)); #@
  954. eval(shift(@insns));
  955. &movdqa ($t2,$t3);
  956. eval(shift(@insns));
  957. eval(shift(@insns));
  958. &psrld ($t3,$sigma1[2]);
  959. eval(shift(@insns));
  960. eval(shift(@insns)); #@
  961. &psrlq ($t2,$sigma1[0]);
  962. eval(shift(@insns));
  963. eval(shift(@insns));
  964. &pxor ($t3,$t2);
  965. eval(shift(@insns)); #@
  966. eval(shift(@insns));
  967. eval(shift(@insns));
  968. eval(shift(@insns)); #@
  969. eval(shift(@insns));
  970. &psrlq ($t2,$sigma1[1]-$sigma1[0]);
  971. eval(shift(@insns));
  972. eval(shift(@insns));
  973. eval(shift(@insns));
  974. &pxor ($t3,$t2);
  975. eval(shift(@insns));
  976. eval(shift(@insns));
  977. eval(shift(@insns)); #@
  978. #&pshufb ($t3,$t5);
  979. &pshufd ($t3,$t3,0b00001000);
  980. eval(shift(@insns));
  981. eval(shift(@insns));
  982. &movdqa ($t2,16*2*$j."($Tbl)");
  983. eval(shift(@insns)); #@
  984. eval(shift(@insns));
  985. &pslldq ($t3,8);
  986. eval(shift(@insns));
  987. eval(shift(@insns));
  988. eval(shift(@insns));
  989. &paddd (@X[0],$t3); # X[2..3] += sigma1(X[16..17])
  990. eval(shift(@insns)); #@
  991. eval(shift(@insns));
  992. eval(shift(@insns));
  993. }
  994. &paddd ($t2,@X[0]);
  995. foreach (@insns) { eval; } # remaining instructions
  996. &movdqa (16*$j."(%rsp)",$t2);
  997. }
  998. for ($i=0,$j=0; $j<4; $j++) {
  999. &SSSE3_256_00_47($j,\&body_00_15,@X);
  1000. push(@X,shift(@X)); # rotate(@X)
  1001. }
  1002. &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
  1003. &jne (".Lssse3_00_47");
  1004. for ($i=0; $i<16; ) {
  1005. foreach(body_00_15()) { eval; }
  1006. }
  1007. $code.=<<___;
  1008. mov $_ctx,$ctx
  1009. mov $a1,$A
  1010. add $SZ*0($ctx),$A
  1011. lea 16*$SZ($inp),$inp
  1012. add $SZ*1($ctx),$B
  1013. add $SZ*2($ctx),$C
  1014. add $SZ*3($ctx),$D
  1015. add $SZ*4($ctx),$E
  1016. add $SZ*5($ctx),$F
  1017. add $SZ*6($ctx),$G
  1018. add $SZ*7($ctx),$H
  1019. cmp $_end,$inp
  1020. mov $A,$SZ*0($ctx)
  1021. mov $B,$SZ*1($ctx)
  1022. mov $C,$SZ*2($ctx)
  1023. mov $D,$SZ*3($ctx)
  1024. mov $E,$SZ*4($ctx)
  1025. mov $F,$SZ*5($ctx)
  1026. mov $G,$SZ*6($ctx)
  1027. mov $H,$SZ*7($ctx)
  1028. jb .Lloop_ssse3
  1029. mov $_rsp,%rsi
  1030. .cfi_def_cfa %rsi,8
  1031. ___
  1032. $code.=<<___ if ($win64);
  1033. movaps 16*$SZ+32(%rsp),%xmm6
  1034. movaps 16*$SZ+48(%rsp),%xmm7
  1035. movaps 16*$SZ+64(%rsp),%xmm8
  1036. movaps 16*$SZ+80(%rsp),%xmm9
  1037. ___
  1038. $code.=<<___;
  1039. mov -48(%rsi),%r15
  1040. .cfi_restore %r15
  1041. mov -40(%rsi),%r14
  1042. .cfi_restore %r14
  1043. mov -32(%rsi),%r13
  1044. .cfi_restore %r13
  1045. mov -24(%rsi),%r12
  1046. .cfi_restore %r12
  1047. mov -16(%rsi),%rbp
  1048. .cfi_restore %rbp
  1049. mov -8(%rsi),%rbx
  1050. .cfi_restore %rbx
  1051. lea (%rsi),%rsp
  1052. .cfi_def_cfa_register %rsp
  1053. .Lepilogue_ssse3:
  1054. ret
  1055. .cfi_endproc
  1056. .size ${func}_ssse3,.-${func}_ssse3
  1057. ___
  1058. }
  1059. if ($avx) {{
  1060. ######################################################################
  1061. # XOP code path
  1062. #
  1063. if ($SZ==8) { # SHA512 only
  1064. $code.=<<___;
  1065. .type ${func}_xop,\@function,3
  1066. .align 64
  1067. ${func}_xop:
  1068. .cfi_startproc
  1069. .Lxop_shortcut:
  1070. mov %rsp,%rax # copy %rsp
  1071. .cfi_def_cfa_register %rax
  1072. push %rbx
  1073. .cfi_push %rbx
  1074. push %rbp
  1075. .cfi_push %rbp
  1076. push %r12
  1077. .cfi_push %r12
  1078. push %r13
  1079. .cfi_push %r13
  1080. push %r14
  1081. .cfi_push %r14
  1082. push %r15
  1083. .cfi_push %r15
  1084. shl \$4,%rdx # num*16
  1085. sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
  1086. lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
  1087. and \$-64,%rsp # align stack frame
  1088. mov $ctx,$_ctx # save ctx, 1st arg
  1089. mov $inp,$_inp # save inp, 2nd arh
  1090. mov %rdx,$_end # save end pointer, "3rd" arg
  1091. mov %rax,$_rsp # save copy of %rsp
  1092. .cfi_cfa_expression $_rsp,deref,+8
  1093. ___
  1094. $code.=<<___ if ($win64);
  1095. movaps %xmm6,16*$SZ+32(%rsp)
  1096. movaps %xmm7,16*$SZ+48(%rsp)
  1097. movaps %xmm8,16*$SZ+64(%rsp)
  1098. movaps %xmm9,16*$SZ+80(%rsp)
  1099. ___
  1100. $code.=<<___ if ($win64 && $SZ>4);
  1101. movaps %xmm10,16*$SZ+96(%rsp)
  1102. movaps %xmm11,16*$SZ+112(%rsp)
  1103. ___
  1104. $code.=<<___;
  1105. .Lprologue_xop:
  1106. vzeroupper
  1107. mov $SZ*0($ctx),$A
  1108. mov $SZ*1($ctx),$B
  1109. mov $SZ*2($ctx),$C
  1110. mov $SZ*3($ctx),$D
  1111. mov $SZ*4($ctx),$E
  1112. mov $SZ*5($ctx),$F
  1113. mov $SZ*6($ctx),$G
  1114. mov $SZ*7($ctx),$H
  1115. jmp .Lloop_xop
  1116. ___
  1117. if ($SZ==4) { # SHA256
  1118. my @X = map("%xmm$_",(0..3));
  1119. my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
  1120. $code.=<<___;
  1121. .align 16
  1122. .Lloop_xop:
  1123. vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
  1124. vmovdqu 0x00($inp),@X[0]
  1125. vmovdqu 0x10($inp),@X[1]
  1126. vmovdqu 0x20($inp),@X[2]
  1127. vmovdqu 0x30($inp),@X[3]
  1128. vpshufb $t3,@X[0],@X[0]
  1129. lea $TABLE(%rip),$Tbl
  1130. vpshufb $t3,@X[1],@X[1]
  1131. vpshufb $t3,@X[2],@X[2]
  1132. vpaddd 0x00($Tbl),@X[0],$t0
  1133. vpshufb $t3,@X[3],@X[3]
  1134. vpaddd 0x20($Tbl),@X[1],$t1
  1135. vpaddd 0x40($Tbl),@X[2],$t2
  1136. vpaddd 0x60($Tbl),@X[3],$t3
  1137. vmovdqa $t0,0x00(%rsp)
  1138. mov $A,$a1
  1139. vmovdqa $t1,0x10(%rsp)
  1140. mov $B,$a3
  1141. vmovdqa $t2,0x20(%rsp)
  1142. xor $C,$a3 # magic
  1143. vmovdqa $t3,0x30(%rsp)
  1144. mov $E,$a0
  1145. jmp .Lxop_00_47
  1146. .align 16
  1147. .Lxop_00_47:
  1148. sub \$`-16*2*$SZ`,$Tbl # size optimization
  1149. ___
  1150. sub XOP_256_00_47 () {
  1151. my $j = shift;
  1152. my $body = shift;
  1153. my @X = @_;
  1154. my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
  1155. &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..4]
  1156. eval(shift(@insns));
  1157. eval(shift(@insns));
  1158. &vpalignr ($t3,@X[3],@X[2],$SZ); # X[9..12]
  1159. eval(shift(@insns));
  1160. eval(shift(@insns));
  1161. &vprotd ($t1,$t0,8*$SZ-$sigma0[1]);
  1162. eval(shift(@insns));
  1163. eval(shift(@insns));
  1164. &vpsrld ($t0,$t0,$sigma0[2]);
  1165. eval(shift(@insns));
  1166. eval(shift(@insns));
  1167. &vpaddd (@X[0],@X[0],$t3); # X[0..3] += X[9..12]
  1168. eval(shift(@insns));
  1169. eval(shift(@insns));
  1170. eval(shift(@insns));
  1171. eval(shift(@insns));
  1172. &vprotd ($t2,$t1,$sigma0[1]-$sigma0[0]);
  1173. eval(shift(@insns));
  1174. eval(shift(@insns));
  1175. &vpxor ($t0,$t0,$t1);
  1176. eval(shift(@insns));
  1177. eval(shift(@insns));
  1178. eval(shift(@insns));
  1179. eval(shift(@insns));
  1180. &vprotd ($t3,@X[3],8*$SZ-$sigma1[1]);
  1181. eval(shift(@insns));
  1182. eval(shift(@insns));
  1183. &vpxor ($t0,$t0,$t2); # sigma0(X[1..4])
  1184. eval(shift(@insns));
  1185. eval(shift(@insns));
  1186. &vpsrld ($t2,@X[3],$sigma1[2]);
  1187. eval(shift(@insns));
  1188. eval(shift(@insns));
  1189. &vpaddd (@X[0],@X[0],$t0); # X[0..3] += sigma0(X[1..4])
  1190. eval(shift(@insns));
  1191. eval(shift(@insns));
  1192. &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
  1193. eval(shift(@insns));
  1194. eval(shift(@insns));
  1195. &vpxor ($t3,$t3,$t2);
  1196. eval(shift(@insns));
  1197. eval(shift(@insns));
  1198. eval(shift(@insns));
  1199. eval(shift(@insns));
  1200. &vpxor ($t3,$t3,$t1); # sigma1(X[14..15])
  1201. eval(shift(@insns));
  1202. eval(shift(@insns));
  1203. eval(shift(@insns));
  1204. eval(shift(@insns));
  1205. &vpsrldq ($t3,$t3,8);
  1206. eval(shift(@insns));
  1207. eval(shift(@insns));
  1208. eval(shift(@insns));
  1209. eval(shift(@insns));
  1210. &vpaddd (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15])
  1211. eval(shift(@insns));
  1212. eval(shift(@insns));
  1213. eval(shift(@insns));
  1214. eval(shift(@insns));
  1215. &vprotd ($t3,@X[0],8*$SZ-$sigma1[1]);
  1216. eval(shift(@insns));
  1217. eval(shift(@insns));
  1218. &vpsrld ($t2,@X[0],$sigma1[2]);
  1219. eval(shift(@insns));
  1220. eval(shift(@insns));
  1221. &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
  1222. eval(shift(@insns));
  1223. eval(shift(@insns));
  1224. &vpxor ($t3,$t3,$t2);
  1225. eval(shift(@insns));
  1226. eval(shift(@insns));
  1227. eval(shift(@insns));
  1228. eval(shift(@insns));
  1229. &vpxor ($t3,$t3,$t1); # sigma1(X[16..17])
  1230. eval(shift(@insns));
  1231. eval(shift(@insns));
  1232. eval(shift(@insns));
  1233. eval(shift(@insns));
  1234. &vpslldq ($t3,$t3,8); # 22 instructions
  1235. eval(shift(@insns));
  1236. eval(shift(@insns));
  1237. eval(shift(@insns));
  1238. eval(shift(@insns));
  1239. &vpaddd (@X[0],@X[0],$t3); # X[2..3] += sigma1(X[16..17])
  1240. eval(shift(@insns));
  1241. eval(shift(@insns));
  1242. eval(shift(@insns));
  1243. eval(shift(@insns));
  1244. &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
  1245. foreach (@insns) { eval; } # remaining instructions
  1246. &vmovdqa (16*$j."(%rsp)",$t2);
  1247. }
  1248. for ($i=0,$j=0; $j<4; $j++) {
  1249. &XOP_256_00_47($j,\&body_00_15,@X);
  1250. push(@X,shift(@X)); # rotate(@X)
  1251. }
  1252. &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
  1253. &jne (".Lxop_00_47");
  1254. for ($i=0; $i<16; ) {
  1255. foreach(body_00_15()) { eval; }
  1256. }
  1257. } else { # SHA512
  1258. my @X = map("%xmm$_",(0..7));
  1259. my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11));
  1260. $code.=<<___;
  1261. .align 16
  1262. .Lloop_xop:
  1263. vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
  1264. vmovdqu 0x00($inp),@X[0]
  1265. lea $TABLE+0x80(%rip),$Tbl # size optimization
  1266. vmovdqu 0x10($inp),@X[1]
  1267. vmovdqu 0x20($inp),@X[2]
  1268. vpshufb $t3,@X[0],@X[0]
  1269. vmovdqu 0x30($inp),@X[3]
  1270. vpshufb $t3,@X[1],@X[1]
  1271. vmovdqu 0x40($inp),@X[4]
  1272. vpshufb $t3,@X[2],@X[2]
  1273. vmovdqu 0x50($inp),@X[5]
  1274. vpshufb $t3,@X[3],@X[3]
  1275. vmovdqu 0x60($inp),@X[6]
  1276. vpshufb $t3,@X[4],@X[4]
  1277. vmovdqu 0x70($inp),@X[7]
  1278. vpshufb $t3,@X[5],@X[5]
  1279. vpaddq -0x80($Tbl),@X[0],$t0
  1280. vpshufb $t3,@X[6],@X[6]
  1281. vpaddq -0x60($Tbl),@X[1],$t1
  1282. vpshufb $t3,@X[7],@X[7]
  1283. vpaddq -0x40($Tbl),@X[2],$t2
  1284. vpaddq -0x20($Tbl),@X[3],$t3
  1285. vmovdqa $t0,0x00(%rsp)
  1286. vpaddq 0x00($Tbl),@X[4],$t0
  1287. vmovdqa $t1,0x10(%rsp)
  1288. vpaddq 0x20($Tbl),@X[5],$t1
  1289. vmovdqa $t2,0x20(%rsp)
  1290. vpaddq 0x40($Tbl),@X[6],$t2
  1291. vmovdqa $t3,0x30(%rsp)
  1292. vpaddq 0x60($Tbl),@X[7],$t3
  1293. vmovdqa $t0,0x40(%rsp)
  1294. mov $A,$a1
  1295. vmovdqa $t1,0x50(%rsp)
  1296. mov $B,$a3
  1297. vmovdqa $t2,0x60(%rsp)
  1298. xor $C,$a3 # magic
  1299. vmovdqa $t3,0x70(%rsp)
  1300. mov $E,$a0
  1301. jmp .Lxop_00_47
  1302. .align 16
  1303. .Lxop_00_47:
  1304. add \$`16*2*$SZ`,$Tbl
  1305. ___
  1306. sub XOP_512_00_47 () {
  1307. my $j = shift;
  1308. my $body = shift;
  1309. my @X = @_;
  1310. my @insns = (&$body,&$body); # 52 instructions
  1311. &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..2]
  1312. eval(shift(@insns));
  1313. eval(shift(@insns));
  1314. &vpalignr ($t3,@X[5],@X[4],$SZ); # X[9..10]
  1315. eval(shift(@insns));
  1316. eval(shift(@insns));
  1317. &vprotq ($t1,$t0,8*$SZ-$sigma0[1]);
  1318. eval(shift(@insns));
  1319. eval(shift(@insns));
  1320. &vpsrlq ($t0,$t0,$sigma0[2]);
  1321. eval(shift(@insns));
  1322. eval(shift(@insns));
  1323. &vpaddq (@X[0],@X[0],$t3); # X[0..1] += X[9..10]
  1324. eval(shift(@insns));
  1325. eval(shift(@insns));
  1326. eval(shift(@insns));
  1327. eval(shift(@insns));
  1328. &vprotq ($t2,$t1,$sigma0[1]-$sigma0[0]);
  1329. eval(shift(@insns));
  1330. eval(shift(@insns));
  1331. &vpxor ($t0,$t0,$t1);
  1332. eval(shift(@insns));
  1333. eval(shift(@insns));
  1334. eval(shift(@insns));
  1335. eval(shift(@insns));
  1336. &vprotq ($t3,@X[7],8*$SZ-$sigma1[1]);
  1337. eval(shift(@insns));
  1338. eval(shift(@insns));
  1339. &vpxor ($t0,$t0,$t2); # sigma0(X[1..2])
  1340. eval(shift(@insns));
  1341. eval(shift(@insns));
  1342. &vpsrlq ($t2,@X[7],$sigma1[2]);
  1343. eval(shift(@insns));
  1344. eval(shift(@insns));
  1345. &vpaddq (@X[0],@X[0],$t0); # X[0..1] += sigma0(X[1..2])
  1346. eval(shift(@insns));
  1347. eval(shift(@insns));
  1348. &vprotq ($t1,$t3,$sigma1[1]-$sigma1[0]);
  1349. eval(shift(@insns));
  1350. eval(shift(@insns));
  1351. &vpxor ($t3,$t3,$t2);
  1352. eval(shift(@insns));
  1353. eval(shift(@insns));
  1354. eval(shift(@insns));
  1355. eval(shift(@insns));
  1356. &vpxor ($t3,$t3,$t1); # sigma1(X[14..15])
  1357. eval(shift(@insns));
  1358. eval(shift(@insns));
  1359. eval(shift(@insns));
  1360. eval(shift(@insns));
  1361. &vpaddq (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15])
  1362. eval(shift(@insns));
  1363. eval(shift(@insns));
  1364. eval(shift(@insns));
  1365. eval(shift(@insns));
  1366. &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)");
  1367. foreach (@insns) { eval; } # remaining instructions
  1368. &vmovdqa (16*$j."(%rsp)",$t2);
  1369. }
  1370. for ($i=0,$j=0; $j<8; $j++) {
  1371. &XOP_512_00_47($j,\&body_00_15,@X);
  1372. push(@X,shift(@X)); # rotate(@X)
  1373. }
  1374. &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0);
  1375. &jne (".Lxop_00_47");
  1376. for ($i=0; $i<16; ) {
  1377. foreach(body_00_15()) { eval; }
  1378. }
  1379. }
  1380. $code.=<<___;
  1381. mov $_ctx,$ctx
  1382. mov $a1,$A
  1383. add $SZ*0($ctx),$A
  1384. lea 16*$SZ($inp),$inp
  1385. add $SZ*1($ctx),$B
  1386. add $SZ*2($ctx),$C
  1387. add $SZ*3($ctx),$D
  1388. add $SZ*4($ctx),$E
  1389. add $SZ*5($ctx),$F
  1390. add $SZ*6($ctx),$G
  1391. add $SZ*7($ctx),$H
  1392. cmp $_end,$inp
  1393. mov $A,$SZ*0($ctx)
  1394. mov $B,$SZ*1($ctx)
  1395. mov $C,$SZ*2($ctx)
  1396. mov $D,$SZ*3($ctx)
  1397. mov $E,$SZ*4($ctx)
  1398. mov $F,$SZ*5($ctx)
  1399. mov $G,$SZ*6($ctx)
  1400. mov $H,$SZ*7($ctx)
  1401. jb .Lloop_xop
  1402. mov $_rsp,%rsi
  1403. .cfi_def_cfa %rsi,8
  1404. vzeroupper
  1405. ___
  1406. $code.=<<___ if ($win64);
  1407. movaps 16*$SZ+32(%rsp),%xmm6
  1408. movaps 16*$SZ+48(%rsp),%xmm7
  1409. movaps 16*$SZ+64(%rsp),%xmm8
  1410. movaps 16*$SZ+80(%rsp),%xmm9
  1411. ___
  1412. $code.=<<___ if ($win64 && $SZ>4);
  1413. movaps 16*$SZ+96(%rsp),%xmm10
  1414. movaps 16*$SZ+112(%rsp),%xmm11
  1415. ___
  1416. $code.=<<___;
  1417. mov -48(%rsi),%r15
  1418. .cfi_restore %r15
  1419. mov -40(%rsi),%r14
  1420. .cfi_restore %r14
  1421. mov -32(%rsi),%r13
  1422. .cfi_restore %r13
  1423. mov -24(%rsi),%r12
  1424. .cfi_restore %r12
  1425. mov -16(%rsi),%rbp
  1426. .cfi_restore %rbp
  1427. mov -8(%rsi),%rbx
  1428. .cfi_restore %rbx
  1429. lea (%rsi),%rsp
  1430. .cfi_def_cfa_register %rsp
  1431. .Lepilogue_xop:
  1432. ret
  1433. .cfi_endproc
  1434. .size ${func}_xop,.-${func}_xop
  1435. ___
  1436. }
  1437. ######################################################################
  1438. # AVX+shrd code path
  1439. #
  1440. local *ror = sub { &shrd(@_[0],@_) };
  1441. $code.=<<___;
  1442. .type ${func}_avx,\@function,3
  1443. .align 64
  1444. ${func}_avx:
  1445. .cfi_startproc
  1446. .Lavx_shortcut:
  1447. mov %rsp,%rax # copy %rsp
  1448. .cfi_def_cfa_register %rax
  1449. push %rbx
  1450. .cfi_push %rbx
  1451. push %rbp
  1452. .cfi_push %rbp
  1453. push %r12
  1454. .cfi_push %r12
  1455. push %r13
  1456. .cfi_push %r13
  1457. push %r14
  1458. .cfi_push %r14
  1459. push %r15
  1460. .cfi_push %r15
  1461. shl \$4,%rdx # num*16
  1462. sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
  1463. lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
  1464. and \$-64,%rsp # align stack frame
  1465. mov $ctx,$_ctx # save ctx, 1st arg
  1466. mov $inp,$_inp # save inp, 2nd arh
  1467. mov %rdx,$_end # save end pointer, "3rd" arg
  1468. mov %rax,$_rsp # save copy of %rsp
  1469. .cfi_cfa_expression $_rsp,deref,+8
  1470. ___
  1471. $code.=<<___ if ($win64);
  1472. movaps %xmm6,16*$SZ+32(%rsp)
  1473. movaps %xmm7,16*$SZ+48(%rsp)
  1474. movaps %xmm8,16*$SZ+64(%rsp)
  1475. movaps %xmm9,16*$SZ+80(%rsp)
  1476. ___
  1477. $code.=<<___ if ($win64 && $SZ>4);
  1478. movaps %xmm10,16*$SZ+96(%rsp)
  1479. movaps %xmm11,16*$SZ+112(%rsp)
  1480. ___
  1481. $code.=<<___;
  1482. .Lprologue_avx:
  1483. vzeroupper
  1484. mov $SZ*0($ctx),$A
  1485. mov $SZ*1($ctx),$B
  1486. mov $SZ*2($ctx),$C
  1487. mov $SZ*3($ctx),$D
  1488. mov $SZ*4($ctx),$E
  1489. mov $SZ*5($ctx),$F
  1490. mov $SZ*6($ctx),$G
  1491. mov $SZ*7($ctx),$H
  1492. ___
  1493. if ($SZ==4) { # SHA256
  1494. my @X = map("%xmm$_",(0..3));
  1495. my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
  1496. $code.=<<___;
  1497. vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4
  1498. vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5
  1499. jmp .Lloop_avx
  1500. .align 16
  1501. .Lloop_avx:
  1502. vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
  1503. vmovdqu 0x00($inp),@X[0]
  1504. vmovdqu 0x10($inp),@X[1]
  1505. vmovdqu 0x20($inp),@X[2]
  1506. vmovdqu 0x30($inp),@X[3]
  1507. vpshufb $t3,@X[0],@X[0]
  1508. lea $TABLE(%rip),$Tbl
  1509. vpshufb $t3,@X[1],@X[1]
  1510. vpshufb $t3,@X[2],@X[2]
  1511. vpaddd 0x00($Tbl),@X[0],$t0
  1512. vpshufb $t3,@X[3],@X[3]
  1513. vpaddd 0x20($Tbl),@X[1],$t1
  1514. vpaddd 0x40($Tbl),@X[2],$t2
  1515. vpaddd 0x60($Tbl),@X[3],$t3
  1516. vmovdqa $t0,0x00(%rsp)
  1517. mov $A,$a1
  1518. vmovdqa $t1,0x10(%rsp)
  1519. mov $B,$a3
  1520. vmovdqa $t2,0x20(%rsp)
  1521. xor $C,$a3 # magic
  1522. vmovdqa $t3,0x30(%rsp)
  1523. mov $E,$a0
  1524. jmp .Lavx_00_47
  1525. .align 16
  1526. .Lavx_00_47:
  1527. sub \$`-16*2*$SZ`,$Tbl # size optimization
  1528. ___
  1529. sub Xupdate_256_AVX () {
  1530. (
  1531. '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4]
  1532. '&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12]
  1533. '&vpsrld ($t2,$t0,$sigma0[0]);',
  1534. '&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12]
  1535. '&vpsrld ($t3,$t0,$sigma0[2])',
  1536. '&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);',
  1537. '&vpxor ($t0,$t3,$t2)',
  1538. '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15]
  1539. '&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);',
  1540. '&vpxor ($t0,$t0,$t1)',
  1541. '&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);',
  1542. '&vpxor ($t0,$t0,$t2)',
  1543. '&vpsrld ($t2,$t3,$sigma1[2]);',
  1544. '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4])
  1545. '&vpsrlq ($t3,$t3,$sigma1[0]);',
  1546. '&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4])
  1547. '&vpxor ($t2,$t2,$t3);',
  1548. '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
  1549. '&vpxor ($t2,$t2,$t3)',
  1550. '&vpshufb ($t2,$t2,$t4)', # sigma1(X[14..15])
  1551. '&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15])
  1552. '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17]
  1553. '&vpsrld ($t2,$t3,$sigma1[2])',
  1554. '&vpsrlq ($t3,$t3,$sigma1[0])',
  1555. '&vpxor ($t2,$t2,$t3);',
  1556. '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
  1557. '&vpxor ($t2,$t2,$t3)',
  1558. '&vpshufb ($t2,$t2,$t5)',
  1559. '&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17])
  1560. );
  1561. }
  1562. sub AVX_256_00_47 () {
  1563. my $j = shift;
  1564. my $body = shift;
  1565. my @X = @_;
  1566. my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
  1567. foreach (Xupdate_256_AVX()) { # 29 instructions
  1568. eval;
  1569. eval(shift(@insns));
  1570. eval(shift(@insns));
  1571. eval(shift(@insns));
  1572. }
  1573. &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
  1574. foreach (@insns) { eval; } # remaining instructions
  1575. &vmovdqa (16*$j."(%rsp)",$t2);
  1576. }
  1577. for ($i=0,$j=0; $j<4; $j++) {
  1578. &AVX_256_00_47($j,\&body_00_15,@X);
  1579. push(@X,shift(@X)); # rotate(@X)
  1580. }
  1581. &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
  1582. &jne (".Lavx_00_47");
  1583. for ($i=0; $i<16; ) {
  1584. foreach(body_00_15()) { eval; }
  1585. }
  1586. } else { # SHA512
  1587. my @X = map("%xmm$_",(0..7));
  1588. my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11));
  1589. $code.=<<___;
  1590. jmp .Lloop_avx
  1591. .align 16
  1592. .Lloop_avx:
  1593. vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
  1594. vmovdqu 0x00($inp),@X[0]
  1595. lea $TABLE+0x80(%rip),$Tbl # size optimization
  1596. vmovdqu 0x10($inp),@X[1]
  1597. vmovdqu 0x20($inp),@X[2]
  1598. vpshufb $t3,@X[0],@X[0]
  1599. vmovdqu 0x30($inp),@X[3]
  1600. vpshufb $t3,@X[1],@X[1]
  1601. vmovdqu 0x40($inp),@X[4]
  1602. vpshufb $t3,@X[2],@X[2]
  1603. vmovdqu 0x50($inp),@X[5]
  1604. vpshufb $t3,@X[3],@X[3]
  1605. vmovdqu 0x60($inp),@X[6]
  1606. vpshufb $t3,@X[4],@X[4]
  1607. vmovdqu 0x70($inp),@X[7]
  1608. vpshufb $t3,@X[5],@X[5]
  1609. vpaddq -0x80($Tbl),@X[0],$t0
  1610. vpshufb $t3,@X[6],@X[6]
  1611. vpaddq -0x60($Tbl),@X[1],$t1
  1612. vpshufb $t3,@X[7],@X[7]
  1613. vpaddq -0x40($Tbl),@X[2],$t2
  1614. vpaddq -0x20($Tbl),@X[3],$t3
  1615. vmovdqa $t0,0x00(%rsp)
  1616. vpaddq 0x00($Tbl),@X[4],$t0
  1617. vmovdqa $t1,0x10(%rsp)
  1618. vpaddq 0x20($Tbl),@X[5],$t1
  1619. vmovdqa $t2,0x20(%rsp)
  1620. vpaddq 0x40($Tbl),@X[6],$t2
  1621. vmovdqa $t3,0x30(%rsp)
  1622. vpaddq 0x60($Tbl),@X[7],$t3
  1623. vmovdqa $t0,0x40(%rsp)
  1624. mov $A,$a1
  1625. vmovdqa $t1,0x50(%rsp)
  1626. mov $B,$a3
  1627. vmovdqa $t2,0x60(%rsp)
  1628. xor $C,$a3 # magic
  1629. vmovdqa $t3,0x70(%rsp)
  1630. mov $E,$a0
  1631. jmp .Lavx_00_47
  1632. .align 16
  1633. .Lavx_00_47:
  1634. add \$`16*2*$SZ`,$Tbl
  1635. ___
  1636. sub Xupdate_512_AVX () {
  1637. (
  1638. '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..2]
  1639. '&vpalignr ($t3,@X[5],@X[4],$SZ)', # X[9..10]
  1640. '&vpsrlq ($t2,$t0,$sigma0[0])',
  1641. '&vpaddq (@X[0],@X[0],$t3);', # X[0..1] += X[9..10]
  1642. '&vpsrlq ($t3,$t0,$sigma0[2])',
  1643. '&vpsllq ($t1,$t0,8*$SZ-$sigma0[1]);',
  1644. '&vpxor ($t0,$t3,$t2)',
  1645. '&vpsrlq ($t2,$t2,$sigma0[1]-$sigma0[0]);',
  1646. '&vpxor ($t0,$t0,$t1)',
  1647. '&vpsllq ($t1,$t1,$sigma0[1]-$sigma0[0]);',
  1648. '&vpxor ($t0,$t0,$t2)',
  1649. '&vpsrlq ($t3,@X[7],$sigma1[2]);',
  1650. '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..2])
  1651. '&vpsllq ($t2,@X[7],8*$SZ-$sigma1[1]);',
  1652. '&vpaddq (@X[0],@X[0],$t0)', # X[0..1] += sigma0(X[1..2])
  1653. '&vpsrlq ($t1,@X[7],$sigma1[0]);',
  1654. '&vpxor ($t3,$t3,$t2)',
  1655. '&vpsllq ($t2,$t2,$sigma1[1]-$sigma1[0]);',
  1656. '&vpxor ($t3,$t3,$t1)',
  1657. '&vpsrlq ($t1,$t1,$sigma1[1]-$sigma1[0]);',
  1658. '&vpxor ($t3,$t3,$t2)',
  1659. '&vpxor ($t3,$t3,$t1)', # sigma1(X[14..15])
  1660. '&vpaddq (@X[0],@X[0],$t3)', # X[0..1] += sigma1(X[14..15])
  1661. );
  1662. }
  1663. sub AVX_512_00_47 () {
  1664. my $j = shift;
  1665. my $body = shift;
  1666. my @X = @_;
  1667. my @insns = (&$body,&$body); # 52 instructions
  1668. foreach (Xupdate_512_AVX()) { # 23 instructions
  1669. eval;
  1670. eval(shift(@insns));
  1671. eval(shift(@insns));
  1672. }
  1673. &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)");
  1674. foreach (@insns) { eval; } # remaining instructions
  1675. &vmovdqa (16*$j."(%rsp)",$t2);
  1676. }
  1677. for ($i=0,$j=0; $j<8; $j++) {
  1678. &AVX_512_00_47($j,\&body_00_15,@X);
  1679. push(@X,shift(@X)); # rotate(@X)
  1680. }
  1681. &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0);
  1682. &jne (".Lavx_00_47");
  1683. for ($i=0; $i<16; ) {
  1684. foreach(body_00_15()) { eval; }
  1685. }
  1686. }
  1687. $code.=<<___;
  1688. mov $_ctx,$ctx
  1689. mov $a1,$A
  1690. add $SZ*0($ctx),$A
  1691. lea 16*$SZ($inp),$inp
  1692. add $SZ*1($ctx),$B
  1693. add $SZ*2($ctx),$C
  1694. add $SZ*3($ctx),$D
  1695. add $SZ*4($ctx),$E
  1696. add $SZ*5($ctx),$F
  1697. add $SZ*6($ctx),$G
  1698. add $SZ*7($ctx),$H
  1699. cmp $_end,$inp
  1700. mov $A,$SZ*0($ctx)
  1701. mov $B,$SZ*1($ctx)
  1702. mov $C,$SZ*2($ctx)
  1703. mov $D,$SZ*3($ctx)
  1704. mov $E,$SZ*4($ctx)
  1705. mov $F,$SZ*5($ctx)
  1706. mov $G,$SZ*6($ctx)
  1707. mov $H,$SZ*7($ctx)
  1708. jb .Lloop_avx
  1709. mov $_rsp,%rsi
  1710. .cfi_def_cfa %rsi,8
  1711. vzeroupper
  1712. ___
  1713. $code.=<<___ if ($win64);
  1714. movaps 16*$SZ+32(%rsp),%xmm6
  1715. movaps 16*$SZ+48(%rsp),%xmm7
  1716. movaps 16*$SZ+64(%rsp),%xmm8
  1717. movaps 16*$SZ+80(%rsp),%xmm9
  1718. ___
  1719. $code.=<<___ if ($win64 && $SZ>4);
  1720. movaps 16*$SZ+96(%rsp),%xmm10
  1721. movaps 16*$SZ+112(%rsp),%xmm11
  1722. ___
  1723. $code.=<<___;
  1724. mov -48(%rsi),%r15
  1725. .cfi_restore %r15
  1726. mov -40(%rsi),%r14
  1727. .cfi_restore %r14
  1728. mov -32(%rsi),%r13
  1729. .cfi_restore %r13
  1730. mov -24(%rsi),%r12
  1731. .cfi_restore %r12
  1732. mov -16(%rsi),%rbp
  1733. .cfi_restore %rbp
  1734. mov -8(%rsi),%rbx
  1735. .cfi_restore %rbx
  1736. lea (%rsi),%rsp
  1737. .cfi_def_cfa_register %rsp
  1738. .Lepilogue_avx:
  1739. ret
  1740. .cfi_endproc
  1741. .size ${func}_avx,.-${func}_avx
  1742. ___
  1743. if ($avx>1) {{
  1744. ######################################################################
  1745. # AVX2+BMI code path
  1746. #
  1747. my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp
  1748. my $PUSH8=8*2*$SZ;
  1749. use integer;
  1750. sub bodyx_00_15 () {
  1751. # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
  1752. (
  1753. '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
  1754. '&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i]
  1755. '&and ($a4,$e)', # f&e
  1756. '&rorx ($a0,$e,$Sigma1[2])',
  1757. '&rorx ($a2,$e,$Sigma1[1])',
  1758. '&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past
  1759. '&lea ($h,"($h,$a4)")',
  1760. '&andn ($a4,$e,$g)', # ~e&g
  1761. '&xor ($a0,$a2)',
  1762. '&rorx ($a1,$e,$Sigma1[0])',
  1763. '&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g)
  1764. '&xor ($a0,$a1)', # Sigma1(e)
  1765. '&mov ($a2,$a)',
  1766. '&rorx ($a4,$a,$Sigma0[2])',
  1767. '&lea ($h,"($h,$a0)")', # h+=Sigma1(e)
  1768. '&xor ($a2,$b)', # a^b, b^c in next round
  1769. '&rorx ($a1,$a,$Sigma0[1])',
  1770. '&rorx ($a0,$a,$Sigma0[0])',
  1771. '&lea ($d,"($d,$h)")', # d+=h
  1772. '&and ($a3,$a2)', # (b^c)&(a^b)
  1773. '&xor ($a1,$a4)',
  1774. '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
  1775. '&xor ($a1,$a0)', # Sigma0(a)
  1776. '&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c)
  1777. '&mov ($a4,$e)', # copy of f in future
  1778. '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
  1779. );
  1780. # and at the finish one has to $a+=$a1
  1781. }
  1782. $code.=<<___;
  1783. .type ${func}_avx2,\@function,3
  1784. .align 64
  1785. ${func}_avx2:
  1786. .cfi_startproc
  1787. .Lavx2_shortcut:
  1788. mov %rsp,%rax # copy %rsp
  1789. .cfi_def_cfa_register %rax
  1790. push %rbx
  1791. .cfi_push %rbx
  1792. push %rbp
  1793. .cfi_push %rbp
  1794. push %r12
  1795. .cfi_push %r12
  1796. push %r13
  1797. .cfi_push %r13
  1798. push %r14
  1799. .cfi_push %r14
  1800. push %r15
  1801. .cfi_push %r15
  1802. sub \$`2*$SZ*$rounds+4*8+$win64*16*($SZ==4?4:6)`,%rsp
  1803. shl \$4,%rdx # num*16
  1804. and \$-256*$SZ,%rsp # align stack frame
  1805. lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
  1806. add \$`2*$SZ*($rounds-8)`,%rsp
  1807. mov $ctx,$_ctx # save ctx, 1st arg
  1808. mov $inp,$_inp # save inp, 2nd arh
  1809. mov %rdx,$_end # save end pointer, "3rd" arg
  1810. mov %rax,$_rsp # save copy of %rsp
  1811. .cfi_cfa_expression $_rsp,deref,+8
  1812. ___
  1813. $code.=<<___ if ($win64);
  1814. movaps %xmm6,16*$SZ+32(%rsp)
  1815. movaps %xmm7,16*$SZ+48(%rsp)
  1816. movaps %xmm8,16*$SZ+64(%rsp)
  1817. movaps %xmm9,16*$SZ+80(%rsp)
  1818. ___
  1819. $code.=<<___ if ($win64 && $SZ>4);
  1820. movaps %xmm10,16*$SZ+96(%rsp)
  1821. movaps %xmm11,16*$SZ+112(%rsp)
  1822. ___
  1823. $code.=<<___;
  1824. .Lprologue_avx2:
  1825. vzeroupper
  1826. sub \$-16*$SZ,$inp # inp++, size optimization
  1827. mov $SZ*0($ctx),$A
  1828. mov $inp,%r12 # borrow $T1
  1829. mov $SZ*1($ctx),$B
  1830. cmp %rdx,$inp # $_end
  1831. mov $SZ*2($ctx),$C
  1832. cmove %rsp,%r12 # next block or random data
  1833. mov $SZ*3($ctx),$D
  1834. mov $SZ*4($ctx),$E
  1835. mov $SZ*5($ctx),$F
  1836. mov $SZ*6($ctx),$G
  1837. mov $SZ*7($ctx),$H
  1838. ___
  1839. if ($SZ==4) { # SHA256
  1840. my @X = map("%ymm$_",(0..3));
  1841. my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%ymm$_",(4..9));
  1842. $code.=<<___;
  1843. vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4
  1844. vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5
  1845. jmp .Loop_avx2
  1846. .align 16
  1847. .Loop_avx2:
  1848. vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
  1849. vmovdqu -16*$SZ+0($inp),%xmm0
  1850. vmovdqu -16*$SZ+16($inp),%xmm1
  1851. vmovdqu -16*$SZ+32($inp),%xmm2
  1852. vmovdqu -16*$SZ+48($inp),%xmm3
  1853. #mov $inp,$_inp # offload $inp
  1854. vinserti128 \$1,(%r12),@X[0],@X[0]
  1855. vinserti128 \$1,16(%r12),@X[1],@X[1]
  1856. vpshufb $t3,@X[0],@X[0]
  1857. vinserti128 \$1,32(%r12),@X[2],@X[2]
  1858. vpshufb $t3,@X[1],@X[1]
  1859. vinserti128 \$1,48(%r12),@X[3],@X[3]
  1860. lea $TABLE(%rip),$Tbl
  1861. vpshufb $t3,@X[2],@X[2]
  1862. vpaddd 0x00($Tbl),@X[0],$t0
  1863. vpshufb $t3,@X[3],@X[3]
  1864. vpaddd 0x20($Tbl),@X[1],$t1
  1865. vpaddd 0x40($Tbl),@X[2],$t2
  1866. vpaddd 0x60($Tbl),@X[3],$t3
  1867. vmovdqa $t0,0x00(%rsp)
  1868. xor $a1,$a1
  1869. vmovdqa $t1,0x20(%rsp)
  1870. ___
  1871. $code.=<<___ if (!$win64);
  1872. # temporarily use %rdi as frame pointer
  1873. mov $_rsp,%rdi
  1874. .cfi_def_cfa %rdi,8
  1875. ___
  1876. $code.=<<___;
  1877. lea -$PUSH8(%rsp),%rsp
  1878. ___
  1879. $code.=<<___ if (!$win64);
  1880. # the frame info is at $_rsp, but the stack is moving...
  1881. # so a second frame pointer is saved at -8(%rsp)
  1882. # that is in the red zone
  1883. mov %rdi,-8(%rsp)
  1884. .cfi_cfa_expression %rsp-8,deref,+8
  1885. ___
  1886. $code.=<<___;
  1887. mov $B,$a3
  1888. vmovdqa $t2,0x00(%rsp)
  1889. xor $C,$a3 # magic
  1890. vmovdqa $t3,0x20(%rsp)
  1891. mov $F,$a4
  1892. sub \$-16*2*$SZ,$Tbl # size optimization
  1893. jmp .Lavx2_00_47
  1894. .align 16
  1895. .Lavx2_00_47:
  1896. ___
  1897. sub AVX2_256_00_47 () {
  1898. my $j = shift;
  1899. my $body = shift;
  1900. my @X = @_;
  1901. my @insns = (&$body,&$body,&$body,&$body); # 96 instructions
  1902. my $base = "+2*$PUSH8(%rsp)";
  1903. if (($j%2)==0) {
  1904. &lea ("%rsp","-$PUSH8(%rsp)");
  1905. $code.=<<___ if (!$win64);
  1906. .cfi_cfa_expression %rsp+`$PUSH8-8`,deref,+8
  1907. # copy secondary frame pointer to new location again at -8(%rsp)
  1908. pushq $PUSH8-8(%rsp)
  1909. .cfi_cfa_expression %rsp,deref,+8
  1910. lea 8(%rsp),%rsp
  1911. .cfi_cfa_expression %rsp-8,deref,+8
  1912. ___
  1913. }
  1914. foreach (Xupdate_256_AVX()) { # 29 instructions
  1915. eval;
  1916. eval(shift(@insns));
  1917. eval(shift(@insns));
  1918. eval(shift(@insns));
  1919. }
  1920. &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
  1921. foreach (@insns) { eval; } # remaining instructions
  1922. &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2);
  1923. }
  1924. for ($i=0,$j=0; $j<4; $j++) {
  1925. &AVX2_256_00_47($j,\&bodyx_00_15,@X);
  1926. push(@X,shift(@X)); # rotate(@X)
  1927. }
  1928. &lea ($Tbl,16*2*$SZ."($Tbl)");
  1929. &cmpb (($SZ-1)."($Tbl)",0);
  1930. &jne (".Lavx2_00_47");
  1931. for ($i=0; $i<16; ) {
  1932. my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
  1933. foreach(bodyx_00_15()) { eval; }
  1934. }
  1935. } else { # SHA512
  1936. my @X = map("%ymm$_",(0..7));
  1937. my ($t0,$t1,$t2,$t3) = map("%ymm$_",(8..11));
  1938. $code.=<<___;
  1939. jmp .Loop_avx2
  1940. .align 16
  1941. .Loop_avx2:
  1942. vmovdqu -16*$SZ($inp),%xmm0
  1943. vmovdqu -16*$SZ+16($inp),%xmm1
  1944. vmovdqu -16*$SZ+32($inp),%xmm2
  1945. lea $TABLE+0x80(%rip),$Tbl # size optimization
  1946. vmovdqu -16*$SZ+48($inp),%xmm3
  1947. vmovdqu -16*$SZ+64($inp),%xmm4
  1948. vmovdqu -16*$SZ+80($inp),%xmm5
  1949. vmovdqu -16*$SZ+96($inp),%xmm6
  1950. vmovdqu -16*$SZ+112($inp),%xmm7
  1951. #mov $inp,$_inp # offload $inp
  1952. vmovdqa `$SZ*2*$rounds-0x80`($Tbl),$t2
  1953. vinserti128 \$1,(%r12),@X[0],@X[0]
  1954. vinserti128 \$1,16(%r12),@X[1],@X[1]
  1955. vpshufb $t2,@X[0],@X[0]
  1956. vinserti128 \$1,32(%r12),@X[2],@X[2]
  1957. vpshufb $t2,@X[1],@X[1]
  1958. vinserti128 \$1,48(%r12),@X[3],@X[3]
  1959. vpshufb $t2,@X[2],@X[2]
  1960. vinserti128 \$1,64(%r12),@X[4],@X[4]
  1961. vpshufb $t2,@X[3],@X[3]
  1962. vinserti128 \$1,80(%r12),@X[5],@X[5]
  1963. vpshufb $t2,@X[4],@X[4]
  1964. vinserti128 \$1,96(%r12),@X[6],@X[6]
  1965. vpshufb $t2,@X[5],@X[5]
  1966. vinserti128 \$1,112(%r12),@X[7],@X[7]
  1967. vpaddq -0x80($Tbl),@X[0],$t0
  1968. vpshufb $t2,@X[6],@X[6]
  1969. vpaddq -0x60($Tbl),@X[1],$t1
  1970. vpshufb $t2,@X[7],@X[7]
  1971. vpaddq -0x40($Tbl),@X[2],$t2
  1972. vpaddq -0x20($Tbl),@X[3],$t3
  1973. vmovdqa $t0,0x00(%rsp)
  1974. vpaddq 0x00($Tbl),@X[4],$t0
  1975. vmovdqa $t1,0x20(%rsp)
  1976. vpaddq 0x20($Tbl),@X[5],$t1
  1977. vmovdqa $t2,0x40(%rsp)
  1978. vpaddq 0x40($Tbl),@X[6],$t2
  1979. vmovdqa $t3,0x60(%rsp)
  1980. ___
  1981. $code.=<<___ if (!$win64);
  1982. # temporarily use %rdi as frame pointer
  1983. mov $_rsp,%rdi
  1984. .cfi_def_cfa %rdi,8
  1985. ___
  1986. $code.=<<___;
  1987. lea -$PUSH8(%rsp),%rsp
  1988. ___
  1989. $code.=<<___ if (!$win64);
  1990. # the frame info is at $_rsp, but the stack is moving...
  1991. # so a second frame pointer is saved at -8(%rsp)
  1992. # that is in the red zone
  1993. mov %rdi,-8(%rsp)
  1994. .cfi_cfa_expression %rsp-8,deref,+8
  1995. ___
  1996. $code.=<<___;
  1997. vpaddq 0x60($Tbl),@X[7],$t3
  1998. vmovdqa $t0,0x00(%rsp)
  1999. xor $a1,$a1
  2000. vmovdqa $t1,0x20(%rsp)
  2001. mov $B,$a3
  2002. vmovdqa $t2,0x40(%rsp)
  2003. xor $C,$a3 # magic
  2004. vmovdqa $t3,0x60(%rsp)
  2005. mov $F,$a4
  2006. add \$16*2*$SZ,$Tbl
  2007. jmp .Lavx2_00_47
  2008. .align 16
  2009. .Lavx2_00_47:
  2010. ___
  2011. sub AVX2_512_00_47 () {
  2012. my $j = shift;
  2013. my $body = shift;
  2014. my @X = @_;
  2015. my @insns = (&$body,&$body); # 48 instructions
  2016. my $base = "+2*$PUSH8(%rsp)";
  2017. if (($j%4)==0) {
  2018. &lea ("%rsp","-$PUSH8(%rsp)");
  2019. $code.=<<___ if (!$win64);
  2020. .cfi_cfa_expression %rsp+`$PUSH8-8`,deref,+8
  2021. # copy secondary frame pointer to new location again at -8(%rsp)
  2022. pushq $PUSH8-8(%rsp)
  2023. .cfi_cfa_expression %rsp,deref,+8
  2024. lea 8(%rsp),%rsp
  2025. .cfi_cfa_expression %rsp-8,deref,+8
  2026. ___
  2027. }
  2028. foreach (Xupdate_512_AVX()) { # 23 instructions
  2029. eval;
  2030. if ($_ !~ /\;$/) {
  2031. eval(shift(@insns));
  2032. eval(shift(@insns));
  2033. eval(shift(@insns));
  2034. }
  2035. }
  2036. &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)");
  2037. foreach (@insns) { eval; } # remaining instructions
  2038. &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2);
  2039. }
  2040. for ($i=0,$j=0; $j<8; $j++) {
  2041. &AVX2_512_00_47($j,\&bodyx_00_15,@X);
  2042. push(@X,shift(@X)); # rotate(@X)
  2043. }
  2044. &lea ($Tbl,16*2*$SZ."($Tbl)");
  2045. &cmpb (($SZ-1-0x80)."($Tbl)",0);
  2046. &jne (".Lavx2_00_47");
  2047. for ($i=0; $i<16; ) {
  2048. my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
  2049. foreach(bodyx_00_15()) { eval; }
  2050. }
  2051. }
  2052. $code.=<<___;
  2053. mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx
  2054. add $a1,$A
  2055. #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp
  2056. lea `2*$SZ*($rounds-8)`(%rsp),$Tbl
  2057. add $SZ*0($ctx),$A
  2058. add $SZ*1($ctx),$B
  2059. add $SZ*2($ctx),$C
  2060. add $SZ*3($ctx),$D
  2061. add $SZ*4($ctx),$E
  2062. add $SZ*5($ctx),$F
  2063. add $SZ*6($ctx),$G
  2064. add $SZ*7($ctx),$H
  2065. mov $A,$SZ*0($ctx)
  2066. mov $B,$SZ*1($ctx)
  2067. mov $C,$SZ*2($ctx)
  2068. mov $D,$SZ*3($ctx)
  2069. mov $E,$SZ*4($ctx)
  2070. mov $F,$SZ*5($ctx)
  2071. mov $G,$SZ*6($ctx)
  2072. mov $H,$SZ*7($ctx)
  2073. cmp `$PUSH8+2*8`($Tbl),$inp # $_end
  2074. je .Ldone_avx2
  2075. xor $a1,$a1
  2076. mov $B,$a3
  2077. xor $C,$a3 # magic
  2078. mov $F,$a4
  2079. jmp .Lower_avx2
  2080. .align 16
  2081. .Lower_avx2:
  2082. ___
  2083. for ($i=0; $i<8; ) {
  2084. my $base="+16($Tbl)";
  2085. foreach(bodyx_00_15()) { eval; }
  2086. }
  2087. $code.=<<___;
  2088. lea -$PUSH8($Tbl),$Tbl
  2089. cmp %rsp,$Tbl
  2090. jae .Lower_avx2
  2091. mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx
  2092. add $a1,$A
  2093. #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp
  2094. lea `2*$SZ*($rounds-8)`(%rsp),%rsp
  2095. # restore frame pointer to original location at $_rsp
  2096. .cfi_cfa_expression $_rsp,deref,+8
  2097. add $SZ*0($ctx),$A
  2098. add $SZ*1($ctx),$B
  2099. add $SZ*2($ctx),$C
  2100. add $SZ*3($ctx),$D
  2101. add $SZ*4($ctx),$E
  2102. add $SZ*5($ctx),$F
  2103. lea `2*16*$SZ`($inp),$inp # inp+=2
  2104. add $SZ*6($ctx),$G
  2105. mov $inp,%r12
  2106. add $SZ*7($ctx),$H
  2107. cmp $_end,$inp
  2108. mov $A,$SZ*0($ctx)
  2109. cmove %rsp,%r12 # next block or stale data
  2110. mov $B,$SZ*1($ctx)
  2111. mov $C,$SZ*2($ctx)
  2112. mov $D,$SZ*3($ctx)
  2113. mov $E,$SZ*4($ctx)
  2114. mov $F,$SZ*5($ctx)
  2115. mov $G,$SZ*6($ctx)
  2116. mov $H,$SZ*7($ctx)
  2117. jbe .Loop_avx2
  2118. lea (%rsp),$Tbl
  2119. # temporarily use $Tbl as index to $_rsp
  2120. # this avoids the need to save a secondary frame pointer at -8(%rsp)
  2121. .cfi_cfa_expression $Tbl+`16*$SZ+3*8`,deref,+8
  2122. .Ldone_avx2:
  2123. mov `16*$SZ+3*8`($Tbl),%rsi
  2124. .cfi_def_cfa %rsi,8
  2125. vzeroupper
  2126. ___
  2127. $code.=<<___ if ($win64);
  2128. movaps 16*$SZ+32($Tbl),%xmm6
  2129. movaps 16*$SZ+48($Tbl),%xmm7
  2130. movaps 16*$SZ+64($Tbl),%xmm8
  2131. movaps 16*$SZ+80($Tbl),%xmm9
  2132. ___
  2133. $code.=<<___ if ($win64 && $SZ>4);
  2134. movaps 16*$SZ+96($Tbl),%xmm10
  2135. movaps 16*$SZ+112($Tbl),%xmm11
  2136. ___
  2137. $code.=<<___;
  2138. mov -48(%rsi),%r15
  2139. .cfi_restore %r15
  2140. mov -40(%rsi),%r14
  2141. .cfi_restore %r14
  2142. mov -32(%rsi),%r13
  2143. .cfi_restore %r13
  2144. mov -24(%rsi),%r12
  2145. .cfi_restore %r12
  2146. mov -16(%rsi),%rbp
  2147. .cfi_restore %rbp
  2148. mov -8(%rsi),%rbx
  2149. .cfi_restore %rbx
  2150. lea (%rsi),%rsp
  2151. .cfi_def_cfa_register %rsp
  2152. .Lepilogue_avx2:
  2153. ret
  2154. .cfi_endproc
  2155. .size ${func}_avx2,.-${func}_avx2
  2156. ___
  2157. }}
  2158. }}}}}
  2159. # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
  2160. # CONTEXT *context,DISPATCHER_CONTEXT *disp)
  2161. if ($win64) {
  2162. $rec="%rcx";
  2163. $frame="%rdx";
  2164. $context="%r8";
  2165. $disp="%r9";
  2166. $code.=<<___;
  2167. .extern __imp_RtlVirtualUnwind
  2168. .type se_handler,\@abi-omnipotent
  2169. .align 16
  2170. se_handler:
  2171. push %rsi
  2172. push %rdi
  2173. push %rbx
  2174. push %rbp
  2175. push %r12
  2176. push %r13
  2177. push %r14
  2178. push %r15
  2179. pushfq
  2180. sub \$64,%rsp
  2181. mov 120($context),%rax # pull context->Rax
  2182. mov 248($context),%rbx # pull context->Rip
  2183. mov 8($disp),%rsi # disp->ImageBase
  2184. mov 56($disp),%r11 # disp->HanderlData
  2185. mov 0(%r11),%r10d # HandlerData[0]
  2186. lea (%rsi,%r10),%r10 # prologue label
  2187. cmp %r10,%rbx # context->Rip<prologue label
  2188. jb .Lin_prologue
  2189. mov 152($context),%rax # pull context->Rsp
  2190. mov 4(%r11),%r10d # HandlerData[1]
  2191. lea (%rsi,%r10),%r10 # epilogue label
  2192. cmp %r10,%rbx # context->Rip>=epilogue label
  2193. jae .Lin_prologue
  2194. ___
  2195. $code.=<<___ if ($avx>1);
  2196. lea .Lavx2_shortcut(%rip),%r10
  2197. cmp %r10,%rbx # context->Rip<avx2_shortcut
  2198. jb .Lnot_in_avx2
  2199. and \$-256*$SZ,%rax
  2200. add \$`2*$SZ*($rounds-8)`,%rax
  2201. .Lnot_in_avx2:
  2202. ___
  2203. $code.=<<___;
  2204. mov %rax,%rsi # put aside Rsp
  2205. mov 16*$SZ+3*8(%rax),%rax # pull $_rsp
  2206. mov -8(%rax),%rbx
  2207. mov -16(%rax),%rbp
  2208. mov -24(%rax),%r12
  2209. mov -32(%rax),%r13
  2210. mov -40(%rax),%r14
  2211. mov -48(%rax),%r15
  2212. mov %rbx,144($context) # restore context->Rbx
  2213. mov %rbp,160($context) # restore context->Rbp
  2214. mov %r12,216($context) # restore context->R12
  2215. mov %r13,224($context) # restore context->R13
  2216. mov %r14,232($context) # restore context->R14
  2217. mov %r15,240($context) # restore context->R15
  2218. lea .Lepilogue(%rip),%r10
  2219. cmp %r10,%rbx
  2220. jb .Lin_prologue # non-AVX code
  2221. lea 16*$SZ+4*8(%rsi),%rsi # Xmm6- save area
  2222. lea 512($context),%rdi # &context.Xmm6
  2223. mov \$`$SZ==4?8:12`,%ecx
  2224. .long 0xa548f3fc # cld; rep movsq
  2225. .Lin_prologue:
  2226. mov 8(%rax),%rdi
  2227. mov 16(%rax),%rsi
  2228. mov %rax,152($context) # restore context->Rsp
  2229. mov %rsi,168($context) # restore context->Rsi
  2230. mov %rdi,176($context) # restore context->Rdi
  2231. mov 40($disp),%rdi # disp->ContextRecord
  2232. mov $context,%rsi # context
  2233. mov \$154,%ecx # sizeof(CONTEXT)
  2234. .long 0xa548f3fc # cld; rep movsq
  2235. mov $disp,%rsi
  2236. xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
  2237. mov 8(%rsi),%rdx # arg2, disp->ImageBase
  2238. mov 0(%rsi),%r8 # arg3, disp->ControlPc
  2239. mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
  2240. mov 40(%rsi),%r10 # disp->ContextRecord
  2241. lea 56(%rsi),%r11 # &disp->HandlerData
  2242. lea 24(%rsi),%r12 # &disp->EstablisherFrame
  2243. mov %r10,32(%rsp) # arg5
  2244. mov %r11,40(%rsp) # arg6
  2245. mov %r12,48(%rsp) # arg7
  2246. mov %rcx,56(%rsp) # arg8, (NULL)
  2247. call *__imp_RtlVirtualUnwind(%rip)
  2248. mov \$1,%eax # ExceptionContinueSearch
  2249. add \$64,%rsp
  2250. popfq
  2251. pop %r15
  2252. pop %r14
  2253. pop %r13
  2254. pop %r12
  2255. pop %rbp
  2256. pop %rbx
  2257. pop %rdi
  2258. pop %rsi
  2259. ret
  2260. .size se_handler,.-se_handler
  2261. ___
  2262. $code.=<<___ if ($SZ==4 && $shaext);
  2263. .type shaext_handler,\@abi-omnipotent
  2264. .align 16
  2265. shaext_handler:
  2266. push %rsi
  2267. push %rdi
  2268. push %rbx
  2269. push %rbp
  2270. push %r12
  2271. push %r13
  2272. push %r14
  2273. push %r15
  2274. pushfq
  2275. sub \$64,%rsp
  2276. mov 120($context),%rax # pull context->Rax
  2277. mov 248($context),%rbx # pull context->Rip
  2278. lea .Lprologue_shaext(%rip),%r10
  2279. cmp %r10,%rbx # context->Rip<.Lprologue
  2280. jb .Lin_prologue
  2281. lea .Lepilogue_shaext(%rip),%r10
  2282. cmp %r10,%rbx # context->Rip>=.Lepilogue
  2283. jae .Lin_prologue
  2284. lea -8-5*16(%rax),%rsi
  2285. lea 512($context),%rdi # &context.Xmm6
  2286. mov \$10,%ecx
  2287. .long 0xa548f3fc # cld; rep movsq
  2288. jmp .Lin_prologue
  2289. .size shaext_handler,.-shaext_handler
  2290. ___
  2291. $code.=<<___;
  2292. .section .pdata
  2293. .align 4
  2294. .rva .LSEH_begin_$func
  2295. .rva .LSEH_end_$func
  2296. .rva .LSEH_info_$func
  2297. ___
  2298. $code.=<<___ if ($SZ==4 && $shaext);
  2299. .rva .LSEH_begin_${func}_shaext
  2300. .rva .LSEH_end_${func}_shaext
  2301. .rva .LSEH_info_${func}_shaext
  2302. ___
  2303. $code.=<<___ if ($SZ==4);
  2304. .rva .LSEH_begin_${func}_ssse3
  2305. .rva .LSEH_end_${func}_ssse3
  2306. .rva .LSEH_info_${func}_ssse3
  2307. ___
  2308. $code.=<<___ if ($avx && $SZ==8);
  2309. .rva .LSEH_begin_${func}_xop
  2310. .rva .LSEH_end_${func}_xop
  2311. .rva .LSEH_info_${func}_xop
  2312. ___
  2313. $code.=<<___ if ($avx);
  2314. .rva .LSEH_begin_${func}_avx
  2315. .rva .LSEH_end_${func}_avx
  2316. .rva .LSEH_info_${func}_avx
  2317. ___
  2318. $code.=<<___ if ($avx>1);
  2319. .rva .LSEH_begin_${func}_avx2
  2320. .rva .LSEH_end_${func}_avx2
  2321. .rva .LSEH_info_${func}_avx2
  2322. ___
  2323. $code.=<<___;
  2324. .section .xdata
  2325. .align 8
  2326. .LSEH_info_$func:
  2327. .byte 9,0,0,0
  2328. .rva se_handler
  2329. .rva .Lprologue,.Lepilogue # HandlerData[]
  2330. ___
  2331. $code.=<<___ if ($SZ==4 && $shaext);
  2332. .LSEH_info_${func}_shaext:
  2333. .byte 9,0,0,0
  2334. .rva shaext_handler
  2335. ___
  2336. $code.=<<___ if ($SZ==4);
  2337. .LSEH_info_${func}_ssse3:
  2338. .byte 9,0,0,0
  2339. .rva se_handler
  2340. .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[]
  2341. ___
  2342. $code.=<<___ if ($avx && $SZ==8);
  2343. .LSEH_info_${func}_xop:
  2344. .byte 9,0,0,0
  2345. .rva se_handler
  2346. .rva .Lprologue_xop,.Lepilogue_xop # HandlerData[]
  2347. ___
  2348. $code.=<<___ if ($avx);
  2349. .LSEH_info_${func}_avx:
  2350. .byte 9,0,0,0
  2351. .rva se_handler
  2352. .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
  2353. ___
  2354. $code.=<<___ if ($avx>1);
  2355. .LSEH_info_${func}_avx2:
  2356. .byte 9,0,0,0
  2357. .rva se_handler
  2358. .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[]
  2359. ___
  2360. }
  2361. sub sha256op38 {
  2362. my $instr = shift;
  2363. my %opcodelet = (
  2364. "sha256rnds2" => 0xcb,
  2365. "sha256msg1" => 0xcc,
  2366. "sha256msg2" => 0xcd );
  2367. if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) {
  2368. my @opcode=(0x0f,0x38);
  2369. push @opcode,$opcodelet{$instr};
  2370. push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
  2371. return ".byte\t".join(',',@opcode);
  2372. } else {
  2373. return $instr."\t".@_[0];
  2374. }
  2375. }
  2376. foreach (split("\n",$code)) {
  2377. s/\`([^\`]*)\`/eval $1/geo;
  2378. s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo;
  2379. print $_,"\n";
  2380. }
  2381. close STDOUT or die "error closing STDOUT: $!";