rsaz-x86_64.pl 46 KB


  1. #! /usr/bin/env perl
  2. # Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved.
  3. # Copyright (c) 2012, Intel Corporation. All Rights Reserved.
  4. #
  5. # Licensed under the Apache License 2.0 (the "License"). You may not use
  6. # this file except in compliance with the License. You can obtain a copy
  7. # in the file LICENSE in the source distribution or at
  8. # https://www.openssl.org/source/license.html
  9. #
  10. # Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1)
  11. # (1) Intel Corporation, Israel Development Center, Haifa, Israel
  12. # (2) University of Haifa, Israel
  13. #
  14. # References:
  15. # [1] S. Gueron, "Efficient Software Implementations of Modular
  16. # Exponentiation", http://eprint.iacr.org/2011/239
  17. # [2] S. Gueron, V. Krasnov. "Speeding up Big-Numbers Squaring".
  18. # IEEE Proceedings of 9th International Conference on Information
  19. # Technology: New Generations (ITNG 2012), 821-823 (2012).
  20. # [3] S. Gueron, Efficient Software Implementations of Modular Exponentiation
  21. # Journal of Cryptographic Engineering 2:31-43 (2012).
  22. # [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis
  23. # resistant 512-bit and 1024-bit modular exponentiation for optimizing
  24. # RSA1024 and RSA2048 on x86_64 platforms",
  25. # http://rt.openssl.org/Ticket/Display.html?id=2582&user=guest&pass=guest
  26. #
  27. # While original submission covers 512- and 1024-bit exponentiation,
  28. # this module is limited to 512-bit version only (and as such
  29. # accelerates RSA1024 sign). This is because improvement for longer
  30. # keys is not high enough to justify the effort, highest measured
  31. # was ~5% on Westmere. [This is relative to OpenSSL 1.0.2, upcoming
  32. # for the moment of this writing!] Nor does this module implement
  33. # "monolithic" complete exponentiation jumbo-subroutine, but adheres
  34. # to more modular mixture of C and assembly. And it's optimized even
  35. # for processors other than Intel Core family (see table below for
  36. # improvement coefficients).
  37. # <appro@openssl.org>
  38. #
  39. # RSA1024 sign/sec this/original |this/rsax(*) this/fips(*)
  40. # ----------------+---------------------------
  41. # Opteron +13% |+5% +20%
  42. # Bulldozer -0% |-1% +10%
  43. # P4 +11% |+7% +8%
  44. # Westmere +5% |+14% +17%
  45. # Sandy Bridge +2% |+12% +29%
  46. # Ivy Bridge +1% |+11% +35%
  47. # Haswell(**) -0% |+12% +39%
  48. # Atom +13% |+11% +4%
  49. # VIA Nano +70% |+9% +25%
  50. #
  51. # (*) rsax engine and fips numbers are presented for reference
  52. # purposes;
  53. # (**) MULX was attempted, but found to give only marginal improvement;
  54. # $output is the last argument if it looks like a file (it has an extension)
  55. # $flavour is the first argument if it doesn't look like a file
  56. $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  57. $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  58. $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  59. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  60. ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  61. ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  62. die "can't locate x86_64-xlate.pl";
  63. open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
  64. or die "can't call $xlate: $!";
  65. *STDOUT=*OUT;
  66. if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
  67. =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
  68. $addx = ($1>=2.23);
  69. }
  70. if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
  71. `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
  72. $addx = ($1>=2.10);
  73. }
  74. if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
  75. `ml64 2>&1` =~ /Version ([0-9]+)\./) {
  76. $addx = ($1>=12);
  77. }
  78. if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)/) {
  79. my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10
  80. $addx = ($ver>=3.03);
  81. }
  82. ($out, $inp, $mod) = ("%rdi", "%rsi", "%rbp"); # common internal API
  83. {
  84. my ($out,$inp,$mod,$n0,$times) = ("%rdi","%rsi","%rdx","%rcx","%r8d");
  85. $code.=<<___;
  86. .text
  87. .extern OPENSSL_ia32cap_P
  88. .globl rsaz_512_sqr
  89. .type rsaz_512_sqr,\@function,5
  90. .align 32
  91. rsaz_512_sqr: # 25-29% faster than rsaz_512_mul
  92. .cfi_startproc
  93. push %rbx
  94. .cfi_push %rbx
  95. push %rbp
  96. .cfi_push %rbp
  97. push %r12
  98. .cfi_push %r12
  99. push %r13
  100. .cfi_push %r13
  101. push %r14
  102. .cfi_push %r14
  103. push %r15
  104. .cfi_push %r15
  105. subq \$128+24, %rsp
  106. .cfi_adjust_cfa_offset 128+24
  107. .Lsqr_body:
  108. movq $mod, %xmm1 # common off-load
  109. movq ($inp), %rdx
  110. movq 8($inp), %rax
  111. movq $n0, 128(%rsp)
  112. ___
  113. $code.=<<___ if ($addx);
  114. movl \$0x80100,%r11d
  115. andl OPENSSL_ia32cap_P+8(%rip),%r11d
  116. cmpl \$0x80100,%r11d # check for MULX and ADO/CX
  117. je .Loop_sqrx
  118. ___
  119. $code.=<<___;
  120. jmp .Loop_sqr
  121. .align 32
  122. .Loop_sqr:
  123. movl $times,128+8(%rsp)
  124. #first iteration
  125. movq %rdx, %rbx # 0($inp)
  126. mov %rax, %rbp # 8($inp)
  127. mulq %rdx
  128. movq %rax, %r8
  129. movq 16($inp), %rax
  130. movq %rdx, %r9
  131. mulq %rbx
  132. addq %rax, %r9
  133. movq 24($inp), %rax
  134. movq %rdx, %r10
  135. adcq \$0, %r10
  136. mulq %rbx
  137. addq %rax, %r10
  138. movq 32($inp), %rax
  139. movq %rdx, %r11
  140. adcq \$0, %r11
  141. mulq %rbx
  142. addq %rax, %r11
  143. movq 40($inp), %rax
  144. movq %rdx, %r12
  145. adcq \$0, %r12
  146. mulq %rbx
  147. addq %rax, %r12
  148. movq 48($inp), %rax
  149. movq %rdx, %r13
  150. adcq \$0, %r13
  151. mulq %rbx
  152. addq %rax, %r13
  153. movq 56($inp), %rax
  154. movq %rdx, %r14
  155. adcq \$0, %r14
  156. mulq %rbx
  157. addq %rax, %r14
  158. movq %rbx, %rax
  159. adcq \$0, %rdx
  160. xorq %rcx,%rcx # rcx:r8 = r8 << 1
  161. addq %r8, %r8
  162. movq %rdx, %r15
  163. adcq \$0, %rcx
  164. mulq %rax
  165. addq %r8, %rdx
  166. adcq \$0, %rcx
  167. movq %rax, (%rsp)
  168. movq %rdx, 8(%rsp)
  169. #second iteration
  170. movq 16($inp), %rax
  171. mulq %rbp
  172. addq %rax, %r10
  173. movq 24($inp), %rax
  174. movq %rdx, %rbx
  175. adcq \$0, %rbx
  176. mulq %rbp
  177. addq %rax, %r11
  178. movq 32($inp), %rax
  179. adcq \$0, %rdx
  180. addq %rbx, %r11
  181. movq %rdx, %rbx
  182. adcq \$0, %rbx
  183. mulq %rbp
  184. addq %rax, %r12
  185. movq 40($inp), %rax
  186. adcq \$0, %rdx
  187. addq %rbx, %r12
  188. movq %rdx, %rbx
  189. adcq \$0, %rbx
  190. mulq %rbp
  191. addq %rax, %r13
  192. movq 48($inp), %rax
  193. adcq \$0, %rdx
  194. addq %rbx, %r13
  195. movq %rdx, %rbx
  196. adcq \$0, %rbx
  197. mulq %rbp
  198. addq %rax, %r14
  199. movq 56($inp), %rax
  200. adcq \$0, %rdx
  201. addq %rbx, %r14
  202. movq %rdx, %rbx
  203. adcq \$0, %rbx
  204. mulq %rbp
  205. addq %rax, %r15
  206. movq %rbp, %rax
  207. adcq \$0, %rdx
  208. addq %rbx, %r15
  209. adcq \$0, %rdx
  210. xorq %rbx, %rbx # rbx:r10:r9 = r10:r9 << 1
  211. addq %r9, %r9
  212. movq %rdx, %r8
  213. adcq %r10, %r10
  214. adcq \$0, %rbx
  215. mulq %rax
  216. # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
  217. addq %rcx, %rax
  218. movq 16($inp), %rbp
  219. addq %rax, %r9
  220. movq 24($inp), %rax
  221. adcq %rdx, %r10
  222. adcq \$0, %rbx
  223. movq %r9, 16(%rsp)
  224. movq %r10, 24(%rsp)
  225. #third iteration
  226. mulq %rbp
  227. addq %rax, %r12
  228. movq 32($inp), %rax
  229. movq %rdx, %rcx
  230. adcq \$0, %rcx
  231. mulq %rbp
  232. addq %rax, %r13
  233. movq 40($inp), %rax
  234. adcq \$0, %rdx
  235. addq %rcx, %r13
  236. movq %rdx, %rcx
  237. adcq \$0, %rcx
  238. mulq %rbp
  239. addq %rax, %r14
  240. movq 48($inp), %rax
  241. adcq \$0, %rdx
  242. addq %rcx, %r14
  243. movq %rdx, %rcx
  244. adcq \$0, %rcx
  245. mulq %rbp
  246. addq %rax, %r15
  247. movq 56($inp), %rax
  248. adcq \$0, %rdx
  249. addq %rcx, %r15
  250. movq %rdx, %rcx
  251. adcq \$0, %rcx
  252. mulq %rbp
  253. addq %rax, %r8
  254. movq %rbp, %rax
  255. adcq \$0, %rdx
  256. addq %rcx, %r8
  257. adcq \$0, %rdx
  258. xorq %rcx, %rcx # rcx:r12:r11 = r12:r11 << 1
  259. addq %r11, %r11
  260. movq %rdx, %r9
  261. adcq %r12, %r12
  262. adcq \$0, %rcx
  263. mulq %rax
  264. # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
  265. addq %rbx, %rax
  266. movq 24($inp), %r10
  267. addq %rax, %r11
  268. movq 32($inp), %rax
  269. adcq %rdx, %r12
  270. adcq \$0, %rcx
  271. movq %r11, 32(%rsp)
  272. movq %r12, 40(%rsp)
  273. #fourth iteration
  274. mov %rax, %r11 # 32($inp)
  275. mulq %r10
  276. addq %rax, %r14
  277. movq 40($inp), %rax
  278. movq %rdx, %rbx
  279. adcq \$0, %rbx
  280. mov %rax, %r12 # 40($inp)
  281. mulq %r10
  282. addq %rax, %r15
  283. movq 48($inp), %rax
  284. adcq \$0, %rdx
  285. addq %rbx, %r15
  286. movq %rdx, %rbx
  287. adcq \$0, %rbx
  288. mov %rax, %rbp # 48($inp)
  289. mulq %r10
  290. addq %rax, %r8
  291. movq 56($inp), %rax
  292. adcq \$0, %rdx
  293. addq %rbx, %r8
  294. movq %rdx, %rbx
  295. adcq \$0, %rbx
  296. mulq %r10
  297. addq %rax, %r9
  298. movq %r10, %rax
  299. adcq \$0, %rdx
  300. addq %rbx, %r9
  301. adcq \$0, %rdx
  302. xorq %rbx, %rbx # rbx:r13:r14 = r13:r14 << 1
  303. addq %r13, %r13
  304. movq %rdx, %r10
  305. adcq %r14, %r14
  306. adcq \$0, %rbx
  307. mulq %rax
  308. # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
  309. addq %rcx, %rax
  310. addq %rax, %r13
  311. movq %r12, %rax # 40($inp)
  312. adcq %rdx, %r14
  313. adcq \$0, %rbx
  314. movq %r13, 48(%rsp)
  315. movq %r14, 56(%rsp)
  316. #fifth iteration
  317. mulq %r11
  318. addq %rax, %r8
  319. movq %rbp, %rax # 48($inp)
  320. movq %rdx, %rcx
  321. adcq \$0, %rcx
  322. mulq %r11
  323. addq %rax, %r9
  324. movq 56($inp), %rax
  325. adcq \$0, %rdx
  326. addq %rcx, %r9
  327. movq %rdx, %rcx
  328. adcq \$0, %rcx
  329. mov %rax, %r14 # 56($inp)
  330. mulq %r11
  331. addq %rax, %r10
  332. movq %r11, %rax
  333. adcq \$0, %rdx
  334. addq %rcx, %r10
  335. adcq \$0, %rdx
  336. xorq %rcx, %rcx # rcx:r8:r15 = r8:r15 << 1
  337. addq %r15, %r15
  338. movq %rdx, %r11
  339. adcq %r8, %r8
  340. adcq \$0, %rcx
  341. mulq %rax
  342. # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
  343. addq %rbx, %rax
  344. addq %rax, %r15
  345. movq %rbp, %rax # 48($inp)
  346. adcq %rdx, %r8
  347. adcq \$0, %rcx
  348. movq %r15, 64(%rsp)
  349. movq %r8, 72(%rsp)
  350. #sixth iteration
  351. mulq %r12
  352. addq %rax, %r10
  353. movq %r14, %rax # 56($inp)
  354. movq %rdx, %rbx
  355. adcq \$0, %rbx
  356. mulq %r12
  357. addq %rax, %r11
  358. movq %r12, %rax
  359. adcq \$0, %rdx
  360. addq %rbx, %r11
  361. adcq \$0, %rdx
  362. xorq %rbx, %rbx # rbx:r10:r9 = r10:r9 << 1
  363. addq %r9, %r9
  364. movq %rdx, %r12
  365. adcq %r10, %r10
  366. adcq \$0, %rbx
  367. mulq %rax
  368. # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
  369. addq %rcx, %rax
  370. addq %rax, %r9
  371. movq %r14, %rax # 56($inp)
  372. adcq %rdx, %r10
  373. adcq \$0, %rbx
  374. movq %r9, 80(%rsp)
  375. movq %r10, 88(%rsp)
  376. #seventh iteration
  377. mulq %rbp
  378. addq %rax, %r12
  379. movq %rbp, %rax
  380. adcq \$0, %rdx
  381. xorq %rcx, %rcx # rcx:r12:r11 = r12:r11 << 1
  382. addq %r11, %r11
  383. movq %rdx, %r13
  384. adcq %r12, %r12
  385. adcq \$0, %rcx
  386. mulq %rax
  387. # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
  388. addq %rbx, %rax
  389. addq %rax, %r11
  390. movq %r14, %rax # 56($inp)
  391. adcq %rdx, %r12
  392. adcq \$0, %rcx
  393. movq %r11, 96(%rsp)
  394. movq %r12, 104(%rsp)
  395. #eighth iteration
  396. xorq %rbx, %rbx # rbx:r13 = r13 << 1
  397. addq %r13, %r13
  398. adcq \$0, %rbx
  399. mulq %rax
  400. # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
  401. addq %rcx, %rax
  402. addq %r13, %rax
  403. adcq %rbx, %rdx
  404. movq (%rsp), %r8
  405. movq 8(%rsp), %r9
  406. movq 16(%rsp), %r10
  407. movq 24(%rsp), %r11
  408. movq 32(%rsp), %r12
  409. movq 40(%rsp), %r13
  410. movq 48(%rsp), %r14
  411. movq 56(%rsp), %r15
  412. movq %xmm1, %rbp
  413. movq %rax, 112(%rsp)
  414. movq %rdx, 120(%rsp)
  415. call __rsaz_512_reduce
  416. addq 64(%rsp), %r8
  417. adcq 72(%rsp), %r9
  418. adcq 80(%rsp), %r10
  419. adcq 88(%rsp), %r11
  420. adcq 96(%rsp), %r12
  421. adcq 104(%rsp), %r13
  422. adcq 112(%rsp), %r14
  423. adcq 120(%rsp), %r15
  424. sbbq %rcx, %rcx
  425. call __rsaz_512_subtract
  426. movq %r8, %rdx
  427. movq %r9, %rax
  428. movl 128+8(%rsp), $times
  429. movq $out, $inp
  430. decl $times
  431. jnz .Loop_sqr
  432. ___
  433. if ($addx) {
  434. $code.=<<___;
  435. jmp .Lsqr_tail
  436. .align 32
  437. .Loop_sqrx:
  438. movl $times,128+8(%rsp)
  439. movq $out, %xmm0 # off-load
  440. #first iteration
  441. mulx %rax, %r8, %r9
  442. mov %rax, %rbx
  443. mulx 16($inp), %rcx, %r10
  444. xor %rbp, %rbp # cf=0, of=0
  445. mulx 24($inp), %rax, %r11
  446. adcx %rcx, %r9
  447. .byte 0xc4,0x62,0xf3,0xf6,0xa6,0x20,0x00,0x00,0x00 # mulx 32($inp), %rcx, %r12
  448. adcx %rax, %r10
  449. .byte 0xc4,0x62,0xfb,0xf6,0xae,0x28,0x00,0x00,0x00 # mulx 40($inp), %rax, %r13
  450. adcx %rcx, %r11
  451. mulx 48($inp), %rcx, %r14
  452. adcx %rax, %r12
  453. adcx %rcx, %r13
  454. mulx 56($inp), %rax, %r15
  455. adcx %rax, %r14
  456. adcx %rbp, %r15 # %rbp is 0
  457. mulx %rdx, %rax, $out
  458. mov %rbx, %rdx # 8($inp)
  459. xor %rcx, %rcx
  460. adox %r8, %r8
  461. adcx $out, %r8
  462. adox %rbp, %rcx
  463. adcx %rbp, %rcx
  464. mov %rax, (%rsp)
  465. mov %r8, 8(%rsp)
  466. #second iteration
  467. .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x10,0x00,0x00,0x00 # mulx 16($inp), %rax, %rbx
  468. adox %rax, %r10
  469. adcx %rbx, %r11
  470. mulx 24($inp), $out, %r8
  471. adox $out, %r11
  472. .byte 0x66
  473. adcx %r8, %r12
  474. mulx 32($inp), %rax, %rbx
  475. adox %rax, %r12
  476. adcx %rbx, %r13
  477. mulx 40($inp), $out, %r8
  478. adox $out, %r13
  479. adcx %r8, %r14
  480. .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx
  481. adox %rax, %r14
  482. adcx %rbx, %r15
  483. .byte 0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r8
  484. adox $out, %r15
  485. adcx %rbp, %r8
  486. mulx %rdx, %rax, $out
  487. adox %rbp, %r8
  488. .byte 0x48,0x8b,0x96,0x10,0x00,0x00,0x00 # mov 16($inp), %rdx
  489. xor %rbx, %rbx
  490. adox %r9, %r9
  491. # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
  492. adcx %rcx, %rax
  493. adox %r10, %r10
  494. adcx %rax, %r9
  495. adox %rbp, %rbx
  496. adcx $out, %r10
  497. adcx %rbp, %rbx
  498. mov %r9, 16(%rsp)
  499. .byte 0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00 # mov %r10, 24(%rsp)
  500. #third iteration
  501. mulx 24($inp), $out, %r9
  502. adox $out, %r12
  503. adcx %r9, %r13
  504. mulx 32($inp), %rax, %rcx
  505. adox %rax, %r13
  506. adcx %rcx, %r14
  507. .byte 0xc4,0x62,0xc3,0xf6,0x8e,0x28,0x00,0x00,0x00 # mulx 40($inp), $out, %r9
  508. adox $out, %r14
  509. adcx %r9, %r15
  510. .byte 0xc4,0xe2,0xfb,0xf6,0x8e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rcx
  511. adox %rax, %r15
  512. adcx %rcx, %r8
  513. mulx 56($inp), $out, %r9
  514. adox $out, %r8
  515. adcx %rbp, %r9
  516. mulx %rdx, %rax, $out
  517. adox %rbp, %r9
  518. mov 24($inp), %rdx
  519. xor %rcx, %rcx
  520. adox %r11, %r11
  521. # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
  522. adcx %rbx, %rax
  523. adox %r12, %r12
  524. adcx %rax, %r11
  525. adox %rbp, %rcx
  526. adcx $out, %r12
  527. adcx %rbp, %rcx
  528. mov %r11, 32(%rsp)
  529. mov %r12, 40(%rsp)
  530. #fourth iteration
  531. mulx 32($inp), %rax, %rbx
  532. adox %rax, %r14
  533. adcx %rbx, %r15
  534. mulx 40($inp), $out, %r10
  535. adox $out, %r15
  536. adcx %r10, %r8
  537. mulx 48($inp), %rax, %rbx
  538. adox %rax, %r8
  539. adcx %rbx, %r9
  540. mulx 56($inp), $out, %r10
  541. adox $out, %r9
  542. adcx %rbp, %r10
  543. mulx %rdx, %rax, $out
  544. adox %rbp, %r10
  545. mov 32($inp), %rdx
  546. xor %rbx, %rbx
  547. adox %r13, %r13
  548. # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
  549. adcx %rcx, %rax
  550. adox %r14, %r14
  551. adcx %rax, %r13
  552. adox %rbp, %rbx
  553. adcx $out, %r14
  554. adcx %rbp, %rbx
  555. mov %r13, 48(%rsp)
  556. mov %r14, 56(%rsp)
  557. #fifth iteration
  558. mulx 40($inp), $out, %r11
  559. adox $out, %r8
  560. adcx %r11, %r9
  561. mulx 48($inp), %rax, %rcx
  562. adox %rax, %r9
  563. adcx %rcx, %r10
  564. mulx 56($inp), $out, %r11
  565. adox $out, %r10
  566. adcx %rbp, %r11
  567. mulx %rdx, %rax, $out
  568. mov 40($inp), %rdx
  569. adox %rbp, %r11
  570. xor %rcx, %rcx
  571. adox %r15, %r15
  572. # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
  573. adcx %rbx, %rax
  574. adox %r8, %r8
  575. adcx %rax, %r15
  576. adox %rbp, %rcx
  577. adcx $out, %r8
  578. adcx %rbp, %rcx
  579. mov %r15, 64(%rsp)
  580. mov %r8, 72(%rsp)
  581. #sixth iteration
  582. .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx
  583. adox %rax, %r10
  584. adcx %rbx, %r11
  585. .byte 0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r12
  586. adox $out, %r11
  587. adcx %rbp, %r12
  588. mulx %rdx, %rax, $out
  589. adox %rbp, %r12
  590. mov 48($inp), %rdx
  591. xor %rbx, %rbx
  592. adox %r9, %r9
  593. # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
  594. adcx %rcx, %rax
  595. adox %r10, %r10
  596. adcx %rax, %r9
  597. adcx $out, %r10
  598. adox %rbp, %rbx
  599. adcx %rbp, %rbx
  600. mov %r9, 80(%rsp)
  601. mov %r10, 88(%rsp)
  602. #seventh iteration
  603. .byte 0xc4,0x62,0xfb,0xf6,0xae,0x38,0x00,0x00,0x00 # mulx 56($inp), %rax, %r13
  604. adox %rax, %r12
  605. adox %rbp, %r13
  606. mulx %rdx, %rax, $out
  607. xor %rcx, %rcx
  608. mov 56($inp), %rdx
  609. adox %r11, %r11
  610. # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
  611. adcx %rbx, %rax
  612. adox %r12, %r12
  613. adcx %rax, %r11
  614. adox %rbp, %rcx
  615. adcx $out, %r12
  616. adcx %rbp, %rcx
  617. .byte 0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00 # mov %r11, 96(%rsp)
  618. .byte 0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00 # mov %r12, 104(%rsp)
  619. #eighth iteration
  620. mulx %rdx, %rax, %rdx
  621. xor %rbx, %rbx
  622. adox %r13, %r13
  623. # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
  624. adcx %rcx, %rax
  625. adox %rbp, %rbx
  626. adcx %r13, %rax
  627. adcx %rdx, %rbx
  628. movq %xmm0, $out
  629. movq %xmm1, %rbp
  630. movq 128(%rsp), %rdx # pull $n0
  631. movq (%rsp), %r8
  632. movq 8(%rsp), %r9
  633. movq 16(%rsp), %r10
  634. movq 24(%rsp), %r11
  635. movq 32(%rsp), %r12
  636. movq 40(%rsp), %r13
  637. movq 48(%rsp), %r14
  638. movq 56(%rsp), %r15
  639. movq %rax, 112(%rsp)
  640. movq %rbx, 120(%rsp)
  641. call __rsaz_512_reducex
  642. addq 64(%rsp), %r8
  643. adcq 72(%rsp), %r9
  644. adcq 80(%rsp), %r10
  645. adcq 88(%rsp), %r11
  646. adcq 96(%rsp), %r12
  647. adcq 104(%rsp), %r13
  648. adcq 112(%rsp), %r14
  649. adcq 120(%rsp), %r15
  650. sbbq %rcx, %rcx
  651. call __rsaz_512_subtract
  652. movq %r8, %rdx
  653. movq %r9, %rax
  654. movl 128+8(%rsp), $times
  655. movq $out, $inp
  656. decl $times
  657. jnz .Loop_sqrx
  658. .Lsqr_tail:
  659. ___
  660. }
  661. $code.=<<___;
  662. leaq 128+24+48(%rsp), %rax
  663. .cfi_def_cfa %rax,8
  664. movq -48(%rax), %r15
  665. .cfi_restore %r15
  666. movq -40(%rax), %r14
  667. .cfi_restore %r14
  668. movq -32(%rax), %r13
  669. .cfi_restore %r13
  670. movq -24(%rax), %r12
  671. .cfi_restore %r12
  672. movq -16(%rax), %rbp
  673. .cfi_restore %rbp
  674. movq -8(%rax), %rbx
  675. .cfi_restore %rbx
  676. leaq (%rax), %rsp
  677. .cfi_def_cfa_register %rsp
  678. .Lsqr_epilogue:
  679. ret
  680. .cfi_endproc
  681. .size rsaz_512_sqr,.-rsaz_512_sqr
  682. ___
  683. }
  684. {
  685. my ($out,$ap,$bp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8");
  686. $code.=<<___;
  687. .globl rsaz_512_mul
  688. .type rsaz_512_mul,\@function,5
  689. .align 32
  690. rsaz_512_mul:
  691. .cfi_startproc
  692. push %rbx
  693. .cfi_push %rbx
  694. push %rbp
  695. .cfi_push %rbp
  696. push %r12
  697. .cfi_push %r12
  698. push %r13
  699. .cfi_push %r13
  700. push %r14
  701. .cfi_push %r14
  702. push %r15
  703. .cfi_push %r15
  704. subq \$128+24, %rsp
  705. .cfi_adjust_cfa_offset 128+24
  706. .Lmul_body:
  707. movq $out, %xmm0 # off-load arguments
  708. movq $mod, %xmm1
  709. movq $n0, 128(%rsp)
  710. ___
  711. $code.=<<___ if ($addx);
  712. movl \$0x80100,%r11d
  713. andl OPENSSL_ia32cap_P+8(%rip),%r11d
  714. cmpl \$0x80100,%r11d # check for MULX and ADO/CX
  715. je .Lmulx
  716. ___
  717. $code.=<<___;
  718. movq ($bp), %rbx # pass b[0]
  719. movq $bp, %rbp # pass argument
  720. call __rsaz_512_mul
  721. movq %xmm0, $out
  722. movq %xmm1, %rbp
  723. movq (%rsp), %r8
  724. movq 8(%rsp), %r9
  725. movq 16(%rsp), %r10
  726. movq 24(%rsp), %r11
  727. movq 32(%rsp), %r12
  728. movq 40(%rsp), %r13
  729. movq 48(%rsp), %r14
  730. movq 56(%rsp), %r15
  731. call __rsaz_512_reduce
  732. ___
  733. $code.=<<___ if ($addx);
  734. jmp .Lmul_tail
  735. .align 32
  736. .Lmulx:
  737. movq $bp, %rbp # pass argument
  738. movq ($bp), %rdx # pass b[0]
  739. call __rsaz_512_mulx
  740. movq %xmm0, $out
  741. movq %xmm1, %rbp
  742. movq 128(%rsp), %rdx # pull $n0
  743. movq (%rsp), %r8
  744. movq 8(%rsp), %r9
  745. movq 16(%rsp), %r10
  746. movq 24(%rsp), %r11
  747. movq 32(%rsp), %r12
  748. movq 40(%rsp), %r13
  749. movq 48(%rsp), %r14
  750. movq 56(%rsp), %r15
  751. call __rsaz_512_reducex
  752. .Lmul_tail:
  753. ___
  754. $code.=<<___;
  755. addq 64(%rsp), %r8
  756. adcq 72(%rsp), %r9
  757. adcq 80(%rsp), %r10
  758. adcq 88(%rsp), %r11
  759. adcq 96(%rsp), %r12
  760. adcq 104(%rsp), %r13
  761. adcq 112(%rsp), %r14
  762. adcq 120(%rsp), %r15
  763. sbbq %rcx, %rcx
  764. call __rsaz_512_subtract
  765. leaq 128+24+48(%rsp), %rax
  766. .cfi_def_cfa %rax,8
  767. movq -48(%rax), %r15
  768. .cfi_restore %r15
  769. movq -40(%rax), %r14
  770. .cfi_restore %r14
  771. movq -32(%rax), %r13
  772. .cfi_restore %r13
  773. movq -24(%rax), %r12
  774. .cfi_restore %r12
  775. movq -16(%rax), %rbp
  776. .cfi_restore %rbp
  777. movq -8(%rax), %rbx
  778. .cfi_restore %rbx
  779. leaq (%rax), %rsp
  780. .cfi_def_cfa_register %rsp
  781. .Lmul_epilogue:
  782. ret
  783. .cfi_endproc
  784. .size rsaz_512_mul,.-rsaz_512_mul
  785. ___
  786. }
  787. {
  788. my ($out,$ap,$bp,$mod,$n0,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
  789. $code.=<<___;
  790. .globl rsaz_512_mul_gather4
  791. .type rsaz_512_mul_gather4,\@function,6
  792. .align 32
  793. rsaz_512_mul_gather4:
  794. .cfi_startproc
  795. push %rbx
  796. .cfi_push %rbx
  797. push %rbp
  798. .cfi_push %rbp
  799. push %r12
  800. .cfi_push %r12
  801. push %r13
  802. .cfi_push %r13
  803. push %r14
  804. .cfi_push %r14
  805. push %r15
  806. .cfi_push %r15
  807. subq \$`128+24+($win64?0xb0:0)`, %rsp
  808. .cfi_adjust_cfa_offset `128+24+($win64?0xb0:0)`
  809. ___
  810. $code.=<<___ if ($win64);
  811. movaps %xmm6,0xa0(%rsp)
  812. movaps %xmm7,0xb0(%rsp)
  813. movaps %xmm8,0xc0(%rsp)
  814. movaps %xmm9,0xd0(%rsp)
  815. movaps %xmm10,0xe0(%rsp)
  816. movaps %xmm11,0xf0(%rsp)
  817. movaps %xmm12,0x100(%rsp)
  818. movaps %xmm13,0x110(%rsp)
  819. movaps %xmm14,0x120(%rsp)
  820. movaps %xmm15,0x130(%rsp)
  821. ___
  822. $code.=<<___;
  823. .Lmul_gather4_body:
  824. movd $pwr,%xmm8
  825. movdqa .Linc+16(%rip),%xmm1 # 00000002000000020000000200000002
  826. movdqa .Linc(%rip),%xmm0 # 00000001000000010000000000000000
  827. pshufd \$0,%xmm8,%xmm8 # broadcast $power
  828. movdqa %xmm1,%xmm7
  829. movdqa %xmm1,%xmm2
  830. ___
  831. ########################################################################
  832. # calculate mask by comparing 0..15 to $power
  833. #
  834. for($i=0;$i<4;$i++) {
  835. $code.=<<___;
  836. paddd %xmm`$i`,%xmm`$i+1`
  837. pcmpeqd %xmm8,%xmm`$i`
  838. movdqa %xmm7,%xmm`$i+3`
  839. ___
  840. }
  841. for(;$i<7;$i++) {
  842. $code.=<<___;
  843. paddd %xmm`$i`,%xmm`$i+1`
  844. pcmpeqd %xmm8,%xmm`$i`
  845. ___
  846. }
  847. $code.=<<___;
  848. pcmpeqd %xmm8,%xmm7
  849. movdqa 16*0($bp),%xmm8
  850. movdqa 16*1($bp),%xmm9
  851. movdqa 16*2($bp),%xmm10
  852. movdqa 16*3($bp),%xmm11
  853. pand %xmm0,%xmm8
  854. movdqa 16*4($bp),%xmm12
  855. pand %xmm1,%xmm9
  856. movdqa 16*5($bp),%xmm13
  857. pand %xmm2,%xmm10
  858. movdqa 16*6($bp),%xmm14
  859. pand %xmm3,%xmm11
  860. movdqa 16*7($bp),%xmm15
  861. leaq 128($bp), %rbp
  862. pand %xmm4,%xmm12
  863. pand %xmm5,%xmm13
  864. pand %xmm6,%xmm14
  865. pand %xmm7,%xmm15
  866. por %xmm10,%xmm8
  867. por %xmm11,%xmm9
  868. por %xmm12,%xmm8
  869. por %xmm13,%xmm9
  870. por %xmm14,%xmm8
  871. por %xmm15,%xmm9
  872. por %xmm9,%xmm8
  873. pshufd \$0x4e,%xmm8,%xmm9
  874. por %xmm9,%xmm8
  875. ___
  876. $code.=<<___ if ($addx);
  877. movl \$0x80100,%r11d
  878. andl OPENSSL_ia32cap_P+8(%rip),%r11d
  879. cmpl \$0x80100,%r11d # check for MULX and ADO/CX
  880. je .Lmulx_gather
  881. ___
  882. $code.=<<___;
  883. movq %xmm8,%rbx
  884. movq $n0, 128(%rsp) # off-load arguments
  885. movq $out, 128+8(%rsp)
  886. movq $mod, 128+16(%rsp)
  887. movq ($ap), %rax
  888. movq 8($ap), %rcx
  889. mulq %rbx # 0 iteration
  890. movq %rax, (%rsp)
  891. movq %rcx, %rax
  892. movq %rdx, %r8
  893. mulq %rbx
  894. addq %rax, %r8
  895. movq 16($ap), %rax
  896. movq %rdx, %r9
  897. adcq \$0, %r9
  898. mulq %rbx
  899. addq %rax, %r9
  900. movq 24($ap), %rax
  901. movq %rdx, %r10
  902. adcq \$0, %r10
  903. mulq %rbx
  904. addq %rax, %r10
  905. movq 32($ap), %rax
  906. movq %rdx, %r11
  907. adcq \$0, %r11
  908. mulq %rbx
  909. addq %rax, %r11
  910. movq 40($ap), %rax
  911. movq %rdx, %r12
  912. adcq \$0, %r12
  913. mulq %rbx
  914. addq %rax, %r12
  915. movq 48($ap), %rax
  916. movq %rdx, %r13
  917. adcq \$0, %r13
  918. mulq %rbx
  919. addq %rax, %r13
  920. movq 56($ap), %rax
  921. movq %rdx, %r14
  922. adcq \$0, %r14
  923. mulq %rbx
  924. addq %rax, %r14
  925. movq ($ap), %rax
  926. movq %rdx, %r15
  927. adcq \$0, %r15
  928. leaq 8(%rsp), %rdi
  929. movl \$7, %ecx
  930. jmp .Loop_mul_gather
  931. .align 32
  932. .Loop_mul_gather:
  933. movdqa 16*0(%rbp),%xmm8
  934. movdqa 16*1(%rbp),%xmm9
  935. movdqa 16*2(%rbp),%xmm10
  936. movdqa 16*3(%rbp),%xmm11
  937. pand %xmm0,%xmm8
  938. movdqa 16*4(%rbp),%xmm12
  939. pand %xmm1,%xmm9
  940. movdqa 16*5(%rbp),%xmm13
  941. pand %xmm2,%xmm10
  942. movdqa 16*6(%rbp),%xmm14
  943. pand %xmm3,%xmm11
  944. movdqa 16*7(%rbp),%xmm15
  945. leaq 128(%rbp), %rbp
  946. pand %xmm4,%xmm12
  947. pand %xmm5,%xmm13
  948. pand %xmm6,%xmm14
  949. pand %xmm7,%xmm15
  950. por %xmm10,%xmm8
  951. por %xmm11,%xmm9
  952. por %xmm12,%xmm8
  953. por %xmm13,%xmm9
  954. por %xmm14,%xmm8
  955. por %xmm15,%xmm9
  956. por %xmm9,%xmm8
  957. pshufd \$0x4e,%xmm8,%xmm9
  958. por %xmm9,%xmm8
  959. movq %xmm8,%rbx
  960. mulq %rbx
  961. addq %rax, %r8
  962. movq 8($ap), %rax
  963. movq %r8, (%rdi)
  964. movq %rdx, %r8
  965. adcq \$0, %r8
  966. mulq %rbx
  967. addq %rax, %r9
  968. movq 16($ap), %rax
  969. adcq \$0, %rdx
  970. addq %r9, %r8
  971. movq %rdx, %r9
  972. adcq \$0, %r9
  973. mulq %rbx
  974. addq %rax, %r10
  975. movq 24($ap), %rax
  976. adcq \$0, %rdx
  977. addq %r10, %r9
  978. movq %rdx, %r10
  979. adcq \$0, %r10
  980. mulq %rbx
  981. addq %rax, %r11
  982. movq 32($ap), %rax
  983. adcq \$0, %rdx
  984. addq %r11, %r10
  985. movq %rdx, %r11
  986. adcq \$0, %r11
  987. mulq %rbx
  988. addq %rax, %r12
  989. movq 40($ap), %rax
  990. adcq \$0, %rdx
  991. addq %r12, %r11
  992. movq %rdx, %r12
  993. adcq \$0, %r12
  994. mulq %rbx
  995. addq %rax, %r13
  996. movq 48($ap), %rax
  997. adcq \$0, %rdx
  998. addq %r13, %r12
  999. movq %rdx, %r13
  1000. adcq \$0, %r13
  1001. mulq %rbx
  1002. addq %rax, %r14
  1003. movq 56($ap), %rax
  1004. adcq \$0, %rdx
  1005. addq %r14, %r13
  1006. movq %rdx, %r14
  1007. adcq \$0, %r14
  1008. mulq %rbx
  1009. addq %rax, %r15
  1010. movq ($ap), %rax
  1011. adcq \$0, %rdx
  1012. addq %r15, %r14
  1013. movq %rdx, %r15
  1014. adcq \$0, %r15
  1015. leaq 8(%rdi), %rdi
  1016. decl %ecx
  1017. jnz .Loop_mul_gather
  1018. movq %r8, (%rdi)
  1019. movq %r9, 8(%rdi)
  1020. movq %r10, 16(%rdi)
  1021. movq %r11, 24(%rdi)
  1022. movq %r12, 32(%rdi)
  1023. movq %r13, 40(%rdi)
  1024. movq %r14, 48(%rdi)
  1025. movq %r15, 56(%rdi)
  1026. movq 128+8(%rsp), $out
  1027. movq 128+16(%rsp), %rbp
  1028. movq (%rsp), %r8
  1029. movq 8(%rsp), %r9
  1030. movq 16(%rsp), %r10
  1031. movq 24(%rsp), %r11
  1032. movq 32(%rsp), %r12
  1033. movq 40(%rsp), %r13
  1034. movq 48(%rsp), %r14
  1035. movq 56(%rsp), %r15
  1036. call __rsaz_512_reduce
  1037. ___
  1038. $code.=<<___ if ($addx);
  1039. jmp .Lmul_gather_tail
  1040. .align 32
  1041. .Lmulx_gather:
  1042. movq %xmm8,%rdx
  1043. mov $n0, 128(%rsp) # off-load arguments
  1044. mov $out, 128+8(%rsp)
  1045. mov $mod, 128+16(%rsp)
  1046. mulx ($ap), %rbx, %r8 # 0 iteration
  1047. mov %rbx, (%rsp)
  1048. xor %edi, %edi # cf=0, of=0
  1049. mulx 8($ap), %rax, %r9
  1050. mulx 16($ap), %rbx, %r10
  1051. adcx %rax, %r8
  1052. mulx 24($ap), %rax, %r11
  1053. adcx %rbx, %r9
  1054. mulx 32($ap), %rbx, %r12
  1055. adcx %rax, %r10
  1056. mulx 40($ap), %rax, %r13
  1057. adcx %rbx, %r11
  1058. mulx 48($ap), %rbx, %r14
  1059. adcx %rax, %r12
  1060. mulx 56($ap), %rax, %r15
  1061. adcx %rbx, %r13
  1062. adcx %rax, %r14
  1063. .byte 0x67
  1064. mov %r8, %rbx
  1065. adcx %rdi, %r15 # %rdi is 0
  1066. mov \$-7, %rcx
  1067. jmp .Loop_mulx_gather
  1068. .align 32
  1069. .Loop_mulx_gather:
  1070. movdqa 16*0(%rbp),%xmm8
  1071. movdqa 16*1(%rbp),%xmm9
  1072. movdqa 16*2(%rbp),%xmm10
  1073. movdqa 16*3(%rbp),%xmm11
  1074. pand %xmm0,%xmm8
  1075. movdqa 16*4(%rbp),%xmm12
  1076. pand %xmm1,%xmm9
  1077. movdqa 16*5(%rbp),%xmm13
  1078. pand %xmm2,%xmm10
  1079. movdqa 16*6(%rbp),%xmm14
  1080. pand %xmm3,%xmm11
  1081. movdqa 16*7(%rbp),%xmm15
  1082. leaq 128(%rbp), %rbp
  1083. pand %xmm4,%xmm12
  1084. pand %xmm5,%xmm13
  1085. pand %xmm6,%xmm14
  1086. pand %xmm7,%xmm15
  1087. por %xmm10,%xmm8
  1088. por %xmm11,%xmm9
  1089. por %xmm12,%xmm8
  1090. por %xmm13,%xmm9
  1091. por %xmm14,%xmm8
  1092. por %xmm15,%xmm9
  1093. por %xmm9,%xmm8
  1094. pshufd \$0x4e,%xmm8,%xmm9
  1095. por %xmm9,%xmm8
  1096. movq %xmm8,%rdx
  1097. .byte 0xc4,0x62,0xfb,0xf6,0x86,0x00,0x00,0x00,0x00 # mulx ($ap), %rax, %r8
  1098. adcx %rax, %rbx
  1099. adox %r9, %r8
  1100. mulx 8($ap), %rax, %r9
  1101. adcx %rax, %r8
  1102. adox %r10, %r9
  1103. mulx 16($ap), %rax, %r10
  1104. adcx %rax, %r9
  1105. adox %r11, %r10
  1106. .byte 0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00 # mulx 24($ap), %rax, %r11
  1107. adcx %rax, %r10
  1108. adox %r12, %r11
  1109. mulx 32($ap), %rax, %r12
  1110. adcx %rax, %r11
  1111. adox %r13, %r12
  1112. mulx 40($ap), %rax, %r13
  1113. adcx %rax, %r12
  1114. adox %r14, %r13
  1115. .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14
  1116. adcx %rax, %r13
  1117. .byte 0x67
  1118. adox %r15, %r14
  1119. mulx 56($ap), %rax, %r15
  1120. mov %rbx, 64(%rsp,%rcx,8)
  1121. adcx %rax, %r14
  1122. adox %rdi, %r15
  1123. mov %r8, %rbx
  1124. adcx %rdi, %r15 # cf=0
  1125. inc %rcx # of=0
  1126. jnz .Loop_mulx_gather
  1127. mov %r8, 64(%rsp)
  1128. mov %r9, 64+8(%rsp)
  1129. mov %r10, 64+16(%rsp)
  1130. mov %r11, 64+24(%rsp)
  1131. mov %r12, 64+32(%rsp)
  1132. mov %r13, 64+40(%rsp)
  1133. mov %r14, 64+48(%rsp)
  1134. mov %r15, 64+56(%rsp)
  1135. mov 128(%rsp), %rdx # pull arguments
  1136. mov 128+8(%rsp), $out
  1137. mov 128+16(%rsp), %rbp
  1138. mov (%rsp), %r8
  1139. mov 8(%rsp), %r9
  1140. mov 16(%rsp), %r10
  1141. mov 24(%rsp), %r11
  1142. mov 32(%rsp), %r12
  1143. mov 40(%rsp), %r13
  1144. mov 48(%rsp), %r14
  1145. mov 56(%rsp), %r15
  1146. call __rsaz_512_reducex
  1147. .Lmul_gather_tail:
  1148. ___
  1149. $code.=<<___;
  1150. addq 64(%rsp), %r8
  1151. adcq 72(%rsp), %r9
  1152. adcq 80(%rsp), %r10
  1153. adcq 88(%rsp), %r11
  1154. adcq 96(%rsp), %r12
  1155. adcq 104(%rsp), %r13
  1156. adcq 112(%rsp), %r14
  1157. adcq 120(%rsp), %r15
  1158. sbbq %rcx, %rcx
  1159. call __rsaz_512_subtract
  1160. leaq 128+24+48(%rsp), %rax
  1161. ___
  1162. $code.=<<___ if ($win64);
  1163. movaps 0xa0-0xc8(%rax),%xmm6
  1164. movaps 0xb0-0xc8(%rax),%xmm7
  1165. movaps 0xc0-0xc8(%rax),%xmm8
  1166. movaps 0xd0-0xc8(%rax),%xmm9
  1167. movaps 0xe0-0xc8(%rax),%xmm10
  1168. movaps 0xf0-0xc8(%rax),%xmm11
  1169. movaps 0x100-0xc8(%rax),%xmm12
  1170. movaps 0x110-0xc8(%rax),%xmm13
  1171. movaps 0x120-0xc8(%rax),%xmm14
  1172. movaps 0x130-0xc8(%rax),%xmm15
  1173. lea 0xb0(%rax),%rax
  1174. ___
  1175. $code.=<<___;
  1176. .cfi_def_cfa %rax,8
  1177. movq -48(%rax), %r15
  1178. .cfi_restore %r15
  1179. movq -40(%rax), %r14
  1180. .cfi_restore %r14
  1181. movq -32(%rax), %r13
  1182. .cfi_restore %r13
  1183. movq -24(%rax), %r12
  1184. .cfi_restore %r12
  1185. movq -16(%rax), %rbp
  1186. .cfi_restore %rbp
  1187. movq -8(%rax), %rbx
  1188. .cfi_restore %rbx
  1189. leaq (%rax), %rsp
  1190. .cfi_def_cfa_register %rsp
  1191. .Lmul_gather4_epilogue:
  1192. ret
  1193. .cfi_endproc
  1194. .size rsaz_512_mul_gather4,.-rsaz_512_mul_gather4
  1195. ___
  1196. }
  1197. {
  1198. my ($out,$ap,$mod,$n0,$tbl,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
  1199. $code.=<<___;
  1200. .globl rsaz_512_mul_scatter4
  1201. .type rsaz_512_mul_scatter4,\@function,6
  1202. .align 32
  1203. rsaz_512_mul_scatter4:
  1204. .cfi_startproc
  1205. push %rbx
  1206. .cfi_push %rbx
  1207. push %rbp
  1208. .cfi_push %rbp
  1209. push %r12
  1210. .cfi_push %r12
  1211. push %r13
  1212. .cfi_push %r13
  1213. push %r14
  1214. .cfi_push %r14
  1215. push %r15
  1216. .cfi_push %r15
  1217. mov $pwr, $pwr
  1218. subq \$128+24, %rsp
  1219. .cfi_adjust_cfa_offset 128+24
  1220. .Lmul_scatter4_body:
  1221. leaq ($tbl,$pwr,8), $tbl
  1222. movq $out, %xmm0 # off-load arguments
  1223. movq $mod, %xmm1
  1224. movq $tbl, %xmm2
  1225. movq $n0, 128(%rsp)
  1226. movq $out, %rbp
  1227. ___
  1228. $code.=<<___ if ($addx);
  1229. movl \$0x80100,%r11d
  1230. andl OPENSSL_ia32cap_P+8(%rip),%r11d
  1231. cmpl \$0x80100,%r11d # check for MULX and ADO/CX
  1232. je .Lmulx_scatter
  1233. ___
  1234. $code.=<<___;
  1235. movq ($out),%rbx # pass b[0]
  1236. call __rsaz_512_mul
  1237. movq %xmm0, $out
  1238. movq %xmm1, %rbp
  1239. movq (%rsp), %r8
  1240. movq 8(%rsp), %r9
  1241. movq 16(%rsp), %r10
  1242. movq 24(%rsp), %r11
  1243. movq 32(%rsp), %r12
  1244. movq 40(%rsp), %r13
  1245. movq 48(%rsp), %r14
  1246. movq 56(%rsp), %r15
  1247. call __rsaz_512_reduce
  1248. ___
  1249. $code.=<<___ if ($addx);
  1250. jmp .Lmul_scatter_tail
  1251. .align 32
  1252. .Lmulx_scatter:
  1253. movq ($out), %rdx # pass b[0]
  1254. call __rsaz_512_mulx
  1255. movq %xmm0, $out
  1256. movq %xmm1, %rbp
  1257. movq 128(%rsp), %rdx # pull $n0
  1258. movq (%rsp), %r8
  1259. movq 8(%rsp), %r9
  1260. movq 16(%rsp), %r10
  1261. movq 24(%rsp), %r11
  1262. movq 32(%rsp), %r12
  1263. movq 40(%rsp), %r13
  1264. movq 48(%rsp), %r14
  1265. movq 56(%rsp), %r15
  1266. call __rsaz_512_reducex
  1267. .Lmul_scatter_tail:
  1268. ___
  1269. $code.=<<___;
  1270. addq 64(%rsp), %r8
  1271. adcq 72(%rsp), %r9
  1272. adcq 80(%rsp), %r10
  1273. adcq 88(%rsp), %r11
  1274. adcq 96(%rsp), %r12
  1275. adcq 104(%rsp), %r13
  1276. adcq 112(%rsp), %r14
  1277. adcq 120(%rsp), %r15
  1278. movq %xmm2, $inp
  1279. sbbq %rcx, %rcx
  1280. call __rsaz_512_subtract
  1281. movq %r8, 128*0($inp) # scatter
  1282. movq %r9, 128*1($inp)
  1283. movq %r10, 128*2($inp)
  1284. movq %r11, 128*3($inp)
  1285. movq %r12, 128*4($inp)
  1286. movq %r13, 128*5($inp)
  1287. movq %r14, 128*6($inp)
  1288. movq %r15, 128*7($inp)
  1289. leaq 128+24+48(%rsp), %rax
  1290. .cfi_def_cfa %rax,8
  1291. movq -48(%rax), %r15
  1292. .cfi_restore %r15
  1293. movq -40(%rax), %r14
  1294. .cfi_restore %r14
  1295. movq -32(%rax), %r13
  1296. .cfi_restore %r13
  1297. movq -24(%rax), %r12
  1298. .cfi_restore %r12
  1299. movq -16(%rax), %rbp
  1300. .cfi_restore %rbp
  1301. movq -8(%rax), %rbx
  1302. .cfi_restore %rbx
  1303. leaq (%rax), %rsp
  1304. .cfi_def_cfa_register %rsp
  1305. .Lmul_scatter4_epilogue:
  1306. ret
  1307. .cfi_endproc
  1308. .size rsaz_512_mul_scatter4,.-rsaz_512_mul_scatter4
  1309. ___
  1310. }
  1311. {
  1312. my ($out,$inp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx");
  1313. $code.=<<___;
  1314. .globl rsaz_512_mul_by_one
  1315. .type rsaz_512_mul_by_one,\@function,4
  1316. .align 32
  1317. rsaz_512_mul_by_one:
  1318. .cfi_startproc
  1319. push %rbx
  1320. .cfi_push %rbx
  1321. push %rbp
  1322. .cfi_push %rbp
  1323. push %r12
  1324. .cfi_push %r12
  1325. push %r13
  1326. .cfi_push %r13
  1327. push %r14
  1328. .cfi_push %r14
  1329. push %r15
  1330. .cfi_push %r15
  1331. subq \$128+24, %rsp
  1332. .cfi_adjust_cfa_offset 128+24
  1333. .Lmul_by_one_body:
  1334. ___
  1335. $code.=<<___ if ($addx);
  1336. movl OPENSSL_ia32cap_P+8(%rip),%eax
  1337. ___
  1338. $code.=<<___;
  1339. movq $mod, %rbp # reassign argument
  1340. movq $n0, 128(%rsp)
  1341. movq ($inp), %r8
  1342. pxor %xmm0, %xmm0
  1343. movq 8($inp), %r9
  1344. movq 16($inp), %r10
  1345. movq 24($inp), %r11
  1346. movq 32($inp), %r12
  1347. movq 40($inp), %r13
  1348. movq 48($inp), %r14
  1349. movq 56($inp), %r15
  1350. movdqa %xmm0, (%rsp)
  1351. movdqa %xmm0, 16(%rsp)
  1352. movdqa %xmm0, 32(%rsp)
  1353. movdqa %xmm0, 48(%rsp)
  1354. movdqa %xmm0, 64(%rsp)
  1355. movdqa %xmm0, 80(%rsp)
  1356. movdqa %xmm0, 96(%rsp)
  1357. ___
  1358. $code.=<<___ if ($addx);
  1359. andl \$0x80100,%eax
  1360. cmpl \$0x80100,%eax # check for MULX and ADO/CX
  1361. je .Lby_one_callx
  1362. ___
  1363. $code.=<<___;
  1364. call __rsaz_512_reduce
  1365. ___
  1366. $code.=<<___ if ($addx);
  1367. jmp .Lby_one_tail
  1368. .align 32
  1369. .Lby_one_callx:
  1370. movq 128(%rsp), %rdx # pull $n0
  1371. call __rsaz_512_reducex
  1372. .Lby_one_tail:
  1373. ___
  1374. $code.=<<___;
  1375. movq %r8, ($out)
  1376. movq %r9, 8($out)
  1377. movq %r10, 16($out)
  1378. movq %r11, 24($out)
  1379. movq %r12, 32($out)
  1380. movq %r13, 40($out)
  1381. movq %r14, 48($out)
  1382. movq %r15, 56($out)
  1383. leaq 128+24+48(%rsp), %rax
  1384. .cfi_def_cfa %rax,8
  1385. movq -48(%rax), %r15
  1386. .cfi_restore %r15
  1387. movq -40(%rax), %r14
  1388. .cfi_restore %r14
  1389. movq -32(%rax), %r13
  1390. .cfi_restore %r13
  1391. movq -24(%rax), %r12
  1392. .cfi_restore %r12
  1393. movq -16(%rax), %rbp
  1394. .cfi_restore %rbp
  1395. movq -8(%rax), %rbx
  1396. .cfi_restore %rbx
  1397. leaq (%rax), %rsp
  1398. .cfi_def_cfa_register %rsp
  1399. .Lmul_by_one_epilogue:
  1400. ret
  1401. .cfi_endproc
  1402. .size rsaz_512_mul_by_one,.-rsaz_512_mul_by_one
  1403. ___
  1404. }
  1405. { # __rsaz_512_reduce
  1406. #
  1407. # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0
  1408. # output: %r8-%r15
  1409. # clobbers: everything except %rbp and %rdi
  1410. $code.=<<___;
  1411. .type __rsaz_512_reduce,\@abi-omnipotent
  1412. .align 32
  1413. __rsaz_512_reduce:
  1414. .cfi_startproc
  1415. movq %r8, %rbx
  1416. imulq 128+8(%rsp), %rbx
  1417. movq 0(%rbp), %rax
  1418. movl \$8, %ecx
  1419. jmp .Lreduction_loop
  1420. .align 32
  1421. .Lreduction_loop:
  1422. mulq %rbx
  1423. movq 8(%rbp), %rax
  1424. negq %r8
  1425. movq %rdx, %r8
  1426. adcq \$0, %r8
  1427. mulq %rbx
  1428. addq %rax, %r9
  1429. movq 16(%rbp), %rax
  1430. adcq \$0, %rdx
  1431. addq %r9, %r8
  1432. movq %rdx, %r9
  1433. adcq \$0, %r9
  1434. mulq %rbx
  1435. addq %rax, %r10
  1436. movq 24(%rbp), %rax
  1437. adcq \$0, %rdx
  1438. addq %r10, %r9
  1439. movq %rdx, %r10
  1440. adcq \$0, %r10
  1441. mulq %rbx
  1442. addq %rax, %r11
  1443. movq 32(%rbp), %rax
  1444. adcq \$0, %rdx
  1445. addq %r11, %r10
  1446. movq 128+8(%rsp), %rsi
  1447. #movq %rdx, %r11
  1448. #adcq \$0, %r11
  1449. adcq \$0, %rdx
  1450. movq %rdx, %r11
  1451. mulq %rbx
  1452. addq %rax, %r12
  1453. movq 40(%rbp), %rax
  1454. adcq \$0, %rdx
  1455. imulq %r8, %rsi
  1456. addq %r12, %r11
  1457. movq %rdx, %r12
  1458. adcq \$0, %r12
  1459. mulq %rbx
  1460. addq %rax, %r13
  1461. movq 48(%rbp), %rax
  1462. adcq \$0, %rdx
  1463. addq %r13, %r12
  1464. movq %rdx, %r13
  1465. adcq \$0, %r13
  1466. mulq %rbx
  1467. addq %rax, %r14
  1468. movq 56(%rbp), %rax
  1469. adcq \$0, %rdx
  1470. addq %r14, %r13
  1471. movq %rdx, %r14
  1472. adcq \$0, %r14
  1473. mulq %rbx
  1474. movq %rsi, %rbx
  1475. addq %rax, %r15
  1476. movq 0(%rbp), %rax
  1477. adcq \$0, %rdx
  1478. addq %r15, %r14
  1479. movq %rdx, %r15
  1480. adcq \$0, %r15
  1481. decl %ecx
  1482. jne .Lreduction_loop
  1483. ret
  1484. .cfi_endproc
  1485. .size __rsaz_512_reduce,.-__rsaz_512_reduce
  1486. ___
  1487. }
  1488. if ($addx) {
  1489. # __rsaz_512_reducex
  1490. #
  1491. # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0
  1492. # output: %r8-%r15
  1493. # clobbers: everything except %rbp and %rdi
  1494. $code.=<<___;
  1495. .type __rsaz_512_reducex,\@abi-omnipotent
  1496. .align 32
  1497. __rsaz_512_reducex:
  1498. .cfi_startproc
  1499. #movq 128+8(%rsp), %rdx # pull $n0
  1500. imulq %r8, %rdx
  1501. xorq %rsi, %rsi # cf=0,of=0
  1502. movl \$8, %ecx
  1503. jmp .Lreduction_loopx
  1504. .align 32
  1505. .Lreduction_loopx:
  1506. mov %r8, %rbx
  1507. mulx 0(%rbp), %rax, %r8
  1508. adcx %rbx, %rax
  1509. adox %r9, %r8
  1510. mulx 8(%rbp), %rax, %r9
  1511. adcx %rax, %r8
  1512. adox %r10, %r9
  1513. mulx 16(%rbp), %rbx, %r10
  1514. adcx %rbx, %r9
  1515. adox %r11, %r10
  1516. mulx 24(%rbp), %rbx, %r11
  1517. adcx %rbx, %r10
  1518. adox %r12, %r11
  1519. .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 32(%rbp), %rbx, %r12
  1520. mov %rdx, %rax
  1521. mov %r8, %rdx
  1522. adcx %rbx, %r11
  1523. adox %r13, %r12
  1524. mulx 128+8(%rsp), %rbx, %rdx
  1525. mov %rax, %rdx
  1526. mulx 40(%rbp), %rax, %r13
  1527. adcx %rax, %r12
  1528. adox %r14, %r13
  1529. .byte 0xc4,0x62,0xfb,0xf6,0xb5,0x30,0x00,0x00,0x00 # mulx 48(%rbp), %rax, %r14
  1530. adcx %rax, %r13
  1531. adox %r15, %r14
  1532. mulx 56(%rbp), %rax, %r15
  1533. mov %rbx, %rdx
  1534. adcx %rax, %r14
  1535. adox %rsi, %r15 # %rsi is 0
  1536. adcx %rsi, %r15 # cf=0
  1537. decl %ecx # of=0
  1538. jne .Lreduction_loopx
  1539. ret
  1540. .cfi_endproc
  1541. .size __rsaz_512_reducex,.-__rsaz_512_reducex
  1542. ___
  1543. }
  1544. { # __rsaz_512_subtract
  1545. # input: %r8-%r15, %rdi - $out, %rbp - $mod, %rcx - mask
  1546. # output:
  1547. # clobbers: everything but %rdi, %rsi and %rbp
  1548. $code.=<<___;
  1549. .type __rsaz_512_subtract,\@abi-omnipotent
  1550. .align 32
  1551. __rsaz_512_subtract:
  1552. .cfi_startproc
  1553. movq %r8, ($out)
  1554. movq %r9, 8($out)
  1555. movq %r10, 16($out)
  1556. movq %r11, 24($out)
  1557. movq %r12, 32($out)
  1558. movq %r13, 40($out)
  1559. movq %r14, 48($out)
  1560. movq %r15, 56($out)
  1561. movq 0($mod), %r8
  1562. movq 8($mod), %r9
  1563. negq %r8
  1564. notq %r9
  1565. andq %rcx, %r8
  1566. movq 16($mod), %r10
  1567. andq %rcx, %r9
  1568. notq %r10
  1569. movq 24($mod), %r11
  1570. andq %rcx, %r10
  1571. notq %r11
  1572. movq 32($mod), %r12
  1573. andq %rcx, %r11
  1574. notq %r12
  1575. movq 40($mod), %r13
  1576. andq %rcx, %r12
  1577. notq %r13
  1578. movq 48($mod), %r14
  1579. andq %rcx, %r13
  1580. notq %r14
  1581. movq 56($mod), %r15
  1582. andq %rcx, %r14
  1583. notq %r15
  1584. andq %rcx, %r15
  1585. addq ($out), %r8
  1586. adcq 8($out), %r9
  1587. adcq 16($out), %r10
  1588. adcq 24($out), %r11
  1589. adcq 32($out), %r12
  1590. adcq 40($out), %r13
  1591. adcq 48($out), %r14
  1592. adcq 56($out), %r15
  1593. movq %r8, ($out)
  1594. movq %r9, 8($out)
  1595. movq %r10, 16($out)
  1596. movq %r11, 24($out)
  1597. movq %r12, 32($out)
  1598. movq %r13, 40($out)
  1599. movq %r14, 48($out)
  1600. movq %r15, 56($out)
  1601. ret
  1602. .cfi_endproc
  1603. .size __rsaz_512_subtract,.-__rsaz_512_subtract
  1604. ___
  1605. }
  1606. { # __rsaz_512_mul
  1607. #
  1608. # input: %rsi - ap, %rbp - bp
  1609. # output:
  1610. # clobbers: everything
  1611. my ($ap,$bp) = ("%rsi","%rbp");
  1612. $code.=<<___;
  1613. .type __rsaz_512_mul,\@abi-omnipotent
  1614. .align 32
  1615. __rsaz_512_mul:
  1616. .cfi_startproc
  1617. leaq 8(%rsp), %rdi
  1618. movq ($ap), %rax
  1619. mulq %rbx
  1620. movq %rax, (%rdi)
  1621. movq 8($ap), %rax
  1622. movq %rdx, %r8
  1623. mulq %rbx
  1624. addq %rax, %r8
  1625. movq 16($ap), %rax
  1626. movq %rdx, %r9
  1627. adcq \$0, %r9
  1628. mulq %rbx
  1629. addq %rax, %r9
  1630. movq 24($ap), %rax
  1631. movq %rdx, %r10
  1632. adcq \$0, %r10
  1633. mulq %rbx
  1634. addq %rax, %r10
  1635. movq 32($ap), %rax
  1636. movq %rdx, %r11
  1637. adcq \$0, %r11
  1638. mulq %rbx
  1639. addq %rax, %r11
  1640. movq 40($ap), %rax
  1641. movq %rdx, %r12
  1642. adcq \$0, %r12
  1643. mulq %rbx
  1644. addq %rax, %r12
  1645. movq 48($ap), %rax
  1646. movq %rdx, %r13
  1647. adcq \$0, %r13
  1648. mulq %rbx
  1649. addq %rax, %r13
  1650. movq 56($ap), %rax
  1651. movq %rdx, %r14
  1652. adcq \$0, %r14
  1653. mulq %rbx
  1654. addq %rax, %r14
  1655. movq ($ap), %rax
  1656. movq %rdx, %r15
  1657. adcq \$0, %r15
  1658. leaq 8($bp), $bp
  1659. leaq 8(%rdi), %rdi
  1660. movl \$7, %ecx
  1661. jmp .Loop_mul
  1662. .align 32
  1663. .Loop_mul:
  1664. movq ($bp), %rbx
  1665. mulq %rbx
  1666. addq %rax, %r8
  1667. movq 8($ap), %rax
  1668. movq %r8, (%rdi)
  1669. movq %rdx, %r8
  1670. adcq \$0, %r8
  1671. mulq %rbx
  1672. addq %rax, %r9
  1673. movq 16($ap), %rax
  1674. adcq \$0, %rdx
  1675. addq %r9, %r8
  1676. movq %rdx, %r9
  1677. adcq \$0, %r9
  1678. mulq %rbx
  1679. addq %rax, %r10
  1680. movq 24($ap), %rax
  1681. adcq \$0, %rdx
  1682. addq %r10, %r9
  1683. movq %rdx, %r10
  1684. adcq \$0, %r10
  1685. mulq %rbx
  1686. addq %rax, %r11
  1687. movq 32($ap), %rax
  1688. adcq \$0, %rdx
  1689. addq %r11, %r10
  1690. movq %rdx, %r11
  1691. adcq \$0, %r11
  1692. mulq %rbx
  1693. addq %rax, %r12
  1694. movq 40($ap), %rax
  1695. adcq \$0, %rdx
  1696. addq %r12, %r11
  1697. movq %rdx, %r12
  1698. adcq \$0, %r12
  1699. mulq %rbx
  1700. addq %rax, %r13
  1701. movq 48($ap), %rax
  1702. adcq \$0, %rdx
  1703. addq %r13, %r12
  1704. movq %rdx, %r13
  1705. adcq \$0, %r13
  1706. mulq %rbx
  1707. addq %rax, %r14
  1708. movq 56($ap), %rax
  1709. adcq \$0, %rdx
  1710. addq %r14, %r13
  1711. movq %rdx, %r14
  1712. leaq 8($bp), $bp
  1713. adcq \$0, %r14
  1714. mulq %rbx
  1715. addq %rax, %r15
  1716. movq ($ap), %rax
  1717. adcq \$0, %rdx
  1718. addq %r15, %r14
  1719. movq %rdx, %r15
  1720. adcq \$0, %r15
  1721. leaq 8(%rdi), %rdi
  1722. decl %ecx
  1723. jnz .Loop_mul
  1724. movq %r8, (%rdi)
  1725. movq %r9, 8(%rdi)
  1726. movq %r10, 16(%rdi)
  1727. movq %r11, 24(%rdi)
  1728. movq %r12, 32(%rdi)
  1729. movq %r13, 40(%rdi)
  1730. movq %r14, 48(%rdi)
  1731. movq %r15, 56(%rdi)
  1732. ret
  1733. .cfi_endproc
  1734. .size __rsaz_512_mul,.-__rsaz_512_mul
  1735. ___
  1736. }
  1737. if ($addx) {
  1738. # __rsaz_512_mulx
  1739. #
  1740. # input: %rsi - ap, %rbp - bp
  1741. # output:
  1742. # clobbers: everything
  1743. my ($ap,$bp,$zero) = ("%rsi","%rbp","%rdi");
  1744. $code.=<<___;
  1745. .type __rsaz_512_mulx,\@abi-omnipotent
  1746. .align 32
  1747. __rsaz_512_mulx:
  1748. .cfi_startproc
  1749. mulx ($ap), %rbx, %r8 # initial %rdx preloaded by caller
  1750. mov \$-6, %rcx
  1751. mulx 8($ap), %rax, %r9
  1752. movq %rbx, 8(%rsp)
  1753. mulx 16($ap), %rbx, %r10
  1754. adc %rax, %r8
  1755. mulx 24($ap), %rax, %r11
  1756. adc %rbx, %r9
  1757. mulx 32($ap), %rbx, %r12
  1758. adc %rax, %r10
  1759. mulx 40($ap), %rax, %r13
  1760. adc %rbx, %r11
  1761. mulx 48($ap), %rbx, %r14
  1762. adc %rax, %r12
  1763. mulx 56($ap), %rax, %r15
  1764. mov 8($bp), %rdx
  1765. adc %rbx, %r13
  1766. adc %rax, %r14
  1767. adc \$0, %r15
  1768. xor $zero, $zero # cf=0,of=0
  1769. jmp .Loop_mulx
  1770. .align 32
  1771. .Loop_mulx:
  1772. movq %r8, %rbx
  1773. mulx ($ap), %rax, %r8
  1774. adcx %rax, %rbx
  1775. adox %r9, %r8
  1776. mulx 8($ap), %rax, %r9
  1777. adcx %rax, %r8
  1778. adox %r10, %r9
  1779. mulx 16($ap), %rax, %r10
  1780. adcx %rax, %r9
  1781. adox %r11, %r10
  1782. mulx 24($ap), %rax, %r11
  1783. adcx %rax, %r10
  1784. adox %r12, %r11
  1785. .byte 0x3e,0xc4,0x62,0xfb,0xf6,0xa6,0x20,0x00,0x00,0x00 # mulx 32($ap), %rax, %r12
  1786. adcx %rax, %r11
  1787. adox %r13, %r12
  1788. mulx 40($ap), %rax, %r13
  1789. adcx %rax, %r12
  1790. adox %r14, %r13
  1791. mulx 48($ap), %rax, %r14
  1792. adcx %rax, %r13
  1793. adox %r15, %r14
  1794. mulx 56($ap), %rax, %r15
  1795. movq 64($bp,%rcx,8), %rdx
  1796. movq %rbx, 8+64-8(%rsp,%rcx,8)
  1797. adcx %rax, %r14
  1798. adox $zero, %r15
  1799. adcx $zero, %r15 # cf=0
  1800. inc %rcx # of=0
  1801. jnz .Loop_mulx
  1802. movq %r8, %rbx
  1803. mulx ($ap), %rax, %r8
  1804. adcx %rax, %rbx
  1805. adox %r9, %r8
  1806. .byte 0xc4,0x62,0xfb,0xf6,0x8e,0x08,0x00,0x00,0x00 # mulx 8($ap), %rax, %r9
  1807. adcx %rax, %r8
  1808. adox %r10, %r9
  1809. .byte 0xc4,0x62,0xfb,0xf6,0x96,0x10,0x00,0x00,0x00 # mulx 16($ap), %rax, %r10
  1810. adcx %rax, %r9
  1811. adox %r11, %r10
  1812. mulx 24($ap), %rax, %r11
  1813. adcx %rax, %r10
  1814. adox %r12, %r11
  1815. mulx 32($ap), %rax, %r12
  1816. adcx %rax, %r11
  1817. adox %r13, %r12
  1818. mulx 40($ap), %rax, %r13
  1819. adcx %rax, %r12
  1820. adox %r14, %r13
  1821. .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14
  1822. adcx %rax, %r13
  1823. adox %r15, %r14
  1824. .byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 # mulx 56($ap), %rax, %r15
  1825. adcx %rax, %r14
  1826. adox $zero, %r15
  1827. adcx $zero, %r15
  1828. mov %rbx, 8+64-8(%rsp)
  1829. mov %r8, 8+64(%rsp)
  1830. mov %r9, 8+64+8(%rsp)
  1831. mov %r10, 8+64+16(%rsp)
  1832. mov %r11, 8+64+24(%rsp)
  1833. mov %r12, 8+64+32(%rsp)
  1834. mov %r13, 8+64+40(%rsp)
  1835. mov %r14, 8+64+48(%rsp)
  1836. mov %r15, 8+64+56(%rsp)
  1837. ret
  1838. .cfi_endproc
  1839. .size __rsaz_512_mulx,.-__rsaz_512_mulx
  1840. ___
  1841. }
  1842. {
  1843. my ($out,$inp,$power)= $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx");
  1844. $code.=<<___;
  1845. .globl rsaz_512_scatter4
  1846. .type rsaz_512_scatter4,\@abi-omnipotent
  1847. .align 16
  1848. rsaz_512_scatter4:
  1849. .cfi_startproc
  1850. leaq ($out,$power,8), $out
  1851. movl \$8, %r9d
  1852. jmp .Loop_scatter
  1853. .align 16
  1854. .Loop_scatter:
  1855. movq ($inp), %rax
  1856. leaq 8($inp), $inp
  1857. movq %rax, ($out)
  1858. leaq 128($out), $out
  1859. decl %r9d
  1860. jnz .Loop_scatter
  1861. ret
  1862. .cfi_endproc
  1863. .size rsaz_512_scatter4,.-rsaz_512_scatter4
  1864. .globl rsaz_512_gather4
  1865. .type rsaz_512_gather4,\@abi-omnipotent
  1866. .align 16
  1867. rsaz_512_gather4:
  1868. .cfi_startproc
  1869. ___
  1870. $code.=<<___ if ($win64);
  1871. .LSEH_begin_rsaz_512_gather4:
  1872. .byte 0x48,0x81,0xec,0xa8,0x00,0x00,0x00 # sub $0xa8,%rsp
  1873. .byte 0x0f,0x29,0x34,0x24 # movaps %xmm6,(%rsp)
  1874. .byte 0x0f,0x29,0x7c,0x24,0x10 # movaps %xmm7,0x10(%rsp)
  1875. .byte 0x44,0x0f,0x29,0x44,0x24,0x20 # movaps %xmm8,0x20(%rsp)
  1876. .byte 0x44,0x0f,0x29,0x4c,0x24,0x30 # movaps %xmm9,0x30(%rsp)
  1877. .byte 0x44,0x0f,0x29,0x54,0x24,0x40 # movaps %xmm10,0x40(%rsp)
  1878. .byte 0x44,0x0f,0x29,0x5c,0x24,0x50 # movaps %xmm11,0x50(%rsp)
  1879. .byte 0x44,0x0f,0x29,0x64,0x24,0x60 # movaps %xmm12,0x60(%rsp)
  1880. .byte 0x44,0x0f,0x29,0x6c,0x24,0x70 # movaps %xmm13,0x70(%rsp)
  1881. .byte 0x44,0x0f,0x29,0xb4,0x24,0x80,0,0,0 # movaps %xmm14,0x80(%rsp)
  1882. .byte 0x44,0x0f,0x29,0xbc,0x24,0x90,0,0,0 # movaps %xmm15,0x90(%rsp)
  1883. ___
  1884. $code.=<<___;
  1885. movd $power,%xmm8
  1886. movdqa .Linc+16(%rip),%xmm1 # 00000002000000020000000200000002
  1887. movdqa .Linc(%rip),%xmm0 # 00000001000000010000000000000000
  1888. pshufd \$0,%xmm8,%xmm8 # broadcast $power
  1889. movdqa %xmm1,%xmm7
  1890. movdqa %xmm1,%xmm2
  1891. ___
  1892. ########################################################################
  1893. # calculate mask by comparing 0..15 to $power
  1894. #
  1895. for($i=0;$i<4;$i++) {
  1896. $code.=<<___;
  1897. paddd %xmm`$i`,%xmm`$i+1`
  1898. pcmpeqd %xmm8,%xmm`$i`
  1899. movdqa %xmm7,%xmm`$i+3`
  1900. ___
  1901. }
  1902. for(;$i<7;$i++) {
  1903. $code.=<<___;
  1904. paddd %xmm`$i`,%xmm`$i+1`
  1905. pcmpeqd %xmm8,%xmm`$i`
  1906. ___
  1907. }
  1908. $code.=<<___;
  1909. pcmpeqd %xmm8,%xmm7
  1910. movl \$8, %r9d
  1911. jmp .Loop_gather
  1912. .align 16
  1913. .Loop_gather:
  1914. movdqa 16*0($inp),%xmm8
  1915. movdqa 16*1($inp),%xmm9
  1916. movdqa 16*2($inp),%xmm10
  1917. movdqa 16*3($inp),%xmm11
  1918. pand %xmm0,%xmm8
  1919. movdqa 16*4($inp),%xmm12
  1920. pand %xmm1,%xmm9
  1921. movdqa 16*5($inp),%xmm13
  1922. pand %xmm2,%xmm10
  1923. movdqa 16*6($inp),%xmm14
  1924. pand %xmm3,%xmm11
  1925. movdqa 16*7($inp),%xmm15
  1926. leaq 128($inp), $inp
  1927. pand %xmm4,%xmm12
  1928. pand %xmm5,%xmm13
  1929. pand %xmm6,%xmm14
  1930. pand %xmm7,%xmm15
  1931. por %xmm10,%xmm8
  1932. por %xmm11,%xmm9
  1933. por %xmm12,%xmm8
  1934. por %xmm13,%xmm9
  1935. por %xmm14,%xmm8
  1936. por %xmm15,%xmm9
  1937. por %xmm9,%xmm8
  1938. pshufd \$0x4e,%xmm8,%xmm9
  1939. por %xmm9,%xmm8
  1940. movq %xmm8,($out)
  1941. leaq 8($out), $out
  1942. decl %r9d
  1943. jnz .Loop_gather
  1944. ___
  1945. $code.=<<___ if ($win64);
  1946. movaps 0x00(%rsp),%xmm6
  1947. movaps 0x10(%rsp),%xmm7
  1948. movaps 0x20(%rsp),%xmm8
  1949. movaps 0x30(%rsp),%xmm9
  1950. movaps 0x40(%rsp),%xmm10
  1951. movaps 0x50(%rsp),%xmm11
  1952. movaps 0x60(%rsp),%xmm12
  1953. movaps 0x70(%rsp),%xmm13
  1954. movaps 0x80(%rsp),%xmm14
  1955. movaps 0x90(%rsp),%xmm15
  1956. add \$0xa8,%rsp
  1957. ___
  1958. $code.=<<___;
  1959. ret
  1960. .LSEH_end_rsaz_512_gather4:
  1961. .cfi_endproc
  1962. .size rsaz_512_gather4,.-rsaz_512_gather4
  1963. .align 64
  1964. .Linc:
  1965. .long 0,0, 1,1
  1966. .long 2,2, 2,2
  1967. ___
  1968. }
  1969. # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
  1970. # CONTEXT *context,DISPATCHER_CONTEXT *disp)
  1971. if ($win64) {
  1972. $rec="%rcx";
  1973. $frame="%rdx";
  1974. $context="%r8";
  1975. $disp="%r9";
  1976. $code.=<<___;
  1977. .extern __imp_RtlVirtualUnwind
  1978. .type se_handler,\@abi-omnipotent
  1979. .align 16
  1980. se_handler:
  1981. push %rsi
  1982. push %rdi
  1983. push %rbx
  1984. push %rbp
  1985. push %r12
  1986. push %r13
  1987. push %r14
  1988. push %r15
  1989. pushfq
  1990. sub \$64,%rsp
  1991. mov 120($context),%rax # pull context->Rax
  1992. mov 248($context),%rbx # pull context->Rip
  1993. mov 8($disp),%rsi # disp->ImageBase
  1994. mov 56($disp),%r11 # disp->HandlerData
  1995. mov 0(%r11),%r10d # HandlerData[0]
  1996. lea (%rsi,%r10),%r10 # end of prologue label
  1997. cmp %r10,%rbx # context->Rip<end of prologue label
  1998. jb .Lcommon_seh_tail
  1999. mov 152($context),%rax # pull context->Rsp
  2000. mov 4(%r11),%r10d # HandlerData[1]
  2001. lea (%rsi,%r10),%r10 # epilogue label
  2002. cmp %r10,%rbx # context->Rip>=epilogue label
  2003. jae .Lcommon_seh_tail
  2004. lea 128+24+48(%rax),%rax
  2005. lea .Lmul_gather4_epilogue(%rip),%rbx
  2006. cmp %r10,%rbx
  2007. jne .Lse_not_in_mul_gather4
  2008. lea 0xb0(%rax),%rax
  2009. lea -48-0xa8(%rax),%rsi
  2010. lea 512($context),%rdi
  2011. mov \$20,%ecx
  2012. .long 0xa548f3fc # cld; rep movsq
  2013. .Lse_not_in_mul_gather4:
  2014. mov -8(%rax),%rbx
  2015. mov -16(%rax),%rbp
  2016. mov -24(%rax),%r12
  2017. mov -32(%rax),%r13
  2018. mov -40(%rax),%r14
  2019. mov -48(%rax),%r15
  2020. mov %rbx,144($context) # restore context->Rbx
  2021. mov %rbp,160($context) # restore context->Rbp
  2022. mov %r12,216($context) # restore context->R12
  2023. mov %r13,224($context) # restore context->R13
  2024. mov %r14,232($context) # restore context->R14
  2025. mov %r15,240($context) # restore context->R15
  2026. .Lcommon_seh_tail:
  2027. mov 8(%rax),%rdi
  2028. mov 16(%rax),%rsi
  2029. mov %rax,152($context) # restore context->Rsp
  2030. mov %rsi,168($context) # restore context->Rsi
  2031. mov %rdi,176($context) # restore context->Rdi
  2032. mov 40($disp),%rdi # disp->ContextRecord
  2033. mov $context,%rsi # context
  2034. mov \$154,%ecx # sizeof(CONTEXT)
  2035. .long 0xa548f3fc # cld; rep movsq
  2036. mov $disp,%rsi
  2037. xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
  2038. mov 8(%rsi),%rdx # arg2, disp->ImageBase
  2039. mov 0(%rsi),%r8 # arg3, disp->ControlPc
  2040. mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
  2041. mov 40(%rsi),%r10 # disp->ContextRecord
  2042. lea 56(%rsi),%r11 # &disp->HandlerData
  2043. lea 24(%rsi),%r12 # &disp->EstablisherFrame
  2044. mov %r10,32(%rsp) # arg5
  2045. mov %r11,40(%rsp) # arg6
  2046. mov %r12,48(%rsp) # arg7
  2047. mov %rcx,56(%rsp) # arg8, (NULL)
  2048. call *__imp_RtlVirtualUnwind(%rip)
  2049. mov \$1,%eax # ExceptionContinueSearch
  2050. add \$64,%rsp
  2051. popfq
  2052. pop %r15
  2053. pop %r14
  2054. pop %r13
  2055. pop %r12
  2056. pop %rbp
  2057. pop %rbx
  2058. pop %rdi
  2059. pop %rsi
  2060. ret
  2061. .size se_handler,.-se_handler
  2062. .section .pdata
  2063. .align 4
  2064. .rva .LSEH_begin_rsaz_512_sqr
  2065. .rva .LSEH_end_rsaz_512_sqr
  2066. .rva .LSEH_info_rsaz_512_sqr
  2067. .rva .LSEH_begin_rsaz_512_mul
  2068. .rva .LSEH_end_rsaz_512_mul
  2069. .rva .LSEH_info_rsaz_512_mul
  2070. .rva .LSEH_begin_rsaz_512_mul_gather4
  2071. .rva .LSEH_end_rsaz_512_mul_gather4
  2072. .rva .LSEH_info_rsaz_512_mul_gather4
  2073. .rva .LSEH_begin_rsaz_512_mul_scatter4
  2074. .rva .LSEH_end_rsaz_512_mul_scatter4
  2075. .rva .LSEH_info_rsaz_512_mul_scatter4
  2076. .rva .LSEH_begin_rsaz_512_mul_by_one
  2077. .rva .LSEH_end_rsaz_512_mul_by_one
  2078. .rva .LSEH_info_rsaz_512_mul_by_one
  2079. .rva .LSEH_begin_rsaz_512_gather4
  2080. .rva .LSEH_end_rsaz_512_gather4
  2081. .rva .LSEH_info_rsaz_512_gather4
  2082. .section .xdata
  2083. .align 8
  2084. .LSEH_info_rsaz_512_sqr:
  2085. .byte 9,0,0,0
  2086. .rva se_handler
  2087. .rva .Lsqr_body,.Lsqr_epilogue # HandlerData[]
  2088. .LSEH_info_rsaz_512_mul:
  2089. .byte 9,0,0,0
  2090. .rva se_handler
  2091. .rva .Lmul_body,.Lmul_epilogue # HandlerData[]
  2092. .LSEH_info_rsaz_512_mul_gather4:
  2093. .byte 9,0,0,0
  2094. .rva se_handler
  2095. .rva .Lmul_gather4_body,.Lmul_gather4_epilogue # HandlerData[]
  2096. .LSEH_info_rsaz_512_mul_scatter4:
  2097. .byte 9,0,0,0
  2098. .rva se_handler
  2099. .rva .Lmul_scatter4_body,.Lmul_scatter4_epilogue # HandlerData[]
  2100. .LSEH_info_rsaz_512_mul_by_one:
  2101. .byte 9,0,0,0
  2102. .rva se_handler
  2103. .rva .Lmul_by_one_body,.Lmul_by_one_epilogue # HandlerData[]
  2104. .LSEH_info_rsaz_512_gather4:
  2105. .byte 0x01,0x46,0x16,0x00
  2106. .byte 0x46,0xf8,0x09,0x00 # vmovaps 0x90(rsp),xmm15
  2107. .byte 0x3d,0xe8,0x08,0x00 # vmovaps 0x80(rsp),xmm14
  2108. .byte 0x34,0xd8,0x07,0x00 # vmovaps 0x70(rsp),xmm13
  2109. .byte 0x2e,0xc8,0x06,0x00 # vmovaps 0x60(rsp),xmm12
  2110. .byte 0x28,0xb8,0x05,0x00 # vmovaps 0x50(rsp),xmm11
  2111. .byte 0x22,0xa8,0x04,0x00 # vmovaps 0x40(rsp),xmm10
  2112. .byte 0x1c,0x98,0x03,0x00 # vmovaps 0x30(rsp),xmm9
  2113. .byte 0x16,0x88,0x02,0x00 # vmovaps 0x20(rsp),xmm8
  2114. .byte 0x10,0x78,0x01,0x00 # vmovaps 0x10(rsp),xmm7
  2115. .byte 0x0b,0x68,0x00,0x00 # vmovaps 0x00(rsp),xmm6
  2116. .byte 0x07,0x01,0x15,0x00 # sub rsp,0xa8
  2117. ___
  2118. }
  2119. $code =~ s/\`([^\`]*)\`/eval $1/gem;
  2120. print $code;
  2121. close STDOUT or die "error closing STDOUT: $!";