x25519-x86_64.pl 25 KB


  1. #!/usr/bin/env perl
  2. # Copyright 2018 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. # ====================================================================
  10. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  11. # project. The module is, however, dual licensed under OpenSSL and
  12. # CRYPTOGAMS licenses depending on where you obtain it. For further
  13. # details see http://www.openssl.org/~appro/cryptogams/.
  14. # ====================================================================
  15. #
  16. # X25519 lower-level primitives for x86_64.
  17. #
  18. # February 2018.
  19. #
  20. # This module implements radix 2^51 multiplication and squaring, and
  21. # radix 2^64 multiplication, squaring, addition, subtraction and final
  22. # reduction. Latter radix is used on ADCX/ADOX-capable processors such
  23. # as Broadwell. On related note one should mention that there are
  24. # vector implementations that provide significantly better performance
  25. # on some processors(*), but they are large and overly complex. Which
  26. # in combination with them being effectively processor-specific makes
  27. # the undertaking hard to justify. The goal for this implementation
  28. # is rather versatility and simplicity [and ultimately formal
  29. # verification].
  30. #
  31. # (*) For example sandy2x should provide ~30% improvement on Sandy
  32. # Bridge, but only nominal ~5% on Haswell [and big loss on
  33. # Broadwell and successors].
  34. #
  35. ######################################################################
  36. # Improvement coefficients:
  37. #
  38. # amd64-51(*) gcc-5.x(**)
  39. #
  40. # P4 +22% +40%
  41. # Sandy Bridge -3% +11%
  42. # Haswell -1% +13%
  43. # Broadwell(***) +30% +35%
  44. # Skylake(***) +33% +47%
  45. # Silvermont +20% +26%
  46. # Goldmont +40% +50%
  47. # Bulldozer +20% +9%
  48. # Ryzen(***) +43% +40%
  49. # VIA +170% +120%
  50. #
  51. # (*) amd64-51 is popular assembly implementation with 2^51 radix,
  52. # only multiplication and squaring subroutines were linked
  53. # for comparison, but not complete ladder step; gain on most
  54. # processors is because this module refrains from shld, and
  55. # minor regression on others is because this does result in
  56. # higher instruction count;
  57. # (**) compiler is free to inline functions, in assembly one would
  58. # need to implement ladder step to do that, and it will improve
  59. # performance by several percent;
  60. # (***) ADCX/ADOX result for 2^64 radix, there is no corresponding
  61. # C implementation, so that comparison is always against
  62. # 2^51 radix;
  63. # $output is the last argument if it looks like a file (it has an extension)
  64. # $flavour is the first argument if it doesn't look like a file
  65. $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  66. $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  67. $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  68. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  69. ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  70. ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  71. die "can't locate x86_64-xlate.pl";
  72. open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
  73. or die "can't call $xlate: $!";
  74. *STDOUT=*OUT;
  75. if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
  76. =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
  77. $addx = ($1>=2.23);
  78. }
  79. if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
  80. `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
  81. $addx = ($1>=2.10);
  82. }
  83. if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
  84. `ml64 2>&1` =~ /Version ([0-9]+)\./) {
  85. $addx = ($1>=12);
  86. }
  87. if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)/) {
  88. my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10
  89. $addx = ($ver>=3.03);
  90. }
  91. $code.=<<___;
  92. .text
  93. .globl x25519_fe51_mul
  94. .type x25519_fe51_mul,\@function,3
  95. .align 32
  96. x25519_fe51_mul:
  97. .cfi_startproc
  98. push %rbp
  99. .cfi_push %rbp
  100. push %rbx
  101. .cfi_push %rbx
  102. push %r12
  103. .cfi_push %r12
  104. push %r13
  105. .cfi_push %r13
  106. push %r14
  107. .cfi_push %r14
  108. push %r15
  109. .cfi_push %r15
  110. lea -8*5(%rsp),%rsp
  111. .cfi_adjust_cfa_offset 40
  112. .Lfe51_mul_body:
  113. mov 8*0(%rsi),%rax # f[0]
  114. mov 8*0(%rdx),%r11 # load g[0-4]
  115. mov 8*1(%rdx),%r12
  116. mov 8*2(%rdx),%r13
  117. mov 8*3(%rdx),%rbp
  118. mov 8*4(%rdx),%r14
  119. mov %rdi,8*4(%rsp) # offload 1st argument
  120. mov %rax,%rdi
  121. mulq %r11 # f[0]*g[0]
  122. mov %r11,8*0(%rsp) # offload g[0]
  123. mov %rax,%rbx # %rbx:%rcx = h0
  124. mov %rdi,%rax
  125. mov %rdx,%rcx
  126. mulq %r12 # f[0]*g[1]
  127. mov %r12,8*1(%rsp) # offload g[1]
  128. mov %rax,%r8 # %r8:%r9 = h1
  129. mov %rdi,%rax
  130. lea (%r14,%r14,8),%r15
  131. mov %rdx,%r9
  132. mulq %r13 # f[0]*g[2]
  133. mov %r13,8*2(%rsp) # offload g[2]
  134. mov %rax,%r10 # %r10:%r11 = h2
  135. mov %rdi,%rax
  136. lea (%r14,%r15,2),%rdi # g[4]*19
  137. mov %rdx,%r11
  138. mulq %rbp # f[0]*g[3]
  139. mov %rax,%r12 # %r12:%r13 = h3
  140. mov 8*0(%rsi),%rax # f[0]
  141. mov %rdx,%r13
  142. mulq %r14 # f[0]*g[4]
  143. mov %rax,%r14 # %r14:%r15 = h4
  144. mov 8*1(%rsi),%rax # f[1]
  145. mov %rdx,%r15
  146. mulq %rdi # f[1]*g[4]*19
  147. add %rax,%rbx
  148. mov 8*2(%rsi),%rax # f[2]
  149. adc %rdx,%rcx
  150. mulq %rdi # f[2]*g[4]*19
  151. add %rax,%r8
  152. mov 8*3(%rsi),%rax # f[3]
  153. adc %rdx,%r9
  154. mulq %rdi # f[3]*g[4]*19
  155. add %rax,%r10
  156. mov 8*4(%rsi),%rax # f[4]
  157. adc %rdx,%r11
  158. mulq %rdi # f[4]*g[4]*19
  159. imulq \$19,%rbp,%rdi # g[3]*19
  160. add %rax,%r12
  161. mov 8*1(%rsi),%rax # f[1]
  162. adc %rdx,%r13
  163. mulq %rbp # f[1]*g[3]
  164. mov 8*2(%rsp),%rbp # g[2]
  165. add %rax,%r14
  166. mov 8*2(%rsi),%rax # f[2]
  167. adc %rdx,%r15
  168. mulq %rdi # f[2]*g[3]*19
  169. add %rax,%rbx
  170. mov 8*3(%rsi),%rax # f[3]
  171. adc %rdx,%rcx
  172. mulq %rdi # f[3]*g[3]*19
  173. add %rax,%r8
  174. mov 8*4(%rsi),%rax # f[4]
  175. adc %rdx,%r9
  176. mulq %rdi # f[4]*g[3]*19
  177. imulq \$19,%rbp,%rdi # g[2]*19
  178. add %rax,%r10
  179. mov 8*1(%rsi),%rax # f[1]
  180. adc %rdx,%r11
  181. mulq %rbp # f[1]*g[2]
  182. add %rax,%r12
  183. mov 8*2(%rsi),%rax # f[2]
  184. adc %rdx,%r13
  185. mulq %rbp # f[2]*g[2]
  186. mov 8*1(%rsp),%rbp # g[1]
  187. add %rax,%r14
  188. mov 8*3(%rsi),%rax # f[3]
  189. adc %rdx,%r15
  190. mulq %rdi # f[3]*g[2]*19
  191. add %rax,%rbx
  192. mov 8*4(%rsi),%rax # f[3]
  193. adc %rdx,%rcx
  194. mulq %rdi # f[4]*g[2]*19
  195. add %rax,%r8
  196. mov 8*1(%rsi),%rax # f[1]
  197. adc %rdx,%r9
  198. mulq %rbp # f[1]*g[1]
  199. imulq \$19,%rbp,%rdi
  200. add %rax,%r10
  201. mov 8*2(%rsi),%rax # f[2]
  202. adc %rdx,%r11
  203. mulq %rbp # f[2]*g[1]
  204. add %rax,%r12
  205. mov 8*3(%rsi),%rax # f[3]
  206. adc %rdx,%r13
  207. mulq %rbp # f[3]*g[1]
  208. mov 8*0(%rsp),%rbp # g[0]
  209. add %rax,%r14
  210. mov 8*4(%rsi),%rax # f[4]
  211. adc %rdx,%r15
  212. mulq %rdi # f[4]*g[1]*19
  213. add %rax,%rbx
  214. mov 8*1(%rsi),%rax # f[1]
  215. adc %rdx,%rcx
  216. mul %rbp # f[1]*g[0]
  217. add %rax,%r8
  218. mov 8*2(%rsi),%rax # f[2]
  219. adc %rdx,%r9
  220. mul %rbp # f[2]*g[0]
  221. add %rax,%r10
  222. mov 8*3(%rsi),%rax # f[3]
  223. adc %rdx,%r11
  224. mul %rbp # f[3]*g[0]
  225. add %rax,%r12
  226. mov 8*4(%rsi),%rax # f[4]
  227. adc %rdx,%r13
  228. mulq %rbp # f[4]*g[0]
  229. add %rax,%r14
  230. adc %rdx,%r15
  231. mov 8*4(%rsp),%rdi # restore 1st argument
  232. jmp .Lreduce51
  233. .Lfe51_mul_epilogue:
  234. .cfi_endproc
  235. .size x25519_fe51_mul,.-x25519_fe51_mul
  236. .globl x25519_fe51_sqr
  237. .type x25519_fe51_sqr,\@function,2
  238. .align 32
  239. x25519_fe51_sqr:
  240. .cfi_startproc
  241. push %rbp
  242. .cfi_push %rbp
  243. push %rbx
  244. .cfi_push %rbx
  245. push %r12
  246. .cfi_push %r12
  247. push %r13
  248. .cfi_push %r13
  249. push %r14
  250. .cfi_push %r14
  251. push %r15
  252. .cfi_push %r15
  253. lea -8*5(%rsp),%rsp
  254. .cfi_adjust_cfa_offset 40
  255. .Lfe51_sqr_body:
  256. mov 8*0(%rsi),%rax # g[0]
  257. mov 8*2(%rsi),%r15 # g[2]
  258. mov 8*4(%rsi),%rbp # g[4]
  259. mov %rdi,8*4(%rsp) # offload 1st argument
  260. lea (%rax,%rax),%r14
  261. mulq %rax # g[0]*g[0]
  262. mov %rax,%rbx
  263. mov 8*1(%rsi),%rax # g[1]
  264. mov %rdx,%rcx
  265. mulq %r14 # 2*g[0]*g[1]
  266. mov %rax,%r8
  267. mov %r15,%rax
  268. mov %r15,8*0(%rsp) # offload g[2]
  269. mov %rdx,%r9
  270. mulq %r14 # 2*g[0]*g[2]
  271. mov %rax,%r10
  272. mov 8*3(%rsi),%rax
  273. mov %rdx,%r11
  274. imulq \$19,%rbp,%rdi # g[4]*19
  275. mulq %r14 # 2*g[0]*g[3]
  276. mov %rax,%r12
  277. mov %rbp,%rax
  278. mov %rdx,%r13
  279. mulq %r14 # 2*g[0]*g[4]
  280. mov %rax,%r14
  281. mov %rbp,%rax
  282. mov %rdx,%r15
  283. mulq %rdi # g[4]*g[4]*19
  284. add %rax,%r12
  285. mov 8*1(%rsi),%rax # g[1]
  286. adc %rdx,%r13
  287. mov 8*3(%rsi),%rsi # g[3]
  288. lea (%rax,%rax),%rbp
  289. mulq %rax # g[1]*g[1]
  290. add %rax,%r10
  291. mov 8*0(%rsp),%rax # g[2]
  292. adc %rdx,%r11
  293. mulq %rbp # 2*g[1]*g[2]
  294. add %rax,%r12
  295. mov %rbp,%rax
  296. adc %rdx,%r13
  297. mulq %rsi # 2*g[1]*g[3]
  298. add %rax,%r14
  299. mov %rbp,%rax
  300. adc %rdx,%r15
  301. imulq \$19,%rsi,%rbp # g[3]*19
  302. mulq %rdi # 2*g[1]*g[4]*19
  303. add %rax,%rbx
  304. lea (%rsi,%rsi),%rax
  305. adc %rdx,%rcx
  306. mulq %rdi # 2*g[3]*g[4]*19
  307. add %rax,%r10
  308. mov %rsi,%rax
  309. adc %rdx,%r11
  310. mulq %rbp # g[3]*g[3]*19
  311. add %rax,%r8
  312. mov 8*0(%rsp),%rax # g[2]
  313. adc %rdx,%r9
  314. lea (%rax,%rax),%rsi
  315. mulq %rax # g[2]*g[2]
  316. add %rax,%r14
  317. mov %rbp,%rax
  318. adc %rdx,%r15
  319. mulq %rsi # 2*g[2]*g[3]*19
  320. add %rax,%rbx
  321. mov %rsi,%rax
  322. adc %rdx,%rcx
  323. mulq %rdi # 2*g[2]*g[4]*19
  324. add %rax,%r8
  325. adc %rdx,%r9
  326. mov 8*4(%rsp),%rdi # restore 1st argument
  327. jmp .Lreduce51
  328. .align 32
  329. .Lreduce51:
  330. mov \$0x7ffffffffffff,%rbp
  331. mov %r10,%rdx
  332. shr \$51,%r10
  333. shl \$13,%r11
  334. and %rbp,%rdx # %rdx = g2 = h2 & mask
  335. or %r10,%r11 # h2>>51
  336. add %r11,%r12
  337. adc \$0,%r13 # h3 += h2>>51
  338. mov %rbx,%rax
  339. shr \$51,%rbx
  340. shl \$13,%rcx
  341. and %rbp,%rax # %rax = g0 = h0 & mask
  342. or %rbx,%rcx # h0>>51
  343. add %rcx,%r8 # h1 += h0>>51
  344. adc \$0,%r9
  345. mov %r12,%rbx
  346. shr \$51,%r12
  347. shl \$13,%r13
  348. and %rbp,%rbx # %rbx = g3 = h3 & mask
  349. or %r12,%r13 # h3>>51
  350. add %r13,%r14 # h4 += h3>>51
  351. adc \$0,%r15
  352. mov %r8,%rcx
  353. shr \$51,%r8
  354. shl \$13,%r9
  355. and %rbp,%rcx # %rcx = g1 = h1 & mask
  356. or %r8,%r9
  357. add %r9,%rdx # g2 += h1>>51
  358. mov %r14,%r10
  359. shr \$51,%r14
  360. shl \$13,%r15
  361. and %rbp,%r10 # %r10 = g4 = h0 & mask
  362. or %r14,%r15 # h0>>51
  363. lea (%r15,%r15,8),%r14
  364. lea (%r15,%r14,2),%r15
  365. add %r15,%rax # g0 += (h0>>51)*19
  366. mov %rdx,%r8
  367. and %rbp,%rdx # g2 &= mask
  368. shr \$51,%r8
  369. add %r8,%rbx # g3 += g2>>51
  370. mov %rax,%r9
  371. and %rbp,%rax # g0 &= mask
  372. shr \$51,%r9
  373. add %r9,%rcx # g1 += g0>>51
  374. mov %rax,8*0(%rdi) # save the result
  375. mov %rcx,8*1(%rdi)
  376. mov %rdx,8*2(%rdi)
  377. mov %rbx,8*3(%rdi)
  378. mov %r10,8*4(%rdi)
  379. mov 8*5(%rsp),%r15
  380. .cfi_restore %r15
  381. mov 8*6(%rsp),%r14
  382. .cfi_restore %r14
  383. mov 8*7(%rsp),%r13
  384. .cfi_restore %r13
  385. mov 8*8(%rsp),%r12
  386. .cfi_restore %r12
  387. mov 8*9(%rsp),%rbx
  388. .cfi_restore %rbx
  389. mov 8*10(%rsp),%rbp
  390. .cfi_restore %rbp
  391. lea 8*11(%rsp),%rsp
  392. .cfi_adjust_cfa_offset 88
  393. .Lfe51_sqr_epilogue:
  394. ret
  395. .cfi_endproc
  396. .size x25519_fe51_sqr,.-x25519_fe51_sqr
  397. .globl x25519_fe51_mul121666
  398. .type x25519_fe51_mul121666,\@function,2
  399. .align 32
  400. x25519_fe51_mul121666:
  401. .cfi_startproc
  402. push %rbp
  403. .cfi_push %rbp
  404. push %rbx
  405. .cfi_push %rbx
  406. push %r12
  407. .cfi_push %r12
  408. push %r13
  409. .cfi_push %r13
  410. push %r14
  411. .cfi_push %r14
  412. push %r15
  413. .cfi_push %r15
  414. lea -8*5(%rsp),%rsp
  415. .cfi_adjust_cfa_offset 40
  416. .Lfe51_mul121666_body:
  417. mov \$121666,%eax
  418. mulq 8*0(%rsi)
  419. mov %rax,%rbx # %rbx:%rcx = h0
  420. mov \$121666,%eax
  421. mov %rdx,%rcx
  422. mulq 8*1(%rsi)
  423. mov %rax,%r8 # %r8:%r9 = h1
  424. mov \$121666,%eax
  425. mov %rdx,%r9
  426. mulq 8*2(%rsi)
  427. mov %rax,%r10 # %r10:%r11 = h2
  428. mov \$121666,%eax
  429. mov %rdx,%r11
  430. mulq 8*3(%rsi)
  431. mov %rax,%r12 # %r12:%r13 = h3
  432. mov \$121666,%eax # f[0]
  433. mov %rdx,%r13
  434. mulq 8*4(%rsi)
  435. mov %rax,%r14 # %r14:%r15 = h4
  436. mov %rdx,%r15
  437. jmp .Lreduce51
  438. .Lfe51_mul121666_epilogue:
  439. .cfi_endproc
  440. .size x25519_fe51_mul121666,.-x25519_fe51_mul121666
  441. ___
  442. ########################################################################
  443. # Base 2^64 subroutines modulo 2*(2^255-19)
  444. #
  445. if ($addx) {
  446. my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7) = map("%r$_",(8..15));
  447. $code.=<<___;
  448. .extern OPENSSL_ia32cap_P
  449. .globl x25519_fe64_eligible
  450. .type x25519_fe64_eligible,\@abi-omnipotent
  451. .align 32
  452. x25519_fe64_eligible:
  453. .cfi_startproc
  454. mov OPENSSL_ia32cap_P+8(%rip),%ecx
  455. xor %eax,%eax
  456. and \$0x80100,%ecx
  457. cmp \$0x80100,%ecx
  458. cmove %ecx,%eax
  459. ret
  460. .cfi_endproc
  461. .size x25519_fe64_eligible,.-x25519_fe64_eligible
  462. .globl x25519_fe64_mul
  463. .type x25519_fe64_mul,\@function,3
  464. .align 32
  465. x25519_fe64_mul:
  466. .cfi_startproc
  467. push %rbp
  468. .cfi_push %rbp
  469. push %rbx
  470. .cfi_push %rbx
  471. push %r12
  472. .cfi_push %r12
  473. push %r13
  474. .cfi_push %r13
  475. push %r14
  476. .cfi_push %r14
  477. push %r15
  478. .cfi_push %r15
  479. push %rdi # offload dst
  480. .cfi_push %rdi
  481. lea -8*2(%rsp),%rsp
  482. .cfi_adjust_cfa_offset 16
  483. .Lfe64_mul_body:
  484. mov %rdx,%rax
  485. mov 8*0(%rdx),%rbp # b[0]
  486. mov 8*0(%rsi),%rdx # a[0]
  487. mov 8*1(%rax),%rcx # b[1]
  488. mov 8*2(%rax),$acc6 # b[2]
  489. mov 8*3(%rax),$acc7 # b[3]
  490. mulx %rbp,$acc0,%rax # a[0]*b[0]
  491. xor %edi,%edi # cf=0,of=0
  492. mulx %rcx,$acc1,%rbx # a[0]*b[1]
  493. adcx %rax,$acc1
  494. mulx $acc6,$acc2,%rax # a[0]*b[2]
  495. adcx %rbx,$acc2
  496. mulx $acc7,$acc3,$acc4 # a[0]*b[3]
  497. mov 8*1(%rsi),%rdx # a[1]
  498. adcx %rax,$acc3
  499. mov $acc6,(%rsp) # offload b[2]
  500. adcx %rdi,$acc4 # cf=0
  501. mulx %rbp,%rax,%rbx # a[1]*b[0]
  502. adox %rax,$acc1
  503. adcx %rbx,$acc2
  504. mulx %rcx,%rax,%rbx # a[1]*b[1]
  505. adox %rax,$acc2
  506. adcx %rbx,$acc3
  507. mulx $acc6,%rax,%rbx # a[1]*b[2]
  508. adox %rax,$acc3
  509. adcx %rbx,$acc4
  510. mulx $acc7,%rax,$acc5 # a[1]*b[3]
  511. mov 8*2(%rsi),%rdx # a[2]
  512. adox %rax,$acc4
  513. adcx %rdi,$acc5 # cf=0
  514. adox %rdi,$acc5 # of=0
  515. mulx %rbp,%rax,%rbx # a[2]*b[0]
  516. adcx %rax,$acc2
  517. adox %rbx,$acc3
  518. mulx %rcx,%rax,%rbx # a[2]*b[1]
  519. adcx %rax,$acc3
  520. adox %rbx,$acc4
  521. mulx $acc6,%rax,%rbx # a[2]*b[2]
  522. adcx %rax,$acc4
  523. adox %rbx,$acc5
  524. mulx $acc7,%rax,$acc6 # a[2]*b[3]
  525. mov 8*3(%rsi),%rdx # a[3]
  526. adcx %rax,$acc5
  527. adox %rdi,$acc6 # of=0
  528. adcx %rdi,$acc6 # cf=0
  529. mulx %rbp,%rax,%rbx # a[3]*b[0]
  530. adox %rax,$acc3
  531. adcx %rbx,$acc4
  532. mulx %rcx,%rax,%rbx # a[3]*b[1]
  533. adox %rax,$acc4
  534. adcx %rbx,$acc5
  535. mulx (%rsp),%rax,%rbx # a[3]*b[2]
  536. adox %rax,$acc5
  537. adcx %rbx,$acc6
  538. mulx $acc7,%rax,$acc7 # a[3]*b[3]
  539. mov \$38,%edx
  540. adox %rax,$acc6
  541. adcx %rdi,$acc7 # cf=0
  542. adox %rdi,$acc7 # of=0
  543. jmp .Lreduce64
  544. .Lfe64_mul_epilogue:
  545. .cfi_endproc
  546. .size x25519_fe64_mul,.-x25519_fe64_mul
  547. .globl x25519_fe64_sqr
  548. .type x25519_fe64_sqr,\@function,2
  549. .align 32
  550. x25519_fe64_sqr:
  551. .cfi_startproc
  552. push %rbp
  553. .cfi_push %rbp
  554. push %rbx
  555. .cfi_push %rbx
  556. push %r12
  557. .cfi_push %r12
  558. push %r13
  559. .cfi_push %r13
  560. push %r14
  561. .cfi_push %r14
  562. push %r15
  563. .cfi_push %r15
  564. push %rdi # offload dst
  565. .cfi_push %rdi
  566. lea -8*2(%rsp),%rsp
  567. .cfi_adjust_cfa_offset 16
  568. .Lfe64_sqr_body:
  569. mov 8*0(%rsi),%rdx # a[0]
  570. mov 8*1(%rsi),%rcx # a[1]
  571. mov 8*2(%rsi),%rbp # a[2]
  572. mov 8*3(%rsi),%rsi # a[3]
  573. ################################################################
  574. mulx %rdx,$acc0,$acc7 # a[0]*a[0]
  575. mulx %rcx,$acc1,%rax # a[0]*a[1]
  576. xor %edi,%edi # cf=0,of=0
  577. mulx %rbp,$acc2,%rbx # a[0]*a[2]
  578. adcx %rax,$acc2
  579. mulx %rsi,$acc3,$acc4 # a[0]*a[3]
  580. mov %rcx,%rdx # a[1]
  581. adcx %rbx,$acc3
  582. adcx %rdi,$acc4 # cf=0
  583. ################################################################
  584. mulx %rbp,%rax,%rbx # a[1]*a[2]
  585. adox %rax,$acc3
  586. adcx %rbx,$acc4
  587. mulx %rsi,%rax,$acc5 # a[1]*a[3]
  588. mov %rbp,%rdx # a[2]
  589. adox %rax,$acc4
  590. adcx %rdi,$acc5
  591. ################################################################
  592. mulx %rsi,%rax,$acc6 # a[2]*a[3]
  593. mov %rcx,%rdx # a[1]
  594. adox %rax,$acc5
  595. adcx %rdi,$acc6 # cf=0
  596. adox %rdi,$acc6 # of=0
  597. adcx $acc1,$acc1 # acc1:6<<1
  598. adox $acc7,$acc1
  599. adcx $acc2,$acc2
  600. mulx %rdx,%rax,%rbx # a[1]*a[1]
  601. mov %rbp,%rdx # a[2]
  602. adcx $acc3,$acc3
  603. adox %rax,$acc2
  604. adcx $acc4,$acc4
  605. adox %rbx,$acc3
  606. mulx %rdx,%rax,%rbx # a[2]*a[2]
  607. mov %rsi,%rdx # a[3]
  608. adcx $acc5,$acc5
  609. adox %rax,$acc4
  610. adcx $acc6,$acc6
  611. adox %rbx,$acc5
  612. mulx %rdx,%rax,$acc7 # a[3]*a[3]
  613. mov \$38,%edx
  614. adox %rax,$acc6
  615. adcx %rdi,$acc7 # cf=0
  616. adox %rdi,$acc7 # of=0
  617. jmp .Lreduce64
  618. .align 32
  619. .Lreduce64:
  620. mulx $acc4,%rax,%rbx
  621. adcx %rax,$acc0
  622. adox %rbx,$acc1
  623. mulx $acc5,%rax,%rbx
  624. adcx %rax,$acc1
  625. adox %rbx,$acc2
  626. mulx $acc6,%rax,%rbx
  627. adcx %rax,$acc2
  628. adox %rbx,$acc3
  629. mulx $acc7,%rax,$acc4
  630. adcx %rax,$acc3
  631. adox %rdi,$acc4
  632. adcx %rdi,$acc4
  633. mov 8*2(%rsp),%rdi # restore dst
  634. imulq %rdx,$acc4
  635. add $acc4,$acc0
  636. adc \$0,$acc1
  637. adc \$0,$acc2
  638. adc \$0,$acc3
  639. sbb %rax,%rax # cf -> mask
  640. and \$38,%rax
  641. add %rax,$acc0
  642. mov $acc1,8*1(%rdi)
  643. mov $acc2,8*2(%rdi)
  644. mov $acc3,8*3(%rdi)
  645. mov $acc0,8*0(%rdi)
  646. mov 8*3(%rsp),%r15
  647. .cfi_restore %r15
  648. mov 8*4(%rsp),%r14
  649. .cfi_restore %r14
  650. mov 8*5(%rsp),%r13
  651. .cfi_restore %r13
  652. mov 8*6(%rsp),%r12
  653. .cfi_restore %r12
  654. mov 8*7(%rsp),%rbx
  655. .cfi_restore %rbx
  656. mov 8*8(%rsp),%rbp
  657. .cfi_restore %rbp
  658. lea 8*9(%rsp),%rsp
  659. .cfi_adjust_cfa_offset 88
  660. .Lfe64_sqr_epilogue:
  661. ret
  662. .cfi_endproc
  663. .size x25519_fe64_sqr,.-x25519_fe64_sqr
  664. .globl x25519_fe64_mul121666
  665. .type x25519_fe64_mul121666,\@function,2
  666. .align 32
  667. x25519_fe64_mul121666:
  668. .Lfe64_mul121666_body:
  669. .cfi_startproc
  670. mov \$121666,%edx
  671. mulx 8*0(%rsi),$acc0,%rcx
  672. mulx 8*1(%rsi),$acc1,%rax
  673. add %rcx,$acc1
  674. mulx 8*2(%rsi),$acc2,%rcx
  675. adc %rax,$acc2
  676. mulx 8*3(%rsi),$acc3,%rax
  677. adc %rcx,$acc3
  678. adc \$0,%rax
  679. imulq \$38,%rax,%rax
  680. add %rax,$acc0
  681. adc \$0,$acc1
  682. adc \$0,$acc2
  683. adc \$0,$acc3
  684. sbb %rax,%rax # cf -> mask
  685. and \$38,%rax
  686. add %rax,$acc0
  687. mov $acc1,8*1(%rdi)
  688. mov $acc2,8*2(%rdi)
  689. mov $acc3,8*3(%rdi)
  690. mov $acc0,8*0(%rdi)
  691. .Lfe64_mul121666_epilogue:
  692. ret
  693. .cfi_endproc
  694. .size x25519_fe64_mul121666,.-x25519_fe64_mul121666
  695. .globl x25519_fe64_add
  696. .type x25519_fe64_add,\@function,3
  697. .align 32
  698. x25519_fe64_add:
  699. .Lfe64_add_body:
  700. .cfi_startproc
  701. mov 8*0(%rsi),$acc0
  702. mov 8*1(%rsi),$acc1
  703. mov 8*2(%rsi),$acc2
  704. mov 8*3(%rsi),$acc3
  705. add 8*0(%rdx),$acc0
  706. adc 8*1(%rdx),$acc1
  707. adc 8*2(%rdx),$acc2
  708. adc 8*3(%rdx),$acc3
  709. sbb %rax,%rax # cf -> mask
  710. and \$38,%rax
  711. add %rax,$acc0
  712. adc \$0,$acc1
  713. adc \$0,$acc2
  714. mov $acc1,8*1(%rdi)
  715. adc \$0,$acc3
  716. mov $acc2,8*2(%rdi)
  717. sbb %rax,%rax # cf -> mask
  718. mov $acc3,8*3(%rdi)
  719. and \$38,%rax
  720. add %rax,$acc0
  721. mov $acc0,8*0(%rdi)
  722. .Lfe64_add_epilogue:
  723. ret
  724. .cfi_endproc
  725. .size x25519_fe64_add,.-x25519_fe64_add
  726. .globl x25519_fe64_sub
  727. .type x25519_fe64_sub,\@function,3
  728. .align 32
  729. x25519_fe64_sub:
  730. .Lfe64_sub_body:
  731. .cfi_startproc
  732. mov 8*0(%rsi),$acc0
  733. mov 8*1(%rsi),$acc1
  734. mov 8*2(%rsi),$acc2
  735. mov 8*3(%rsi),$acc3
  736. sub 8*0(%rdx),$acc0
  737. sbb 8*1(%rdx),$acc1
  738. sbb 8*2(%rdx),$acc2
  739. sbb 8*3(%rdx),$acc3
  740. sbb %rax,%rax # cf -> mask
  741. and \$38,%rax
  742. sub %rax,$acc0
  743. sbb \$0,$acc1
  744. sbb \$0,$acc2
  745. mov $acc1,8*1(%rdi)
  746. sbb \$0,$acc3
  747. mov $acc2,8*2(%rdi)
  748. sbb %rax,%rax # cf -> mask
  749. mov $acc3,8*3(%rdi)
  750. and \$38,%rax
  751. sub %rax,$acc0
  752. mov $acc0,8*0(%rdi)
  753. .Lfe64_sub_epilogue:
  754. ret
  755. .cfi_endproc
  756. .size x25519_fe64_sub,.-x25519_fe64_sub
  757. .globl x25519_fe64_tobytes
  758. .type x25519_fe64_tobytes,\@function,2
  759. .align 32
  760. x25519_fe64_tobytes:
  761. .Lfe64_to_body:
  762. .cfi_startproc
  763. mov 8*0(%rsi),$acc0
  764. mov 8*1(%rsi),$acc1
  765. mov 8*2(%rsi),$acc2
  766. mov 8*3(%rsi),$acc3
  767. ################################# reduction modulo 2^255-19
  768. lea ($acc3,$acc3),%rax
  769. sar \$63,$acc3 # most significant bit -> mask
  770. shr \$1,%rax # most significant bit cleared
  771. and \$19,$acc3
  772. add \$19,$acc3 # compare to modulus in the same go
  773. add $acc3,$acc0
  774. adc \$0,$acc1
  775. adc \$0,$acc2
  776. adc \$0,%rax
  777. lea (%rax,%rax),$acc3
  778. sar \$63,%rax # most significant bit -> mask
  779. shr \$1,$acc3 # most significant bit cleared
  780. not %rax
  781. and \$19,%rax
  782. sub %rax,$acc0
  783. sbb \$0,$acc1
  784. sbb \$0,$acc2
  785. sbb \$0,$acc3
  786. mov $acc0,8*0(%rdi)
  787. mov $acc1,8*1(%rdi)
  788. mov $acc2,8*2(%rdi)
  789. mov $acc3,8*3(%rdi)
  790. .Lfe64_to_epilogue:
  791. ret
  792. .cfi_endproc
  793. .size x25519_fe64_tobytes,.-x25519_fe64_tobytes
  794. ___
  795. } else {
  796. $code.=<<___;
  797. .globl x25519_fe64_eligible
  798. .type x25519_fe64_eligible,\@abi-omnipotent
  799. .align 32
  800. x25519_fe64_eligible:
  801. .cfi_startproc
  802. xor %eax,%eax
  803. ret
  804. .cfi_endproc
  805. .size x25519_fe64_eligible,.-x25519_fe64_eligible
  806. .globl x25519_fe64_mul
  807. .type x25519_fe64_mul,\@abi-omnipotent
  808. .globl x25519_fe64_sqr
  809. .globl x25519_fe64_mul121666
  810. .globl x25519_fe64_add
  811. .globl x25519_fe64_sub
  812. .globl x25519_fe64_tobytes
  813. x25519_fe64_mul:
  814. x25519_fe64_sqr:
  815. x25519_fe64_mul121666:
  816. x25519_fe64_add:
  817. x25519_fe64_sub:
  818. x25519_fe64_tobytes:
  819. .cfi_startproc
  820. .byte 0x0f,0x0b # ud2
  821. ret
  822. .cfi_endproc
  823. .size x25519_fe64_mul,.-x25519_fe64_mul
  824. ___
  825. }
  826. $code.=<<___;
  827. .asciz "X25519 primitives for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
  828. ___
  829. # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
  830. # CONTEXT *context,DISPATCHER_CONTEXT *disp)
  831. if ($win64) {
  832. $rec="%rcx";
  833. $frame="%rdx";
  834. $context="%r8";
  835. $disp="%r9";
  836. $code.=<<___;
  837. .extern __imp_RtlVirtualUnwind
  838. .type short_handler,\@abi-omnipotent
  839. .align 16
  840. short_handler:
  841. push %rsi
  842. push %rdi
  843. push %rbx
  844. push %rbp
  845. push %r12
  846. push %r13
  847. push %r14
  848. push %r15
  849. pushfq
  850. sub \$64,%rsp
  851. mov 120($context),%rax # pull context->Rax
  852. mov 248($context),%rbx # pull context->Rip
  853. mov 8($disp),%rsi # disp->ImageBase
  854. mov 56($disp),%r11 # disp->HandlerData
  855. mov 0(%r11),%r10d # HandlerData[0]
  856. lea (%rsi,%r10),%r10 # end of prologue label
  857. cmp %r10,%rbx # context->Rip<end of prologue label
  858. jb .Lcommon_seh_tail
  859. mov 152($context),%rax # pull context->Rsp
  860. jmp .Lcommon_seh_tail
  861. .size short_handler,.-short_handler
  862. .type full_handler,\@abi-omnipotent
  863. .align 16
  864. full_handler:
  865. push %rsi
  866. push %rdi
  867. push %rbx
  868. push %rbp
  869. push %r12
  870. push %r13
  871. push %r14
  872. push %r15
  873. pushfq
  874. sub \$64,%rsp
  875. mov 120($context),%rax # pull context->Rax
  876. mov 248($context),%rbx # pull context->Rip
  877. mov 8($disp),%rsi # disp->ImageBase
  878. mov 56($disp),%r11 # disp->HandlerData
  879. mov 0(%r11),%r10d # HandlerData[0]
  880. lea (%rsi,%r10),%r10 # end of prologue label
  881. cmp %r10,%rbx # context->Rip<end of prologue label
  882. jb .Lcommon_seh_tail
  883. mov 152($context),%rax # pull context->Rsp
  884. mov 4(%r11),%r10d # HandlerData[1]
  885. lea (%rsi,%r10),%r10 # epilogue label
  886. cmp %r10,%rbx # context->Rip>=epilogue label
  887. jae .Lcommon_seh_tail
  888. mov 8(%r11),%r10d # HandlerData[2]
  889. lea (%rax,%r10),%rax
  890. mov -8(%rax),%rbp
  891. mov -16(%rax),%rbx
  892. mov -24(%rax),%r12
  893. mov -32(%rax),%r13
  894. mov -40(%rax),%r14
  895. mov -48(%rax),%r15
  896. mov %rbx,144($context) # restore context->Rbx
  897. mov %rbp,160($context) # restore context->Rbp
  898. mov %r12,216($context) # restore context->R12
  899. mov %r13,224($context) # restore context->R13
  900. mov %r14,232($context) # restore context->R14
  901. mov %r15,240($context) # restore context->R15
  902. .Lcommon_seh_tail:
  903. mov 8(%rax),%rdi
  904. mov 16(%rax),%rsi
  905. mov %rax,152($context) # restore context->Rsp
  906. mov %rsi,168($context) # restore context->Rsi
  907. mov %rdi,176($context) # restore context->Rdi
  908. mov 40($disp),%rdi # disp->ContextRecord
  909. mov $context,%rsi # context
  910. mov \$154,%ecx # sizeof(CONTEXT)
  911. .long 0xa548f3fc # cld; rep movsq
  912. mov $disp,%rsi
  913. xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
  914. mov 8(%rsi),%rdx # arg2, disp->ImageBase
  915. mov 0(%rsi),%r8 # arg3, disp->ControlPc
  916. mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
  917. mov 40(%rsi),%r10 # disp->ContextRecord
  918. lea 56(%rsi),%r11 # &disp->HandlerData
  919. lea 24(%rsi),%r12 # &disp->EstablisherFrame
  920. mov %r10,32(%rsp) # arg5
  921. mov %r11,40(%rsp) # arg6
  922. mov %r12,48(%rsp) # arg7
  923. mov %rcx,56(%rsp) # arg8, (NULL)
  924. call *__imp_RtlVirtualUnwind(%rip)
  925. mov \$1,%eax # ExceptionContinueSearch
  926. add \$64,%rsp
  927. popfq
  928. pop %r15
  929. pop %r14
  930. pop %r13
  931. pop %r12
  932. pop %rbp
  933. pop %rbx
  934. pop %rdi
  935. pop %rsi
  936. ret
  937. .size full_handler,.-full_handler
  938. .section .pdata
  939. .align 4
  940. .rva .LSEH_begin_x25519_fe51_mul
  941. .rva .LSEH_end_x25519_fe51_mul
  942. .rva .LSEH_info_x25519_fe51_mul
  943. .rva .LSEH_begin_x25519_fe51_sqr
  944. .rva .LSEH_end_x25519_fe51_sqr
  945. .rva .LSEH_info_x25519_fe51_sqr
  946. .rva .LSEH_begin_x25519_fe51_mul121666
  947. .rva .LSEH_end_x25519_fe51_mul121666
  948. .rva .LSEH_info_x25519_fe51_mul121666
  949. ___
  950. $code.=<<___ if ($addx);
  951. .rva .LSEH_begin_x25519_fe64_mul
  952. .rva .LSEH_end_x25519_fe64_mul
  953. .rva .LSEH_info_x25519_fe64_mul
  954. .rva .LSEH_begin_x25519_fe64_sqr
  955. .rva .LSEH_end_x25519_fe64_sqr
  956. .rva .LSEH_info_x25519_fe64_sqr
  957. .rva .LSEH_begin_x25519_fe64_mul121666
  958. .rva .LSEH_end_x25519_fe64_mul121666
  959. .rva .LSEH_info_x25519_fe64_mul121666
  960. .rva .LSEH_begin_x25519_fe64_add
  961. .rva .LSEH_end_x25519_fe64_add
  962. .rva .LSEH_info_x25519_fe64_add
  963. .rva .LSEH_begin_x25519_fe64_sub
  964. .rva .LSEH_end_x25519_fe64_sub
  965. .rva .LSEH_info_x25519_fe64_sub
  966. .rva .LSEH_begin_x25519_fe64_tobytes
  967. .rva .LSEH_end_x25519_fe64_tobytes
  968. .rva .LSEH_info_x25519_fe64_tobytes
  969. ___
  970. $code.=<<___;
  971. .section .xdata
  972. .align 8
  973. .LSEH_info_x25519_fe51_mul:
  974. .byte 9,0,0,0
  975. .rva full_handler
  976. .rva .Lfe51_mul_body,.Lfe51_mul_epilogue # HandlerData[]
  977. .long 88,0
  978. .LSEH_info_x25519_fe51_sqr:
  979. .byte 9,0,0,0
  980. .rva full_handler
  981. .rva .Lfe51_sqr_body,.Lfe51_sqr_epilogue # HandlerData[]
  982. .long 88,0
  983. .LSEH_info_x25519_fe51_mul121666:
  984. .byte 9,0,0,0
  985. .rva full_handler
  986. .rva .Lfe51_mul121666_body,.Lfe51_mul121666_epilogue # HandlerData[]
  987. .long 88,0
  988. ___
  989. $code.=<<___ if ($addx);
  990. .LSEH_info_x25519_fe64_mul:
  991. .byte 9,0,0,0
  992. .rva full_handler
  993. .rva .Lfe64_mul_body,.Lfe64_mul_epilogue # HandlerData[]
  994. .long 72,0
  995. .LSEH_info_x25519_fe64_sqr:
  996. .byte 9,0,0,0
  997. .rva full_handler
  998. .rva .Lfe64_sqr_body,.Lfe64_sqr_epilogue # HandlerData[]
  999. .long 72,0
  1000. .LSEH_info_x25519_fe64_mul121666:
  1001. .byte 9,0,0,0
  1002. .rva short_handler
  1003. .rva .Lfe64_mul121666_body,.Lfe64_mul121666_epilogue # HandlerData[]
  1004. .LSEH_info_x25519_fe64_add:
  1005. .byte 9,0,0,0
  1006. .rva short_handler
  1007. .rva .Lfe64_add_body,.Lfe64_add_epilogue # HandlerData[]
  1008. .LSEH_info_x25519_fe64_sub:
  1009. .byte 9,0,0,0
  1010. .rva short_handler
  1011. .rva .Lfe64_sub_body,.Lfe64_sub_epilogue # HandlerData[]
  1012. .LSEH_info_x25519_fe64_tobytes:
  1013. .byte 9,0,0,0
  1014. .rva short_handler
  1015. .rva .Lfe64_to_body,.Lfe64_to_epilogue # HandlerData[]
  1016. ___
  1017. }
  1018. $code =~ s/\`([^\`]*)\`/eval $1/gem;
  1019. print $code;
  1020. close STDOUT or die "error closing STDOUT";