x86_64-mont.pl 32 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593
  1. #! /usr/bin/env perl
  2. # Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the OpenSSL license (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. # ====================================================================
  9. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  10. # project. The module is, however, dual licensed under OpenSSL and
  11. # CRYPTOGAMS licenses depending on where you obtain it. For further
  12. # details see http://www.openssl.org/~appro/cryptogams/.
  13. # ====================================================================
  14. # October 2005.
  15. #
  16. # Montgomery multiplication routine for x86_64. While it gives modest
  17. # 9% improvement of rsa4096 sign on Opteron, rsa512 sign runs more
  18. # than twice, >2x, as fast. Most common rsa1024 sign is improved by
  19. # respectful 50%. It remains to be seen if loop unrolling and
  20. # dedicated squaring routine can provide further improvement...
  21. # July 2011.
  22. #
  23. # Add dedicated squaring procedure. Performance improvement varies
  24. # from platform to platform, but in average it's ~5%/15%/25%/33%
  25. # for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
  26. # August 2011.
  27. #
  28. # Unroll and modulo-schedule inner loops in such manner that they
  29. # are "fallen through" for input lengths of 8, which is critical for
  30. # 1024-bit RSA *sign*. Average performance improvement in comparison
  31. # to *initial* version of this module from 2005 is ~0%/30%/40%/45%
  32. # for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
  33. # June 2013.
  34. #
  35. # Optimize reduction in squaring procedure and improve 1024+-bit RSA
  36. # sign performance by 10-16% on Intel Sandy Bridge and later
  37. # (virtually same on non-Intel processors).
  38. # August 2013.
  39. #
  40. # Add MULX/ADOX/ADCX code path.
  41. $flavour = shift;
  42. $output = shift;
  43. if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
  44. $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  45. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  46. ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  47. ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  48. die "can't locate x86_64-xlate.pl";
  49. open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
  50. *STDOUT=*OUT;
  51. if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
  52. =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
  53. $addx = ($1>=2.23);
  54. }
  55. if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
  56. `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
  57. $addx = ($1>=2.10);
  58. }
  59. if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
  60. `ml64 2>&1` =~ /Version ([0-9]+)\./) {
  61. $addx = ($1>=12);
  62. }
  63. if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9])\.([0-9]+)/) {
  64. my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10
  65. $addx = ($ver>=3.03);
  66. }
  67. # int bn_mul_mont(
  68. $rp="%rdi"; # BN_ULONG *rp,
  69. $ap="%rsi"; # const BN_ULONG *ap,
  70. $bp="%rdx"; # const BN_ULONG *bp,
  71. $np="%rcx"; # const BN_ULONG *np,
  72. $n0="%r8"; # const BN_ULONG *n0,
  73. $num="%r9"; # int num);
  74. $lo0="%r10";
  75. $hi0="%r11";
  76. $hi1="%r13";
  77. $i="%r14";
  78. $j="%r15";
  79. $m0="%rbx";
  80. $m1="%rbp";
  81. $code=<<___;
  82. .text
  83. .extern OPENSSL_ia32cap_P
  84. .globl bn_mul_mont
  85. .type bn_mul_mont,\@function,6
  86. .align 16
  87. bn_mul_mont:
  88. .cfi_startproc
  89. mov ${num}d,${num}d
  90. mov %rsp,%rax
  91. .cfi_def_cfa_register %rax
  92. test \$3,${num}d
  93. jnz .Lmul_enter
  94. cmp \$8,${num}d
  95. jb .Lmul_enter
  96. ___
  97. $code.=<<___ if ($addx);
  98. mov OPENSSL_ia32cap_P+8(%rip),%r11d
  99. ___
  100. $code.=<<___;
  101. cmp $ap,$bp
  102. jne .Lmul4x_enter
  103. test \$7,${num}d
  104. jz .Lsqr8x_enter
  105. jmp .Lmul4x_enter
  106. .align 16
  107. .Lmul_enter:
  108. push %rbx
  109. .cfi_push %rbx
  110. push %rbp
  111. .cfi_push %rbp
  112. push %r12
  113. .cfi_push %r12
  114. push %r13
  115. .cfi_push %r13
  116. push %r14
  117. .cfi_push %r14
  118. push %r15
  119. .cfi_push %r15
  120. neg $num
  121. mov %rsp,%r11
  122. lea -16(%rsp,$num,8),%r10 # future alloca(8*(num+2))
  123. neg $num # restore $num
  124. and \$-1024,%r10 # minimize TLB usage
  125. # An OS-agnostic version of __chkstk.
  126. #
  127. # Some OSes (Windows) insist on stack being "wired" to
  128. # physical memory in strictly sequential manner, i.e. if stack
  129. # allocation spans two pages, then reference to farmost one can
  130. # be punishable by SEGV. But page walking can do good even on
  131. # other OSes, because it guarantees that villain thread hits
  132. # the guard page before it can make damage to innocent one...
  133. sub %r10,%r11
  134. and \$-4096,%r11
  135. lea (%r10,%r11),%rsp
  136. mov (%rsp),%r11
  137. cmp %r10,%rsp
  138. ja .Lmul_page_walk
  139. jmp .Lmul_page_walk_done
  140. .align 16
  141. .Lmul_page_walk:
  142. lea -4096(%rsp),%rsp
  143. mov (%rsp),%r11
  144. cmp %r10,%rsp
  145. ja .Lmul_page_walk
  146. .Lmul_page_walk_done:
  147. mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp
  148. .cfi_cfa_expression %rsp+8,$num,8,mul,plus,deref,+8
  149. .Lmul_body:
  150. mov $bp,%r12 # reassign $bp
  151. ___
  152. $bp="%r12";
  153. $code.=<<___;
  154. mov ($n0),$n0 # pull n0[0] value
  155. mov ($bp),$m0 # m0=bp[0]
  156. mov ($ap),%rax
  157. xor $i,$i # i=0
  158. xor $j,$j # j=0
  159. mov $n0,$m1
  160. mulq $m0 # ap[0]*bp[0]
  161. mov %rax,$lo0
  162. mov ($np),%rax
  163. imulq $lo0,$m1 # "tp[0]"*n0
  164. mov %rdx,$hi0
  165. mulq $m1 # np[0]*m1
  166. add %rax,$lo0 # discarded
  167. mov 8($ap),%rax
  168. adc \$0,%rdx
  169. mov %rdx,$hi1
  170. lea 1($j),$j # j++
  171. jmp .L1st_enter
  172. .align 16
  173. .L1st:
  174. add %rax,$hi1
  175. mov ($ap,$j,8),%rax
  176. adc \$0,%rdx
  177. add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0]
  178. mov $lo0,$hi0
  179. adc \$0,%rdx
  180. mov $hi1,-16(%rsp,$j,8) # tp[j-1]
  181. mov %rdx,$hi1
  182. .L1st_enter:
  183. mulq $m0 # ap[j]*bp[0]
  184. add %rax,$hi0
  185. mov ($np,$j,8),%rax
  186. adc \$0,%rdx
  187. lea 1($j),$j # j++
  188. mov %rdx,$lo0
  189. mulq $m1 # np[j]*m1
  190. cmp $num,$j
  191. jne .L1st
  192. add %rax,$hi1
  193. mov ($ap),%rax # ap[0]
  194. adc \$0,%rdx
  195. add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0]
  196. adc \$0,%rdx
  197. mov $hi1,-16(%rsp,$j,8) # tp[j-1]
  198. mov %rdx,$hi1
  199. mov $lo0,$hi0
  200. xor %rdx,%rdx
  201. add $hi0,$hi1
  202. adc \$0,%rdx
  203. mov $hi1,-8(%rsp,$num,8)
  204. mov %rdx,(%rsp,$num,8) # store upmost overflow bit
  205. lea 1($i),$i # i++
  206. jmp .Louter
  207. .align 16
  208. .Louter:
  209. mov ($bp,$i,8),$m0 # m0=bp[i]
  210. xor $j,$j # j=0
  211. mov $n0,$m1
  212. mov (%rsp),$lo0
  213. mulq $m0 # ap[0]*bp[i]
  214. add %rax,$lo0 # ap[0]*bp[i]+tp[0]
  215. mov ($np),%rax
  216. adc \$0,%rdx
  217. imulq $lo0,$m1 # tp[0]*n0
  218. mov %rdx,$hi0
  219. mulq $m1 # np[0]*m1
  220. add %rax,$lo0 # discarded
  221. mov 8($ap),%rax
  222. adc \$0,%rdx
  223. mov 8(%rsp),$lo0 # tp[1]
  224. mov %rdx,$hi1
  225. lea 1($j),$j # j++
  226. jmp .Linner_enter
  227. .align 16
  228. .Linner:
  229. add %rax,$hi1
  230. mov ($ap,$j,8),%rax
  231. adc \$0,%rdx
  232. add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j]
  233. mov (%rsp,$j,8),$lo0
  234. adc \$0,%rdx
  235. mov $hi1,-16(%rsp,$j,8) # tp[j-1]
  236. mov %rdx,$hi1
  237. .Linner_enter:
  238. mulq $m0 # ap[j]*bp[i]
  239. add %rax,$hi0
  240. mov ($np,$j,8),%rax
  241. adc \$0,%rdx
  242. add $hi0,$lo0 # ap[j]*bp[i]+tp[j]
  243. mov %rdx,$hi0
  244. adc \$0,$hi0
  245. lea 1($j),$j # j++
  246. mulq $m1 # np[j]*m1
  247. cmp $num,$j
  248. jne .Linner
  249. add %rax,$hi1
  250. mov ($ap),%rax # ap[0]
  251. adc \$0,%rdx
  252. add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j]
  253. mov (%rsp,$j,8),$lo0
  254. adc \$0,%rdx
  255. mov $hi1,-16(%rsp,$j,8) # tp[j-1]
  256. mov %rdx,$hi1
  257. xor %rdx,%rdx
  258. add $hi0,$hi1
  259. adc \$0,%rdx
  260. add $lo0,$hi1 # pull upmost overflow bit
  261. adc \$0,%rdx
  262. mov $hi1,-8(%rsp,$num,8)
  263. mov %rdx,(%rsp,$num,8) # store upmost overflow bit
  264. lea 1($i),$i # i++
  265. cmp $num,$i
  266. jb .Louter
  267. xor $i,$i # i=0 and clear CF!
  268. mov (%rsp),%rax # tp[0]
  269. lea (%rsp),$ap # borrow ap for tp
  270. mov $num,$j # j=num
  271. jmp .Lsub
  272. .align 16
  273. .Lsub: sbb ($np,$i,8),%rax
  274. mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i]
  275. mov 8($ap,$i,8),%rax # tp[i+1]
  276. lea 1($i),$i # i++
  277. dec $j # doesnn't affect CF!
  278. jnz .Lsub
  279. sbb \$0,%rax # handle upmost overflow bit
  280. xor $i,$i
  281. and %rax,$ap
  282. not %rax
  283. mov $rp,$np
  284. and %rax,$np
  285. mov $num,$j # j=num
  286. or $np,$ap # ap=borrow?tp:rp
  287. .align 16
  288. .Lcopy: # copy or in-place refresh
  289. mov ($ap,$i,8),%rax
  290. mov $i,(%rsp,$i,8) # zap temporary vector
  291. mov %rax,($rp,$i,8) # rp[i]=tp[i]
  292. lea 1($i),$i
  293. sub \$1,$j
  294. jnz .Lcopy
  295. mov 8(%rsp,$num,8),%rsi # restore %rsp
  296. .cfi_def_cfa %rsi,8
  297. mov \$1,%rax
  298. mov -48(%rsi),%r15
  299. .cfi_restore %r15
  300. mov -40(%rsi),%r14
  301. .cfi_restore %r14
  302. mov -32(%rsi),%r13
  303. .cfi_restore %r13
  304. mov -24(%rsi),%r12
  305. .cfi_restore %r12
  306. mov -16(%rsi),%rbp
  307. .cfi_restore %rbp
  308. mov -8(%rsi),%rbx
  309. .cfi_restore %rbx
  310. lea (%rsi),%rsp
  311. .cfi_def_cfa_register %rsp
  312. .Lmul_epilogue:
  313. ret
  314. .cfi_endproc
  315. .size bn_mul_mont,.-bn_mul_mont
  316. ___
  317. {{{
  318. my @A=("%r10","%r11");
  319. my @N=("%r13","%rdi");
  320. $code.=<<___;
  321. .type bn_mul4x_mont,\@function,6
  322. .align 16
  323. bn_mul4x_mont:
  324. .cfi_startproc
  325. mov ${num}d,${num}d
  326. mov %rsp,%rax
  327. .cfi_def_cfa_register %rax
  328. .Lmul4x_enter:
  329. ___
  330. $code.=<<___ if ($addx);
  331. and \$0x80100,%r11d
  332. cmp \$0x80100,%r11d
  333. je .Lmulx4x_enter
  334. ___
  335. $code.=<<___;
  336. push %rbx
  337. .cfi_push %rbx
  338. push %rbp
  339. .cfi_push %rbp
  340. push %r12
  341. .cfi_push %r12
  342. push %r13
  343. .cfi_push %r13
  344. push %r14
  345. .cfi_push %r14
  346. push %r15
  347. .cfi_push %r15
  348. neg $num
  349. mov %rsp,%r11
  350. lea -32(%rsp,$num,8),%r10 # future alloca(8*(num+4))
  351. neg $num # restore
  352. and \$-1024,%r10 # minimize TLB usage
  353. sub %r10,%r11
  354. and \$-4096,%r11
  355. lea (%r10,%r11),%rsp
  356. mov (%rsp),%r11
  357. cmp %r10,%rsp
  358. ja .Lmul4x_page_walk
  359. jmp .Lmul4x_page_walk_done
  360. .Lmul4x_page_walk:
  361. lea -4096(%rsp),%rsp
  362. mov (%rsp),%r11
  363. cmp %r10,%rsp
  364. ja .Lmul4x_page_walk
  365. .Lmul4x_page_walk_done:
  366. mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp
  367. .cfi_cfa_expression %rsp+8,$num,8,mul,plus,deref,+8
  368. .Lmul4x_body:
  369. mov $rp,16(%rsp,$num,8) # tp[num+2]=$rp
  370. mov %rdx,%r12 # reassign $bp
  371. ___
  372. $bp="%r12";
  373. $code.=<<___;
  374. mov ($n0),$n0 # pull n0[0] value
  375. mov ($bp),$m0 # m0=bp[0]
  376. mov ($ap),%rax
  377. xor $i,$i # i=0
  378. xor $j,$j # j=0
  379. mov $n0,$m1
  380. mulq $m0 # ap[0]*bp[0]
  381. mov %rax,$A[0]
  382. mov ($np),%rax
  383. imulq $A[0],$m1 # "tp[0]"*n0
  384. mov %rdx,$A[1]
  385. mulq $m1 # np[0]*m1
  386. add %rax,$A[0] # discarded
  387. mov 8($ap),%rax
  388. adc \$0,%rdx
  389. mov %rdx,$N[1]
  390. mulq $m0
  391. add %rax,$A[1]
  392. mov 8($np),%rax
  393. adc \$0,%rdx
  394. mov %rdx,$A[0]
  395. mulq $m1
  396. add %rax,$N[1]
  397. mov 16($ap),%rax
  398. adc \$0,%rdx
  399. add $A[1],$N[1]
  400. lea 4($j),$j # j++
  401. adc \$0,%rdx
  402. mov $N[1],(%rsp)
  403. mov %rdx,$N[0]
  404. jmp .L1st4x
  405. .align 16
  406. .L1st4x:
  407. mulq $m0 # ap[j]*bp[0]
  408. add %rax,$A[0]
  409. mov -16($np,$j,8),%rax
  410. adc \$0,%rdx
  411. mov %rdx,$A[1]
  412. mulq $m1 # np[j]*m1
  413. add %rax,$N[0]
  414. mov -8($ap,$j,8),%rax
  415. adc \$0,%rdx
  416. add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
  417. adc \$0,%rdx
  418. mov $N[0],-24(%rsp,$j,8) # tp[j-1]
  419. mov %rdx,$N[1]
  420. mulq $m0 # ap[j]*bp[0]
  421. add %rax,$A[1]
  422. mov -8($np,$j,8),%rax
  423. adc \$0,%rdx
  424. mov %rdx,$A[0]
  425. mulq $m1 # np[j]*m1
  426. add %rax,$N[1]
  427. mov ($ap,$j,8),%rax
  428. adc \$0,%rdx
  429. add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
  430. adc \$0,%rdx
  431. mov $N[1],-16(%rsp,$j,8) # tp[j-1]
  432. mov %rdx,$N[0]
  433. mulq $m0 # ap[j]*bp[0]
  434. add %rax,$A[0]
  435. mov ($np,$j,8),%rax
  436. adc \$0,%rdx
  437. mov %rdx,$A[1]
  438. mulq $m1 # np[j]*m1
  439. add %rax,$N[0]
  440. mov 8($ap,$j,8),%rax
  441. adc \$0,%rdx
  442. add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
  443. adc \$0,%rdx
  444. mov $N[0],-8(%rsp,$j,8) # tp[j-1]
  445. mov %rdx,$N[1]
  446. mulq $m0 # ap[j]*bp[0]
  447. add %rax,$A[1]
  448. mov 8($np,$j,8),%rax
  449. adc \$0,%rdx
  450. lea 4($j),$j # j++
  451. mov %rdx,$A[0]
  452. mulq $m1 # np[j]*m1
  453. add %rax,$N[1]
  454. mov -16($ap,$j,8),%rax
  455. adc \$0,%rdx
  456. add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
  457. adc \$0,%rdx
  458. mov $N[1],-32(%rsp,$j,8) # tp[j-1]
  459. mov %rdx,$N[0]
  460. cmp $num,$j
  461. jb .L1st4x
  462. mulq $m0 # ap[j]*bp[0]
  463. add %rax,$A[0]
  464. mov -16($np,$j,8),%rax
  465. adc \$0,%rdx
  466. mov %rdx,$A[1]
  467. mulq $m1 # np[j]*m1
  468. add %rax,$N[0]
  469. mov -8($ap,$j,8),%rax
  470. adc \$0,%rdx
  471. add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
  472. adc \$0,%rdx
  473. mov $N[0],-24(%rsp,$j,8) # tp[j-1]
  474. mov %rdx,$N[1]
  475. mulq $m0 # ap[j]*bp[0]
  476. add %rax,$A[1]
  477. mov -8($np,$j,8),%rax
  478. adc \$0,%rdx
  479. mov %rdx,$A[0]
  480. mulq $m1 # np[j]*m1
  481. add %rax,$N[1]
  482. mov ($ap),%rax # ap[0]
  483. adc \$0,%rdx
  484. add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
  485. adc \$0,%rdx
  486. mov $N[1],-16(%rsp,$j,8) # tp[j-1]
  487. mov %rdx,$N[0]
  488. xor $N[1],$N[1]
  489. add $A[0],$N[0]
  490. adc \$0,$N[1]
  491. mov $N[0],-8(%rsp,$j,8)
  492. mov $N[1],(%rsp,$j,8) # store upmost overflow bit
  493. lea 1($i),$i # i++
  494. .align 4
  495. .Louter4x:
  496. mov ($bp,$i,8),$m0 # m0=bp[i]
  497. xor $j,$j # j=0
  498. mov (%rsp),$A[0]
  499. mov $n0,$m1
  500. mulq $m0 # ap[0]*bp[i]
  501. add %rax,$A[0] # ap[0]*bp[i]+tp[0]
  502. mov ($np),%rax
  503. adc \$0,%rdx
  504. imulq $A[0],$m1 # tp[0]*n0
  505. mov %rdx,$A[1]
  506. mulq $m1 # np[0]*m1
  507. add %rax,$A[0] # "$N[0]", discarded
  508. mov 8($ap),%rax
  509. adc \$0,%rdx
  510. mov %rdx,$N[1]
  511. mulq $m0 # ap[j]*bp[i]
  512. add %rax,$A[1]
  513. mov 8($np),%rax
  514. adc \$0,%rdx
  515. add 8(%rsp),$A[1] # +tp[1]
  516. adc \$0,%rdx
  517. mov %rdx,$A[0]
  518. mulq $m1 # np[j]*m1
  519. add %rax,$N[1]
  520. mov 16($ap),%rax
  521. adc \$0,%rdx
  522. add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j]
  523. lea 4($j),$j # j+=2
  524. adc \$0,%rdx
  525. mov $N[1],(%rsp) # tp[j-1]
  526. mov %rdx,$N[0]
  527. jmp .Linner4x
  528. .align 16
  529. .Linner4x:
  530. mulq $m0 # ap[j]*bp[i]
  531. add %rax,$A[0]
  532. mov -16($np,$j,8),%rax
  533. adc \$0,%rdx
  534. add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
  535. adc \$0,%rdx
  536. mov %rdx,$A[1]
  537. mulq $m1 # np[j]*m1
  538. add %rax,$N[0]
  539. mov -8($ap,$j,8),%rax
  540. adc \$0,%rdx
  541. add $A[0],$N[0]
  542. adc \$0,%rdx
  543. mov $N[0],-24(%rsp,$j,8) # tp[j-1]
  544. mov %rdx,$N[1]
  545. mulq $m0 # ap[j]*bp[i]
  546. add %rax,$A[1]
  547. mov -8($np,$j,8),%rax
  548. adc \$0,%rdx
  549. add -8(%rsp,$j,8),$A[1]
  550. adc \$0,%rdx
  551. mov %rdx,$A[0]
  552. mulq $m1 # np[j]*m1
  553. add %rax,$N[1]
  554. mov ($ap,$j,8),%rax
  555. adc \$0,%rdx
  556. add $A[1],$N[1]
  557. adc \$0,%rdx
  558. mov $N[1],-16(%rsp,$j,8) # tp[j-1]
  559. mov %rdx,$N[0]
  560. mulq $m0 # ap[j]*bp[i]
  561. add %rax,$A[0]
  562. mov ($np,$j,8),%rax
  563. adc \$0,%rdx
  564. add (%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
  565. adc \$0,%rdx
  566. mov %rdx,$A[1]
  567. mulq $m1 # np[j]*m1
  568. add %rax,$N[0]
  569. mov 8($ap,$j,8),%rax
  570. adc \$0,%rdx
  571. add $A[0],$N[0]
  572. adc \$0,%rdx
  573. mov $N[0],-8(%rsp,$j,8) # tp[j-1]
  574. mov %rdx,$N[1]
  575. mulq $m0 # ap[j]*bp[i]
  576. add %rax,$A[1]
  577. mov 8($np,$j,8),%rax
  578. adc \$0,%rdx
  579. add 8(%rsp,$j,8),$A[1]
  580. adc \$0,%rdx
  581. lea 4($j),$j # j++
  582. mov %rdx,$A[0]
  583. mulq $m1 # np[j]*m1
  584. add %rax,$N[1]
  585. mov -16($ap,$j,8),%rax
  586. adc \$0,%rdx
  587. add $A[1],$N[1]
  588. adc \$0,%rdx
  589. mov $N[1],-32(%rsp,$j,8) # tp[j-1]
  590. mov %rdx,$N[0]
  591. cmp $num,$j
  592. jb .Linner4x
  593. mulq $m0 # ap[j]*bp[i]
  594. add %rax,$A[0]
  595. mov -16($np,$j,8),%rax
  596. adc \$0,%rdx
  597. add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
  598. adc \$0,%rdx
  599. mov %rdx,$A[1]
  600. mulq $m1 # np[j]*m1
  601. add %rax,$N[0]
  602. mov -8($ap,$j,8),%rax
  603. adc \$0,%rdx
  604. add $A[0],$N[0]
  605. adc \$0,%rdx
  606. mov $N[0],-24(%rsp,$j,8) # tp[j-1]
  607. mov %rdx,$N[1]
  608. mulq $m0 # ap[j]*bp[i]
  609. add %rax,$A[1]
  610. mov -8($np,$j,8),%rax
  611. adc \$0,%rdx
  612. add -8(%rsp,$j,8),$A[1]
  613. adc \$0,%rdx
  614. lea 1($i),$i # i++
  615. mov %rdx,$A[0]
  616. mulq $m1 # np[j]*m1
  617. add %rax,$N[1]
  618. mov ($ap),%rax # ap[0]
  619. adc \$0,%rdx
  620. add $A[1],$N[1]
  621. adc \$0,%rdx
  622. mov $N[1],-16(%rsp,$j,8) # tp[j-1]
  623. mov %rdx,$N[0]
  624. xor $N[1],$N[1]
  625. add $A[0],$N[0]
  626. adc \$0,$N[1]
  627. add (%rsp,$num,8),$N[0] # pull upmost overflow bit
  628. adc \$0,$N[1]
  629. mov $N[0],-8(%rsp,$j,8)
  630. mov $N[1],(%rsp,$j,8) # store upmost overflow bit
  631. cmp $num,$i
  632. jb .Louter4x
  633. ___
  634. {
  635. my @ri=("%rax","%rdx",$m0,$m1);
  636. $code.=<<___;
  637. mov 16(%rsp,$num,8),$rp # restore $rp
  638. lea -4($num),$j
  639. mov 0(%rsp),@ri[0] # tp[0]
  640. pxor %xmm0,%xmm0
  641. mov 8(%rsp),@ri[1] # tp[1]
  642. shr \$2,$j # j=num/4-1
  643. lea (%rsp),$ap # borrow ap for tp
  644. xor $i,$i # i=0 and clear CF!
  645. sub 0($np),@ri[0]
  646. mov 16($ap),@ri[2] # tp[2]
  647. mov 24($ap),@ri[3] # tp[3]
  648. sbb 8($np),@ri[1]
  649. jmp .Lsub4x
  650. .align 16
  651. .Lsub4x:
  652. mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i]
  653. mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i]
  654. sbb 16($np,$i,8),@ri[2]
  655. mov 32($ap,$i,8),@ri[0] # tp[i+1]
  656. mov 40($ap,$i,8),@ri[1]
  657. sbb 24($np,$i,8),@ri[3]
  658. mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i]
  659. mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i]
  660. sbb 32($np,$i,8),@ri[0]
  661. mov 48($ap,$i,8),@ri[2]
  662. mov 56($ap,$i,8),@ri[3]
  663. sbb 40($np,$i,8),@ri[1]
  664. lea 4($i),$i # i++
  665. dec $j # doesnn't affect CF!
  666. jnz .Lsub4x
  667. mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i]
  668. mov 32($ap,$i,8),@ri[0] # load overflow bit
  669. sbb 16($np,$i,8),@ri[2]
  670. mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i]
  671. sbb 24($np,$i,8),@ri[3]
  672. mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i]
  673. sbb \$0,@ri[0] # handle upmost overflow bit
  674. mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i]
  675. xor $i,$i # i=0
  676. and @ri[0],$ap
  677. not @ri[0]
  678. mov $rp,$np
  679. and @ri[0],$np
  680. lea -4($num),$j
  681. or $np,$ap # ap=borrow?tp:rp
  682. shr \$2,$j # j=num/4-1
  683. movdqu ($ap),%xmm1
  684. movdqa %xmm0,(%rsp)
  685. movdqu %xmm1,($rp)
  686. jmp .Lcopy4x
  687. .align 16
  688. .Lcopy4x: # copy or in-place refresh
  689. movdqu 16($ap,$i),%xmm2
  690. movdqu 32($ap,$i),%xmm1
  691. movdqa %xmm0,16(%rsp,$i)
  692. movdqu %xmm2,16($rp,$i)
  693. movdqa %xmm0,32(%rsp,$i)
  694. movdqu %xmm1,32($rp,$i)
  695. lea 32($i),$i
  696. dec $j
  697. jnz .Lcopy4x
  698. movdqu 16($ap,$i),%xmm2
  699. movdqa %xmm0,16(%rsp,$i)
  700. movdqu %xmm2,16($rp,$i)
  701. ___
  702. }
  703. $code.=<<___;
  704. mov 8(%rsp,$num,8),%rsi # restore %rsp
  705. .cfi_def_cfa %rsi, 8
  706. mov \$1,%rax
  707. mov -48(%rsi),%r15
  708. .cfi_restore %r15
  709. mov -40(%rsi),%r14
  710. .cfi_restore %r14
  711. mov -32(%rsi),%r13
  712. .cfi_restore %r13
  713. mov -24(%rsi),%r12
  714. .cfi_restore %r12
  715. mov -16(%rsi),%rbp
  716. .cfi_restore %rbp
  717. mov -8(%rsi),%rbx
  718. .cfi_restore %rbx
  719. lea (%rsi),%rsp
  720. .cfi_def_cfa_register %rsp
  721. .Lmul4x_epilogue:
  722. ret
  723. .cfi_endproc
  724. .size bn_mul4x_mont,.-bn_mul4x_mont
  725. ___
  726. }}}
  727. {{{
  728. ######################################################################
  729. # void bn_sqr8x_mont(
  730. my $rptr="%rdi"; # const BN_ULONG *rptr,
  731. my $aptr="%rsi"; # const BN_ULONG *aptr,
  732. my $bptr="%rdx"; # not used
  733. my $nptr="%rcx"; # const BN_ULONG *nptr,
  734. my $n0 ="%r8"; # const BN_ULONG *n0);
  735. my $num ="%r9"; # int num, has to be divisible by 8
  736. my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
  737. my @A0=("%r10","%r11");
  738. my @A1=("%r12","%r13");
  739. my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
  740. $code.=<<___ if ($addx);
  741. .extern bn_sqrx8x_internal # see x86_64-mont5 module
  742. ___
  743. $code.=<<___;
  744. .extern bn_sqr8x_internal # see x86_64-mont5 module
  745. .type bn_sqr8x_mont,\@function,6
  746. .align 32
  747. bn_sqr8x_mont:
  748. .cfi_startproc
  749. mov %rsp,%rax
  750. .cfi_def_cfa_register %rax
  751. .Lsqr8x_enter:
  752. push %rbx
  753. .cfi_push %rbx
  754. push %rbp
  755. .cfi_push %rbp
  756. push %r12
  757. .cfi_push %r12
  758. push %r13
  759. .cfi_push %r13
  760. push %r14
  761. .cfi_push %r14
  762. push %r15
  763. .cfi_push %r15
  764. .Lsqr8x_prologue:
  765. mov ${num}d,%r10d
  766. shl \$3,${num}d # convert $num to bytes
  767. shl \$3+2,%r10 # 4*$num
  768. neg $num
  769. ##############################################################
  770. # ensure that stack frame doesn't alias with $aptr modulo
  771. # 4096. this is done to allow memory disambiguation logic
  772. # do its job.
  773. #
  774. lea -64(%rsp,$num,2),%r11
  775. mov %rsp,%rbp
  776. mov ($n0),$n0 # *n0
  777. sub $aptr,%r11
  778. and \$4095,%r11
  779. cmp %r11,%r10
  780. jb .Lsqr8x_sp_alt
  781. sub %r11,%rbp # align with $aptr
  782. lea -64(%rbp,$num,2),%rbp # future alloca(frame+2*$num)
  783. jmp .Lsqr8x_sp_done
  784. .align 32
  785. .Lsqr8x_sp_alt:
  786. lea 4096-64(,$num,2),%r10 # 4096-frame-2*$num
  787. lea -64(%rbp,$num,2),%rbp # future alloca(frame+2*$num)
  788. sub %r10,%r11
  789. mov \$0,%r10
  790. cmovc %r10,%r11
  791. sub %r11,%rbp
  792. .Lsqr8x_sp_done:
  793. and \$-64,%rbp
  794. mov %rsp,%r11
  795. sub %rbp,%r11
  796. and \$-4096,%r11
  797. lea (%rbp,%r11),%rsp
  798. mov (%rsp),%r10
  799. cmp %rbp,%rsp
  800. ja .Lsqr8x_page_walk
  801. jmp .Lsqr8x_page_walk_done
  802. .align 16
  803. .Lsqr8x_page_walk:
  804. lea -4096(%rsp),%rsp
  805. mov (%rsp),%r10
  806. cmp %rbp,%rsp
  807. ja .Lsqr8x_page_walk
  808. .Lsqr8x_page_walk_done:
  809. mov $num,%r10
  810. neg $num
  811. mov $n0, 32(%rsp)
  812. mov %rax, 40(%rsp) # save original %rsp
  813. .cfi_cfa_expression %rsp+40,deref,+8
  814. .Lsqr8x_body:
  815. movq $nptr, %xmm2 # save pointer to modulus
  816. pxor %xmm0,%xmm0
  817. movq $rptr,%xmm1 # save $rptr
  818. movq %r10, %xmm3 # -$num
  819. ___
  820. $code.=<<___ if ($addx);
  821. mov OPENSSL_ia32cap_P+8(%rip),%eax
  822. and \$0x80100,%eax
  823. cmp \$0x80100,%eax
  824. jne .Lsqr8x_nox
  825. call bn_sqrx8x_internal # see x86_64-mont5 module
  826. # %rax top-most carry
  827. # %rbp nptr
  828. # %rcx -8*num
  829. # %r8 end of tp[2*num]
  830. lea (%r8,%rcx),%rbx
  831. mov %rcx,$num
  832. mov %rcx,%rdx
  833. movq %xmm1,$rptr
  834. sar \$3+2,%rcx # %cf=0
  835. jmp .Lsqr8x_sub
  836. .align 32
  837. .Lsqr8x_nox:
  838. ___
  839. $code.=<<___;
  840. call bn_sqr8x_internal # see x86_64-mont5 module
  841. # %rax top-most carry
  842. # %rbp nptr
  843. # %r8 -8*num
  844. # %rdi end of tp[2*num]
  845. lea (%rdi,$num),%rbx
  846. mov $num,%rcx
  847. mov $num,%rdx
  848. movq %xmm1,$rptr
  849. sar \$3+2,%rcx # %cf=0
  850. jmp .Lsqr8x_sub
  851. .align 32
  852. .Lsqr8x_sub:
  853. mov 8*0(%rbx),%r12
  854. mov 8*1(%rbx),%r13
  855. mov 8*2(%rbx),%r14
  856. mov 8*3(%rbx),%r15
  857. lea 8*4(%rbx),%rbx
  858. sbb 8*0(%rbp),%r12
  859. sbb 8*1(%rbp),%r13
  860. sbb 8*2(%rbp),%r14
  861. sbb 8*3(%rbp),%r15
  862. lea 8*4(%rbp),%rbp
  863. mov %r12,8*0($rptr)
  864. mov %r13,8*1($rptr)
  865. mov %r14,8*2($rptr)
  866. mov %r15,8*3($rptr)
  867. lea 8*4($rptr),$rptr
  868. inc %rcx # preserves %cf
  869. jnz .Lsqr8x_sub
  870. sbb \$0,%rax # top-most carry
  871. lea (%rbx,$num),%rbx # rewind
  872. lea ($rptr,$num),$rptr # rewind
  873. movq %rax,%xmm1
  874. pxor %xmm0,%xmm0
  875. pshufd \$0,%xmm1,%xmm1
  876. mov 40(%rsp),%rsi # restore %rsp
  877. .cfi_def_cfa %rsi,8
  878. jmp .Lsqr8x_cond_copy
  879. .align 32
  880. .Lsqr8x_cond_copy:
  881. movdqa 16*0(%rbx),%xmm2
  882. movdqa 16*1(%rbx),%xmm3
  883. lea 16*2(%rbx),%rbx
  884. movdqu 16*0($rptr),%xmm4
  885. movdqu 16*1($rptr),%xmm5
  886. lea 16*2($rptr),$rptr
  887. movdqa %xmm0,-16*2(%rbx) # zero tp
  888. movdqa %xmm0,-16*1(%rbx)
  889. movdqa %xmm0,-16*2(%rbx,%rdx)
  890. movdqa %xmm0,-16*1(%rbx,%rdx)
  891. pcmpeqd %xmm1,%xmm0
  892. pand %xmm1,%xmm2
  893. pand %xmm1,%xmm3
  894. pand %xmm0,%xmm4
  895. pand %xmm0,%xmm5
  896. pxor %xmm0,%xmm0
  897. por %xmm2,%xmm4
  898. por %xmm3,%xmm5
  899. movdqu %xmm4,-16*2($rptr)
  900. movdqu %xmm5,-16*1($rptr)
  901. add \$32,$num
  902. jnz .Lsqr8x_cond_copy
  903. mov \$1,%rax
  904. mov -48(%rsi),%r15
  905. .cfi_restore %r15
  906. mov -40(%rsi),%r14
  907. .cfi_restore %r14
  908. mov -32(%rsi),%r13
  909. .cfi_restore %r13
  910. mov -24(%rsi),%r12
  911. .cfi_restore %r12
  912. mov -16(%rsi),%rbp
  913. .cfi_restore %rbp
  914. mov -8(%rsi),%rbx
  915. .cfi_restore %rbx
  916. lea (%rsi),%rsp
  917. .cfi_def_cfa_register %rsp
  918. .Lsqr8x_epilogue:
  919. ret
  920. .cfi_endproc
  921. .size bn_sqr8x_mont,.-bn_sqr8x_mont
  922. ___
  923. }}}
  924. if ($addx) {{{
  925. my $bp="%rdx"; # original value
  926. $code.=<<___;
  927. .type bn_mulx4x_mont,\@function,6
  928. .align 32
  929. bn_mulx4x_mont:
  930. .cfi_startproc
  931. mov %rsp,%rax
  932. .cfi_def_cfa_register %rax
  933. .Lmulx4x_enter:
  934. push %rbx
  935. .cfi_push %rbx
  936. push %rbp
  937. .cfi_push %rbp
  938. push %r12
  939. .cfi_push %r12
  940. push %r13
  941. .cfi_push %r13
  942. push %r14
  943. .cfi_push %r14
  944. push %r15
  945. .cfi_push %r15
  946. .Lmulx4x_prologue:
  947. shl \$3,${num}d # convert $num to bytes
  948. xor %r10,%r10
  949. sub $num,%r10 # -$num
  950. mov ($n0),$n0 # *n0
  951. lea -72(%rsp,%r10),%rbp # future alloca(frame+$num+8)
  952. and \$-128,%rbp
  953. mov %rsp,%r11
  954. sub %rbp,%r11
  955. and \$-4096,%r11
  956. lea (%rbp,%r11),%rsp
  957. mov (%rsp),%r10
  958. cmp %rbp,%rsp
  959. ja .Lmulx4x_page_walk
  960. jmp .Lmulx4x_page_walk_done
  961. .align 16
  962. .Lmulx4x_page_walk:
  963. lea -4096(%rsp),%rsp
  964. mov (%rsp),%r10
  965. cmp %rbp,%rsp
  966. ja .Lmulx4x_page_walk
  967. .Lmulx4x_page_walk_done:
  968. lea ($bp,$num),%r10
  969. ##############################################################
  970. # Stack layout
  971. # +0 num
  972. # +8 off-loaded &b[i]
  973. # +16 end of b[num]
  974. # +24 saved n0
  975. # +32 saved rp
  976. # +40 saved %rsp
  977. # +48 inner counter
  978. # +56
  979. # +64 tmp[num+1]
  980. #
  981. mov $num,0(%rsp) # save $num
  982. shr \$5,$num
  983. mov %r10,16(%rsp) # end of b[num]
  984. sub \$1,$num
  985. mov $n0, 24(%rsp) # save *n0
  986. mov $rp, 32(%rsp) # save $rp
  987. mov %rax,40(%rsp) # save original %rsp
  988. .cfi_cfa_expression %rsp+40,deref,+8
  989. mov $num,48(%rsp) # inner counter
  990. jmp .Lmulx4x_body
  991. .align 32
  992. .Lmulx4x_body:
  993. ___
  994. my ($aptr, $bptr, $nptr, $tptr, $mi, $bi, $zero, $num)=
  995. ("%rsi","%rdi","%rcx","%rbx","%r8","%r9","%rbp","%rax");
  996. my $rptr=$bptr;
  997. $code.=<<___;
  998. lea 8($bp),$bptr
  999. mov ($bp),%rdx # b[0], $bp==%rdx actually
  1000. lea 64+32(%rsp),$tptr
  1001. mov %rdx,$bi
  1002. mulx 0*8($aptr),$mi,%rax # a[0]*b[0]
  1003. mulx 1*8($aptr),%r11,%r14 # a[1]*b[0]
  1004. add %rax,%r11
  1005. mov $bptr,8(%rsp) # off-load &b[i]
  1006. mulx 2*8($aptr),%r12,%r13 # ...
  1007. adc %r14,%r12
  1008. adc \$0,%r13
  1009. mov $mi,$bptr # borrow $bptr
  1010. imulq 24(%rsp),$mi # "t[0]"*n0
  1011. xor $zero,$zero # cf=0, of=0
  1012. mulx 3*8($aptr),%rax,%r14
  1013. mov $mi,%rdx
  1014. lea 4*8($aptr),$aptr
  1015. adcx %rax,%r13
  1016. adcx $zero,%r14 # cf=0
  1017. mulx 0*8($nptr),%rax,%r10
  1018. adcx %rax,$bptr # discarded
  1019. adox %r11,%r10
  1020. mulx 1*8($nptr),%rax,%r11
  1021. adcx %rax,%r10
  1022. adox %r12,%r11
  1023. .byte 0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00 # mulx 2*8($nptr),%rax,%r12
  1024. mov 48(%rsp),$bptr # counter value
  1025. mov %r10,-4*8($tptr)
  1026. adcx %rax,%r11
  1027. adox %r13,%r12
  1028. mulx 3*8($nptr),%rax,%r15
  1029. mov $bi,%rdx
  1030. mov %r11,-3*8($tptr)
  1031. adcx %rax,%r12
  1032. adox $zero,%r15 # of=0
  1033. lea 4*8($nptr),$nptr
  1034. mov %r12,-2*8($tptr)
  1035. jmp .Lmulx4x_1st
  1036. .align 32
  1037. .Lmulx4x_1st:
  1038. adcx $zero,%r15 # cf=0, modulo-scheduled
  1039. mulx 0*8($aptr),%r10,%rax # a[4]*b[0]
  1040. adcx %r14,%r10
  1041. mulx 1*8($aptr),%r11,%r14 # a[5]*b[0]
  1042. adcx %rax,%r11
  1043. mulx 2*8($aptr),%r12,%rax # ...
  1044. adcx %r14,%r12
  1045. mulx 3*8($aptr),%r13,%r14
  1046. .byte 0x67,0x67
  1047. mov $mi,%rdx
  1048. adcx %rax,%r13
  1049. adcx $zero,%r14 # cf=0
  1050. lea 4*8($aptr),$aptr
  1051. lea 4*8($tptr),$tptr
  1052. adox %r15,%r10
  1053. mulx 0*8($nptr),%rax,%r15
  1054. adcx %rax,%r10
  1055. adox %r15,%r11
  1056. mulx 1*8($nptr),%rax,%r15
  1057. adcx %rax,%r11
  1058. adox %r15,%r12
  1059. mulx 2*8($nptr),%rax,%r15
  1060. mov %r10,-5*8($tptr)
  1061. adcx %rax,%r12
  1062. mov %r11,-4*8($tptr)
  1063. adox %r15,%r13
  1064. mulx 3*8($nptr),%rax,%r15
  1065. mov $bi,%rdx
  1066. mov %r12,-3*8($tptr)
  1067. adcx %rax,%r13
  1068. adox $zero,%r15
  1069. lea 4*8($nptr),$nptr
  1070. mov %r13,-2*8($tptr)
  1071. dec $bptr # of=0, pass cf
  1072. jnz .Lmulx4x_1st
  1073. mov 0(%rsp),$num # load num
  1074. mov 8(%rsp),$bptr # re-load &b[i]
  1075. adc $zero,%r15 # modulo-scheduled
  1076. add %r15,%r14
  1077. sbb %r15,%r15 # top-most carry
  1078. mov %r14,-1*8($tptr)
  1079. jmp .Lmulx4x_outer
  1080. .align 32
  1081. .Lmulx4x_outer:
  1082. mov ($bptr),%rdx # b[i]
  1083. lea 8($bptr),$bptr # b++
  1084. sub $num,$aptr # rewind $aptr
  1085. mov %r15,($tptr) # save top-most carry
  1086. lea 64+4*8(%rsp),$tptr
  1087. sub $num,$nptr # rewind $nptr
  1088. mulx 0*8($aptr),$mi,%r11 # a[0]*b[i]
  1089. xor %ebp,%ebp # xor $zero,$zero # cf=0, of=0
  1090. mov %rdx,$bi
  1091. mulx 1*8($aptr),%r14,%r12 # a[1]*b[i]
  1092. adox -4*8($tptr),$mi
  1093. adcx %r14,%r11
  1094. mulx 2*8($aptr),%r15,%r13 # ...
  1095. adox -3*8($tptr),%r11
  1096. adcx %r15,%r12
  1097. adox -2*8($tptr),%r12
  1098. adcx $zero,%r13
  1099. adox $zero,%r13
  1100. mov $bptr,8(%rsp) # off-load &b[i]
  1101. mov $mi,%r15
  1102. imulq 24(%rsp),$mi # "t[0]"*n0
  1103. xor %ebp,%ebp # xor $zero,$zero # cf=0, of=0
  1104. mulx 3*8($aptr),%rax,%r14
  1105. mov $mi,%rdx
  1106. adcx %rax,%r13
  1107. adox -1*8($tptr),%r13
  1108. adcx $zero,%r14
  1109. lea 4*8($aptr),$aptr
  1110. adox $zero,%r14
  1111. mulx 0*8($nptr),%rax,%r10
  1112. adcx %rax,%r15 # discarded
  1113. adox %r11,%r10
  1114. mulx 1*8($nptr),%rax,%r11
  1115. adcx %rax,%r10
  1116. adox %r12,%r11
  1117. mulx 2*8($nptr),%rax,%r12
  1118. mov %r10,-4*8($tptr)
  1119. adcx %rax,%r11
  1120. adox %r13,%r12
  1121. mulx 3*8($nptr),%rax,%r15
  1122. mov $bi,%rdx
  1123. mov %r11,-3*8($tptr)
  1124. lea 4*8($nptr),$nptr
  1125. adcx %rax,%r12
  1126. adox $zero,%r15 # of=0
  1127. mov 48(%rsp),$bptr # counter value
  1128. mov %r12,-2*8($tptr)
  1129. jmp .Lmulx4x_inner
  1130. .align 32
  1131. .Lmulx4x_inner:
  1132. mulx 0*8($aptr),%r10,%rax # a[4]*b[i]
  1133. adcx $zero,%r15 # cf=0, modulo-scheduled
  1134. adox %r14,%r10
  1135. mulx 1*8($aptr),%r11,%r14 # a[5]*b[i]
  1136. adcx 0*8($tptr),%r10
  1137. adox %rax,%r11
  1138. mulx 2*8($aptr),%r12,%rax # ...
  1139. adcx 1*8($tptr),%r11
  1140. adox %r14,%r12
  1141. mulx 3*8($aptr),%r13,%r14
  1142. mov $mi,%rdx
  1143. adcx 2*8($tptr),%r12
  1144. adox %rax,%r13
  1145. adcx 3*8($tptr),%r13
  1146. adox $zero,%r14 # of=0
  1147. lea 4*8($aptr),$aptr
  1148. lea 4*8($tptr),$tptr
  1149. adcx $zero,%r14 # cf=0
  1150. adox %r15,%r10
  1151. mulx 0*8($nptr),%rax,%r15
  1152. adcx %rax,%r10
  1153. adox %r15,%r11
  1154. mulx 1*8($nptr),%rax,%r15
  1155. adcx %rax,%r11
  1156. adox %r15,%r12
  1157. mulx 2*8($nptr),%rax,%r15
  1158. mov %r10,-5*8($tptr)
  1159. adcx %rax,%r12
  1160. adox %r15,%r13
  1161. mulx 3*8($nptr),%rax,%r15
  1162. mov $bi,%rdx
  1163. mov %r11,-4*8($tptr)
  1164. mov %r12,-3*8($tptr)
  1165. adcx %rax,%r13
  1166. adox $zero,%r15
  1167. lea 4*8($nptr),$nptr
  1168. mov %r13,-2*8($tptr)
  1169. dec $bptr # of=0, pass cf
  1170. jnz .Lmulx4x_inner
  1171. mov 0(%rsp),$num # load num
  1172. mov 8(%rsp),$bptr # re-load &b[i]
  1173. adc $zero,%r15 # modulo-scheduled
  1174. sub 0*8($tptr),$zero # pull top-most carry
  1175. adc %r15,%r14
  1176. sbb %r15,%r15 # top-most carry
  1177. mov %r14,-1*8($tptr)
  1178. cmp 16(%rsp),$bptr
  1179. jne .Lmulx4x_outer
  1180. lea 64(%rsp),$tptr
  1181. sub $num,$nptr # rewind $nptr
  1182. neg %r15
  1183. mov $num,%rdx
  1184. shr \$3+2,$num # %cf=0
  1185. mov 32(%rsp),$rptr # restore rp
  1186. jmp .Lmulx4x_sub
  1187. .align 32
  1188. .Lmulx4x_sub:
  1189. mov 8*0($tptr),%r11
  1190. mov 8*1($tptr),%r12
  1191. mov 8*2($tptr),%r13
  1192. mov 8*3($tptr),%r14
  1193. lea 8*4($tptr),$tptr
  1194. sbb 8*0($nptr),%r11
  1195. sbb 8*1($nptr),%r12
  1196. sbb 8*2($nptr),%r13
  1197. sbb 8*3($nptr),%r14
  1198. lea 8*4($nptr),$nptr
  1199. mov %r11,8*0($rptr)
  1200. mov %r12,8*1($rptr)
  1201. mov %r13,8*2($rptr)
  1202. mov %r14,8*3($rptr)
  1203. lea 8*4($rptr),$rptr
  1204. dec $num # preserves %cf
  1205. jnz .Lmulx4x_sub
  1206. sbb \$0,%r15 # top-most carry
  1207. lea 64(%rsp),$tptr
  1208. sub %rdx,$rptr # rewind
  1209. movq %r15,%xmm1
  1210. pxor %xmm0,%xmm0
  1211. pshufd \$0,%xmm1,%xmm1
  1212. mov 40(%rsp),%rsi # restore %rsp
  1213. .cfi_def_cfa %rsi,8
  1214. jmp .Lmulx4x_cond_copy
  1215. .align 32
  1216. .Lmulx4x_cond_copy:
  1217. movdqa 16*0($tptr),%xmm2
  1218. movdqa 16*1($tptr),%xmm3
  1219. lea 16*2($tptr),$tptr
  1220. movdqu 16*0($rptr),%xmm4
  1221. movdqu 16*1($rptr),%xmm5
  1222. lea 16*2($rptr),$rptr
  1223. movdqa %xmm0,-16*2($tptr) # zero tp
  1224. movdqa %xmm0,-16*1($tptr)
  1225. pcmpeqd %xmm1,%xmm0
  1226. pand %xmm1,%xmm2
  1227. pand %xmm1,%xmm3
  1228. pand %xmm0,%xmm4
  1229. pand %xmm0,%xmm5
  1230. pxor %xmm0,%xmm0
  1231. por %xmm2,%xmm4
  1232. por %xmm3,%xmm5
  1233. movdqu %xmm4,-16*2($rptr)
  1234. movdqu %xmm5,-16*1($rptr)
  1235. sub \$32,%rdx
  1236. jnz .Lmulx4x_cond_copy
  1237. mov %rdx,($tptr)
  1238. mov \$1,%rax
  1239. mov -48(%rsi),%r15
  1240. .cfi_restore %r15
  1241. mov -40(%rsi),%r14
  1242. .cfi_restore %r14
  1243. mov -32(%rsi),%r13
  1244. .cfi_restore %r13
  1245. mov -24(%rsi),%r12
  1246. .cfi_restore %r12
  1247. mov -16(%rsi),%rbp
  1248. .cfi_restore %rbp
  1249. mov -8(%rsi),%rbx
  1250. .cfi_restore %rbx
  1251. lea (%rsi),%rsp
  1252. .cfi_def_cfa_register %rsp
  1253. .Lmulx4x_epilogue:
  1254. ret
  1255. .cfi_endproc
  1256. .size bn_mulx4x_mont,.-bn_mulx4x_mont
  1257. ___
  1258. }}}
  1259. $code.=<<___;
  1260. .asciz "Montgomery Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
  1261. .align 16
  1262. ___
  1263. # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
  1264. # CONTEXT *context,DISPATCHER_CONTEXT *disp)
  1265. if ($win64) {
  1266. $rec="%rcx";
  1267. $frame="%rdx";
  1268. $context="%r8";
  1269. $disp="%r9";
  1270. $code.=<<___;
  1271. .extern __imp_RtlVirtualUnwind
  1272. .type mul_handler,\@abi-omnipotent
  1273. .align 16
  1274. mul_handler:
  1275. push %rsi
  1276. push %rdi
  1277. push %rbx
  1278. push %rbp
  1279. push %r12
  1280. push %r13
  1281. push %r14
  1282. push %r15
  1283. pushfq
  1284. sub \$64,%rsp
  1285. mov 120($context),%rax # pull context->Rax
  1286. mov 248($context),%rbx # pull context->Rip
  1287. mov 8($disp),%rsi # disp->ImageBase
  1288. mov 56($disp),%r11 # disp->HandlerData
  1289. mov 0(%r11),%r10d # HandlerData[0]
  1290. lea (%rsi,%r10),%r10 # end of prologue label
  1291. cmp %r10,%rbx # context->Rip<end of prologue label
  1292. jb .Lcommon_seh_tail
  1293. mov 152($context),%rax # pull context->Rsp
  1294. mov 4(%r11),%r10d # HandlerData[1]
  1295. lea (%rsi,%r10),%r10 # epilogue label
  1296. cmp %r10,%rbx # context->Rip>=epilogue label
  1297. jae .Lcommon_seh_tail
  1298. mov 192($context),%r10 # pull $num
  1299. mov 8(%rax,%r10,8),%rax # pull saved stack pointer
  1300. jmp .Lcommon_pop_regs
  1301. .size mul_handler,.-mul_handler
  1302. .type sqr_handler,\@abi-omnipotent
  1303. .align 16
  1304. sqr_handler:
  1305. push %rsi
  1306. push %rdi
  1307. push %rbx
  1308. push %rbp
  1309. push %r12
  1310. push %r13
  1311. push %r14
  1312. push %r15
  1313. pushfq
  1314. sub \$64,%rsp
  1315. mov 120($context),%rax # pull context->Rax
  1316. mov 248($context),%rbx # pull context->Rip
  1317. mov 8($disp),%rsi # disp->ImageBase
  1318. mov 56($disp),%r11 # disp->HandlerData
  1319. mov 0(%r11),%r10d # HandlerData[0]
  1320. lea (%rsi,%r10),%r10 # end of prologue label
  1321. cmp %r10,%rbx # context->Rip<.Lsqr_prologue
  1322. jb .Lcommon_seh_tail
  1323. mov 4(%r11),%r10d # HandlerData[1]
  1324. lea (%rsi,%r10),%r10 # body label
  1325. cmp %r10,%rbx # context->Rip<.Lsqr_body
  1326. jb .Lcommon_pop_regs
  1327. mov 152($context),%rax # pull context->Rsp
  1328. mov 8(%r11),%r10d # HandlerData[2]
  1329. lea (%rsi,%r10),%r10 # epilogue label
  1330. cmp %r10,%rbx # context->Rip>=.Lsqr_epilogue
  1331. jae .Lcommon_seh_tail
  1332. mov 40(%rax),%rax # pull saved stack pointer
  1333. .Lcommon_pop_regs:
  1334. mov -8(%rax),%rbx
  1335. mov -16(%rax),%rbp
  1336. mov -24(%rax),%r12
  1337. mov -32(%rax),%r13
  1338. mov -40(%rax),%r14
  1339. mov -48(%rax),%r15
  1340. mov %rbx,144($context) # restore context->Rbx
  1341. mov %rbp,160($context) # restore context->Rbp
  1342. mov %r12,216($context) # restore context->R12
  1343. mov %r13,224($context) # restore context->R13
  1344. mov %r14,232($context) # restore context->R14
  1345. mov %r15,240($context) # restore context->R15
  1346. .Lcommon_seh_tail:
  1347. mov 8(%rax),%rdi
  1348. mov 16(%rax),%rsi
  1349. mov %rax,152($context) # restore context->Rsp
  1350. mov %rsi,168($context) # restore context->Rsi
  1351. mov %rdi,176($context) # restore context->Rdi
  1352. mov 40($disp),%rdi # disp->ContextRecord
  1353. mov $context,%rsi # context
  1354. mov \$154,%ecx # sizeof(CONTEXT)
  1355. .long 0xa548f3fc # cld; rep movsq
  1356. mov $disp,%rsi
  1357. xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
  1358. mov 8(%rsi),%rdx # arg2, disp->ImageBase
  1359. mov 0(%rsi),%r8 # arg3, disp->ControlPc
  1360. mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
  1361. mov 40(%rsi),%r10 # disp->ContextRecord
  1362. lea 56(%rsi),%r11 # &disp->HandlerData
  1363. lea 24(%rsi),%r12 # &disp->EstablisherFrame
  1364. mov %r10,32(%rsp) # arg5
  1365. mov %r11,40(%rsp) # arg6
  1366. mov %r12,48(%rsp) # arg7
  1367. mov %rcx,56(%rsp) # arg8, (NULL)
  1368. call *__imp_RtlVirtualUnwind(%rip)
  1369. mov \$1,%eax # ExceptionContinueSearch
  1370. add \$64,%rsp
  1371. popfq
  1372. pop %r15
  1373. pop %r14
  1374. pop %r13
  1375. pop %r12
  1376. pop %rbp
  1377. pop %rbx
  1378. pop %rdi
  1379. pop %rsi
  1380. ret
  1381. .size sqr_handler,.-sqr_handler
  1382. .section .pdata
  1383. .align 4
  1384. .rva .LSEH_begin_bn_mul_mont
  1385. .rva .LSEH_end_bn_mul_mont
  1386. .rva .LSEH_info_bn_mul_mont
  1387. .rva .LSEH_begin_bn_mul4x_mont
  1388. .rva .LSEH_end_bn_mul4x_mont
  1389. .rva .LSEH_info_bn_mul4x_mont
  1390. .rva .LSEH_begin_bn_sqr8x_mont
  1391. .rva .LSEH_end_bn_sqr8x_mont
  1392. .rva .LSEH_info_bn_sqr8x_mont
  1393. ___
  1394. $code.=<<___ if ($addx);
  1395. .rva .LSEH_begin_bn_mulx4x_mont
  1396. .rva .LSEH_end_bn_mulx4x_mont
  1397. .rva .LSEH_info_bn_mulx4x_mont
  1398. ___
  1399. $code.=<<___;
  1400. .section .xdata
  1401. .align 8
  1402. .LSEH_info_bn_mul_mont:
  1403. .byte 9,0,0,0
  1404. .rva mul_handler
  1405. .rva .Lmul_body,.Lmul_epilogue # HandlerData[]
  1406. .LSEH_info_bn_mul4x_mont:
  1407. .byte 9,0,0,0
  1408. .rva mul_handler
  1409. .rva .Lmul4x_body,.Lmul4x_epilogue # HandlerData[]
  1410. .LSEH_info_bn_sqr8x_mont:
  1411. .byte 9,0,0,0
  1412. .rva sqr_handler
  1413. .rva .Lsqr8x_prologue,.Lsqr8x_body,.Lsqr8x_epilogue # HandlerData[]
  1414. .align 8
  1415. ___
  1416. $code.=<<___ if ($addx);
  1417. .LSEH_info_bn_mulx4x_mont:
  1418. .byte 9,0,0,0
  1419. .rva sqr_handler
  1420. .rva .Lmulx4x_prologue,.Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[]
  1421. .align 8
  1422. ___
  1423. }
  1424. print $code;
  1425. close STDOUT;