x86_64-mont5.pl 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998
  1. #!/usr/bin/env perl
  2. # ====================================================================
  3. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  4. # project. The module is, however, dual licensed under OpenSSL and
  5. # CRYPTOGAMS licenses depending on where you obtain it. For further
  6. # details see http://www.openssl.org/~appro/cryptogams/.
  7. # ====================================================================
  8. # August 2011.
  9. #
  10. # Companion to x86_64-mont.pl that optimizes cache-timing attack
  11. # countermeasures. The subroutines are produced by replacing bp[i]
  12. # references in their x86_64-mont.pl counterparts with cache-neutral
  13. # references to powers table computed in BN_mod_exp_mont_consttime.
  14. # In addition subroutine that scatters elements of the powers table
  15. # is implemented, so that scatter-/gathering can be tuned without
  16. # bn_exp.c modifications.
  17. $flavour = shift;
  18. $output = shift;
  19. if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
  20. $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  21. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  22. ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  23. ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  24. die "can't locate x86_64-xlate.pl";
  25. open STDOUT,"| $^X $xlate $flavour $output";
  26. # int bn_mul_mont_gather5(
  27. $rp="%rdi"; # BN_ULONG *rp,
  28. $ap="%rsi"; # const BN_ULONG *ap,
  29. $bp="%rdx"; # const BN_ULONG *bp,
  30. $np="%rcx"; # const BN_ULONG *np,
  31. $n0="%r8"; # const BN_ULONG *n0,
  32. $num="%r9"; # int num,
  33. # int idx); # 0 to 2^5-1, "index" in $bp holding
  34. # pre-computed powers of a', interlaced
  35. # in such manner that b[0] is $bp[idx],
  36. # b[1] is [2^5+idx], etc.
  37. $lo0="%r10";
  38. $hi0="%r11";
  39. $hi1="%r13";
  40. $i="%r14";
  41. $j="%r15";
  42. $m0="%rbx";
  43. $m1="%rbp";
  44. $code=<<___;
  45. .text
  46. .globl bn_mul_mont_gather5
  47. .type bn_mul_mont_gather5,\@function,6
  48. .align 64
  49. bn_mul_mont_gather5:
  50. test \$3,${num}d
  51. jnz .Lmul_enter
  52. cmp \$8,${num}d
  53. jb .Lmul_enter
  54. jmp .Lmul4x_enter
  55. .align 16
  56. .Lmul_enter:
  57. mov ${num}d,${num}d
  58. mov `($win64?56:8)`(%rsp),%r10d # load 7th argument
  59. push %rbx
  60. push %rbp
  61. push %r12
  62. push %r13
  63. push %r14
  64. push %r15
  65. ___
  66. $code.=<<___ if ($win64);
  67. lea -0x28(%rsp),%rsp
  68. movaps %xmm6,(%rsp)
  69. movaps %xmm7,0x10(%rsp)
  70. .Lmul_alloca:
  71. ___
  72. $code.=<<___;
  73. mov %rsp,%rax
  74. lea 2($num),%r11
  75. neg %r11
  76. lea (%rsp,%r11,8),%rsp # tp=alloca(8*(num+2))
  77. and \$-1024,%rsp # minimize TLB usage
  78. mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp
  79. .Lmul_body:
  80. mov $bp,%r12 # reassign $bp
  81. ___
  82. $bp="%r12";
  83. $STRIDE=2**5*8; # 5 is "window size"
  84. $N=$STRIDE/4; # should match cache line size
  85. $code.=<<___;
  86. mov %r10,%r11
  87. shr \$`log($N/8)/log(2)`,%r10
  88. and \$`$N/8-1`,%r11
  89. not %r10
  90. lea .Lmagic_masks(%rip),%rax
  91. and \$`2**5/($N/8)-1`,%r10 # 5 is "window size"
  92. lea 96($bp,%r11,8),$bp # pointer within 1st cache line
  93. movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which
  94. movq 8(%rax,%r10,8),%xmm5 # cache line contains element
  95. movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument
  96. movq 24(%rax,%r10,8),%xmm7
  97. movq `0*$STRIDE/4-96`($bp),%xmm0
  98. movq `1*$STRIDE/4-96`($bp),%xmm1
  99. pand %xmm4,%xmm0
  100. movq `2*$STRIDE/4-96`($bp),%xmm2
  101. pand %xmm5,%xmm1
  102. movq `3*$STRIDE/4-96`($bp),%xmm3
  103. pand %xmm6,%xmm2
  104. por %xmm1,%xmm0
  105. pand %xmm7,%xmm3
  106. por %xmm2,%xmm0
  107. lea $STRIDE($bp),$bp
  108. por %xmm3,%xmm0
  109. movq %xmm0,$m0 # m0=bp[0]
  110. mov ($n0),$n0 # pull n0[0] value
  111. mov ($ap),%rax
  112. xor $i,$i # i=0
  113. xor $j,$j # j=0
  114. movq `0*$STRIDE/4-96`($bp),%xmm0
  115. movq `1*$STRIDE/4-96`($bp),%xmm1
  116. pand %xmm4,%xmm0
  117. movq `2*$STRIDE/4-96`($bp),%xmm2
  118. pand %xmm5,%xmm1
  119. mov $n0,$m1
  120. mulq $m0 # ap[0]*bp[0]
  121. mov %rax,$lo0
  122. mov ($np),%rax
  123. movq `3*$STRIDE/4-96`($bp),%xmm3
  124. pand %xmm6,%xmm2
  125. por %xmm1,%xmm0
  126. pand %xmm7,%xmm3
  127. imulq $lo0,$m1 # "tp[0]"*n0
  128. mov %rdx,$hi0
  129. por %xmm2,%xmm0
  130. lea $STRIDE($bp),$bp
  131. por %xmm3,%xmm0
  132. mulq $m1 # np[0]*m1
  133. add %rax,$lo0 # discarded
  134. mov 8($ap),%rax
  135. adc \$0,%rdx
  136. mov %rdx,$hi1
  137. lea 1($j),$j # j++
  138. jmp .L1st_enter
  139. .align 16
  140. .L1st:
  141. add %rax,$hi1
  142. mov ($ap,$j,8),%rax
  143. adc \$0,%rdx
  144. add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0]
  145. mov $lo0,$hi0
  146. adc \$0,%rdx
  147. mov $hi1,-16(%rsp,$j,8) # tp[j-1]
  148. mov %rdx,$hi1
  149. .L1st_enter:
  150. mulq $m0 # ap[j]*bp[0]
  151. add %rax,$hi0
  152. mov ($np,$j,8),%rax
  153. adc \$0,%rdx
  154. lea 1($j),$j # j++
  155. mov %rdx,$lo0
  156. mulq $m1 # np[j]*m1
  157. cmp $num,$j
  158. jne .L1st
  159. movq %xmm0,$m0 # bp[1]
  160. add %rax,$hi1
  161. mov ($ap),%rax # ap[0]
  162. adc \$0,%rdx
  163. add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0]
  164. adc \$0,%rdx
  165. mov $hi1,-16(%rsp,$j,8) # tp[j-1]
  166. mov %rdx,$hi1
  167. mov $lo0,$hi0
  168. xor %rdx,%rdx
  169. add $hi0,$hi1
  170. adc \$0,%rdx
  171. mov $hi1,-8(%rsp,$num,8)
  172. mov %rdx,(%rsp,$num,8) # store upmost overflow bit
  173. lea 1($i),$i # i++
  174. jmp .Louter
  175. .align 16
  176. .Louter:
  177. xor $j,$j # j=0
  178. mov $n0,$m1
  179. mov (%rsp),$lo0
  180. movq `0*$STRIDE/4-96`($bp),%xmm0
  181. movq `1*$STRIDE/4-96`($bp),%xmm1
  182. pand %xmm4,%xmm0
  183. movq `2*$STRIDE/4-96`($bp),%xmm2
  184. pand %xmm5,%xmm1
  185. mulq $m0 # ap[0]*bp[i]
  186. add %rax,$lo0 # ap[0]*bp[i]+tp[0]
  187. mov ($np),%rax
  188. adc \$0,%rdx
  189. movq `3*$STRIDE/4-96`($bp),%xmm3
  190. pand %xmm6,%xmm2
  191. por %xmm1,%xmm0
  192. pand %xmm7,%xmm3
  193. imulq $lo0,$m1 # tp[0]*n0
  194. mov %rdx,$hi0
  195. por %xmm2,%xmm0
  196. lea $STRIDE($bp),$bp
  197. por %xmm3,%xmm0
  198. mulq $m1 # np[0]*m1
  199. add %rax,$lo0 # discarded
  200. mov 8($ap),%rax
  201. adc \$0,%rdx
  202. mov 8(%rsp),$lo0 # tp[1]
  203. mov %rdx,$hi1
  204. lea 1($j),$j # j++
  205. jmp .Linner_enter
  206. .align 16
  207. .Linner:
  208. add %rax,$hi1
  209. mov ($ap,$j,8),%rax
  210. adc \$0,%rdx
  211. add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j]
  212. mov (%rsp,$j,8),$lo0
  213. adc \$0,%rdx
  214. mov $hi1,-16(%rsp,$j,8) # tp[j-1]
  215. mov %rdx,$hi1
  216. .Linner_enter:
  217. mulq $m0 # ap[j]*bp[i]
  218. add %rax,$hi0
  219. mov ($np,$j,8),%rax
  220. adc \$0,%rdx
  221. add $hi0,$lo0 # ap[j]*bp[i]+tp[j]
  222. mov %rdx,$hi0
  223. adc \$0,$hi0
  224. lea 1($j),$j # j++
  225. mulq $m1 # np[j]*m1
  226. cmp $num,$j
  227. jne .Linner
  228. movq %xmm0,$m0 # bp[i+1]
  229. add %rax,$hi1
  230. mov ($ap),%rax # ap[0]
  231. adc \$0,%rdx
  232. add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j]
  233. mov (%rsp,$j,8),$lo0
  234. adc \$0,%rdx
  235. mov $hi1,-16(%rsp,$j,8) # tp[j-1]
  236. mov %rdx,$hi1
  237. xor %rdx,%rdx
  238. add $hi0,$hi1
  239. adc \$0,%rdx
  240. add $lo0,$hi1 # pull upmost overflow bit
  241. adc \$0,%rdx
  242. mov $hi1,-8(%rsp,$num,8)
  243. mov %rdx,(%rsp,$num,8) # store upmost overflow bit
  244. lea 1($i),$i # i++
  245. cmp $num,$i
  246. jl .Louter
  247. xor $i,$i # i=0 and clear CF!
  248. mov (%rsp),%rax # tp[0]
  249. lea (%rsp),$ap # borrow ap for tp
  250. mov $num,$j # j=num
  251. jmp .Lsub
  252. .align 16
  253. .Lsub: sbb ($np,$i,8),%rax
  254. mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i]
  255. mov 8($ap,$i,8),%rax # tp[i+1]
  256. lea 1($i),$i # i++
  257. dec $j # doesnn't affect CF!
  258. jnz .Lsub
  259. sbb \$0,%rax # handle upmost overflow bit
  260. xor $i,$i
  261. and %rax,$ap
  262. not %rax
  263. mov $rp,$np
  264. and %rax,$np
  265. mov $num,$j # j=num
  266. or $np,$ap # ap=borrow?tp:rp
  267. .align 16
  268. .Lcopy: # copy or in-place refresh
  269. mov ($ap,$i,8),%rax
  270. mov $i,(%rsp,$i,8) # zap temporary vector
  271. mov %rax,($rp,$i,8) # rp[i]=tp[i]
  272. lea 1($i),$i
  273. sub \$1,$j
  274. jnz .Lcopy
  275. mov 8(%rsp,$num,8),%rsi # restore %rsp
  276. mov \$1,%rax
  277. ___
  278. $code.=<<___ if ($win64);
  279. movaps (%rsi),%xmm6
  280. movaps 0x10(%rsi),%xmm7
  281. lea 0x28(%rsi),%rsi
  282. ___
  283. $code.=<<___;
  284. mov (%rsi),%r15
  285. mov 8(%rsi),%r14
  286. mov 16(%rsi),%r13
  287. mov 24(%rsi),%r12
  288. mov 32(%rsi),%rbp
  289. mov 40(%rsi),%rbx
  290. lea 48(%rsi),%rsp
  291. .Lmul_epilogue:
  292. ret
  293. .size bn_mul_mont_gather5,.-bn_mul_mont_gather5
  294. ___
  295. {{{
  296. my @A=("%r10","%r11");
  297. my @N=("%r13","%rdi");
  298. $code.=<<___;
  299. .type bn_mul4x_mont_gather5,\@function,6
  300. .align 16
  301. bn_mul4x_mont_gather5:
  302. .Lmul4x_enter:
  303. mov ${num}d,${num}d
  304. mov `($win64?56:8)`(%rsp),%r10d # load 7th argument
  305. push %rbx
  306. push %rbp
  307. push %r12
  308. push %r13
  309. push %r14
  310. push %r15
  311. ___
  312. $code.=<<___ if ($win64);
  313. lea -0x28(%rsp),%rsp
  314. movaps %xmm6,(%rsp)
  315. movaps %xmm7,0x10(%rsp)
  316. .Lmul4x_alloca:
  317. ___
  318. $code.=<<___;
  319. mov %rsp,%rax
  320. lea 4($num),%r11
  321. neg %r11
  322. lea (%rsp,%r11,8),%rsp # tp=alloca(8*(num+4))
  323. and \$-1024,%rsp # minimize TLB usage
  324. mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp
  325. .Lmul4x_body:
  326. mov $rp,16(%rsp,$num,8) # tp[num+2]=$rp
  327. mov %rdx,%r12 # reassign $bp
  328. ___
  329. $bp="%r12";
  330. $STRIDE=2**5*8; # 5 is "window size"
  331. $N=$STRIDE/4; # should match cache line size
  332. $code.=<<___;
  333. mov %r10,%r11
  334. shr \$`log($N/8)/log(2)`,%r10
  335. and \$`$N/8-1`,%r11
  336. not %r10
  337. lea .Lmagic_masks(%rip),%rax
  338. and \$`2**5/($N/8)-1`,%r10 # 5 is "window size"
  339. lea 96($bp,%r11,8),$bp # pointer within 1st cache line
  340. movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which
  341. movq 8(%rax,%r10,8),%xmm5 # cache line contains element
  342. movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument
  343. movq 24(%rax,%r10,8),%xmm7
  344. movq `0*$STRIDE/4-96`($bp),%xmm0
  345. movq `1*$STRIDE/4-96`($bp),%xmm1
  346. pand %xmm4,%xmm0
  347. movq `2*$STRIDE/4-96`($bp),%xmm2
  348. pand %xmm5,%xmm1
  349. movq `3*$STRIDE/4-96`($bp),%xmm3
  350. pand %xmm6,%xmm2
  351. por %xmm1,%xmm0
  352. pand %xmm7,%xmm3
  353. por %xmm2,%xmm0
  354. lea $STRIDE($bp),$bp
  355. por %xmm3,%xmm0
  356. movq %xmm0,$m0 # m0=bp[0]
  357. mov ($n0),$n0 # pull n0[0] value
  358. mov ($ap),%rax
  359. xor $i,$i # i=0
  360. xor $j,$j # j=0
  361. movq `0*$STRIDE/4-96`($bp),%xmm0
  362. movq `1*$STRIDE/4-96`($bp),%xmm1
  363. pand %xmm4,%xmm0
  364. movq `2*$STRIDE/4-96`($bp),%xmm2
  365. pand %xmm5,%xmm1
  366. mov $n0,$m1
  367. mulq $m0 # ap[0]*bp[0]
  368. mov %rax,$A[0]
  369. mov ($np),%rax
  370. movq `3*$STRIDE/4-96`($bp),%xmm3
  371. pand %xmm6,%xmm2
  372. por %xmm1,%xmm0
  373. pand %xmm7,%xmm3
  374. imulq $A[0],$m1 # "tp[0]"*n0
  375. mov %rdx,$A[1]
  376. por %xmm2,%xmm0
  377. lea $STRIDE($bp),$bp
  378. por %xmm3,%xmm0
  379. mulq $m1 # np[0]*m1
  380. add %rax,$A[0] # discarded
  381. mov 8($ap),%rax
  382. adc \$0,%rdx
  383. mov %rdx,$N[1]
  384. mulq $m0
  385. add %rax,$A[1]
  386. mov 8($np),%rax
  387. adc \$0,%rdx
  388. mov %rdx,$A[0]
  389. mulq $m1
  390. add %rax,$N[1]
  391. mov 16($ap),%rax
  392. adc \$0,%rdx
  393. add $A[1],$N[1]
  394. lea 4($j),$j # j++
  395. adc \$0,%rdx
  396. mov $N[1],(%rsp)
  397. mov %rdx,$N[0]
  398. jmp .L1st4x
  399. .align 16
  400. .L1st4x:
  401. mulq $m0 # ap[j]*bp[0]
  402. add %rax,$A[0]
  403. mov -16($np,$j,8),%rax
  404. adc \$0,%rdx
  405. mov %rdx,$A[1]
  406. mulq $m1 # np[j]*m1
  407. add %rax,$N[0]
  408. mov -8($ap,$j,8),%rax
  409. adc \$0,%rdx
  410. add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
  411. adc \$0,%rdx
  412. mov $N[0],-24(%rsp,$j,8) # tp[j-1]
  413. mov %rdx,$N[1]
  414. mulq $m0 # ap[j]*bp[0]
  415. add %rax,$A[1]
  416. mov -8($np,$j,8),%rax
  417. adc \$0,%rdx
  418. mov %rdx,$A[0]
  419. mulq $m1 # np[j]*m1
  420. add %rax,$N[1]
  421. mov ($ap,$j,8),%rax
  422. adc \$0,%rdx
  423. add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
  424. adc \$0,%rdx
  425. mov $N[1],-16(%rsp,$j,8) # tp[j-1]
  426. mov %rdx,$N[0]
  427. mulq $m0 # ap[j]*bp[0]
  428. add %rax,$A[0]
  429. mov ($np,$j,8),%rax
  430. adc \$0,%rdx
  431. mov %rdx,$A[1]
  432. mulq $m1 # np[j]*m1
  433. add %rax,$N[0]
  434. mov 8($ap,$j,8),%rax
  435. adc \$0,%rdx
  436. add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
  437. adc \$0,%rdx
  438. mov $N[0],-8(%rsp,$j,8) # tp[j-1]
  439. mov %rdx,$N[1]
  440. mulq $m0 # ap[j]*bp[0]
  441. add %rax,$A[1]
  442. mov 8($np,$j,8),%rax
  443. adc \$0,%rdx
  444. lea 4($j),$j # j++
  445. mov %rdx,$A[0]
  446. mulq $m1 # np[j]*m1
  447. add %rax,$N[1]
  448. mov -16($ap,$j,8),%rax
  449. adc \$0,%rdx
  450. add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
  451. adc \$0,%rdx
  452. mov $N[1],-32(%rsp,$j,8) # tp[j-1]
  453. mov %rdx,$N[0]
  454. cmp $num,$j
  455. jl .L1st4x
  456. mulq $m0 # ap[j]*bp[0]
  457. add %rax,$A[0]
  458. mov -16($np,$j,8),%rax
  459. adc \$0,%rdx
  460. mov %rdx,$A[1]
  461. mulq $m1 # np[j]*m1
  462. add %rax,$N[0]
  463. mov -8($ap,$j,8),%rax
  464. adc \$0,%rdx
  465. add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
  466. adc \$0,%rdx
  467. mov $N[0],-24(%rsp,$j,8) # tp[j-1]
  468. mov %rdx,$N[1]
  469. mulq $m0 # ap[j]*bp[0]
  470. add %rax,$A[1]
  471. mov -8($np,$j,8),%rax
  472. adc \$0,%rdx
  473. mov %rdx,$A[0]
  474. mulq $m1 # np[j]*m1
  475. add %rax,$N[1]
  476. mov ($ap),%rax # ap[0]
  477. adc \$0,%rdx
  478. add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
  479. adc \$0,%rdx
  480. mov $N[1],-16(%rsp,$j,8) # tp[j-1]
  481. mov %rdx,$N[0]
  482. movq %xmm0,$m0 # bp[1]
  483. xor $N[1],$N[1]
  484. add $A[0],$N[0]
  485. adc \$0,$N[1]
  486. mov $N[0],-8(%rsp,$j,8)
  487. mov $N[1],(%rsp,$j,8) # store upmost overflow bit
  488. lea 1($i),$i # i++
  489. .align 4
  490. .Louter4x:
  491. xor $j,$j # j=0
  492. movq `0*$STRIDE/4-96`($bp),%xmm0
  493. movq `1*$STRIDE/4-96`($bp),%xmm1
  494. pand %xmm4,%xmm0
  495. movq `2*$STRIDE/4-96`($bp),%xmm2
  496. pand %xmm5,%xmm1
  497. mov (%rsp),$A[0]
  498. mov $n0,$m1
  499. mulq $m0 # ap[0]*bp[i]
  500. add %rax,$A[0] # ap[0]*bp[i]+tp[0]
  501. mov ($np),%rax
  502. adc \$0,%rdx
  503. movq `3*$STRIDE/4-96`($bp),%xmm3
  504. pand %xmm6,%xmm2
  505. por %xmm1,%xmm0
  506. pand %xmm7,%xmm3
  507. imulq $A[0],$m1 # tp[0]*n0
  508. mov %rdx,$A[1]
  509. por %xmm2,%xmm0
  510. lea $STRIDE($bp),$bp
  511. por %xmm3,%xmm0
  512. mulq $m1 # np[0]*m1
  513. add %rax,$A[0] # "$N[0]", discarded
  514. mov 8($ap),%rax
  515. adc \$0,%rdx
  516. mov %rdx,$N[1]
  517. mulq $m0 # ap[j]*bp[i]
  518. add %rax,$A[1]
  519. mov 8($np),%rax
  520. adc \$0,%rdx
  521. add 8(%rsp),$A[1] # +tp[1]
  522. adc \$0,%rdx
  523. mov %rdx,$A[0]
  524. mulq $m1 # np[j]*m1
  525. add %rax,$N[1]
  526. mov 16($ap),%rax
  527. adc \$0,%rdx
  528. add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j]
  529. lea 4($j),$j # j+=2
  530. adc \$0,%rdx
  531. mov $N[1],(%rsp) # tp[j-1]
  532. mov %rdx,$N[0]
  533. jmp .Linner4x
  534. .align 16
  535. .Linner4x:
  536. mulq $m0 # ap[j]*bp[i]
  537. add %rax,$A[0]
  538. mov -16($np,$j,8),%rax
  539. adc \$0,%rdx
  540. add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
  541. adc \$0,%rdx
  542. mov %rdx,$A[1]
  543. mulq $m1 # np[j]*m1
  544. add %rax,$N[0]
  545. mov -8($ap,$j,8),%rax
  546. adc \$0,%rdx
  547. add $A[0],$N[0]
  548. adc \$0,%rdx
  549. mov $N[0],-24(%rsp,$j,8) # tp[j-1]
  550. mov %rdx,$N[1]
  551. mulq $m0 # ap[j]*bp[i]
  552. add %rax,$A[1]
  553. mov -8($np,$j,8),%rax
  554. adc \$0,%rdx
  555. add -8(%rsp,$j,8),$A[1]
  556. adc \$0,%rdx
  557. mov %rdx,$A[0]
  558. mulq $m1 # np[j]*m1
  559. add %rax,$N[1]
  560. mov ($ap,$j,8),%rax
  561. adc \$0,%rdx
  562. add $A[1],$N[1]
  563. adc \$0,%rdx
  564. mov $N[1],-16(%rsp,$j,8) # tp[j-1]
  565. mov %rdx,$N[0]
  566. mulq $m0 # ap[j]*bp[i]
  567. add %rax,$A[0]
  568. mov ($np,$j,8),%rax
  569. adc \$0,%rdx
  570. add (%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
  571. adc \$0,%rdx
  572. mov %rdx,$A[1]
  573. mulq $m1 # np[j]*m1
  574. add %rax,$N[0]
  575. mov 8($ap,$j,8),%rax
  576. adc \$0,%rdx
  577. add $A[0],$N[0]
  578. adc \$0,%rdx
  579. mov $N[0],-8(%rsp,$j,8) # tp[j-1]
  580. mov %rdx,$N[1]
  581. mulq $m0 # ap[j]*bp[i]
  582. add %rax,$A[1]
  583. mov 8($np,$j,8),%rax
  584. adc \$0,%rdx
  585. add 8(%rsp,$j,8),$A[1]
  586. adc \$0,%rdx
  587. lea 4($j),$j # j++
  588. mov %rdx,$A[0]
  589. mulq $m1 # np[j]*m1
  590. add %rax,$N[1]
  591. mov -16($ap,$j,8),%rax
  592. adc \$0,%rdx
  593. add $A[1],$N[1]
  594. adc \$0,%rdx
  595. mov $N[1],-32(%rsp,$j,8) # tp[j-1]
  596. mov %rdx,$N[0]
  597. cmp $num,$j
  598. jl .Linner4x
  599. mulq $m0 # ap[j]*bp[i]
  600. add %rax,$A[0]
  601. mov -16($np,$j,8),%rax
  602. adc \$0,%rdx
  603. add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
  604. adc \$0,%rdx
  605. mov %rdx,$A[1]
  606. mulq $m1 # np[j]*m1
  607. add %rax,$N[0]
  608. mov -8($ap,$j,8),%rax
  609. adc \$0,%rdx
  610. add $A[0],$N[0]
  611. adc \$0,%rdx
  612. mov $N[0],-24(%rsp,$j,8) # tp[j-1]
  613. mov %rdx,$N[1]
  614. mulq $m0 # ap[j]*bp[i]
  615. add %rax,$A[1]
  616. mov -8($np,$j,8),%rax
  617. adc \$0,%rdx
  618. add -8(%rsp,$j,8),$A[1]
  619. adc \$0,%rdx
  620. lea 1($i),$i # i++
  621. mov %rdx,$A[0]
  622. mulq $m1 # np[j]*m1
  623. add %rax,$N[1]
  624. mov ($ap),%rax # ap[0]
  625. adc \$0,%rdx
  626. add $A[1],$N[1]
  627. adc \$0,%rdx
  628. mov $N[1],-16(%rsp,$j,8) # tp[j-1]
  629. mov %rdx,$N[0]
  630. movq %xmm0,$m0 # bp[i+1]
  631. xor $N[1],$N[1]
  632. add $A[0],$N[0]
  633. adc \$0,$N[1]
  634. add (%rsp,$num,8),$N[0] # pull upmost overflow bit
  635. adc \$0,$N[1]
  636. mov $N[0],-8(%rsp,$j,8)
  637. mov $N[1],(%rsp,$j,8) # store upmost overflow bit
  638. cmp $num,$i
  639. jl .Louter4x
  640. ___
  641. {
  642. my @ri=("%rax","%rdx",$m0,$m1);
  643. $code.=<<___;
  644. mov 16(%rsp,$num,8),$rp # restore $rp
  645. mov 0(%rsp),@ri[0] # tp[0]
  646. pxor %xmm0,%xmm0
  647. mov 8(%rsp),@ri[1] # tp[1]
  648. shr \$2,$num # num/=4
  649. lea (%rsp),$ap # borrow ap for tp
  650. xor $i,$i # i=0 and clear CF!
  651. sub 0($np),@ri[0]
  652. mov 16($ap),@ri[2] # tp[2]
  653. mov 24($ap),@ri[3] # tp[3]
  654. sbb 8($np),@ri[1]
  655. lea -1($num),$j # j=num/4-1
  656. jmp .Lsub4x
  657. .align 16
  658. .Lsub4x:
  659. mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i]
  660. mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i]
  661. sbb 16($np,$i,8),@ri[2]
  662. mov 32($ap,$i,8),@ri[0] # tp[i+1]
  663. mov 40($ap,$i,8),@ri[1]
  664. sbb 24($np,$i,8),@ri[3]
  665. mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i]
  666. mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i]
  667. sbb 32($np,$i,8),@ri[0]
  668. mov 48($ap,$i,8),@ri[2]
  669. mov 56($ap,$i,8),@ri[3]
  670. sbb 40($np,$i,8),@ri[1]
  671. lea 4($i),$i # i++
  672. dec $j # doesnn't affect CF!
  673. jnz .Lsub4x
  674. mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i]
  675. mov 32($ap,$i,8),@ri[0] # load overflow bit
  676. sbb 16($np,$i,8),@ri[2]
  677. mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i]
  678. sbb 24($np,$i,8),@ri[3]
  679. mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i]
  680. sbb \$0,@ri[0] # handle upmost overflow bit
  681. mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i]
  682. xor $i,$i # i=0
  683. and @ri[0],$ap
  684. not @ri[0]
  685. mov $rp,$np
  686. and @ri[0],$np
  687. lea -1($num),$j
  688. or $np,$ap # ap=borrow?tp:rp
  689. movdqu ($ap),%xmm1
  690. movdqa %xmm0,(%rsp)
  691. movdqu %xmm1,($rp)
  692. jmp .Lcopy4x
  693. .align 16
  694. .Lcopy4x: # copy or in-place refresh
  695. movdqu 16($ap,$i),%xmm2
  696. movdqu 32($ap,$i),%xmm1
  697. movdqa %xmm0,16(%rsp,$i)
  698. movdqu %xmm2,16($rp,$i)
  699. movdqa %xmm0,32(%rsp,$i)
  700. movdqu %xmm1,32($rp,$i)
  701. lea 32($i),$i
  702. dec $j
  703. jnz .Lcopy4x
  704. shl \$2,$num
  705. movdqu 16($ap,$i),%xmm2
  706. movdqa %xmm0,16(%rsp,$i)
  707. movdqu %xmm2,16($rp,$i)
  708. ___
  709. }
  710. $code.=<<___;
  711. mov 8(%rsp,$num,8),%rsi # restore %rsp
  712. mov \$1,%rax
  713. ___
  714. $code.=<<___ if ($win64);
  715. movaps (%rsi),%xmm6
  716. movaps 0x10(%rsi),%xmm7
  717. lea 0x28(%rsi),%rsi
  718. ___
  719. $code.=<<___;
  720. mov (%rsi),%r15
  721. mov 8(%rsi),%r14
  722. mov 16(%rsi),%r13
  723. mov 24(%rsi),%r12
  724. mov 32(%rsi),%rbp
  725. mov 40(%rsi),%rbx
  726. lea 48(%rsi),%rsp
  727. .Lmul4x_epilogue:
  728. ret
  729. .size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
  730. ___
  731. }}}
  732. {
  733. my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%rdx","%r8", "%r9") : # Win64 order
  734. ("%rdi","%rsi","%rdx","%rcx"); # Unix order
  735. $code.=<<___;
  736. .globl bn_scatter5
  737. .type bn_scatter5,\@abi-omnipotent
  738. .align 16
  739. bn_scatter5:
  740. lea ($tbl,$idx,8),$tbl
  741. .Lscatter:
  742. mov ($inp),%rax
  743. lea 8($inp),$inp
  744. mov %rax,($tbl)
  745. lea 32*8($tbl),$tbl
  746. sub \$1,$num
  747. jnz .Lscatter
  748. ret
  749. .size bn_scatter5,.-bn_scatter5
  750. ___
  751. }
  752. $code.=<<___;
  753. .align 64
  754. .Lmagic_masks:
  755. .long 0,0, 0,0, 0,0, -1,-1
  756. .long 0,0, 0,0, 0,0, 0,0
  757. .asciz "Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
  758. ___
  759. # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
  760. # CONTEXT *context,DISPATCHER_CONTEXT *disp)
  761. if ($win64) {
  762. $rec="%rcx";
  763. $frame="%rdx";
  764. $context="%r8";
  765. $disp="%r9";
  766. $code.=<<___;
  767. .extern __imp_RtlVirtualUnwind
  768. .type mul_handler,\@abi-omnipotent
  769. .align 16
  770. mul_handler:
  771. push %rsi
  772. push %rdi
  773. push %rbx
  774. push %rbp
  775. push %r12
  776. push %r13
  777. push %r14
  778. push %r15
  779. pushfq
  780. sub \$64,%rsp
  781. mov 120($context),%rax # pull context->Rax
  782. mov 248($context),%rbx # pull context->Rip
  783. mov 8($disp),%rsi # disp->ImageBase
  784. mov 56($disp),%r11 # disp->HandlerData
  785. mov 0(%r11),%r10d # HandlerData[0]
  786. lea (%rsi,%r10),%r10 # end of prologue label
  787. cmp %r10,%rbx # context->Rip<end of prologue label
  788. jb .Lcommon_seh_tail
  789. lea `40+48`(%rax),%rax
  790. mov 4(%r11),%r10d # HandlerData[1]
  791. lea (%rsi,%r10),%r10 # end of alloca label
  792. cmp %r10,%rbx # context->Rip<end of alloca label
  793. jb .Lcommon_seh_tail
  794. mov 152($context),%rax # pull context->Rsp
  795. mov 8(%r11),%r10d # HandlerData[2]
  796. lea (%rsi,%r10),%r10 # epilogue label
  797. cmp %r10,%rbx # context->Rip>=epilogue label
  798. jae .Lcommon_seh_tail
  799. mov 192($context),%r10 # pull $num
  800. mov 8(%rax,%r10,8),%rax # pull saved stack pointer
  801. movaps (%rax),%xmm0
  802. movaps 16(%rax),%xmm1
  803. lea `40+48`(%rax),%rax
  804. mov -8(%rax),%rbx
  805. mov -16(%rax),%rbp
  806. mov -24(%rax),%r12
  807. mov -32(%rax),%r13
  808. mov -40(%rax),%r14
  809. mov -48(%rax),%r15
  810. mov %rbx,144($context) # restore context->Rbx
  811. mov %rbp,160($context) # restore context->Rbp
  812. mov %r12,216($context) # restore context->R12
  813. mov %r13,224($context) # restore context->R13
  814. mov %r14,232($context) # restore context->R14
  815. mov %r15,240($context) # restore context->R15
  816. movups %xmm0,512($context) # restore context->Xmm6
  817. movups %xmm1,528($context) # restore context->Xmm7
  818. .Lcommon_seh_tail:
  819. mov 8(%rax),%rdi
  820. mov 16(%rax),%rsi
  821. mov %rax,152($context) # restore context->Rsp
  822. mov %rsi,168($context) # restore context->Rsi
  823. mov %rdi,176($context) # restore context->Rdi
  824. mov 40($disp),%rdi # disp->ContextRecord
  825. mov $context,%rsi # context
  826. mov \$154,%ecx # sizeof(CONTEXT)
  827. .long 0xa548f3fc # cld; rep movsq
  828. mov $disp,%rsi
  829. xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
  830. mov 8(%rsi),%rdx # arg2, disp->ImageBase
  831. mov 0(%rsi),%r8 # arg3, disp->ControlPc
  832. mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
  833. mov 40(%rsi),%r10 # disp->ContextRecord
  834. lea 56(%rsi),%r11 # &disp->HandlerData
  835. lea 24(%rsi),%r12 # &disp->EstablisherFrame
  836. mov %r10,32(%rsp) # arg5
  837. mov %r11,40(%rsp) # arg6
  838. mov %r12,48(%rsp) # arg7
  839. mov %rcx,56(%rsp) # arg8, (NULL)
  840. call *__imp_RtlVirtualUnwind(%rip)
  841. mov \$1,%eax # ExceptionContinueSearch
  842. add \$64,%rsp
  843. popfq
  844. pop %r15
  845. pop %r14
  846. pop %r13
  847. pop %r12
  848. pop %rbp
  849. pop %rbx
  850. pop %rdi
  851. pop %rsi
  852. ret
  853. .size mul_handler,.-mul_handler
  854. .section .pdata
  855. .align 4
  856. .rva .LSEH_begin_bn_mul_mont_gather5
  857. .rva .LSEH_end_bn_mul_mont_gather5
  858. .rva .LSEH_info_bn_mul_mont_gather5
  859. .rva .LSEH_begin_bn_mul4x_mont_gather5
  860. .rva .LSEH_end_bn_mul4x_mont_gather5
  861. .rva .LSEH_info_bn_mul4x_mont_gather5
  862. .section .xdata
  863. .align 8
  864. .LSEH_info_bn_mul_mont_gather5:
  865. .byte 9,0,0,0
  866. .rva mul_handler
  867. .rva .Lmul_alloca,.Lmul_body,.Lmul_epilogue # HandlerData[]
  868. .align 8
  869. .LSEH_info_bn_mul4x_mont_gather5:
  870. .byte 9,0,0,0
  871. .rva mul_handler
  872. .rva .Lmul4x_alloca,.Lmul4x_body,.Lmul4x_epilogue # HandlerData[]
  873. .align 8
  874. ___
  875. }
  876. $code =~ s/\`([^\`]*)\`/eval($1)/gem;
  877. print $code;
  878. close STDOUT;