x86_64-mont5.pl 22 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070
  1. #!/usr/bin/env perl
  2. # ====================================================================
  3. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  4. # project. The module is, however, dual licensed under OpenSSL and
  5. # CRYPTOGAMS licenses depending on where you obtain it. For further
  6. # details see http://www.openssl.org/~appro/cryptogams/.
  7. # ====================================================================
  8. # August 2011.
  9. #
  10. # Companion to x86_64-mont.pl that optimizes cache-timing attack
  11. # countermeasures. The subroutines are produced by replacing bp[i]
  12. # references in their x86_64-mont.pl counterparts with cache-neutral
  13. # references to powers table computed in BN_mod_exp_mont_consttime.
  14. # In addition subroutine that scatters elements of the powers table
  15. # is implemented, so that scatter-/gathering can be tuned without
  16. # bn_exp.c modifications.
  17. $flavour = shift;
  18. $output = shift;
  19. if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
  20. $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  21. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  22. ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  23. ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  24. die "can't locate x86_64-xlate.pl";
  25. open STDOUT,"| $^X $xlate $flavour $output";
  26. # int bn_mul_mont_gather5(
  27. $rp="%rdi"; # BN_ULONG *rp,
  28. $ap="%rsi"; # const BN_ULONG *ap,
  29. $bp="%rdx"; # const BN_ULONG *bp,
  30. $np="%rcx"; # const BN_ULONG *np,
  31. $n0="%r8"; # const BN_ULONG *n0,
  32. $num="%r9"; # int num,
  33. # int idx); # 0 to 2^5-1, "index" in $bp holding
  34. # pre-computed powers of a', interlaced
  35. # in such manner that b[0] is $bp[idx],
  36. # b[1] is [2^5+idx], etc.
  37. $lo0="%r10";
  38. $hi0="%r11";
  39. $hi1="%r13";
  40. $i="%r14";
  41. $j="%r15";
  42. $m0="%rbx";
  43. $m1="%rbp";
  44. $code=<<___;
  45. .text
  46. .globl bn_mul_mont_gather5
  47. .type bn_mul_mont_gather5,\@function,6
  48. .align 64
  49. bn_mul_mont_gather5:
  50. test \$3,${num}d
  51. jnz .Lmul_enter
  52. cmp \$8,${num}d
  53. jb .Lmul_enter
  54. jmp .Lmul4x_enter
  55. .align 16
  56. .Lmul_enter:
  57. mov ${num}d,${num}d
  58. mov `($win64?56:8)`(%rsp),%r10d # load 7th argument
  59. push %rbx
  60. push %rbp
  61. push %r12
  62. push %r13
  63. push %r14
  64. push %r15
  65. ___
  66. $code.=<<___ if ($win64);
  67. lea -0x28(%rsp),%rsp
  68. movaps %xmm6,(%rsp)
  69. movaps %xmm7,0x10(%rsp)
  70. .Lmul_alloca:
  71. ___
  72. $code.=<<___;
  73. mov %rsp,%rax
  74. lea 2($num),%r11
  75. neg %r11
  76. lea (%rsp,%r11,8),%rsp # tp=alloca(8*(num+2))
  77. and \$-1024,%rsp # minimize TLB usage
  78. mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp
  79. .Lmul_body:
  80. mov $bp,%r12 # reassign $bp
  81. ___
  82. $bp="%r12";
  83. $STRIDE=2**5*8; # 5 is "window size"
  84. $N=$STRIDE/4; # should match cache line size
  85. $code.=<<___;
  86. mov %r10,%r11
  87. shr \$`log($N/8)/log(2)`,%r10
  88. and \$`$N/8-1`,%r11
  89. not %r10
  90. lea .Lmagic_masks(%rip),%rax
  91. and \$`2**5/($N/8)-1`,%r10 # 5 is "window size"
  92. lea 96($bp,%r11,8),$bp # pointer within 1st cache line
  93. movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which
  94. movq 8(%rax,%r10,8),%xmm5 # cache line contains element
  95. movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument
  96. movq 24(%rax,%r10,8),%xmm7
  97. movq `0*$STRIDE/4-96`($bp),%xmm0
  98. movq `1*$STRIDE/4-96`($bp),%xmm1
  99. pand %xmm4,%xmm0
  100. movq `2*$STRIDE/4-96`($bp),%xmm2
  101. pand %xmm5,%xmm1
  102. movq `3*$STRIDE/4-96`($bp),%xmm3
  103. pand %xmm6,%xmm2
  104. por %xmm1,%xmm0
  105. pand %xmm7,%xmm3
  106. por %xmm2,%xmm0
  107. lea $STRIDE($bp),$bp
  108. por %xmm3,%xmm0
  109. movq %xmm0,$m0 # m0=bp[0]
  110. mov ($n0),$n0 # pull n0[0] value
  111. mov ($ap),%rax
  112. xor $i,$i # i=0
  113. xor $j,$j # j=0
  114. movq `0*$STRIDE/4-96`($bp),%xmm0
  115. movq `1*$STRIDE/4-96`($bp),%xmm1
  116. pand %xmm4,%xmm0
  117. movq `2*$STRIDE/4-96`($bp),%xmm2
  118. pand %xmm5,%xmm1
  119. mov $n0,$m1
  120. mulq $m0 # ap[0]*bp[0]
  121. mov %rax,$lo0
  122. mov ($np),%rax
  123. movq `3*$STRIDE/4-96`($bp),%xmm3
  124. pand %xmm6,%xmm2
  125. por %xmm1,%xmm0
  126. pand %xmm7,%xmm3
  127. imulq $lo0,$m1 # "tp[0]"*n0
  128. mov %rdx,$hi0
  129. por %xmm2,%xmm0
  130. lea $STRIDE($bp),$bp
  131. por %xmm3,%xmm0
  132. mulq $m1 # np[0]*m1
  133. add %rax,$lo0 # discarded
  134. mov 8($ap),%rax
  135. adc \$0,%rdx
  136. mov %rdx,$hi1
  137. lea 1($j),$j # j++
  138. jmp .L1st_enter
  139. .align 16
  140. .L1st:
  141. add %rax,$hi1
  142. mov ($ap,$j,8),%rax
  143. adc \$0,%rdx
  144. add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0]
  145. mov $lo0,$hi0
  146. adc \$0,%rdx
  147. mov $hi1,-16(%rsp,$j,8) # tp[j-1]
  148. mov %rdx,$hi1
  149. .L1st_enter:
  150. mulq $m0 # ap[j]*bp[0]
  151. add %rax,$hi0
  152. mov ($np,$j,8),%rax
  153. adc \$0,%rdx
  154. lea 1($j),$j # j++
  155. mov %rdx,$lo0
  156. mulq $m1 # np[j]*m1
  157. cmp $num,$j
  158. jne .L1st
  159. movq %xmm0,$m0 # bp[1]
  160. add %rax,$hi1
  161. mov ($ap),%rax # ap[0]
  162. adc \$0,%rdx
  163. add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0]
  164. adc \$0,%rdx
  165. mov $hi1,-16(%rsp,$j,8) # tp[j-1]
  166. mov %rdx,$hi1
  167. mov $lo0,$hi0
  168. xor %rdx,%rdx
  169. add $hi0,$hi1
  170. adc \$0,%rdx
  171. mov $hi1,-8(%rsp,$num,8)
  172. mov %rdx,(%rsp,$num,8) # store upmost overflow bit
  173. lea 1($i),$i # i++
  174. jmp .Louter
  175. .align 16
  176. .Louter:
  177. xor $j,$j # j=0
  178. mov $n0,$m1
  179. mov (%rsp),$lo0
  180. movq `0*$STRIDE/4-96`($bp),%xmm0
  181. movq `1*$STRIDE/4-96`($bp),%xmm1
  182. pand %xmm4,%xmm0
  183. movq `2*$STRIDE/4-96`($bp),%xmm2
  184. pand %xmm5,%xmm1
  185. mulq $m0 # ap[0]*bp[i]
  186. add %rax,$lo0 # ap[0]*bp[i]+tp[0]
  187. mov ($np),%rax
  188. adc \$0,%rdx
  189. movq `3*$STRIDE/4-96`($bp),%xmm3
  190. pand %xmm6,%xmm2
  191. por %xmm1,%xmm0
  192. pand %xmm7,%xmm3
  193. imulq $lo0,$m1 # tp[0]*n0
  194. mov %rdx,$hi0
  195. por %xmm2,%xmm0
  196. lea $STRIDE($bp),$bp
  197. por %xmm3,%xmm0
  198. mulq $m1 # np[0]*m1
  199. add %rax,$lo0 # discarded
  200. mov 8($ap),%rax
  201. adc \$0,%rdx
  202. mov 8(%rsp),$lo0 # tp[1]
  203. mov %rdx,$hi1
  204. lea 1($j),$j # j++
  205. jmp .Linner_enter
  206. .align 16
  207. .Linner:
  208. add %rax,$hi1
  209. mov ($ap,$j,8),%rax
  210. adc \$0,%rdx
  211. add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j]
  212. mov (%rsp,$j,8),$lo0
  213. adc \$0,%rdx
  214. mov $hi1,-16(%rsp,$j,8) # tp[j-1]
  215. mov %rdx,$hi1
  216. .Linner_enter:
  217. mulq $m0 # ap[j]*bp[i]
  218. add %rax,$hi0
  219. mov ($np,$j,8),%rax
  220. adc \$0,%rdx
  221. add $hi0,$lo0 # ap[j]*bp[i]+tp[j]
  222. mov %rdx,$hi0
  223. adc \$0,$hi0
  224. lea 1($j),$j # j++
  225. mulq $m1 # np[j]*m1
  226. cmp $num,$j
  227. jne .Linner
  228. movq %xmm0,$m0 # bp[i+1]
  229. add %rax,$hi1
  230. mov ($ap),%rax # ap[0]
  231. adc \$0,%rdx
  232. add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j]
  233. mov (%rsp,$j,8),$lo0
  234. adc \$0,%rdx
  235. mov $hi1,-16(%rsp,$j,8) # tp[j-1]
  236. mov %rdx,$hi1
  237. xor %rdx,%rdx
  238. add $hi0,$hi1
  239. adc \$0,%rdx
  240. add $lo0,$hi1 # pull upmost overflow bit
  241. adc \$0,%rdx
  242. mov $hi1,-8(%rsp,$num,8)
  243. mov %rdx,(%rsp,$num,8) # store upmost overflow bit
  244. lea 1($i),$i # i++
  245. cmp $num,$i
  246. jl .Louter
  247. xor $i,$i # i=0 and clear CF!
  248. mov (%rsp),%rax # tp[0]
  249. lea (%rsp),$ap # borrow ap for tp
  250. mov $num,$j # j=num
  251. jmp .Lsub
  252. .align 16
  253. .Lsub: sbb ($np,$i,8),%rax
  254. mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i]
  255. mov 8($ap,$i,8),%rax # tp[i+1]
  256. lea 1($i),$i # i++
  257. dec $j # doesnn't affect CF!
  258. jnz .Lsub
  259. sbb \$0,%rax # handle upmost overflow bit
  260. xor $i,$i
  261. and %rax,$ap
  262. not %rax
  263. mov $rp,$np
  264. and %rax,$np
  265. mov $num,$j # j=num
  266. or $np,$ap # ap=borrow?tp:rp
  267. .align 16
  268. .Lcopy: # copy or in-place refresh
  269. mov ($ap,$i,8),%rax
  270. mov $i,(%rsp,$i,8) # zap temporary vector
  271. mov %rax,($rp,$i,8) # rp[i]=tp[i]
  272. lea 1($i),$i
  273. sub \$1,$j
  274. jnz .Lcopy
  275. mov 8(%rsp,$num,8),%rsi # restore %rsp
  276. mov \$1,%rax
  277. ___
  278. $code.=<<___ if ($win64);
  279. movaps (%rsi),%xmm6
  280. movaps 0x10(%rsi),%xmm7
  281. lea 0x28(%rsi),%rsi
  282. ___
  283. $code.=<<___;
  284. mov (%rsi),%r15
  285. mov 8(%rsi),%r14
  286. mov 16(%rsi),%r13
  287. mov 24(%rsi),%r12
  288. mov 32(%rsi),%rbp
  289. mov 40(%rsi),%rbx
  290. lea 48(%rsi),%rsp
  291. .Lmul_epilogue:
  292. ret
  293. .size bn_mul_mont_gather5,.-bn_mul_mont_gather5
  294. ___
  295. {{{
  296. my @A=("%r10","%r11");
  297. my @N=("%r13","%rdi");
  298. $code.=<<___;
  299. .type bn_mul4x_mont_gather5,\@function,6
  300. .align 16
  301. bn_mul4x_mont_gather5:
  302. .Lmul4x_enter:
  303. mov ${num}d,${num}d
  304. mov `($win64?56:8)`(%rsp),%r10d # load 7th argument
  305. push %rbx
  306. push %rbp
  307. push %r12
  308. push %r13
  309. push %r14
  310. push %r15
  311. ___
  312. $code.=<<___ if ($win64);
  313. lea -0x28(%rsp),%rsp
  314. movaps %xmm6,(%rsp)
  315. movaps %xmm7,0x10(%rsp)
  316. .Lmul4x_alloca:
  317. ___
  318. $code.=<<___;
  319. mov %rsp,%rax
  320. lea 4($num),%r11
  321. neg %r11
  322. lea (%rsp,%r11,8),%rsp # tp=alloca(8*(num+4))
  323. and \$-1024,%rsp # minimize TLB usage
  324. mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp
  325. .Lmul4x_body:
  326. mov $rp,16(%rsp,$num,8) # tp[num+2]=$rp
  327. mov %rdx,%r12 # reassign $bp
  328. ___
  329. $bp="%r12";
  330. $STRIDE=2**5*8; # 5 is "window size"
  331. $N=$STRIDE/4; # should match cache line size
  332. $code.=<<___;
  333. mov %r10,%r11
  334. shr \$`log($N/8)/log(2)`,%r10
  335. and \$`$N/8-1`,%r11
  336. not %r10
  337. lea .Lmagic_masks(%rip),%rax
  338. and \$`2**5/($N/8)-1`,%r10 # 5 is "window size"
  339. lea 96($bp,%r11,8),$bp # pointer within 1st cache line
  340. movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which
  341. movq 8(%rax,%r10,8),%xmm5 # cache line contains element
  342. movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument
  343. movq 24(%rax,%r10,8),%xmm7
  344. movq `0*$STRIDE/4-96`($bp),%xmm0
  345. movq `1*$STRIDE/4-96`($bp),%xmm1
  346. pand %xmm4,%xmm0
  347. movq `2*$STRIDE/4-96`($bp),%xmm2
  348. pand %xmm5,%xmm1
  349. movq `3*$STRIDE/4-96`($bp),%xmm3
  350. pand %xmm6,%xmm2
  351. por %xmm1,%xmm0
  352. pand %xmm7,%xmm3
  353. por %xmm2,%xmm0
  354. lea $STRIDE($bp),$bp
  355. por %xmm3,%xmm0
  356. movq %xmm0,$m0 # m0=bp[0]
  357. mov ($n0),$n0 # pull n0[0] value
  358. mov ($ap),%rax
  359. xor $i,$i # i=0
  360. xor $j,$j # j=0
  361. movq `0*$STRIDE/4-96`($bp),%xmm0
  362. movq `1*$STRIDE/4-96`($bp),%xmm1
  363. pand %xmm4,%xmm0
  364. movq `2*$STRIDE/4-96`($bp),%xmm2
  365. pand %xmm5,%xmm1
  366. mov $n0,$m1
  367. mulq $m0 # ap[0]*bp[0]
  368. mov %rax,$A[0]
  369. mov ($np),%rax
  370. movq `3*$STRIDE/4-96`($bp),%xmm3
  371. pand %xmm6,%xmm2
  372. por %xmm1,%xmm0
  373. pand %xmm7,%xmm3
  374. imulq $A[0],$m1 # "tp[0]"*n0
  375. mov %rdx,$A[1]
  376. por %xmm2,%xmm0
  377. lea $STRIDE($bp),$bp
  378. por %xmm3,%xmm0
  379. mulq $m1 # np[0]*m1
  380. add %rax,$A[0] # discarded
  381. mov 8($ap),%rax
  382. adc \$0,%rdx
  383. mov %rdx,$N[1]
  384. mulq $m0
  385. add %rax,$A[1]
  386. mov 8($np),%rax
  387. adc \$0,%rdx
  388. mov %rdx,$A[0]
  389. mulq $m1
  390. add %rax,$N[1]
  391. mov 16($ap),%rax
  392. adc \$0,%rdx
  393. add $A[1],$N[1]
  394. lea 4($j),$j # j++
  395. adc \$0,%rdx
  396. mov $N[1],(%rsp)
  397. mov %rdx,$N[0]
  398. jmp .L1st4x
  399. .align 16
  400. .L1st4x:
  401. mulq $m0 # ap[j]*bp[0]
  402. add %rax,$A[0]
  403. mov -16($np,$j,8),%rax
  404. adc \$0,%rdx
  405. mov %rdx,$A[1]
  406. mulq $m1 # np[j]*m1
  407. add %rax,$N[0]
  408. mov -8($ap,$j,8),%rax
  409. adc \$0,%rdx
  410. add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
  411. adc \$0,%rdx
  412. mov $N[0],-24(%rsp,$j,8) # tp[j-1]
  413. mov %rdx,$N[1]
  414. mulq $m0 # ap[j]*bp[0]
  415. add %rax,$A[1]
  416. mov -8($np,$j,8),%rax
  417. adc \$0,%rdx
  418. mov %rdx,$A[0]
  419. mulq $m1 # np[j]*m1
  420. add %rax,$N[1]
  421. mov ($ap,$j,8),%rax
  422. adc \$0,%rdx
  423. add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
  424. adc \$0,%rdx
  425. mov $N[1],-16(%rsp,$j,8) # tp[j-1]
  426. mov %rdx,$N[0]
  427. mulq $m0 # ap[j]*bp[0]
  428. add %rax,$A[0]
  429. mov ($np,$j,8),%rax
  430. adc \$0,%rdx
  431. mov %rdx,$A[1]
  432. mulq $m1 # np[j]*m1
  433. add %rax,$N[0]
  434. mov 8($ap,$j,8),%rax
  435. adc \$0,%rdx
  436. add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
  437. adc \$0,%rdx
  438. mov $N[0],-8(%rsp,$j,8) # tp[j-1]
  439. mov %rdx,$N[1]
  440. mulq $m0 # ap[j]*bp[0]
  441. add %rax,$A[1]
  442. mov 8($np,$j,8),%rax
  443. adc \$0,%rdx
  444. lea 4($j),$j # j++
  445. mov %rdx,$A[0]
  446. mulq $m1 # np[j]*m1
  447. add %rax,$N[1]
  448. mov -16($ap,$j,8),%rax
  449. adc \$0,%rdx
  450. add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
  451. adc \$0,%rdx
  452. mov $N[1],-32(%rsp,$j,8) # tp[j-1]
  453. mov %rdx,$N[0]
  454. cmp $num,$j
  455. jl .L1st4x
  456. mulq $m0 # ap[j]*bp[0]
  457. add %rax,$A[0]
  458. mov -16($np,$j,8),%rax
  459. adc \$0,%rdx
  460. mov %rdx,$A[1]
  461. mulq $m1 # np[j]*m1
  462. add %rax,$N[0]
  463. mov -8($ap,$j,8),%rax
  464. adc \$0,%rdx
  465. add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
  466. adc \$0,%rdx
  467. mov $N[0],-24(%rsp,$j,8) # tp[j-1]
  468. mov %rdx,$N[1]
  469. mulq $m0 # ap[j]*bp[0]
  470. add %rax,$A[1]
  471. mov -8($np,$j,8),%rax
  472. adc \$0,%rdx
  473. mov %rdx,$A[0]
  474. mulq $m1 # np[j]*m1
  475. add %rax,$N[1]
  476. mov ($ap),%rax # ap[0]
  477. adc \$0,%rdx
  478. add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
  479. adc \$0,%rdx
  480. mov $N[1],-16(%rsp,$j,8) # tp[j-1]
  481. mov %rdx,$N[0]
  482. movq %xmm0,$m0 # bp[1]
  483. xor $N[1],$N[1]
  484. add $A[0],$N[0]
  485. adc \$0,$N[1]
  486. mov $N[0],-8(%rsp,$j,8)
  487. mov $N[1],(%rsp,$j,8) # store upmost overflow bit
  488. lea 1($i),$i # i++
  489. .align 4
  490. .Louter4x:
  491. xor $j,$j # j=0
  492. movq `0*$STRIDE/4-96`($bp),%xmm0
  493. movq `1*$STRIDE/4-96`($bp),%xmm1
  494. pand %xmm4,%xmm0
  495. movq `2*$STRIDE/4-96`($bp),%xmm2
  496. pand %xmm5,%xmm1
  497. mov (%rsp),$A[0]
  498. mov $n0,$m1
  499. mulq $m0 # ap[0]*bp[i]
  500. add %rax,$A[0] # ap[0]*bp[i]+tp[0]
  501. mov ($np),%rax
  502. adc \$0,%rdx
  503. movq `3*$STRIDE/4-96`($bp),%xmm3
  504. pand %xmm6,%xmm2
  505. por %xmm1,%xmm0
  506. pand %xmm7,%xmm3
  507. imulq $A[0],$m1 # tp[0]*n0
  508. mov %rdx,$A[1]
  509. por %xmm2,%xmm0
  510. lea $STRIDE($bp),$bp
  511. por %xmm3,%xmm0
  512. mulq $m1 # np[0]*m1
  513. add %rax,$A[0] # "$N[0]", discarded
  514. mov 8($ap),%rax
  515. adc \$0,%rdx
  516. mov %rdx,$N[1]
  517. mulq $m0 # ap[j]*bp[i]
  518. add %rax,$A[1]
  519. mov 8($np),%rax
  520. adc \$0,%rdx
  521. add 8(%rsp),$A[1] # +tp[1]
  522. adc \$0,%rdx
  523. mov %rdx,$A[0]
  524. mulq $m1 # np[j]*m1
  525. add %rax,$N[1]
  526. mov 16($ap),%rax
  527. adc \$0,%rdx
  528. add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j]
  529. lea 4($j),$j # j+=2
  530. adc \$0,%rdx
  531. mov %rdx,$N[0]
  532. jmp .Linner4x
  533. .align 16
  534. .Linner4x:
  535. mulq $m0 # ap[j]*bp[i]
  536. add %rax,$A[0]
  537. mov -16($np,$j,8),%rax
  538. adc \$0,%rdx
  539. add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
  540. adc \$0,%rdx
  541. mov %rdx,$A[1]
  542. mulq $m1 # np[j]*m1
  543. add %rax,$N[0]
  544. mov -8($ap,$j,8),%rax
  545. adc \$0,%rdx
  546. add $A[0],$N[0]
  547. adc \$0,%rdx
  548. mov $N[1],-32(%rsp,$j,8) # tp[j-1]
  549. mov %rdx,$N[1]
  550. mulq $m0 # ap[j]*bp[i]
  551. add %rax,$A[1]
  552. mov -8($np,$j,8),%rax
  553. adc \$0,%rdx
  554. add -8(%rsp,$j,8),$A[1]
  555. adc \$0,%rdx
  556. mov %rdx,$A[0]
  557. mulq $m1 # np[j]*m1
  558. add %rax,$N[1]
  559. mov ($ap,$j,8),%rax
  560. adc \$0,%rdx
  561. add $A[1],$N[1]
  562. adc \$0,%rdx
  563. mov $N[0],-24(%rsp,$j,8) # tp[j-1]
  564. mov %rdx,$N[0]
  565. mulq $m0 # ap[j]*bp[i]
  566. add %rax,$A[0]
  567. mov ($np,$j,8),%rax
  568. adc \$0,%rdx
  569. add (%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
  570. adc \$0,%rdx
  571. mov %rdx,$A[1]
  572. mulq $m1 # np[j]*m1
  573. add %rax,$N[0]
  574. mov 8($ap,$j,8),%rax
  575. adc \$0,%rdx
  576. add $A[0],$N[0]
  577. adc \$0,%rdx
  578. mov $N[1],-16(%rsp,$j,8) # tp[j-1]
  579. mov %rdx,$N[1]
  580. mulq $m0 # ap[j]*bp[i]
  581. add %rax,$A[1]
  582. mov 8($np,$j,8),%rax
  583. adc \$0,%rdx
  584. add 8(%rsp,$j,8),$A[1]
  585. adc \$0,%rdx
  586. lea 4($j),$j # j++
  587. mov %rdx,$A[0]
  588. mulq $m1 # np[j]*m1
  589. add %rax,$N[1]
  590. mov -16($ap,$j,8),%rax
  591. adc \$0,%rdx
  592. add $A[1],$N[1]
  593. adc \$0,%rdx
  594. mov $N[0],-40(%rsp,$j,8) # tp[j-1]
  595. mov %rdx,$N[0]
  596. cmp $num,$j
  597. jl .Linner4x
  598. mulq $m0 # ap[j]*bp[i]
  599. add %rax,$A[0]
  600. mov -16($np,$j,8),%rax
  601. adc \$0,%rdx
  602. add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
  603. adc \$0,%rdx
  604. mov %rdx,$A[1]
  605. mulq $m1 # np[j]*m1
  606. add %rax,$N[0]
  607. mov -8($ap,$j,8),%rax
  608. adc \$0,%rdx
  609. add $A[0],$N[0]
  610. adc \$0,%rdx
  611. mov $N[1],-32(%rsp,$j,8) # tp[j-1]
  612. mov %rdx,$N[1]
  613. mulq $m0 # ap[j]*bp[i]
  614. add %rax,$A[1]
  615. mov -8($np,$j,8),%rax
  616. adc \$0,%rdx
  617. add -8(%rsp,$j,8),$A[1]
  618. adc \$0,%rdx
  619. lea 1($i),$i # i++
  620. mov %rdx,$A[0]
  621. mulq $m1 # np[j]*m1
  622. add %rax,$N[1]
  623. mov ($ap),%rax # ap[0]
  624. adc \$0,%rdx
  625. add $A[1],$N[1]
  626. adc \$0,%rdx
  627. mov $N[0],-24(%rsp,$j,8) # tp[j-1]
  628. mov %rdx,$N[0]
  629. movq %xmm0,$m0 # bp[i+1]
  630. mov $N[1],-16(%rsp,$j,8) # tp[j-1]
  631. xor $N[1],$N[1]
  632. add $A[0],$N[0]
  633. adc \$0,$N[1]
  634. add (%rsp,$num,8),$N[0] # pull upmost overflow bit
  635. adc \$0,$N[1]
  636. mov $N[0],-8(%rsp,$j,8)
  637. mov $N[1],(%rsp,$j,8) # store upmost overflow bit
  638. cmp $num,$i
  639. jl .Louter4x
  640. ___
  641. {
  642. my @ri=("%rax","%rdx",$m0,$m1);
  643. $code.=<<___;
  644. mov 16(%rsp,$num,8),$rp # restore $rp
  645. mov 0(%rsp),@ri[0] # tp[0]
  646. pxor %xmm0,%xmm0
  647. mov 8(%rsp),@ri[1] # tp[1]
  648. shr \$2,$num # num/=4
  649. lea (%rsp),$ap # borrow ap for tp
  650. xor $i,$i # i=0 and clear CF!
  651. sub 0($np),@ri[0]
  652. mov 16($ap),@ri[2] # tp[2]
  653. mov 24($ap),@ri[3] # tp[3]
  654. sbb 8($np),@ri[1]
  655. lea -1($num),$j # j=num/4-1
  656. jmp .Lsub4x
  657. .align 16
  658. .Lsub4x:
  659. mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i]
  660. mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i]
  661. sbb 16($np,$i,8),@ri[2]
  662. mov 32($ap,$i,8),@ri[0] # tp[i+1]
  663. mov 40($ap,$i,8),@ri[1]
  664. sbb 24($np,$i,8),@ri[3]
  665. mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i]
  666. mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i]
  667. sbb 32($np,$i,8),@ri[0]
  668. mov 48($ap,$i,8),@ri[2]
  669. mov 56($ap,$i,8),@ri[3]
  670. sbb 40($np,$i,8),@ri[1]
  671. lea 4($i),$i # i++
  672. dec $j # doesnn't affect CF!
  673. jnz .Lsub4x
  674. mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i]
  675. mov 32($ap,$i,8),@ri[0] # load overflow bit
  676. sbb 16($np,$i,8),@ri[2]
  677. mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i]
  678. sbb 24($np,$i,8),@ri[3]
  679. mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i]
  680. sbb \$0,@ri[0] # handle upmost overflow bit
  681. mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i]
  682. xor $i,$i # i=0
  683. and @ri[0],$ap
  684. not @ri[0]
  685. mov $rp,$np
  686. and @ri[0],$np
  687. lea -1($num),$j
  688. or $np,$ap # ap=borrow?tp:rp
  689. movdqu ($ap),%xmm1
  690. movdqa %xmm0,(%rsp)
  691. movdqu %xmm1,($rp)
  692. jmp .Lcopy4x
  693. .align 16
  694. .Lcopy4x: # copy or in-place refresh
  695. movdqu 16($ap,$i),%xmm2
  696. movdqu 32($ap,$i),%xmm1
  697. movdqa %xmm0,16(%rsp,$i)
  698. movdqu %xmm2,16($rp,$i)
  699. movdqa %xmm0,32(%rsp,$i)
  700. movdqu %xmm1,32($rp,$i)
  701. lea 32($i),$i
  702. dec $j
  703. jnz .Lcopy4x
  704. shl \$2,$num
  705. movdqu 16($ap,$i),%xmm2
  706. movdqa %xmm0,16(%rsp,$i)
  707. movdqu %xmm2,16($rp,$i)
  708. ___
  709. }
  710. $code.=<<___;
  711. mov 8(%rsp,$num,8),%rsi # restore %rsp
  712. mov \$1,%rax
  713. ___
  714. $code.=<<___ if ($win64);
  715. movaps (%rsi),%xmm6
  716. movaps 0x10(%rsi),%xmm7
  717. lea 0x28(%rsi),%rsi
  718. ___
  719. $code.=<<___;
  720. mov (%rsi),%r15
  721. mov 8(%rsi),%r14
  722. mov 16(%rsi),%r13
  723. mov 24(%rsi),%r12
  724. mov 32(%rsi),%rbp
  725. mov 40(%rsi),%rbx
  726. lea 48(%rsi),%rsp
  727. .Lmul4x_epilogue:
  728. ret
  729. .size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
  730. ___
  731. }}}
  732. {
  733. my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%rdx","%r8", "%r9") : # Win64 order
  734. ("%rdi","%rsi","%rdx","%rcx"); # Unix order
  735. my $out=$inp;
  736. my $STRIDE=2**5*8;
  737. my $N=$STRIDE/4;
  738. $code.=<<___;
  739. .globl bn_scatter5
  740. .type bn_scatter5,\@abi-omnipotent
  741. .align 16
  742. bn_scatter5:
  743. cmp \$0, $num
  744. jz .Lscatter_epilogue
  745. lea ($tbl,$idx,8),$tbl
  746. .Lscatter:
  747. mov ($inp),%rax
  748. lea 8($inp),$inp
  749. mov %rax,($tbl)
  750. lea 32*8($tbl),$tbl
  751. sub \$1,$num
  752. jnz .Lscatter
  753. .Lscatter_epilogue:
  754. ret
  755. .size bn_scatter5,.-bn_scatter5
  756. .globl bn_gather5
  757. .type bn_gather5,\@abi-omnipotent
  758. .align 16
  759. bn_gather5:
  760. ___
  761. $code.=<<___ if ($win64);
  762. .LSEH_begin_bn_gather5:
  763. # I can't trust assembler to use specific encoding:-(
  764. .byte 0x48,0x83,0xec,0x28 #sub \$0x28,%rsp
  765. .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp)
  766. .byte 0x0f,0x29,0x7c,0x24,0x10 #movdqa %xmm7,0x10(%rsp)
  767. ___
  768. $code.=<<___;
  769. mov $idx,%r11
  770. shr \$`log($N/8)/log(2)`,$idx
  771. and \$`$N/8-1`,%r11
  772. not $idx
  773. lea .Lmagic_masks(%rip),%rax
  774. and \$`2**5/($N/8)-1`,$idx # 5 is "window size"
  775. lea 96($tbl,%r11,8),$tbl # pointer within 1st cache line
  776. movq 0(%rax,$idx,8),%xmm4 # set of masks denoting which
  777. movq 8(%rax,$idx,8),%xmm5 # cache line contains element
  778. movq 16(%rax,$idx,8),%xmm6 # denoted by 7th argument
  779. movq 24(%rax,$idx,8),%xmm7
  780. jmp .Lgather
  781. .align 16
  782. .Lgather:
  783. movq `0*$STRIDE/4-96`($tbl),%xmm0
  784. movq `1*$STRIDE/4-96`($tbl),%xmm1
  785. pand %xmm4,%xmm0
  786. movq `2*$STRIDE/4-96`($tbl),%xmm2
  787. pand %xmm5,%xmm1
  788. movq `3*$STRIDE/4-96`($tbl),%xmm3
  789. pand %xmm6,%xmm2
  790. por %xmm1,%xmm0
  791. pand %xmm7,%xmm3
  792. por %xmm2,%xmm0
  793. lea $STRIDE($tbl),$tbl
  794. por %xmm3,%xmm0
  795. movq %xmm0,($out) # m0=bp[0]
  796. lea 8($out),$out
  797. sub \$1,$num
  798. jnz .Lgather
  799. ___
  800. $code.=<<___ if ($win64);
  801. movaps %xmm6,(%rsp)
  802. movaps %xmm7,0x10(%rsp)
  803. lea 0x28(%rsp),%rsp
  804. ___
  805. $code.=<<___;
  806. ret
  807. .LSEH_end_bn_gather5:
  808. .size bn_gather5,.-bn_gather5
  809. ___
  810. }
  811. $code.=<<___;
  812. .align 64
  813. .Lmagic_masks:
  814. .long 0,0, 0,0, 0,0, -1,-1
  815. .long 0,0, 0,0, 0,0, 0,0
  816. .asciz "Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
  817. ___
  818. # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
  819. # CONTEXT *context,DISPATCHER_CONTEXT *disp)
  820. if ($win64) {
  821. $rec="%rcx";
  822. $frame="%rdx";
  823. $context="%r8";
  824. $disp="%r9";
  825. $code.=<<___;
  826. .extern __imp_RtlVirtualUnwind
  827. .type mul_handler,\@abi-omnipotent
  828. .align 16
  829. mul_handler:
  830. push %rsi
  831. push %rdi
  832. push %rbx
  833. push %rbp
  834. push %r12
  835. push %r13
  836. push %r14
  837. push %r15
  838. pushfq
  839. sub \$64,%rsp
  840. mov 120($context),%rax # pull context->Rax
  841. mov 248($context),%rbx # pull context->Rip
  842. mov 8($disp),%rsi # disp->ImageBase
  843. mov 56($disp),%r11 # disp->HandlerData
  844. mov 0(%r11),%r10d # HandlerData[0]
  845. lea (%rsi,%r10),%r10 # end of prologue label
  846. cmp %r10,%rbx # context->Rip<end of prologue label
  847. jb .Lcommon_seh_tail
  848. lea `40+48`(%rax),%rax
  849. mov 4(%r11),%r10d # HandlerData[1]
  850. lea (%rsi,%r10),%r10 # end of alloca label
  851. cmp %r10,%rbx # context->Rip<end of alloca label
  852. jb .Lcommon_seh_tail
  853. mov 152($context),%rax # pull context->Rsp
  854. mov 8(%r11),%r10d # HandlerData[2]
  855. lea (%rsi,%r10),%r10 # epilogue label
  856. cmp %r10,%rbx # context->Rip>=epilogue label
  857. jae .Lcommon_seh_tail
  858. mov 192($context),%r10 # pull $num
  859. mov 8(%rax,%r10,8),%rax # pull saved stack pointer
  860. movaps (%rax),%xmm0
  861. movaps 16(%rax),%xmm1
  862. lea `40+48`(%rax),%rax
  863. mov -8(%rax),%rbx
  864. mov -16(%rax),%rbp
  865. mov -24(%rax),%r12
  866. mov -32(%rax),%r13
  867. mov -40(%rax),%r14
  868. mov -48(%rax),%r15
  869. mov %rbx,144($context) # restore context->Rbx
  870. mov %rbp,160($context) # restore context->Rbp
  871. mov %r12,216($context) # restore context->R12
  872. mov %r13,224($context) # restore context->R13
  873. mov %r14,232($context) # restore context->R14
  874. mov %r15,240($context) # restore context->R15
  875. movups %xmm0,512($context) # restore context->Xmm6
  876. movups %xmm1,528($context) # restore context->Xmm7
  877. .Lcommon_seh_tail:
  878. mov 8(%rax),%rdi
  879. mov 16(%rax),%rsi
  880. mov %rax,152($context) # restore context->Rsp
  881. mov %rsi,168($context) # restore context->Rsi
  882. mov %rdi,176($context) # restore context->Rdi
  883. mov 40($disp),%rdi # disp->ContextRecord
  884. mov $context,%rsi # context
  885. mov \$154,%ecx # sizeof(CONTEXT)
  886. .long 0xa548f3fc # cld; rep movsq
  887. mov $disp,%rsi
  888. xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
  889. mov 8(%rsi),%rdx # arg2, disp->ImageBase
  890. mov 0(%rsi),%r8 # arg3, disp->ControlPc
  891. mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
  892. mov 40(%rsi),%r10 # disp->ContextRecord
  893. lea 56(%rsi),%r11 # &disp->HandlerData
  894. lea 24(%rsi),%r12 # &disp->EstablisherFrame
  895. mov %r10,32(%rsp) # arg5
  896. mov %r11,40(%rsp) # arg6
  897. mov %r12,48(%rsp) # arg7
  898. mov %rcx,56(%rsp) # arg8, (NULL)
  899. call *__imp_RtlVirtualUnwind(%rip)
  900. mov \$1,%eax # ExceptionContinueSearch
  901. add \$64,%rsp
  902. popfq
  903. pop %r15
  904. pop %r14
  905. pop %r13
  906. pop %r12
  907. pop %rbp
  908. pop %rbx
  909. pop %rdi
  910. pop %rsi
  911. ret
  912. .size mul_handler,.-mul_handler
  913. .section .pdata
  914. .align 4
  915. .rva .LSEH_begin_bn_mul_mont_gather5
  916. .rva .LSEH_end_bn_mul_mont_gather5
  917. .rva .LSEH_info_bn_mul_mont_gather5
  918. .rva .LSEH_begin_bn_mul4x_mont_gather5
  919. .rva .LSEH_end_bn_mul4x_mont_gather5
  920. .rva .LSEH_info_bn_mul4x_mont_gather5
  921. .rva .LSEH_begin_bn_gather5
  922. .rva .LSEH_end_bn_gather5
  923. .rva .LSEH_info_bn_gather5
  924. .section .xdata
  925. .align 8
  926. .LSEH_info_bn_mul_mont_gather5:
  927. .byte 9,0,0,0
  928. .rva mul_handler
  929. .rva .Lmul_alloca,.Lmul_body,.Lmul_epilogue # HandlerData[]
  930. .align 8
  931. .LSEH_info_bn_mul4x_mont_gather5:
  932. .byte 9,0,0,0
  933. .rva mul_handler
  934. .rva .Lmul4x_alloca,.Lmul4x_body,.Lmul4x_epilogue # HandlerData[]
  935. .align 8
  936. .LSEH_info_bn_gather5:
  937. .byte 0x01,0x0d,0x05,0x00
  938. .byte 0x0d,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7
  939. .byte 0x08,0x68,0x00,0x00 #movaps (rsp),xmm6
  940. .byte 0x04,0x42,0x00,0x00 #sub rsp,0x28
  941. .align 8
  942. ___
  943. }
  944. $code =~ s/\`([^\`]*)\`/eval($1)/gem;
  945. print $code;
  946. close STDOUT;