rsaz-avx2.pl 51 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898
  1. #!/usr/bin/env perl
  2. ##############################################################################
  3. # #
  4. # Copyright (c) 2012, Intel Corporation #
  5. # #
  6. # All rights reserved. #
  7. # #
  8. # Redistribution and use in source and binary forms, with or without #
  9. # modification, are permitted provided that the following conditions are #
  10. # met: #
  11. # #
  12. # * Redistributions of source code must retain the above copyright #
  13. # notice, this list of conditions and the following disclaimer. #
  14. # #
  15. # * Redistributions in binary form must reproduce the above copyright #
  16. # notice, this list of conditions and the following disclaimer in the #
  17. # documentation and/or other materials provided with the #
  18. # distribution. #
  19. # #
  20. # * Neither the name of the Intel Corporation nor the names of its #
  21. # contributors may be used to endorse or promote products derived from #
  22. # this software without specific prior written permission. #
  23. # #
  24. # #
  25. # THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY #
  26. # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE #
  27. # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR #
  28. # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR #
  29. # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, #
  30. # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, #
  31. # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR #
  32. # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF #
  33. # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING #
  34. # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS #
  35. # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #
  36. # #
  37. ##############################################################################
  38. # Developers and authors: #
  39. # Shay Gueron (1, 2), and Vlad Krasnov (1) #
  40. # (1) Intel Corporation, Israel Development Center, Haifa, Israel #
  41. # (2) University of Haifa, Israel #
  42. ##############################################################################
  43. # Reference: #
  44. # [1] S. Gueron, V. Krasnov: "Software Implementation of Modular #
  45. # Exponentiation, Using Advanced Vector Instructions Architectures", #
  46. # F. Ozbudak and F. Rodriguez-Henriquez (Eds.): WAIFI 2012, LNCS 7369, #
  47. # pp. 119?135, 2012. Springer-Verlag Berlin Heidelberg 2012 #
  48. # [2] S. Gueron: "Efficient Software Implementations of Modular #
  49. # Exponentiation", Journal of Cryptographic Engineering 2:31-43 (2012). #
  50. # [3] S. Gueron, V. Krasnov: "Speeding up Big-numbers Squaring",IEEE #
  51. # Proceedings of 9th International Conference on Information Technology: #
  52. # New Generations (ITNG 2012), pp.821-823 (2012) #
  53. # [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis #
  54. # resistant 1024-bit modular exponentiation, for optimizing RSA2048 #
  55. # on AVX2 capable x86_64 platforms", #
  56. # http://rt.openssl.org/Ticket/Display.html?id=2850&user=guest&pass=guest#
  57. ##############################################################################
  58. #
  59. # +13% improvement over original submission by <appro@openssl.org>
  60. #
  61. # rsa2048 sign/sec OpenSSL 1.0.1 scalar(*) this
  62. # 2.3GHz Haswell 621 765/+23% 1113/+79%
  63. # 2.3GHz Broadwell(**) 688 1200(***)/+74% 1120/+63%
  64. #
  65. # (*) if system doesn't support AVX2, for reference purposes;
  66. # (**) scaled to 2.3GHz to simplify comparison;
  67. # (***) scalar AD*X code is faster than AVX2 and is preferred code
  68. # path for Broadwell;
  69. $flavour = shift;
  70. $output = shift;
  71. if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
  72. $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  73. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  74. ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  75. ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  76. die "can't locate x86_64-xlate.pl";
  77. if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
  78. =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
  79. $avx = ($1>=2.19) + ($1>=2.22);
  80. $addx = ($1>=2.23);
  81. }
  82. if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
  83. `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
  84. $avx = ($1>=2.09) + ($1>=2.10);
  85. $addx = ($1>=2.10);
  86. }
  87. if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
  88. `ml64 2>&1` =~ /Version ([0-9]+)\./) {
  89. $avx = ($1>=10) + ($1>=11);
  90. $addx = ($1>=11);
  91. }
  92. if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9])\.([0-9]+)/) {
  93. my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10
  94. $avx = ($ver>=3.0) + ($ver>=3.01);
  95. $addx = ($ver>=3.03);
  96. }
  97. open OUT,"| \"$^X\" $xlate $flavour $output";
  98. *STDOUT = *OUT;
  99. if ($avx>1) {{{
  100. { # void AMS_WW(
  101. my $rp="%rdi"; # BN_ULONG *rp,
  102. my $ap="%rsi"; # const BN_ULONG *ap,
  103. my $np="%rdx"; # const BN_ULONG *np,
  104. my $n0="%ecx"; # const BN_ULONG n0,
  105. my $rep="%r8d"; # int repeat);
  106. # The registers that hold the accumulated redundant result
  107. # The AMM works on 1024 bit operands, and redundant word size is 29
  108. # Therefore: ceil(1024/29)/4 = 9
  109. my $ACC0="%ymm0";
  110. my $ACC1="%ymm1";
  111. my $ACC2="%ymm2";
  112. my $ACC3="%ymm3";
  113. my $ACC4="%ymm4";
  114. my $ACC5="%ymm5";
  115. my $ACC6="%ymm6";
  116. my $ACC7="%ymm7";
  117. my $ACC8="%ymm8";
  118. my $ACC9="%ymm9";
  119. # Registers that hold the broadcasted words of bp, currently used
  120. my $B1="%ymm10";
  121. my $B2="%ymm11";
  122. # Registers that hold the broadcasted words of Y, currently used
  123. my $Y1="%ymm12";
  124. my $Y2="%ymm13";
  125. # Helper registers
  126. my $TEMP1="%ymm14";
  127. my $AND_MASK="%ymm15";
  128. # alu registers that hold the first words of the ACC
  129. my $r0="%r9";
  130. my $r1="%r10";
  131. my $r2="%r11";
  132. my $r3="%r12";
  133. my $i="%r14d"; # loop counter
  134. my $tmp = "%r15";
  135. my $FrameSize=32*18+32*8; # place for A^2 and 2*A
  136. my $aap=$r0;
  137. my $tp0="%rbx";
  138. my $tp1=$r3;
  139. my $tpa=$tmp;
  140. $np="%r13"; # reassigned argument
  141. $code.=<<___;
  142. .text
  143. .globl rsaz_1024_sqr_avx2
  144. .type rsaz_1024_sqr_avx2,\@function,5
  145. .align 64
  146. rsaz_1024_sqr_avx2: # 702 cycles, 14% faster than rsaz_1024_mul_avx2
  147. lea (%rsp), %rax
  148. push %rbx
  149. push %rbp
  150. push %r12
  151. push %r13
  152. push %r14
  153. push %r15
  154. vzeroupper
  155. ___
  156. $code.=<<___ if ($win64);
  157. lea -0xa8(%rsp),%rsp
  158. vmovaps %xmm6,-0xd8(%rax)
  159. vmovaps %xmm7,-0xc8(%rax)
  160. vmovaps %xmm8,-0xb8(%rax)
  161. vmovaps %xmm9,-0xa8(%rax)
  162. vmovaps %xmm10,-0x98(%rax)
  163. vmovaps %xmm11,-0x88(%rax)
  164. vmovaps %xmm12,-0x78(%rax)
  165. vmovaps %xmm13,-0x68(%rax)
  166. vmovaps %xmm14,-0x58(%rax)
  167. vmovaps %xmm15,-0x48(%rax)
  168. .Lsqr_1024_body:
  169. ___
  170. $code.=<<___;
  171. mov %rax,%rbp
  172. mov %rdx, $np # reassigned argument
  173. sub \$$FrameSize, %rsp
  174. mov $np, $tmp
  175. sub \$-128, $rp # size optimization
  176. sub \$-128, $ap
  177. sub \$-128, $np
  178. and \$4095, $tmp # see if $np crosses page
  179. add \$32*10, $tmp
  180. shr \$12, $tmp
  181. vpxor $ACC9,$ACC9,$ACC9
  182. jz .Lsqr_1024_no_n_copy
  183. # unaligned 256-bit load that crosses page boundary can
  184. # cause >2x performance degradation here, so if $np does
  185. # cross page boundary, copy it to stack and make sure stack
  186. # frame doesn't...
  187. sub \$32*10,%rsp
  188. vmovdqu 32*0-128($np), $ACC0
  189. and \$-2048, %rsp
  190. vmovdqu 32*1-128($np), $ACC1
  191. vmovdqu 32*2-128($np), $ACC2
  192. vmovdqu 32*3-128($np), $ACC3
  193. vmovdqu 32*4-128($np), $ACC4
  194. vmovdqu 32*5-128($np), $ACC5
  195. vmovdqu 32*6-128($np), $ACC6
  196. vmovdqu 32*7-128($np), $ACC7
  197. vmovdqu 32*8-128($np), $ACC8
  198. lea $FrameSize+128(%rsp),$np
  199. vmovdqu $ACC0, 32*0-128($np)
  200. vmovdqu $ACC1, 32*1-128($np)
  201. vmovdqu $ACC2, 32*2-128($np)
  202. vmovdqu $ACC3, 32*3-128($np)
  203. vmovdqu $ACC4, 32*4-128($np)
  204. vmovdqu $ACC5, 32*5-128($np)
  205. vmovdqu $ACC6, 32*6-128($np)
  206. vmovdqu $ACC7, 32*7-128($np)
  207. vmovdqu $ACC8, 32*8-128($np)
  208. vmovdqu $ACC9, 32*9-128($np) # $ACC9 is zero
  209. .Lsqr_1024_no_n_copy:
  210. and \$-1024, %rsp
  211. vmovdqu 32*1-128($ap), $ACC1
  212. vmovdqu 32*2-128($ap), $ACC2
  213. vmovdqu 32*3-128($ap), $ACC3
  214. vmovdqu 32*4-128($ap), $ACC4
  215. vmovdqu 32*5-128($ap), $ACC5
  216. vmovdqu 32*6-128($ap), $ACC6
  217. vmovdqu 32*7-128($ap), $ACC7
  218. vmovdqu 32*8-128($ap), $ACC8
  219. lea 192(%rsp), $tp0 # 64+128=192
  220. vpbroadcastq .Land_mask(%rip), $AND_MASK
  221. jmp .LOOP_GRANDE_SQR_1024
  222. .align 32
  223. .LOOP_GRANDE_SQR_1024:
  224. lea 32*18+128(%rsp), $aap # size optimization
  225. lea 448(%rsp), $tp1 # 64+128+256=448
  226. # the squaring is performed as described in Variant B of
  227. # "Speeding up Big-Number Squaring", so start by calculating
  228. # the A*2=A+A vector
  229. vpaddq $ACC1, $ACC1, $ACC1
  230. vpbroadcastq 32*0-128($ap), $B1
  231. vpaddq $ACC2, $ACC2, $ACC2
  232. vmovdqa $ACC1, 32*0-128($aap)
  233. vpaddq $ACC3, $ACC3, $ACC3
  234. vmovdqa $ACC2, 32*1-128($aap)
  235. vpaddq $ACC4, $ACC4, $ACC4
  236. vmovdqa $ACC3, 32*2-128($aap)
  237. vpaddq $ACC5, $ACC5, $ACC5
  238. vmovdqa $ACC4, 32*3-128($aap)
  239. vpaddq $ACC6, $ACC6, $ACC6
  240. vmovdqa $ACC5, 32*4-128($aap)
  241. vpaddq $ACC7, $ACC7, $ACC7
  242. vmovdqa $ACC6, 32*5-128($aap)
  243. vpaddq $ACC8, $ACC8, $ACC8
  244. vmovdqa $ACC7, 32*6-128($aap)
  245. vpxor $ACC9, $ACC9, $ACC9
  246. vmovdqa $ACC8, 32*7-128($aap)
  247. vpmuludq 32*0-128($ap), $B1, $ACC0
  248. vpbroadcastq 32*1-128($ap), $B2
  249. vmovdqu $ACC9, 32*9-192($tp0) # zero upper half
  250. vpmuludq $B1, $ACC1, $ACC1
  251. vmovdqu $ACC9, 32*10-448($tp1)
  252. vpmuludq $B1, $ACC2, $ACC2
  253. vmovdqu $ACC9, 32*11-448($tp1)
  254. vpmuludq $B1, $ACC3, $ACC3
  255. vmovdqu $ACC9, 32*12-448($tp1)
  256. vpmuludq $B1, $ACC4, $ACC4
  257. vmovdqu $ACC9, 32*13-448($tp1)
  258. vpmuludq $B1, $ACC5, $ACC5
  259. vmovdqu $ACC9, 32*14-448($tp1)
  260. vpmuludq $B1, $ACC6, $ACC6
  261. vmovdqu $ACC9, 32*15-448($tp1)
  262. vpmuludq $B1, $ACC7, $ACC7
  263. vmovdqu $ACC9, 32*16-448($tp1)
  264. vpmuludq $B1, $ACC8, $ACC8
  265. vpbroadcastq 32*2-128($ap), $B1
  266. vmovdqu $ACC9, 32*17-448($tp1)
  267. mov $ap, $tpa
  268. mov \$4, $i
  269. jmp .Lsqr_entry_1024
  270. ___
  271. $TEMP0=$Y1;
  272. $TEMP2=$Y2;
  273. $code.=<<___;
  274. .align 32
  275. .LOOP_SQR_1024:
  276. vpbroadcastq 32*1-128($tpa), $B2
  277. vpmuludq 32*0-128($ap), $B1, $ACC0
  278. vpaddq 32*0-192($tp0), $ACC0, $ACC0
  279. vpmuludq 32*0-128($aap), $B1, $ACC1
  280. vpaddq 32*1-192($tp0), $ACC1, $ACC1
  281. vpmuludq 32*1-128($aap), $B1, $ACC2
  282. vpaddq 32*2-192($tp0), $ACC2, $ACC2
  283. vpmuludq 32*2-128($aap), $B1, $ACC3
  284. vpaddq 32*3-192($tp0), $ACC3, $ACC3
  285. vpmuludq 32*3-128($aap), $B1, $ACC4
  286. vpaddq 32*4-192($tp0), $ACC4, $ACC4
  287. vpmuludq 32*4-128($aap), $B1, $ACC5
  288. vpaddq 32*5-192($tp0), $ACC5, $ACC5
  289. vpmuludq 32*5-128($aap), $B1, $ACC6
  290. vpaddq 32*6-192($tp0), $ACC6, $ACC6
  291. vpmuludq 32*6-128($aap), $B1, $ACC7
  292. vpaddq 32*7-192($tp0), $ACC7, $ACC7
  293. vpmuludq 32*7-128($aap), $B1, $ACC8
  294. vpbroadcastq 32*2-128($tpa), $B1
  295. vpaddq 32*8-192($tp0), $ACC8, $ACC8
  296. .Lsqr_entry_1024:
  297. vmovdqu $ACC0, 32*0-192($tp0)
  298. vmovdqu $ACC1, 32*1-192($tp0)
  299. vpmuludq 32*1-128($ap), $B2, $TEMP0
  300. vpaddq $TEMP0, $ACC2, $ACC2
  301. vpmuludq 32*1-128($aap), $B2, $TEMP1
  302. vpaddq $TEMP1, $ACC3, $ACC3
  303. vpmuludq 32*2-128($aap), $B2, $TEMP2
  304. vpaddq $TEMP2, $ACC4, $ACC4
  305. vpmuludq 32*3-128($aap), $B2, $TEMP0
  306. vpaddq $TEMP0, $ACC5, $ACC5
  307. vpmuludq 32*4-128($aap), $B2, $TEMP1
  308. vpaddq $TEMP1, $ACC6, $ACC6
  309. vpmuludq 32*5-128($aap), $B2, $TEMP2
  310. vpaddq $TEMP2, $ACC7, $ACC7
  311. vpmuludq 32*6-128($aap), $B2, $TEMP0
  312. vpaddq $TEMP0, $ACC8, $ACC8
  313. vpmuludq 32*7-128($aap), $B2, $ACC0
  314. vpbroadcastq 32*3-128($tpa), $B2
  315. vpaddq 32*9-192($tp0), $ACC0, $ACC0
  316. vmovdqu $ACC2, 32*2-192($tp0)
  317. vmovdqu $ACC3, 32*3-192($tp0)
  318. vpmuludq 32*2-128($ap), $B1, $TEMP2
  319. vpaddq $TEMP2, $ACC4, $ACC4
  320. vpmuludq 32*2-128($aap), $B1, $TEMP0
  321. vpaddq $TEMP0, $ACC5, $ACC5
  322. vpmuludq 32*3-128($aap), $B1, $TEMP1
  323. vpaddq $TEMP1, $ACC6, $ACC6
  324. vpmuludq 32*4-128($aap), $B1, $TEMP2
  325. vpaddq $TEMP2, $ACC7, $ACC7
  326. vpmuludq 32*5-128($aap), $B1, $TEMP0
  327. vpaddq $TEMP0, $ACC8, $ACC8
  328. vpmuludq 32*6-128($aap), $B1, $TEMP1
  329. vpaddq $TEMP1, $ACC0, $ACC0
  330. vpmuludq 32*7-128($aap), $B1, $ACC1
  331. vpbroadcastq 32*4-128($tpa), $B1
  332. vpaddq 32*10-448($tp1), $ACC1, $ACC1
  333. vmovdqu $ACC4, 32*4-192($tp0)
  334. vmovdqu $ACC5, 32*5-192($tp0)
  335. vpmuludq 32*3-128($ap), $B2, $TEMP0
  336. vpaddq $TEMP0, $ACC6, $ACC6
  337. vpmuludq 32*3-128($aap), $B2, $TEMP1
  338. vpaddq $TEMP1, $ACC7, $ACC7
  339. vpmuludq 32*4-128($aap), $B2, $TEMP2
  340. vpaddq $TEMP2, $ACC8, $ACC8
  341. vpmuludq 32*5-128($aap), $B2, $TEMP0
  342. vpaddq $TEMP0, $ACC0, $ACC0
  343. vpmuludq 32*6-128($aap), $B2, $TEMP1
  344. vpaddq $TEMP1, $ACC1, $ACC1
  345. vpmuludq 32*7-128($aap), $B2, $ACC2
  346. vpbroadcastq 32*5-128($tpa), $B2
  347. vpaddq 32*11-448($tp1), $ACC2, $ACC2
  348. vmovdqu $ACC6, 32*6-192($tp0)
  349. vmovdqu $ACC7, 32*7-192($tp0)
  350. vpmuludq 32*4-128($ap), $B1, $TEMP0
  351. vpaddq $TEMP0, $ACC8, $ACC8
  352. vpmuludq 32*4-128($aap), $B1, $TEMP1
  353. vpaddq $TEMP1, $ACC0, $ACC0
  354. vpmuludq 32*5-128($aap), $B1, $TEMP2
  355. vpaddq $TEMP2, $ACC1, $ACC1
  356. vpmuludq 32*6-128($aap), $B1, $TEMP0
  357. vpaddq $TEMP0, $ACC2, $ACC2
  358. vpmuludq 32*7-128($aap), $B1, $ACC3
  359. vpbroadcastq 32*6-128($tpa), $B1
  360. vpaddq 32*12-448($tp1), $ACC3, $ACC3
  361. vmovdqu $ACC8, 32*8-192($tp0)
  362. vmovdqu $ACC0, 32*9-192($tp0)
  363. lea 8($tp0), $tp0
  364. vpmuludq 32*5-128($ap), $B2, $TEMP2
  365. vpaddq $TEMP2, $ACC1, $ACC1
  366. vpmuludq 32*5-128($aap), $B2, $TEMP0
  367. vpaddq $TEMP0, $ACC2, $ACC2
  368. vpmuludq 32*6-128($aap), $B2, $TEMP1
  369. vpaddq $TEMP1, $ACC3, $ACC3
  370. vpmuludq 32*7-128($aap), $B2, $ACC4
  371. vpbroadcastq 32*7-128($tpa), $B2
  372. vpaddq 32*13-448($tp1), $ACC4, $ACC4
  373. vmovdqu $ACC1, 32*10-448($tp1)
  374. vmovdqu $ACC2, 32*11-448($tp1)
  375. vpmuludq 32*6-128($ap), $B1, $TEMP0
  376. vpaddq $TEMP0, $ACC3, $ACC3
  377. vpmuludq 32*6-128($aap), $B1, $TEMP1
  378. vpbroadcastq 32*8-128($tpa), $ACC0 # borrow $ACC0 for $B1
  379. vpaddq $TEMP1, $ACC4, $ACC4
  380. vpmuludq 32*7-128($aap), $B1, $ACC5
  381. vpbroadcastq 32*0+8-128($tpa), $B1 # for next iteration
  382. vpaddq 32*14-448($tp1), $ACC5, $ACC5
  383. vmovdqu $ACC3, 32*12-448($tp1)
  384. vmovdqu $ACC4, 32*13-448($tp1)
  385. lea 8($tpa), $tpa
  386. vpmuludq 32*7-128($ap), $B2, $TEMP0
  387. vpaddq $TEMP0, $ACC5, $ACC5
  388. vpmuludq 32*7-128($aap), $B2, $ACC6
  389. vpaddq 32*15-448($tp1), $ACC6, $ACC6
  390. vpmuludq 32*8-128($ap), $ACC0, $ACC7
  391. vmovdqu $ACC5, 32*14-448($tp1)
  392. vpaddq 32*16-448($tp1), $ACC7, $ACC7
  393. vmovdqu $ACC6, 32*15-448($tp1)
  394. vmovdqu $ACC7, 32*16-448($tp1)
  395. lea 8($tp1), $tp1
  396. dec $i
  397. jnz .LOOP_SQR_1024
  398. ___
  399. $ZERO = $ACC9;
  400. $TEMP0 = $B1;
  401. $TEMP2 = $B2;
  402. $TEMP3 = $Y1;
  403. $TEMP4 = $Y2;
  404. $code.=<<___;
  405. #we need to fix indexes 32-39 to avoid overflow
  406. vmovdqu 32*8(%rsp), $ACC8 # 32*8-192($tp0),
  407. vmovdqu 32*9(%rsp), $ACC1 # 32*9-192($tp0)
  408. vmovdqu 32*10(%rsp), $ACC2 # 32*10-192($tp0)
  409. lea 192(%rsp), $tp0 # 64+128=192
  410. vpsrlq \$29, $ACC8, $TEMP1
  411. vpand $AND_MASK, $ACC8, $ACC8
  412. vpsrlq \$29, $ACC1, $TEMP2
  413. vpand $AND_MASK, $ACC1, $ACC1
  414. vpermq \$0x93, $TEMP1, $TEMP1
  415. vpxor $ZERO, $ZERO, $ZERO
  416. vpermq \$0x93, $TEMP2, $TEMP2
  417. vpblendd \$3, $ZERO, $TEMP1, $TEMP0
  418. vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
  419. vpaddq $TEMP0, $ACC8, $ACC8
  420. vpblendd \$3, $TEMP2, $ZERO, $TEMP2
  421. vpaddq $TEMP1, $ACC1, $ACC1
  422. vpaddq $TEMP2, $ACC2, $ACC2
  423. vmovdqu $ACC1, 32*9-192($tp0)
  424. vmovdqu $ACC2, 32*10-192($tp0)
  425. mov (%rsp), %rax
  426. mov 8(%rsp), $r1
  427. mov 16(%rsp), $r2
  428. mov 24(%rsp), $r3
  429. vmovdqu 32*1(%rsp), $ACC1
  430. vmovdqu 32*2-192($tp0), $ACC2
  431. vmovdqu 32*3-192($tp0), $ACC3
  432. vmovdqu 32*4-192($tp0), $ACC4
  433. vmovdqu 32*5-192($tp0), $ACC5
  434. vmovdqu 32*6-192($tp0), $ACC6
  435. vmovdqu 32*7-192($tp0), $ACC7
  436. mov %rax, $r0
  437. imull $n0, %eax
  438. and \$0x1fffffff, %eax
  439. vmovd %eax, $Y1
  440. mov %rax, %rdx
  441. imulq -128($np), %rax
  442. vpbroadcastq $Y1, $Y1
  443. add %rax, $r0
  444. mov %rdx, %rax
  445. imulq 8-128($np), %rax
  446. shr \$29, $r0
  447. add %rax, $r1
  448. mov %rdx, %rax
  449. imulq 16-128($np), %rax
  450. add $r0, $r1
  451. add %rax, $r2
  452. imulq 24-128($np), %rdx
  453. add %rdx, $r3
  454. mov $r1, %rax
  455. imull $n0, %eax
  456. and \$0x1fffffff, %eax
  457. mov \$9, $i
  458. jmp .LOOP_REDUCE_1024
  459. .align 32
  460. .LOOP_REDUCE_1024:
  461. vmovd %eax, $Y2
  462. vpbroadcastq $Y2, $Y2
  463. vpmuludq 32*1-128($np), $Y1, $TEMP0
  464. mov %rax, %rdx
  465. imulq -128($np), %rax
  466. vpaddq $TEMP0, $ACC1, $ACC1
  467. add %rax, $r1
  468. vpmuludq 32*2-128($np), $Y1, $TEMP1
  469. mov %rdx, %rax
  470. imulq 8-128($np), %rax
  471. vpaddq $TEMP1, $ACC2, $ACC2
  472. vpmuludq 32*3-128($np), $Y1, $TEMP2
  473. .byte 0x67
  474. add %rax, $r2
  475. .byte 0x67
  476. mov %rdx, %rax
  477. imulq 16-128($np), %rax
  478. shr \$29, $r1
  479. vpaddq $TEMP2, $ACC3, $ACC3
  480. vpmuludq 32*4-128($np), $Y1, $TEMP0
  481. add %rax, $r3
  482. add $r1, $r2
  483. vpaddq $TEMP0, $ACC4, $ACC4
  484. vpmuludq 32*5-128($np), $Y1, $TEMP1
  485. mov $r2, %rax
  486. imull $n0, %eax
  487. vpaddq $TEMP1, $ACC5, $ACC5
  488. vpmuludq 32*6-128($np), $Y1, $TEMP2
  489. and \$0x1fffffff, %eax
  490. vpaddq $TEMP2, $ACC6, $ACC6
  491. vpmuludq 32*7-128($np), $Y1, $TEMP0
  492. vpaddq $TEMP0, $ACC7, $ACC7
  493. vpmuludq 32*8-128($np), $Y1, $TEMP1
  494. vmovd %eax, $Y1
  495. #vmovdqu 32*1-8-128($np), $TEMP2 # moved below
  496. vpaddq $TEMP1, $ACC8, $ACC8
  497. #vmovdqu 32*2-8-128($np), $TEMP0 # moved below
  498. vpbroadcastq $Y1, $Y1
  499. vpmuludq 32*1-8-128($np), $Y2, $TEMP2 # see above
  500. vmovdqu 32*3-8-128($np), $TEMP1
  501. mov %rax, %rdx
  502. imulq -128($np), %rax
  503. vpaddq $TEMP2, $ACC1, $ACC1
  504. vpmuludq 32*2-8-128($np), $Y2, $TEMP0 # see above
  505. vmovdqu 32*4-8-128($np), $TEMP2
  506. add %rax, $r2
  507. mov %rdx, %rax
  508. imulq 8-128($np), %rax
  509. vpaddq $TEMP0, $ACC2, $ACC2
  510. add $r3, %rax
  511. shr \$29, $r2
  512. vpmuludq $Y2, $TEMP1, $TEMP1
  513. vmovdqu 32*5-8-128($np), $TEMP0
  514. add $r2, %rax
  515. vpaddq $TEMP1, $ACC3, $ACC3
  516. vpmuludq $Y2, $TEMP2, $TEMP2
  517. vmovdqu 32*6-8-128($np), $TEMP1
  518. .byte 0x67
  519. mov %rax, $r3
  520. imull $n0, %eax
  521. vpaddq $TEMP2, $ACC4, $ACC4
  522. vpmuludq $Y2, $TEMP0, $TEMP0
  523. .byte 0xc4,0x41,0x7e,0x6f,0x9d,0x58,0x00,0x00,0x00 # vmovdqu 32*7-8-128($np), $TEMP2
  524. and \$0x1fffffff, %eax
  525. vpaddq $TEMP0, $ACC5, $ACC5
  526. vpmuludq $Y2, $TEMP1, $TEMP1
  527. vmovdqu 32*8-8-128($np), $TEMP0
  528. vpaddq $TEMP1, $ACC6, $ACC6
  529. vpmuludq $Y2, $TEMP2, $TEMP2
  530. vmovdqu 32*9-8-128($np), $ACC9
  531. vmovd %eax, $ACC0 # borrow ACC0 for Y2
  532. imulq -128($np), %rax
  533. vpaddq $TEMP2, $ACC7, $ACC7
  534. vpmuludq $Y2, $TEMP0, $TEMP0
  535. vmovdqu 32*1-16-128($np), $TEMP1
  536. vpbroadcastq $ACC0, $ACC0
  537. vpaddq $TEMP0, $ACC8, $ACC8
  538. vpmuludq $Y2, $ACC9, $ACC9
  539. vmovdqu 32*2-16-128($np), $TEMP2
  540. add %rax, $r3
  541. ___
  542. ($ACC0,$Y2)=($Y2,$ACC0);
  543. $code.=<<___;
  544. vmovdqu 32*1-24-128($np), $ACC0
  545. vpmuludq $Y1, $TEMP1, $TEMP1
  546. vmovdqu 32*3-16-128($np), $TEMP0
  547. vpaddq $TEMP1, $ACC1, $ACC1
  548. vpmuludq $Y2, $ACC0, $ACC0
  549. vpmuludq $Y1, $TEMP2, $TEMP2
  550. .byte 0xc4,0x41,0x7e,0x6f,0xb5,0xf0,0xff,0xff,0xff # vmovdqu 32*4-16-128($np), $TEMP1
  551. vpaddq $ACC1, $ACC0, $ACC0
  552. vpaddq $TEMP2, $ACC2, $ACC2
  553. vpmuludq $Y1, $TEMP0, $TEMP0
  554. vmovdqu 32*5-16-128($np), $TEMP2
  555. .byte 0x67
  556. vmovq $ACC0, %rax
  557. vmovdqu $ACC0, (%rsp) # transfer $r0-$r3
  558. vpaddq $TEMP0, $ACC3, $ACC3
  559. vpmuludq $Y1, $TEMP1, $TEMP1
  560. vmovdqu 32*6-16-128($np), $TEMP0
  561. vpaddq $TEMP1, $ACC4, $ACC4
  562. vpmuludq $Y1, $TEMP2, $TEMP2
  563. vmovdqu 32*7-16-128($np), $TEMP1
  564. vpaddq $TEMP2, $ACC5, $ACC5
  565. vpmuludq $Y1, $TEMP0, $TEMP0
  566. vmovdqu 32*8-16-128($np), $TEMP2
  567. vpaddq $TEMP0, $ACC6, $ACC6
  568. vpmuludq $Y1, $TEMP1, $TEMP1
  569. shr \$29, $r3
  570. vmovdqu 32*9-16-128($np), $TEMP0
  571. add $r3, %rax
  572. vpaddq $TEMP1, $ACC7, $ACC7
  573. vpmuludq $Y1, $TEMP2, $TEMP2
  574. #vmovdqu 32*2-24-128($np), $TEMP1 # moved below
  575. mov %rax, $r0
  576. imull $n0, %eax
  577. vpaddq $TEMP2, $ACC8, $ACC8
  578. vpmuludq $Y1, $TEMP0, $TEMP0
  579. and \$0x1fffffff, %eax
  580. vmovd %eax, $Y1
  581. vmovdqu 32*3-24-128($np), $TEMP2
  582. .byte 0x67
  583. vpaddq $TEMP0, $ACC9, $ACC9
  584. vpbroadcastq $Y1, $Y1
  585. vpmuludq 32*2-24-128($np), $Y2, $TEMP1 # see above
  586. vmovdqu 32*4-24-128($np), $TEMP0
  587. mov %rax, %rdx
  588. imulq -128($np), %rax
  589. mov 8(%rsp), $r1
  590. vpaddq $TEMP1, $ACC2, $ACC1
  591. vpmuludq $Y2, $TEMP2, $TEMP2
  592. vmovdqu 32*5-24-128($np), $TEMP1
  593. add %rax, $r0
  594. mov %rdx, %rax
  595. imulq 8-128($np), %rax
  596. .byte 0x67
  597. shr \$29, $r0
  598. mov 16(%rsp), $r2
  599. vpaddq $TEMP2, $ACC3, $ACC2
  600. vpmuludq $Y2, $TEMP0, $TEMP0
  601. vmovdqu 32*6-24-128($np), $TEMP2
  602. add %rax, $r1
  603. mov %rdx, %rax
  604. imulq 16-128($np), %rax
  605. vpaddq $TEMP0, $ACC4, $ACC3
  606. vpmuludq $Y2, $TEMP1, $TEMP1
  607. vmovdqu 32*7-24-128($np), $TEMP0
  608. imulq 24-128($np), %rdx # future $r3
  609. add %rax, $r2
  610. lea ($r0,$r1), %rax
  611. vpaddq $TEMP1, $ACC5, $ACC4
  612. vpmuludq $Y2, $TEMP2, $TEMP2
  613. vmovdqu 32*8-24-128($np), $TEMP1
  614. mov %rax, $r1
  615. imull $n0, %eax
  616. vpmuludq $Y2, $TEMP0, $TEMP0
  617. vpaddq $TEMP2, $ACC6, $ACC5
  618. vmovdqu 32*9-24-128($np), $TEMP2
  619. and \$0x1fffffff, %eax
  620. vpaddq $TEMP0, $ACC7, $ACC6
  621. vpmuludq $Y2, $TEMP1, $TEMP1
  622. add 24(%rsp), %rdx
  623. vpaddq $TEMP1, $ACC8, $ACC7
  624. vpmuludq $Y2, $TEMP2, $TEMP2
  625. vpaddq $TEMP2, $ACC9, $ACC8
  626. vmovq $r3, $ACC9
  627. mov %rdx, $r3
  628. dec $i
  629. jnz .LOOP_REDUCE_1024
  630. ___
  631. ($ACC0,$Y2)=($Y2,$ACC0);
  632. $code.=<<___;
  633. lea 448(%rsp), $tp1 # size optimization
  634. vpaddq $ACC9, $Y2, $ACC0
  635. vpxor $ZERO, $ZERO, $ZERO
  636. vpaddq 32*9-192($tp0), $ACC0, $ACC0
  637. vpaddq 32*10-448($tp1), $ACC1, $ACC1
  638. vpaddq 32*11-448($tp1), $ACC2, $ACC2
  639. vpaddq 32*12-448($tp1), $ACC3, $ACC3
  640. vpaddq 32*13-448($tp1), $ACC4, $ACC4
  641. vpaddq 32*14-448($tp1), $ACC5, $ACC5
  642. vpaddq 32*15-448($tp1), $ACC6, $ACC6
  643. vpaddq 32*16-448($tp1), $ACC7, $ACC7
  644. vpaddq 32*17-448($tp1), $ACC8, $ACC8
  645. vpsrlq \$29, $ACC0, $TEMP1
  646. vpand $AND_MASK, $ACC0, $ACC0
  647. vpsrlq \$29, $ACC1, $TEMP2
  648. vpand $AND_MASK, $ACC1, $ACC1
  649. vpsrlq \$29, $ACC2, $TEMP3
  650. vpermq \$0x93, $TEMP1, $TEMP1
  651. vpand $AND_MASK, $ACC2, $ACC2
  652. vpsrlq \$29, $ACC3, $TEMP4
  653. vpermq \$0x93, $TEMP2, $TEMP2
  654. vpand $AND_MASK, $ACC3, $ACC3
  655. vpermq \$0x93, $TEMP3, $TEMP3
  656. vpblendd \$3, $ZERO, $TEMP1, $TEMP0
  657. vpermq \$0x93, $TEMP4, $TEMP4
  658. vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
  659. vpaddq $TEMP0, $ACC0, $ACC0
  660. vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
  661. vpaddq $TEMP1, $ACC1, $ACC1
  662. vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
  663. vpaddq $TEMP2, $ACC2, $ACC2
  664. vpblendd \$3, $TEMP4, $ZERO, $TEMP4
  665. vpaddq $TEMP3, $ACC3, $ACC3
  666. vpaddq $TEMP4, $ACC4, $ACC4
  667. vpsrlq \$29, $ACC0, $TEMP1
  668. vpand $AND_MASK, $ACC0, $ACC0
  669. vpsrlq \$29, $ACC1, $TEMP2
  670. vpand $AND_MASK, $ACC1, $ACC1
  671. vpsrlq \$29, $ACC2, $TEMP3
  672. vpermq \$0x93, $TEMP1, $TEMP1
  673. vpand $AND_MASK, $ACC2, $ACC2
  674. vpsrlq \$29, $ACC3, $TEMP4
  675. vpermq \$0x93, $TEMP2, $TEMP2
  676. vpand $AND_MASK, $ACC3, $ACC3
  677. vpermq \$0x93, $TEMP3, $TEMP3
  678. vpblendd \$3, $ZERO, $TEMP1, $TEMP0
  679. vpermq \$0x93, $TEMP4, $TEMP4
  680. vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
  681. vpaddq $TEMP0, $ACC0, $ACC0
  682. vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
  683. vpaddq $TEMP1, $ACC1, $ACC1
  684. vmovdqu $ACC0, 32*0-128($rp)
  685. vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
  686. vpaddq $TEMP2, $ACC2, $ACC2
  687. vmovdqu $ACC1, 32*1-128($rp)
  688. vpblendd \$3, $TEMP4, $ZERO, $TEMP4
  689. vpaddq $TEMP3, $ACC3, $ACC3
  690. vmovdqu $ACC2, 32*2-128($rp)
  691. vpaddq $TEMP4, $ACC4, $ACC4
  692. vmovdqu $ACC3, 32*3-128($rp)
  693. ___
  694. $TEMP5=$ACC0;
  695. $code.=<<___;
  696. vpsrlq \$29, $ACC4, $TEMP1
  697. vpand $AND_MASK, $ACC4, $ACC4
  698. vpsrlq \$29, $ACC5, $TEMP2
  699. vpand $AND_MASK, $ACC5, $ACC5
  700. vpsrlq \$29, $ACC6, $TEMP3
  701. vpermq \$0x93, $TEMP1, $TEMP1
  702. vpand $AND_MASK, $ACC6, $ACC6
  703. vpsrlq \$29, $ACC7, $TEMP4
  704. vpermq \$0x93, $TEMP2, $TEMP2
  705. vpand $AND_MASK, $ACC7, $ACC7
  706. vpsrlq \$29, $ACC8, $TEMP5
  707. vpermq \$0x93, $TEMP3, $TEMP3
  708. vpand $AND_MASK, $ACC8, $ACC8
  709. vpermq \$0x93, $TEMP4, $TEMP4
  710. vpblendd \$3, $ZERO, $TEMP1, $TEMP0
  711. vpermq \$0x93, $TEMP5, $TEMP5
  712. vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
  713. vpaddq $TEMP0, $ACC4, $ACC4
  714. vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
  715. vpaddq $TEMP1, $ACC5, $ACC5
  716. vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
  717. vpaddq $TEMP2, $ACC6, $ACC6
  718. vpblendd \$3, $TEMP4, $TEMP5, $TEMP4
  719. vpaddq $TEMP3, $ACC7, $ACC7
  720. vpaddq $TEMP4, $ACC8, $ACC8
  721. vpsrlq \$29, $ACC4, $TEMP1
  722. vpand $AND_MASK, $ACC4, $ACC4
  723. vpsrlq \$29, $ACC5, $TEMP2
  724. vpand $AND_MASK, $ACC5, $ACC5
  725. vpsrlq \$29, $ACC6, $TEMP3
  726. vpermq \$0x93, $TEMP1, $TEMP1
  727. vpand $AND_MASK, $ACC6, $ACC6
  728. vpsrlq \$29, $ACC7, $TEMP4
  729. vpermq \$0x93, $TEMP2, $TEMP2
  730. vpand $AND_MASK, $ACC7, $ACC7
  731. vpsrlq \$29, $ACC8, $TEMP5
  732. vpermq \$0x93, $TEMP3, $TEMP3
  733. vpand $AND_MASK, $ACC8, $ACC8
  734. vpermq \$0x93, $TEMP4, $TEMP4
  735. vpblendd \$3, $ZERO, $TEMP1, $TEMP0
  736. vpermq \$0x93, $TEMP5, $TEMP5
  737. vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
  738. vpaddq $TEMP0, $ACC4, $ACC4
  739. vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
  740. vpaddq $TEMP1, $ACC5, $ACC5
  741. vmovdqu $ACC4, 32*4-128($rp)
  742. vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
  743. vpaddq $TEMP2, $ACC6, $ACC6
  744. vmovdqu $ACC5, 32*5-128($rp)
  745. vpblendd \$3, $TEMP4, $TEMP5, $TEMP4
  746. vpaddq $TEMP3, $ACC7, $ACC7
  747. vmovdqu $ACC6, 32*6-128($rp)
  748. vpaddq $TEMP4, $ACC8, $ACC8
  749. vmovdqu $ACC7, 32*7-128($rp)
  750. vmovdqu $ACC8, 32*8-128($rp)
  751. mov $rp, $ap
  752. dec $rep
  753. jne .LOOP_GRANDE_SQR_1024
  754. vzeroall
  755. mov %rbp, %rax
  756. ___
  757. $code.=<<___ if ($win64);
  758. movaps -0xd8(%rax),%xmm6
  759. movaps -0xc8(%rax),%xmm7
  760. movaps -0xb8(%rax),%xmm8
  761. movaps -0xa8(%rax),%xmm9
  762. movaps -0x98(%rax),%xmm10
  763. movaps -0x88(%rax),%xmm11
  764. movaps -0x78(%rax),%xmm12
  765. movaps -0x68(%rax),%xmm13
  766. movaps -0x58(%rax),%xmm14
  767. movaps -0x48(%rax),%xmm15
  768. ___
  769. $code.=<<___;
  770. mov -48(%rax),%r15
  771. mov -40(%rax),%r14
  772. mov -32(%rax),%r13
  773. mov -24(%rax),%r12
  774. mov -16(%rax),%rbp
  775. mov -8(%rax),%rbx
  776. lea (%rax),%rsp # restore %rsp
  777. .Lsqr_1024_epilogue:
  778. ret
  779. .size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2
  780. ___
  781. }
  782. { # void AMM_WW(
  783. my $rp="%rdi"; # BN_ULONG *rp,
  784. my $ap="%rsi"; # const BN_ULONG *ap,
  785. my $bp="%rdx"; # const BN_ULONG *bp,
  786. my $np="%rcx"; # const BN_ULONG *np,
  787. my $n0="%r8d"; # unsigned int n0);
  788. # The registers that hold the accumulated redundant result
  789. # The AMM works on 1024 bit operands, and redundant word size is 29
  790. # Therefore: ceil(1024/29)/4 = 9
  791. my $ACC0="%ymm0";
  792. my $ACC1="%ymm1";
  793. my $ACC2="%ymm2";
  794. my $ACC3="%ymm3";
  795. my $ACC4="%ymm4";
  796. my $ACC5="%ymm5";
  797. my $ACC6="%ymm6";
  798. my $ACC7="%ymm7";
  799. my $ACC8="%ymm8";
  800. my $ACC9="%ymm9";
  801. # Registers that hold the broadcasted words of multiplier, currently used
  802. my $Bi="%ymm10";
  803. my $Yi="%ymm11";
  804. # Helper registers
  805. my $TEMP0=$ACC0;
  806. my $TEMP1="%ymm12";
  807. my $TEMP2="%ymm13";
  808. my $ZERO="%ymm14";
  809. my $AND_MASK="%ymm15";
  810. # alu registers that hold the first words of the ACC
  811. my $r0="%r9";
  812. my $r1="%r10";
  813. my $r2="%r11";
  814. my $r3="%r12";
  815. my $i="%r14d";
  816. my $tmp="%r15";
  817. $bp="%r13"; # reassigned argument
  818. $code.=<<___;
  819. .globl rsaz_1024_mul_avx2
  820. .type rsaz_1024_mul_avx2,\@function,5
  821. .align 64
  822. rsaz_1024_mul_avx2:
  823. lea (%rsp), %rax
  824. push %rbx
  825. push %rbp
  826. push %r12
  827. push %r13
  828. push %r14
  829. push %r15
  830. ___
  831. $code.=<<___ if ($win64);
  832. vzeroupper
  833. lea -0xa8(%rsp),%rsp
  834. vmovaps %xmm6,-0xd8(%rax)
  835. vmovaps %xmm7,-0xc8(%rax)
  836. vmovaps %xmm8,-0xb8(%rax)
  837. vmovaps %xmm9,-0xa8(%rax)
  838. vmovaps %xmm10,-0x98(%rax)
  839. vmovaps %xmm11,-0x88(%rax)
  840. vmovaps %xmm12,-0x78(%rax)
  841. vmovaps %xmm13,-0x68(%rax)
  842. vmovaps %xmm14,-0x58(%rax)
  843. vmovaps %xmm15,-0x48(%rax)
  844. .Lmul_1024_body:
  845. ___
  846. $code.=<<___;
  847. mov %rax,%rbp
  848. vzeroall
  849. mov %rdx, $bp # reassigned argument
  850. sub \$64,%rsp
  851. # unaligned 256-bit load that crosses page boundary can
  852. # cause severe performance degradation here, so if $ap does
  853. # cross page boundary, swap it with $bp [meaning that caller
  854. # is advised to lay down $ap and $bp next to each other, so
  855. # that only one can cross page boundary].
  856. .byte 0x67,0x67
  857. mov $ap, $tmp
  858. and \$4095, $tmp
  859. add \$32*10, $tmp
  860. shr \$12, $tmp
  861. mov $ap, $tmp
  862. cmovnz $bp, $ap
  863. cmovnz $tmp, $bp
  864. mov $np, $tmp
  865. sub \$-128,$ap # size optimization
  866. sub \$-128,$np
  867. sub \$-128,$rp
  868. and \$4095, $tmp # see if $np crosses page
  869. add \$32*10, $tmp
  870. .byte 0x67,0x67
  871. shr \$12, $tmp
  872. jz .Lmul_1024_no_n_copy
  873. # unaligned 256-bit load that crosses page boundary can
  874. # cause severe performance degradation here, so if $np does
  875. # cross page boundary, copy it to stack and make sure stack
  876. # frame doesn't...
  877. sub \$32*10,%rsp
  878. vmovdqu 32*0-128($np), $ACC0
  879. and \$-512, %rsp
  880. vmovdqu 32*1-128($np), $ACC1
  881. vmovdqu 32*2-128($np), $ACC2
  882. vmovdqu 32*3-128($np), $ACC3
  883. vmovdqu 32*4-128($np), $ACC4
  884. vmovdqu 32*5-128($np), $ACC5
  885. vmovdqu 32*6-128($np), $ACC6
  886. vmovdqu 32*7-128($np), $ACC7
  887. vmovdqu 32*8-128($np), $ACC8
  888. lea 64+128(%rsp),$np
  889. vmovdqu $ACC0, 32*0-128($np)
  890. vpxor $ACC0, $ACC0, $ACC0
  891. vmovdqu $ACC1, 32*1-128($np)
  892. vpxor $ACC1, $ACC1, $ACC1
  893. vmovdqu $ACC2, 32*2-128($np)
  894. vpxor $ACC2, $ACC2, $ACC2
  895. vmovdqu $ACC3, 32*3-128($np)
  896. vpxor $ACC3, $ACC3, $ACC3
  897. vmovdqu $ACC4, 32*4-128($np)
  898. vpxor $ACC4, $ACC4, $ACC4
  899. vmovdqu $ACC5, 32*5-128($np)
  900. vpxor $ACC5, $ACC5, $ACC5
  901. vmovdqu $ACC6, 32*6-128($np)
  902. vpxor $ACC6, $ACC6, $ACC6
  903. vmovdqu $ACC7, 32*7-128($np)
  904. vpxor $ACC7, $ACC7, $ACC7
  905. vmovdqu $ACC8, 32*8-128($np)
  906. vmovdqa $ACC0, $ACC8
  907. vmovdqu $ACC9, 32*9-128($np) # $ACC9 is zero after vzeroall
  908. .Lmul_1024_no_n_copy:
  909. and \$-64,%rsp
  910. mov ($bp), %rbx
  911. vpbroadcastq ($bp), $Bi
  912. vmovdqu $ACC0, (%rsp) # clear top of stack
  913. xor $r0, $r0
  914. .byte 0x67
  915. xor $r1, $r1
  916. xor $r2, $r2
  917. xor $r3, $r3
  918. vmovdqu .Land_mask(%rip), $AND_MASK
  919. mov \$9, $i
  920. vmovdqu $ACC9, 32*9-128($rp) # $ACC9 is zero after vzeroall
  921. jmp .Loop_mul_1024
  922. .align 32
  923. .Loop_mul_1024:
  924. vpsrlq \$29, $ACC3, $ACC9 # correct $ACC3(*)
  925. mov %rbx, %rax
  926. imulq -128($ap), %rax
  927. add $r0, %rax
  928. mov %rbx, $r1
  929. imulq 8-128($ap), $r1
  930. add 8(%rsp), $r1
  931. mov %rax, $r0
  932. imull $n0, %eax
  933. and \$0x1fffffff, %eax
  934. mov %rbx, $r2
  935. imulq 16-128($ap), $r2
  936. add 16(%rsp), $r2
  937. mov %rbx, $r3
  938. imulq 24-128($ap), $r3
  939. add 24(%rsp), $r3
  940. vpmuludq 32*1-128($ap),$Bi,$TEMP0
  941. vmovd %eax, $Yi
  942. vpaddq $TEMP0,$ACC1,$ACC1
  943. vpmuludq 32*2-128($ap),$Bi,$TEMP1
  944. vpbroadcastq $Yi, $Yi
  945. vpaddq $TEMP1,$ACC2,$ACC2
  946. vpmuludq 32*3-128($ap),$Bi,$TEMP2
  947. vpand $AND_MASK, $ACC3, $ACC3 # correct $ACC3
  948. vpaddq $TEMP2,$ACC3,$ACC3
  949. vpmuludq 32*4-128($ap),$Bi,$TEMP0
  950. vpaddq $TEMP0,$ACC4,$ACC4
  951. vpmuludq 32*5-128($ap),$Bi,$TEMP1
  952. vpaddq $TEMP1,$ACC5,$ACC5
  953. vpmuludq 32*6-128($ap),$Bi,$TEMP2
  954. vpaddq $TEMP2,$ACC6,$ACC6
  955. vpmuludq 32*7-128($ap),$Bi,$TEMP0
  956. vpermq \$0x93, $ACC9, $ACC9 # correct $ACC3
  957. vpaddq $TEMP0,$ACC7,$ACC7
  958. vpmuludq 32*8-128($ap),$Bi,$TEMP1
  959. vpbroadcastq 8($bp), $Bi
  960. vpaddq $TEMP1,$ACC8,$ACC8
  961. mov %rax,%rdx
  962. imulq -128($np),%rax
  963. add %rax,$r0
  964. mov %rdx,%rax
  965. imulq 8-128($np),%rax
  966. add %rax,$r1
  967. mov %rdx,%rax
  968. imulq 16-128($np),%rax
  969. add %rax,$r2
  970. shr \$29, $r0
  971. imulq 24-128($np),%rdx
  972. add %rdx,$r3
  973. add $r0, $r1
  974. vpmuludq 32*1-128($np),$Yi,$TEMP2
  975. vmovq $Bi, %rbx
  976. vpaddq $TEMP2,$ACC1,$ACC1
  977. vpmuludq 32*2-128($np),$Yi,$TEMP0
  978. vpaddq $TEMP0,$ACC2,$ACC2
  979. vpmuludq 32*3-128($np),$Yi,$TEMP1
  980. vpaddq $TEMP1,$ACC3,$ACC3
  981. vpmuludq 32*4-128($np),$Yi,$TEMP2
  982. vpaddq $TEMP2,$ACC4,$ACC4
  983. vpmuludq 32*5-128($np),$Yi,$TEMP0
  984. vpaddq $TEMP0,$ACC5,$ACC5
  985. vpmuludq 32*6-128($np),$Yi,$TEMP1
  986. vpaddq $TEMP1,$ACC6,$ACC6
  987. vpmuludq 32*7-128($np),$Yi,$TEMP2
  988. vpblendd \$3, $ZERO, $ACC9, $ACC9 # correct $ACC3
  989. vpaddq $TEMP2,$ACC7,$ACC7
  990. vpmuludq 32*8-128($np),$Yi,$TEMP0
  991. vpaddq $ACC9, $ACC3, $ACC3 # correct $ACC3
  992. vpaddq $TEMP0,$ACC8,$ACC8
  993. mov %rbx, %rax
  994. imulq -128($ap),%rax
  995. add %rax,$r1
  996. vmovdqu -8+32*1-128($ap),$TEMP1
  997. mov %rbx, %rax
  998. imulq 8-128($ap),%rax
  999. add %rax,$r2
  1000. vmovdqu -8+32*2-128($ap),$TEMP2
  1001. mov $r1, %rax
  1002. imull $n0, %eax
  1003. and \$0x1fffffff, %eax
  1004. imulq 16-128($ap),%rbx
  1005. add %rbx,$r3
  1006. vpmuludq $Bi,$TEMP1,$TEMP1
  1007. vmovd %eax, $Yi
  1008. vmovdqu -8+32*3-128($ap),$TEMP0
  1009. vpaddq $TEMP1,$ACC1,$ACC1
  1010. vpmuludq $Bi,$TEMP2,$TEMP2
  1011. vpbroadcastq $Yi, $Yi
  1012. vmovdqu -8+32*4-128($ap),$TEMP1
  1013. vpaddq $TEMP2,$ACC2,$ACC2
  1014. vpmuludq $Bi,$TEMP0,$TEMP0
  1015. vmovdqu -8+32*5-128($ap),$TEMP2
  1016. vpaddq $TEMP0,$ACC3,$ACC3
  1017. vpmuludq $Bi,$TEMP1,$TEMP1
  1018. vmovdqu -8+32*6-128($ap),$TEMP0
  1019. vpaddq $TEMP1,$ACC4,$ACC4
  1020. vpmuludq $Bi,$TEMP2,$TEMP2
  1021. vmovdqu -8+32*7-128($ap),$TEMP1
  1022. vpaddq $TEMP2,$ACC5,$ACC5
  1023. vpmuludq $Bi,$TEMP0,$TEMP0
  1024. vmovdqu -8+32*8-128($ap),$TEMP2
  1025. vpaddq $TEMP0,$ACC6,$ACC6
  1026. vpmuludq $Bi,$TEMP1,$TEMP1
  1027. vmovdqu -8+32*9-128($ap),$ACC9
  1028. vpaddq $TEMP1,$ACC7,$ACC7
  1029. vpmuludq $Bi,$TEMP2,$TEMP2
  1030. vpaddq $TEMP2,$ACC8,$ACC8
  1031. vpmuludq $Bi,$ACC9,$ACC9
  1032. vpbroadcastq 16($bp), $Bi
  1033. mov %rax,%rdx
  1034. imulq -128($np),%rax
  1035. add %rax,$r1
  1036. vmovdqu -8+32*1-128($np),$TEMP0
  1037. mov %rdx,%rax
  1038. imulq 8-128($np),%rax
  1039. add %rax,$r2
  1040. vmovdqu -8+32*2-128($np),$TEMP1
  1041. shr \$29, $r1
  1042. imulq 16-128($np),%rdx
  1043. add %rdx,$r3
  1044. add $r1, $r2
  1045. vpmuludq $Yi,$TEMP0,$TEMP0
  1046. vmovq $Bi, %rbx
  1047. vmovdqu -8+32*3-128($np),$TEMP2
  1048. vpaddq $TEMP0,$ACC1,$ACC1
  1049. vpmuludq $Yi,$TEMP1,$TEMP1
  1050. vmovdqu -8+32*4-128($np),$TEMP0
  1051. vpaddq $TEMP1,$ACC2,$ACC2
  1052. vpmuludq $Yi,$TEMP2,$TEMP2
  1053. vmovdqu -8+32*5-128($np),$TEMP1
  1054. vpaddq $TEMP2,$ACC3,$ACC3
  1055. vpmuludq $Yi,$TEMP0,$TEMP0
  1056. vmovdqu -8+32*6-128($np),$TEMP2
  1057. vpaddq $TEMP0,$ACC4,$ACC4
  1058. vpmuludq $Yi,$TEMP1,$TEMP1
  1059. vmovdqu -8+32*7-128($np),$TEMP0
  1060. vpaddq $TEMP1,$ACC5,$ACC5
  1061. vpmuludq $Yi,$TEMP2,$TEMP2
  1062. vmovdqu -8+32*8-128($np),$TEMP1
  1063. vpaddq $TEMP2,$ACC6,$ACC6
  1064. vpmuludq $Yi,$TEMP0,$TEMP0
  1065. vmovdqu -8+32*9-128($np),$TEMP2
  1066. vpaddq $TEMP0,$ACC7,$ACC7
  1067. vpmuludq $Yi,$TEMP1,$TEMP1
  1068. vpaddq $TEMP1,$ACC8,$ACC8
  1069. vpmuludq $Yi,$TEMP2,$TEMP2
  1070. vpaddq $TEMP2,$ACC9,$ACC9
  1071. vmovdqu -16+32*1-128($ap),$TEMP0
  1072. mov %rbx,%rax
  1073. imulq -128($ap),%rax
  1074. add $r2,%rax
  1075. vmovdqu -16+32*2-128($ap),$TEMP1
  1076. mov %rax,$r2
  1077. imull $n0, %eax
  1078. and \$0x1fffffff, %eax
  1079. imulq 8-128($ap),%rbx
  1080. add %rbx,$r3
  1081. vpmuludq $Bi,$TEMP0,$TEMP0
  1082. vmovd %eax, $Yi
  1083. vmovdqu -16+32*3-128($ap),$TEMP2
  1084. vpaddq $TEMP0,$ACC1,$ACC1
  1085. vpmuludq $Bi,$TEMP1,$TEMP1
  1086. vpbroadcastq $Yi, $Yi
  1087. vmovdqu -16+32*4-128($ap),$TEMP0
  1088. vpaddq $TEMP1,$ACC2,$ACC2
  1089. vpmuludq $Bi,$TEMP2,$TEMP2
  1090. vmovdqu -16+32*5-128($ap),$TEMP1
  1091. vpaddq $TEMP2,$ACC3,$ACC3
  1092. vpmuludq $Bi,$TEMP0,$TEMP0
  1093. vmovdqu -16+32*6-128($ap),$TEMP2
  1094. vpaddq $TEMP0,$ACC4,$ACC4
  1095. vpmuludq $Bi,$TEMP1,$TEMP1
  1096. vmovdqu -16+32*7-128($ap),$TEMP0
  1097. vpaddq $TEMP1,$ACC5,$ACC5
  1098. vpmuludq $Bi,$TEMP2,$TEMP2
  1099. vmovdqu -16+32*8-128($ap),$TEMP1
  1100. vpaddq $TEMP2,$ACC6,$ACC6
  1101. vpmuludq $Bi,$TEMP0,$TEMP0
  1102. vmovdqu -16+32*9-128($ap),$TEMP2
  1103. vpaddq $TEMP0,$ACC7,$ACC7
  1104. vpmuludq $Bi,$TEMP1,$TEMP1
  1105. vpaddq $TEMP1,$ACC8,$ACC8
  1106. vpmuludq $Bi,$TEMP2,$TEMP2
  1107. vpbroadcastq 24($bp), $Bi
  1108. vpaddq $TEMP2,$ACC9,$ACC9
  1109. vmovdqu -16+32*1-128($np),$TEMP0
  1110. mov %rax,%rdx
  1111. imulq -128($np),%rax
  1112. add %rax,$r2
  1113. vmovdqu -16+32*2-128($np),$TEMP1
  1114. imulq 8-128($np),%rdx
  1115. add %rdx,$r3
  1116. shr \$29, $r2
  1117. vpmuludq $Yi,$TEMP0,$TEMP0
  1118. vmovq $Bi, %rbx
  1119. vmovdqu -16+32*3-128($np),$TEMP2
  1120. vpaddq $TEMP0,$ACC1,$ACC1
  1121. vpmuludq $Yi,$TEMP1,$TEMP1
  1122. vmovdqu -16+32*4-128($np),$TEMP0
  1123. vpaddq $TEMP1,$ACC2,$ACC2
  1124. vpmuludq $Yi,$TEMP2,$TEMP2
  1125. vmovdqu -16+32*5-128($np),$TEMP1
  1126. vpaddq $TEMP2,$ACC3,$ACC3
  1127. vpmuludq $Yi,$TEMP0,$TEMP0
  1128. vmovdqu -16+32*6-128($np),$TEMP2
  1129. vpaddq $TEMP0,$ACC4,$ACC4
  1130. vpmuludq $Yi,$TEMP1,$TEMP1
  1131. vmovdqu -16+32*7-128($np),$TEMP0
  1132. vpaddq $TEMP1,$ACC5,$ACC5
  1133. vpmuludq $Yi,$TEMP2,$TEMP2
  1134. vmovdqu -16+32*8-128($np),$TEMP1
  1135. vpaddq $TEMP2,$ACC6,$ACC6
  1136. vpmuludq $Yi,$TEMP0,$TEMP0
  1137. vmovdqu -16+32*9-128($np),$TEMP2
  1138. vpaddq $TEMP0,$ACC7,$ACC7
  1139. vpmuludq $Yi,$TEMP1,$TEMP1
  1140. vmovdqu -24+32*1-128($ap),$TEMP0
  1141. vpaddq $TEMP1,$ACC8,$ACC8
  1142. vpmuludq $Yi,$TEMP2,$TEMP2
  1143. vmovdqu -24+32*2-128($ap),$TEMP1
  1144. vpaddq $TEMP2,$ACC9,$ACC9
  1145. add $r2, $r3
  1146. imulq -128($ap),%rbx
  1147. add %rbx,$r3
  1148. mov $r3, %rax
  1149. imull $n0, %eax
  1150. and \$0x1fffffff, %eax
  1151. vpmuludq $Bi,$TEMP0,$TEMP0
  1152. vmovd %eax, $Yi
  1153. vmovdqu -24+32*3-128($ap),$TEMP2
  1154. vpaddq $TEMP0,$ACC1,$ACC1
  1155. vpmuludq $Bi,$TEMP1,$TEMP1
  1156. vpbroadcastq $Yi, $Yi
  1157. vmovdqu -24+32*4-128($ap),$TEMP0
  1158. vpaddq $TEMP1,$ACC2,$ACC2
  1159. vpmuludq $Bi,$TEMP2,$TEMP2
  1160. vmovdqu -24+32*5-128($ap),$TEMP1
  1161. vpaddq $TEMP2,$ACC3,$ACC3
  1162. vpmuludq $Bi,$TEMP0,$TEMP0
  1163. vmovdqu -24+32*6-128($ap),$TEMP2
  1164. vpaddq $TEMP0,$ACC4,$ACC4
  1165. vpmuludq $Bi,$TEMP1,$TEMP1
  1166. vmovdqu -24+32*7-128($ap),$TEMP0
  1167. vpaddq $TEMP1,$ACC5,$ACC5
  1168. vpmuludq $Bi,$TEMP2,$TEMP2
  1169. vmovdqu -24+32*8-128($ap),$TEMP1
  1170. vpaddq $TEMP2,$ACC6,$ACC6
  1171. vpmuludq $Bi,$TEMP0,$TEMP0
  1172. vmovdqu -24+32*9-128($ap),$TEMP2
  1173. vpaddq $TEMP0,$ACC7,$ACC7
  1174. vpmuludq $Bi,$TEMP1,$TEMP1
  1175. vpaddq $TEMP1,$ACC8,$ACC8
  1176. vpmuludq $Bi,$TEMP2,$TEMP2
  1177. vpbroadcastq 32($bp), $Bi
  1178. vpaddq $TEMP2,$ACC9,$ACC9
  1179. add \$32, $bp # $bp++
  1180. vmovdqu -24+32*1-128($np),$TEMP0
  1181. imulq -128($np),%rax
  1182. add %rax,$r3
  1183. shr \$29, $r3
  1184. vmovdqu -24+32*2-128($np),$TEMP1
  1185. vpmuludq $Yi,$TEMP0,$TEMP0
  1186. vmovq $Bi, %rbx
  1187. vmovdqu -24+32*3-128($np),$TEMP2
  1188. vpaddq $TEMP0,$ACC1,$ACC0 # $ACC0==$TEMP0
  1189. vpmuludq $Yi,$TEMP1,$TEMP1
  1190. vmovdqu $ACC0, (%rsp) # transfer $r0-$r3
  1191. vpaddq $TEMP1,$ACC2,$ACC1
  1192. vmovdqu -24+32*4-128($np),$TEMP0
  1193. vpmuludq $Yi,$TEMP2,$TEMP2
  1194. vmovdqu -24+32*5-128($np),$TEMP1
  1195. vpaddq $TEMP2,$ACC3,$ACC2
  1196. vpmuludq $Yi,$TEMP0,$TEMP0
  1197. vmovdqu -24+32*6-128($np),$TEMP2
  1198. vpaddq $TEMP0,$ACC4,$ACC3
  1199. vpmuludq $Yi,$TEMP1,$TEMP1
  1200. vmovdqu -24+32*7-128($np),$TEMP0
  1201. vpaddq $TEMP1,$ACC5,$ACC4
  1202. vpmuludq $Yi,$TEMP2,$TEMP2
  1203. vmovdqu -24+32*8-128($np),$TEMP1
  1204. vpaddq $TEMP2,$ACC6,$ACC5
  1205. vpmuludq $Yi,$TEMP0,$TEMP0
  1206. vmovdqu -24+32*9-128($np),$TEMP2
  1207. mov $r3, $r0
  1208. vpaddq $TEMP0,$ACC7,$ACC6
  1209. vpmuludq $Yi,$TEMP1,$TEMP1
  1210. add (%rsp), $r0
  1211. vpaddq $TEMP1,$ACC8,$ACC7
  1212. vpmuludq $Yi,$TEMP2,$TEMP2
  1213. vmovq $r3, $TEMP1
  1214. vpaddq $TEMP2,$ACC9,$ACC8
  1215. dec $i
  1216. jnz .Loop_mul_1024
  1217. ___
  1218. # (*) Original implementation was correcting ACC1-ACC3 for overflow
  1219. # after 7 loop runs, or after 28 iterations, or 56 additions.
  1220. # But as we underutilize resources, it's possible to correct in
  1221. # each iteration with marginal performance loss. But then, as
  1222. # we do it in each iteration, we can correct less digits, and
  1223. # avoid performance penalties completely. Also note that we
  1224. # correct only three digits out of four. This works because
  1225. # most significant digit is subjected to less additions.
  1226. $TEMP0 = $ACC9;
  1227. $TEMP3 = $Bi;
  1228. $TEMP4 = $Yi;
  1229. $code.=<<___;
  1230. vpermq \$0, $AND_MASK, $AND_MASK
  1231. vpaddq (%rsp), $TEMP1, $ACC0
  1232. vpsrlq \$29, $ACC0, $TEMP1
  1233. vpand $AND_MASK, $ACC0, $ACC0
  1234. vpsrlq \$29, $ACC1, $TEMP2
  1235. vpand $AND_MASK, $ACC1, $ACC1
  1236. vpsrlq \$29, $ACC2, $TEMP3
  1237. vpermq \$0x93, $TEMP1, $TEMP1
  1238. vpand $AND_MASK, $ACC2, $ACC2
  1239. vpsrlq \$29, $ACC3, $TEMP4
  1240. vpermq \$0x93, $TEMP2, $TEMP2
  1241. vpand $AND_MASK, $ACC3, $ACC3
  1242. vpblendd \$3, $ZERO, $TEMP1, $TEMP0
  1243. vpermq \$0x93, $TEMP3, $TEMP3
  1244. vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
  1245. vpermq \$0x93, $TEMP4, $TEMP4
  1246. vpaddq $TEMP0, $ACC0, $ACC0
  1247. vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
  1248. vpaddq $TEMP1, $ACC1, $ACC1
  1249. vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
  1250. vpaddq $TEMP2, $ACC2, $ACC2
  1251. vpblendd \$3, $TEMP4, $ZERO, $TEMP4
  1252. vpaddq $TEMP3, $ACC3, $ACC3
  1253. vpaddq $TEMP4, $ACC4, $ACC4
  1254. vpsrlq \$29, $ACC0, $TEMP1
  1255. vpand $AND_MASK, $ACC0, $ACC0
  1256. vpsrlq \$29, $ACC1, $TEMP2
  1257. vpand $AND_MASK, $ACC1, $ACC1
  1258. vpsrlq \$29, $ACC2, $TEMP3
  1259. vpermq \$0x93, $TEMP1, $TEMP1
  1260. vpand $AND_MASK, $ACC2, $ACC2
  1261. vpsrlq \$29, $ACC3, $TEMP4
  1262. vpermq \$0x93, $TEMP2, $TEMP2
  1263. vpand $AND_MASK, $ACC3, $ACC3
  1264. vpermq \$0x93, $TEMP3, $TEMP3
  1265. vpblendd \$3, $ZERO, $TEMP1, $TEMP0
  1266. vpermq \$0x93, $TEMP4, $TEMP4
  1267. vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
  1268. vpaddq $TEMP0, $ACC0, $ACC0
  1269. vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
  1270. vpaddq $TEMP1, $ACC1, $ACC1
  1271. vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
  1272. vpaddq $TEMP2, $ACC2, $ACC2
  1273. vpblendd \$3, $TEMP4, $ZERO, $TEMP4
  1274. vpaddq $TEMP3, $ACC3, $ACC3
  1275. vpaddq $TEMP4, $ACC4, $ACC4
  1276. vmovdqu $ACC0, 0-128($rp)
  1277. vmovdqu $ACC1, 32-128($rp)
  1278. vmovdqu $ACC2, 64-128($rp)
  1279. vmovdqu $ACC3, 96-128($rp)
  1280. ___
  1281. $TEMP5=$ACC0;
  1282. $code.=<<___;
  1283. vpsrlq \$29, $ACC4, $TEMP1
  1284. vpand $AND_MASK, $ACC4, $ACC4
  1285. vpsrlq \$29, $ACC5, $TEMP2
  1286. vpand $AND_MASK, $ACC5, $ACC5
  1287. vpsrlq \$29, $ACC6, $TEMP3
  1288. vpermq \$0x93, $TEMP1, $TEMP1
  1289. vpand $AND_MASK, $ACC6, $ACC6
  1290. vpsrlq \$29, $ACC7, $TEMP4
  1291. vpermq \$0x93, $TEMP2, $TEMP2
  1292. vpand $AND_MASK, $ACC7, $ACC7
  1293. vpsrlq \$29, $ACC8, $TEMP5
  1294. vpermq \$0x93, $TEMP3, $TEMP3
  1295. vpand $AND_MASK, $ACC8, $ACC8
  1296. vpermq \$0x93, $TEMP4, $TEMP4
  1297. vpblendd \$3, $ZERO, $TEMP1, $TEMP0
  1298. vpermq \$0x93, $TEMP5, $TEMP5
  1299. vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
  1300. vpaddq $TEMP0, $ACC4, $ACC4
  1301. vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
  1302. vpaddq $TEMP1, $ACC5, $ACC5
  1303. vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
  1304. vpaddq $TEMP2, $ACC6, $ACC6
  1305. vpblendd \$3, $TEMP4, $TEMP5, $TEMP4
  1306. vpaddq $TEMP3, $ACC7, $ACC7
  1307. vpaddq $TEMP4, $ACC8, $ACC8
  1308. vpsrlq \$29, $ACC4, $TEMP1
  1309. vpand $AND_MASK, $ACC4, $ACC4
  1310. vpsrlq \$29, $ACC5, $TEMP2
  1311. vpand $AND_MASK, $ACC5, $ACC5
  1312. vpsrlq \$29, $ACC6, $TEMP3
  1313. vpermq \$0x93, $TEMP1, $TEMP1
  1314. vpand $AND_MASK, $ACC6, $ACC6
  1315. vpsrlq \$29, $ACC7, $TEMP4
  1316. vpermq \$0x93, $TEMP2, $TEMP2
  1317. vpand $AND_MASK, $ACC7, $ACC7
  1318. vpsrlq \$29, $ACC8, $TEMP5
  1319. vpermq \$0x93, $TEMP3, $TEMP3
  1320. vpand $AND_MASK, $ACC8, $ACC8
  1321. vpermq \$0x93, $TEMP4, $TEMP4
  1322. vpblendd \$3, $ZERO, $TEMP1, $TEMP0
  1323. vpermq \$0x93, $TEMP5, $TEMP5
  1324. vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
  1325. vpaddq $TEMP0, $ACC4, $ACC4
  1326. vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
  1327. vpaddq $TEMP1, $ACC5, $ACC5
  1328. vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
  1329. vpaddq $TEMP2, $ACC6, $ACC6
  1330. vpblendd \$3, $TEMP4, $TEMP5, $TEMP4
  1331. vpaddq $TEMP3, $ACC7, $ACC7
  1332. vpaddq $TEMP4, $ACC8, $ACC8
  1333. vmovdqu $ACC4, 128-128($rp)
  1334. vmovdqu $ACC5, 160-128($rp)
  1335. vmovdqu $ACC6, 192-128($rp)
  1336. vmovdqu $ACC7, 224-128($rp)
  1337. vmovdqu $ACC8, 256-128($rp)
  1338. vzeroupper
  1339. mov %rbp, %rax
  1340. ___
  1341. $code.=<<___ if ($win64);
  1342. movaps -0xd8(%rax),%xmm6
  1343. movaps -0xc8(%rax),%xmm7
  1344. movaps -0xb8(%rax),%xmm8
  1345. movaps -0xa8(%rax),%xmm9
  1346. movaps -0x98(%rax),%xmm10
  1347. movaps -0x88(%rax),%xmm11
  1348. movaps -0x78(%rax),%xmm12
  1349. movaps -0x68(%rax),%xmm13
  1350. movaps -0x58(%rax),%xmm14
  1351. movaps -0x48(%rax),%xmm15
  1352. ___
  1353. $code.=<<___;
  1354. mov -48(%rax),%r15
  1355. mov -40(%rax),%r14
  1356. mov -32(%rax),%r13
  1357. mov -24(%rax),%r12
  1358. mov -16(%rax),%rbp
  1359. mov -8(%rax),%rbx
  1360. lea (%rax),%rsp # restore %rsp
  1361. .Lmul_1024_epilogue:
  1362. ret
  1363. .size rsaz_1024_mul_avx2,.-rsaz_1024_mul_avx2
  1364. ___
  1365. }
  1366. {
  1367. my ($out,$inp) = $win64 ? ("%rcx","%rdx") : ("%rdi","%rsi");
  1368. my @T = map("%r$_",(8..11));
  1369. $code.=<<___;
  1370. .globl rsaz_1024_red2norm_avx2
  1371. .type rsaz_1024_red2norm_avx2,\@abi-omnipotent
  1372. .align 32
  1373. rsaz_1024_red2norm_avx2:
  1374. sub \$-128,$inp # size optimization
  1375. xor %rax,%rax
  1376. ___
  1377. for ($j=0,$i=0; $i<16; $i++) {
  1378. my $k=0;
  1379. while (29*$j<64*($i+1)) { # load data till boundary
  1380. $code.=" mov `8*$j-128`($inp), @T[0]\n";
  1381. $j++; $k++; push(@T,shift(@T));
  1382. }
  1383. $l=$k;
  1384. while ($k>1) { # shift loaded data but last value
  1385. $code.=" shl \$`29*($j-$k)`,@T[-$k]\n";
  1386. $k--;
  1387. }
  1388. $code.=<<___; # shift last value
  1389. mov @T[-1], @T[0]
  1390. shl \$`29*($j-1)`, @T[-1]
  1391. shr \$`-29*($j-1)`, @T[0]
  1392. ___
  1393. while ($l) { # accumulate all values
  1394. $code.=" add @T[-$l], %rax\n";
  1395. $l--;
  1396. }
  1397. $code.=<<___;
  1398. adc \$0, @T[0] # consume eventual carry
  1399. mov %rax, 8*$i($out)
  1400. mov @T[0], %rax
  1401. ___
  1402. push(@T,shift(@T));
  1403. }
  1404. $code.=<<___;
  1405. ret
  1406. .size rsaz_1024_red2norm_avx2,.-rsaz_1024_red2norm_avx2
  1407. .globl rsaz_1024_norm2red_avx2
  1408. .type rsaz_1024_norm2red_avx2,\@abi-omnipotent
  1409. .align 32
  1410. rsaz_1024_norm2red_avx2:
  1411. sub \$-128,$out # size optimization
  1412. mov ($inp),@T[0]
  1413. mov \$0x1fffffff,%eax
  1414. ___
  1415. for ($j=0,$i=0; $i<16; $i++) {
  1416. $code.=" mov `8*($i+1)`($inp),@T[1]\n" if ($i<15);
  1417. $code.=" xor @T[1],@T[1]\n" if ($i==15);
  1418. my $k=1;
  1419. while (29*($j+1)<64*($i+1)) {
  1420. $code.=<<___;
  1421. mov @T[0],@T[-$k]
  1422. shr \$`29*$j`,@T[-$k]
  1423. and %rax,@T[-$k] # &0x1fffffff
  1424. mov @T[-$k],`8*$j-128`($out)
  1425. ___
  1426. $j++; $k++;
  1427. }
  1428. $code.=<<___;
  1429. shrd \$`29*$j`,@T[1],@T[0]
  1430. and %rax,@T[0]
  1431. mov @T[0],`8*$j-128`($out)
  1432. ___
  1433. $j++;
  1434. push(@T,shift(@T));
  1435. }
  1436. $code.=<<___;
  1437. mov @T[0],`8*$j-128`($out) # zero
  1438. mov @T[0],`8*($j+1)-128`($out)
  1439. mov @T[0],`8*($j+2)-128`($out)
  1440. mov @T[0],`8*($j+3)-128`($out)
  1441. ret
  1442. .size rsaz_1024_norm2red_avx2,.-rsaz_1024_norm2red_avx2
  1443. ___
  1444. }
  1445. {
  1446. my ($out,$inp,$power) = $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx");
  1447. $code.=<<___;
  1448. .globl rsaz_1024_scatter5_avx2
  1449. .type rsaz_1024_scatter5_avx2,\@abi-omnipotent
  1450. .align 32
  1451. rsaz_1024_scatter5_avx2:
  1452. vzeroupper
  1453. vmovdqu .Lscatter_permd(%rip),%ymm5
  1454. shl \$4,$power
  1455. lea ($out,$power),$out
  1456. mov \$9,%eax
  1457. jmp .Loop_scatter_1024
  1458. .align 32
  1459. .Loop_scatter_1024:
  1460. vmovdqu ($inp),%ymm0
  1461. lea 32($inp),$inp
  1462. vpermd %ymm0,%ymm5,%ymm0
  1463. vmovdqu %xmm0,($out)
  1464. lea 16*32($out),$out
  1465. dec %eax
  1466. jnz .Loop_scatter_1024
  1467. vzeroupper
  1468. ret
  1469. .size rsaz_1024_scatter5_avx2,.-rsaz_1024_scatter5_avx2
  1470. .globl rsaz_1024_gather5_avx2
  1471. .type rsaz_1024_gather5_avx2,\@abi-omnipotent
  1472. .align 32
  1473. rsaz_1024_gather5_avx2:
  1474. ___
  1475. $code.=<<___ if ($win64);
  1476. lea -0x88(%rsp),%rax
  1477. vzeroupper
  1478. .LSEH_begin_rsaz_1024_gather5:
  1479. # I can't trust assembler to use specific encoding:-(
  1480. .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp
  1481. .byte 0xc5,0xf8,0x29,0x70,0xe0 #vmovaps %xmm6,-0x20(%rax)
  1482. .byte 0xc5,0xf8,0x29,0x78,0xf0 #vmovaps %xmm7,-0x10(%rax)
  1483. .byte 0xc5,0x78,0x29,0x40,0x00 #vmovaps %xmm8,0(%rax)
  1484. .byte 0xc5,0x78,0x29,0x48,0x10 #vmovaps %xmm9,0x10(%rax)
  1485. .byte 0xc5,0x78,0x29,0x50,0x20 #vmovaps %xmm10,0x20(%rax)
  1486. .byte 0xc5,0x78,0x29,0x58,0x30 #vmovaps %xmm11,0x30(%rax)
  1487. .byte 0xc5,0x78,0x29,0x60,0x40 #vmovaps %xmm12,0x40(%rax)
  1488. .byte 0xc5,0x78,0x29,0x68,0x50 #vmovaps %xmm13,0x50(%rax)
  1489. .byte 0xc5,0x78,0x29,0x70,0x60 #vmovaps %xmm14,0x60(%rax)
  1490. .byte 0xc5,0x78,0x29,0x78,0x70 #vmovaps %xmm15,0x70(%rax)
  1491. ___
  1492. $code.=<<___;
  1493. lea .Lgather_table(%rip),%r11
  1494. mov $power,%eax
  1495. and \$3,$power
  1496. shr \$2,%eax # cache line number
  1497. shl \$4,$power # offset within cache line
  1498. vmovdqu -32(%r11),%ymm7 # .Lgather_permd
  1499. vpbroadcastb 8(%r11,%rax), %xmm8
  1500. vpbroadcastb 7(%r11,%rax), %xmm9
  1501. vpbroadcastb 6(%r11,%rax), %xmm10
  1502. vpbroadcastb 5(%r11,%rax), %xmm11
  1503. vpbroadcastb 4(%r11,%rax), %xmm12
  1504. vpbroadcastb 3(%r11,%rax), %xmm13
  1505. vpbroadcastb 2(%r11,%rax), %xmm14
  1506. vpbroadcastb 1(%r11,%rax), %xmm15
  1507. lea 64($inp,$power),$inp
  1508. mov \$64,%r11 # size optimization
  1509. mov \$9,%eax
  1510. jmp .Loop_gather_1024
  1511. .align 32
  1512. .Loop_gather_1024:
  1513. vpand -64($inp), %xmm8,%xmm0
  1514. vpand ($inp), %xmm9,%xmm1
  1515. vpand 64($inp), %xmm10,%xmm2
  1516. vpand ($inp,%r11,2), %xmm11,%xmm3
  1517. vpor %xmm0,%xmm1,%xmm1
  1518. vpand 64($inp,%r11,2), %xmm12,%xmm4
  1519. vpor %xmm2,%xmm3,%xmm3
  1520. vpand ($inp,%r11,4), %xmm13,%xmm5
  1521. vpor %xmm1,%xmm3,%xmm3
  1522. vpand 64($inp,%r11,4), %xmm14,%xmm6
  1523. vpor %xmm4,%xmm5,%xmm5
  1524. vpand -128($inp,%r11,8), %xmm15,%xmm2
  1525. lea ($inp,%r11,8),$inp
  1526. vpor %xmm3,%xmm5,%xmm5
  1527. vpor %xmm2,%xmm6,%xmm6
  1528. vpor %xmm5,%xmm6,%xmm6
  1529. vpermd %ymm6,%ymm7,%ymm6
  1530. vmovdqu %ymm6,($out)
  1531. lea 32($out),$out
  1532. dec %eax
  1533. jnz .Loop_gather_1024
  1534. vpxor %ymm0,%ymm0,%ymm0
  1535. vmovdqu %ymm0,($out)
  1536. vzeroupper
  1537. ___
  1538. $code.=<<___ if ($win64);
  1539. movaps (%rsp),%xmm6
  1540. movaps 0x10(%rsp),%xmm7
  1541. movaps 0x20(%rsp),%xmm8
  1542. movaps 0x30(%rsp),%xmm9
  1543. movaps 0x40(%rsp),%xmm10
  1544. movaps 0x50(%rsp),%xmm11
  1545. movaps 0x60(%rsp),%xmm12
  1546. movaps 0x70(%rsp),%xmm13
  1547. movaps 0x80(%rsp),%xmm14
  1548. movaps 0x90(%rsp),%xmm15
  1549. lea 0xa8(%rsp),%rsp
  1550. .LSEH_end_rsaz_1024_gather5:
  1551. ___
  1552. $code.=<<___;
  1553. ret
  1554. .size rsaz_1024_gather5_avx2,.-rsaz_1024_gather5_avx2
  1555. ___
  1556. }
  1557. $code.=<<___;
  1558. .extern OPENSSL_ia32cap_P
  1559. .globl rsaz_avx2_eligible
  1560. .type rsaz_avx2_eligible,\@abi-omnipotent
  1561. .align 32
  1562. rsaz_avx2_eligible:
  1563. mov OPENSSL_ia32cap_P+8(%rip),%eax
  1564. ___
  1565. $code.=<<___ if ($addx);
  1566. mov \$`1<<8|1<<19`,%ecx
  1567. mov \$0,%edx
  1568. and %eax,%ecx
  1569. cmp \$`1<<8|1<<19`,%ecx # check for BMI2+AD*X
  1570. cmove %edx,%eax
  1571. ___
  1572. $code.=<<___;
  1573. and \$`1<<5`,%eax
  1574. shr \$5,%eax
  1575. ret
  1576. .size rsaz_avx2_eligible,.-rsaz_avx2_eligible
  1577. .align 64
  1578. .Land_mask:
  1579. .quad 0x1fffffff,0x1fffffff,0x1fffffff,-1
  1580. .Lscatter_permd:
  1581. .long 0,2,4,6,7,7,7,7
  1582. .Lgather_permd:
  1583. .long 0,7,1,7,2,7,3,7
  1584. .Lgather_table:
  1585. .byte 0,0,0,0,0,0,0,0, 0xff,0,0,0,0,0,0,0
  1586. .align 64
  1587. ___
  1588. if ($win64) {
  1589. $rec="%rcx";
  1590. $frame="%rdx";
  1591. $context="%r8";
  1592. $disp="%r9";
  1593. $code.=<<___
  1594. .extern __imp_RtlVirtualUnwind
  1595. .type rsaz_se_handler,\@abi-omnipotent
  1596. .align 16
  1597. rsaz_se_handler:
  1598. push %rsi
  1599. push %rdi
  1600. push %rbx
  1601. push %rbp
  1602. push %r12
  1603. push %r13
  1604. push %r14
  1605. push %r15
  1606. pushfq
  1607. sub \$64,%rsp
  1608. mov 120($context),%rax # pull context->Rax
  1609. mov 248($context),%rbx # pull context->Rip
  1610. mov 8($disp),%rsi # disp->ImageBase
  1611. mov 56($disp),%r11 # disp->HandlerData
  1612. mov 0(%r11),%r10d # HandlerData[0]
  1613. lea (%rsi,%r10),%r10 # prologue label
  1614. cmp %r10,%rbx # context->Rip<prologue label
  1615. jb .Lcommon_seh_tail
  1616. mov 152($context),%rax # pull context->Rsp
  1617. mov 4(%r11),%r10d # HandlerData[1]
  1618. lea (%rsi,%r10),%r10 # epilogue label
  1619. cmp %r10,%rbx # context->Rip>=epilogue label
  1620. jae .Lcommon_seh_tail
  1621. mov 160($context),%rax # pull context->Rbp
  1622. mov -48(%rax),%r15
  1623. mov -40(%rax),%r14
  1624. mov -32(%rax),%r13
  1625. mov -24(%rax),%r12
  1626. mov -16(%rax),%rbp
  1627. mov -8(%rax),%rbx
  1628. mov %r15,240($context)
  1629. mov %r14,232($context)
  1630. mov %r13,224($context)
  1631. mov %r12,216($context)
  1632. mov %rbp,160($context)
  1633. mov %rbx,144($context)
  1634. lea -0xd8(%rax),%rsi # %xmm save area
  1635. lea 512($context),%rdi # & context.Xmm6
  1636. mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
  1637. .long 0xa548f3fc # cld; rep movsq
  1638. .Lcommon_seh_tail:
  1639. mov 8(%rax),%rdi
  1640. mov 16(%rax),%rsi
  1641. mov %rax,152($context) # restore context->Rsp
  1642. mov %rsi,168($context) # restore context->Rsi
  1643. mov %rdi,176($context) # restore context->Rdi
  1644. mov 40($disp),%rdi # disp->ContextRecord
  1645. mov $context,%rsi # context
  1646. mov \$154,%ecx # sizeof(CONTEXT)
  1647. .long 0xa548f3fc # cld; rep movsq
  1648. mov $disp,%rsi
  1649. xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
  1650. mov 8(%rsi),%rdx # arg2, disp->ImageBase
  1651. mov 0(%rsi),%r8 # arg3, disp->ControlPc
  1652. mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
  1653. mov 40(%rsi),%r10 # disp->ContextRecord
  1654. lea 56(%rsi),%r11 # &disp->HandlerData
  1655. lea 24(%rsi),%r12 # &disp->EstablisherFrame
  1656. mov %r10,32(%rsp) # arg5
  1657. mov %r11,40(%rsp) # arg6
  1658. mov %r12,48(%rsp) # arg7
  1659. mov %rcx,56(%rsp) # arg8, (NULL)
  1660. call *__imp_RtlVirtualUnwind(%rip)
  1661. mov \$1,%eax # ExceptionContinueSearch
  1662. add \$64,%rsp
  1663. popfq
  1664. pop %r15
  1665. pop %r14
  1666. pop %r13
  1667. pop %r12
  1668. pop %rbp
  1669. pop %rbx
  1670. pop %rdi
  1671. pop %rsi
  1672. ret
  1673. .size rsaz_se_handler,.-rsaz_se_handler
  1674. .section .pdata
  1675. .align 4
  1676. .rva .LSEH_begin_rsaz_1024_sqr_avx2
  1677. .rva .LSEH_end_rsaz_1024_sqr_avx2
  1678. .rva .LSEH_info_rsaz_1024_sqr_avx2
  1679. .rva .LSEH_begin_rsaz_1024_mul_avx2
  1680. .rva .LSEH_end_rsaz_1024_mul_avx2
  1681. .rva .LSEH_info_rsaz_1024_mul_avx2
  1682. .rva .LSEH_begin_rsaz_1024_gather5
  1683. .rva .LSEH_end_rsaz_1024_gather5
  1684. .rva .LSEH_info_rsaz_1024_gather5
  1685. .section .xdata
  1686. .align 8
  1687. .LSEH_info_rsaz_1024_sqr_avx2:
  1688. .byte 9,0,0,0
  1689. .rva rsaz_se_handler
  1690. .rva .Lsqr_1024_body,.Lsqr_1024_epilogue
  1691. .LSEH_info_rsaz_1024_mul_avx2:
  1692. .byte 9,0,0,0
  1693. .rva rsaz_se_handler
  1694. .rva .Lmul_1024_body,.Lmul_1024_epilogue
  1695. .LSEH_info_rsaz_1024_gather5:
  1696. .byte 0x01,0x33,0x16,0x00
  1697. .byte 0x36,0xf8,0x09,0x00 #vmovaps 0x90(rsp),xmm15
  1698. .byte 0x31,0xe8,0x08,0x00 #vmovaps 0x80(rsp),xmm14
  1699. .byte 0x2c,0xd8,0x07,0x00 #vmovaps 0x70(rsp),xmm13
  1700. .byte 0x27,0xc8,0x06,0x00 #vmovaps 0x60(rsp),xmm12
  1701. .byte 0x22,0xb8,0x05,0x00 #vmovaps 0x50(rsp),xmm11
  1702. .byte 0x1d,0xa8,0x04,0x00 #vmovaps 0x40(rsp),xmm10
  1703. .byte 0x18,0x98,0x03,0x00 #vmovaps 0x30(rsp),xmm9
  1704. .byte 0x13,0x88,0x02,0x00 #vmovaps 0x20(rsp),xmm8
  1705. .byte 0x0e,0x78,0x01,0x00 #vmovaps 0x10(rsp),xmm7
  1706. .byte 0x09,0x68,0x00,0x00 #vmovaps 0x00(rsp),xmm6
  1707. .byte 0x04,0x01,0x15,0x00 #sub rsp,0xa8
  1708. ___
  1709. }
  1710. foreach (split("\n",$code)) {
  1711. s/\`([^\`]*)\`/eval($1)/ge;
  1712. s/\b(sh[rl]d?\s+\$)(-?[0-9]+)/$1.$2%64/ge or
  1713. s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
  1714. s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go or
  1715. s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
  1716. s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
  1717. s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go;
  1718. print $_,"\n";
  1719. }
  1720. }}} else {{{
  1721. print <<___; # assembler is too old
  1722. .text
  1723. .globl rsaz_avx2_eligible
  1724. .type rsaz_avx2_eligible,\@abi-omnipotent
  1725. rsaz_avx2_eligible:
  1726. xor %eax,%eax
  1727. ret
  1728. .size rsaz_avx2_eligible,.-rsaz_avx2_eligible
  1729. .globl rsaz_1024_sqr_avx2
  1730. .globl rsaz_1024_mul_avx2
  1731. .globl rsaz_1024_norm2red_avx2
  1732. .globl rsaz_1024_red2norm_avx2
  1733. .globl rsaz_1024_scatter5_avx2
  1734. .globl rsaz_1024_gather5_avx2
  1735. .type rsaz_1024_sqr_avx2,\@abi-omnipotent
  1736. rsaz_1024_sqr_avx2:
  1737. rsaz_1024_mul_avx2:
  1738. rsaz_1024_norm2red_avx2:
  1739. rsaz_1024_red2norm_avx2:
  1740. rsaz_1024_scatter5_avx2:
  1741. rsaz_1024_gather5_avx2:
  1742. .byte 0x0f,0x0b # ud2
  1743. ret
  1744. .size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2
  1745. ___
  1746. }}}
  1747. close STDOUT;