rsaz-x86_64.pl 41 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144
  1. #!/usr/bin/env perl
  2. ##############################################################################
  3. # #
  4. # Copyright (c) 2012, Intel Corporation #
  5. # #
  6. # All rights reserved. #
  7. # #
  8. # Redistribution and use in source and binary forms, with or without #
  9. # modification, are permitted provided that the following conditions are #
  10. # met: #
  11. # #
  12. # * Redistributions of source code must retain the above copyright #
  13. # notice, this list of conditions and the following disclaimer. #
  14. # #
  15. # * Redistributions in binary form must reproduce the above copyright #
  16. # notice, this list of conditions and the following disclaimer in the #
  17. # documentation and/or other materials provided with the #
  18. # distribution. #
  19. # #
  20. # * Neither the name of the Intel Corporation nor the names of its #
  21. # contributors may be used to endorse or promote products derived from #
  22. # this software without specific prior written permission. #
  23. # #
  24. # #
  25. # THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY #
  26. # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE #
  27. # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR #
  28. # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR #
  29. # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, #
  30. # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, #
  31. # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR #
  32. # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF #
  33. # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING #
  34. # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS #
  35. # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #
  36. # #
  37. ##############################################################################
  38. # Developers and authors: #
  39. # Shay Gueron (1, 2), and Vlad Krasnov (1) #
  40. # (1) Intel Architecture Group, Microprocessor and Chipset Development, #
  41. # Israel Development Center, Haifa, Israel #
  42. # (2) University of Haifa #
  43. ##############################################################################
  44. # Reference: #
  45. # [1] S. Gueron, "Efficient Software Implementations of Modular #
  46. # Exponentiation", http://eprint.iacr.org/2011/239 #
  47. # [2] S. Gueron, V. Krasnov. "Speeding up Big-Numbers Squaring". #
  48. # IEEE Proceedings of 9th International Conference on Information #
  49. # Technology: New Generations (ITNG 2012), 821-823 (2012). #
  50. # [3] S. Gueron, Efficient Software Implementations of Modular Exponentiation#
  51. # Journal of Cryptographic Engineering 2:31-43 (2012). #
  52. # [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis #
  53. # resistant 512-bit and 1024-bit modular exponentiation for optimizing #
  54. # RSA1024 and RSA2048 on x86_64 platforms", #
  55. # http://rt.openssl.org/Ticket/Display.html?id=2582&user=guest&pass=guest#
  56. ##############################################################################
  57. # While original submission covers 512- and 1024-bit exponentiation,
  58. # this module is limited to 512-bit version only (and as such
  59. # accelerates RSA1024 sign). This is because improvement for longer
  60. # keys is not high enough to justify the effort, highest measured
  61. # was ~5% on Westmere. [This is relative to OpenSSL 1.0.2, upcoming
  62. # for the moment of this writing!] Nor does this module implement
  63. # "monolithic" complete exponentiation jumbo-subroutine, but adheres
  64. # to more modular mixture of C and assembly. And it's optimized even
  65. # for processors other than Intel Core family (see table below for
  66. # improvement coefficients).
  67. # <appro@openssl.org>
  68. #
  69. # RSA1024 sign/sec this/original |this/rsax(*) this/fips(*)
  70. # ----------------+---------------------------
  71. # Opteron +13% |+5% +20%
  72. # Bulldozer -0% |-1% +10%
  73. # P4 +11% |+7% +8%
  74. # Westmere +5% |+14% +17%
  75. # Sandy Bridge +2% |+12% +29%
  76. # Ivy Bridge +1% |+11% +35%
  77. # Haswell(**) -0% |+12% +39%
  78. # Atom +13% |+11% +4%
  79. # VIA Nano +70% |+9% +25%
  80. #
  81. # (*) rsax engine and fips numbers are presented for reference
  82. # purposes;
  83. # (**) MULX was attempted, but found to give only marginal improvement;
  84. $flavour = shift;
  85. $output = shift;
  86. if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
  87. $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  88. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  89. ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  90. ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  91. die "can't locate x86_64-xlate.pl";
  92. open OUT,"| \"$^X\" $xlate $flavour $output";
  93. *STDOUT=*OUT;
  94. if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
  95. =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
  96. $addx = ($1>=2.23);
  97. }
  98. if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
  99. `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
  100. $addx = ($1>=2.10);
  101. }
  102. if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
  103. `ml64 2>&1` =~ /Version ([0-9]+)\./) {
  104. $addx = ($1>=12);
  105. }
  106. if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9])\.([0-9]+)/) {
  107. my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10
  108. $addx = ($ver>=3.03);
  109. }
  110. ($out, $inp, $mod) = ("%rdi", "%rsi", "%rbp"); # common internal API
  111. {
  112. my ($out,$inp,$mod,$n0,$times) = ("%rdi","%rsi","%rdx","%rcx","%r8d");
  113. $code.=<<___;
  114. .text
  115. .extern OPENSSL_ia32cap_P
  116. .globl rsaz_512_sqr
  117. .type rsaz_512_sqr,\@function,5
  118. .align 32
  119. rsaz_512_sqr: # 25-29% faster than rsaz_512_mul
  120. push %rbx
  121. push %rbp
  122. push %r12
  123. push %r13
  124. push %r14
  125. push %r15
  126. subq \$128+24, %rsp
  127. .Lsqr_body:
  128. movq $mod, %rbp # common argument
  129. movq ($inp), %rdx
  130. movq 8($inp), %rax
  131. movq $n0, 128(%rsp)
  132. ___
  133. $code.=<<___ if ($addx);
  134. movl \$0x80100,%r11d
  135. andl OPENSSL_ia32cap_P+8(%rip),%r11d
  136. cmpl \$0x80100,%r11d # check for MULX and ADO/CX
  137. je .Loop_sqrx
  138. ___
  139. $code.=<<___;
  140. jmp .Loop_sqr
  141. .align 32
  142. .Loop_sqr:
  143. movl $times,128+8(%rsp)
  144. #first iteration
  145. movq %rdx, %rbx
  146. mulq %rdx
  147. movq %rax, %r8
  148. movq 16($inp), %rax
  149. movq %rdx, %r9
  150. mulq %rbx
  151. addq %rax, %r9
  152. movq 24($inp), %rax
  153. movq %rdx, %r10
  154. adcq \$0, %r10
  155. mulq %rbx
  156. addq %rax, %r10
  157. movq 32($inp), %rax
  158. movq %rdx, %r11
  159. adcq \$0, %r11
  160. mulq %rbx
  161. addq %rax, %r11
  162. movq 40($inp), %rax
  163. movq %rdx, %r12
  164. adcq \$0, %r12
  165. mulq %rbx
  166. addq %rax, %r12
  167. movq 48($inp), %rax
  168. movq %rdx, %r13
  169. adcq \$0, %r13
  170. mulq %rbx
  171. addq %rax, %r13
  172. movq 56($inp), %rax
  173. movq %rdx, %r14
  174. adcq \$0, %r14
  175. mulq %rbx
  176. addq %rax, %r14
  177. movq %rbx, %rax
  178. movq %rdx, %r15
  179. adcq \$0, %r15
  180. addq %r8, %r8 #shlq \$1, %r8
  181. movq %r9, %rcx
  182. adcq %r9, %r9 #shld \$1, %r8, %r9
  183. mulq %rax
  184. movq %rax, (%rsp)
  185. addq %rdx, %r8
  186. adcq \$0, %r9
  187. movq %r8, 8(%rsp)
  188. shrq \$63, %rcx
  189. #second iteration
  190. movq 8($inp), %r8
  191. movq 16($inp), %rax
  192. mulq %r8
  193. addq %rax, %r10
  194. movq 24($inp), %rax
  195. movq %rdx, %rbx
  196. adcq \$0, %rbx
  197. mulq %r8
  198. addq %rax, %r11
  199. movq 32($inp), %rax
  200. adcq \$0, %rdx
  201. addq %rbx, %r11
  202. movq %rdx, %rbx
  203. adcq \$0, %rbx
  204. mulq %r8
  205. addq %rax, %r12
  206. movq 40($inp), %rax
  207. adcq \$0, %rdx
  208. addq %rbx, %r12
  209. movq %rdx, %rbx
  210. adcq \$0, %rbx
  211. mulq %r8
  212. addq %rax, %r13
  213. movq 48($inp), %rax
  214. adcq \$0, %rdx
  215. addq %rbx, %r13
  216. movq %rdx, %rbx
  217. adcq \$0, %rbx
  218. mulq %r8
  219. addq %rax, %r14
  220. movq 56($inp), %rax
  221. adcq \$0, %rdx
  222. addq %rbx, %r14
  223. movq %rdx, %rbx
  224. adcq \$0, %rbx
  225. mulq %r8
  226. addq %rax, %r15
  227. movq %r8, %rax
  228. adcq \$0, %rdx
  229. addq %rbx, %r15
  230. movq %rdx, %r8
  231. movq %r10, %rdx
  232. adcq \$0, %r8
  233. add %rdx, %rdx
  234. lea (%rcx,%r10,2), %r10 #shld \$1, %rcx, %r10
  235. movq %r11, %rbx
  236. adcq %r11, %r11 #shld \$1, %r10, %r11
  237. mulq %rax
  238. addq %rax, %r9
  239. adcq %rdx, %r10
  240. adcq \$0, %r11
  241. movq %r9, 16(%rsp)
  242. movq %r10, 24(%rsp)
  243. shrq \$63, %rbx
  244. #third iteration
  245. movq 16($inp), %r9
  246. movq 24($inp), %rax
  247. mulq %r9
  248. addq %rax, %r12
  249. movq 32($inp), %rax
  250. movq %rdx, %rcx
  251. adcq \$0, %rcx
  252. mulq %r9
  253. addq %rax, %r13
  254. movq 40($inp), %rax
  255. adcq \$0, %rdx
  256. addq %rcx, %r13
  257. movq %rdx, %rcx
  258. adcq \$0, %rcx
  259. mulq %r9
  260. addq %rax, %r14
  261. movq 48($inp), %rax
  262. adcq \$0, %rdx
  263. addq %rcx, %r14
  264. movq %rdx, %rcx
  265. adcq \$0, %rcx
  266. mulq %r9
  267. movq %r12, %r10
  268. lea (%rbx,%r12,2), %r12 #shld \$1, %rbx, %r12
  269. addq %rax, %r15
  270. movq 56($inp), %rax
  271. adcq \$0, %rdx
  272. addq %rcx, %r15
  273. movq %rdx, %rcx
  274. adcq \$0, %rcx
  275. mulq %r9
  276. shrq \$63, %r10
  277. addq %rax, %r8
  278. movq %r9, %rax
  279. adcq \$0, %rdx
  280. addq %rcx, %r8
  281. movq %rdx, %r9
  282. adcq \$0, %r9
  283. movq %r13, %rcx
  284. leaq (%r10,%r13,2), %r13 #shld \$1, %r12, %r13
  285. mulq %rax
  286. addq %rax, %r11
  287. adcq %rdx, %r12
  288. adcq \$0, %r13
  289. movq %r11, 32(%rsp)
  290. movq %r12, 40(%rsp)
  291. shrq \$63, %rcx
  292. #fourth iteration
  293. movq 24($inp), %r10
  294. movq 32($inp), %rax
  295. mulq %r10
  296. addq %rax, %r14
  297. movq 40($inp), %rax
  298. movq %rdx, %rbx
  299. adcq \$0, %rbx
  300. mulq %r10
  301. addq %rax, %r15
  302. movq 48($inp), %rax
  303. adcq \$0, %rdx
  304. addq %rbx, %r15
  305. movq %rdx, %rbx
  306. adcq \$0, %rbx
  307. mulq %r10
  308. movq %r14, %r12
  309. leaq (%rcx,%r14,2), %r14 #shld \$1, %rcx, %r14
  310. addq %rax, %r8
  311. movq 56($inp), %rax
  312. adcq \$0, %rdx
  313. addq %rbx, %r8
  314. movq %rdx, %rbx
  315. adcq \$0, %rbx
  316. mulq %r10
  317. shrq \$63, %r12
  318. addq %rax, %r9
  319. movq %r10, %rax
  320. adcq \$0, %rdx
  321. addq %rbx, %r9
  322. movq %rdx, %r10
  323. adcq \$0, %r10
  324. movq %r15, %rbx
  325. leaq (%r12,%r15,2),%r15 #shld \$1, %r14, %r15
  326. mulq %rax
  327. addq %rax, %r13
  328. adcq %rdx, %r14
  329. adcq \$0, %r15
  330. movq %r13, 48(%rsp)
  331. movq %r14, 56(%rsp)
  332. shrq \$63, %rbx
  333. #fifth iteration
  334. movq 32($inp), %r11
  335. movq 40($inp), %rax
  336. mulq %r11
  337. addq %rax, %r8
  338. movq 48($inp), %rax
  339. movq %rdx, %rcx
  340. adcq \$0, %rcx
  341. mulq %r11
  342. addq %rax, %r9
  343. movq 56($inp), %rax
  344. adcq \$0, %rdx
  345. movq %r8, %r12
  346. leaq (%rbx,%r8,2), %r8 #shld \$1, %rbx, %r8
  347. addq %rcx, %r9
  348. movq %rdx, %rcx
  349. adcq \$0, %rcx
  350. mulq %r11
  351. shrq \$63, %r12
  352. addq %rax, %r10
  353. movq %r11, %rax
  354. adcq \$0, %rdx
  355. addq %rcx, %r10
  356. movq %rdx, %r11
  357. adcq \$0, %r11
  358. movq %r9, %rcx
  359. leaq (%r12,%r9,2), %r9 #shld \$1, %r8, %r9
  360. mulq %rax
  361. addq %rax, %r15
  362. adcq %rdx, %r8
  363. adcq \$0, %r9
  364. movq %r15, 64(%rsp)
  365. movq %r8, 72(%rsp)
  366. shrq \$63, %rcx
  367. #sixth iteration
  368. movq 40($inp), %r12
  369. movq 48($inp), %rax
  370. mulq %r12
  371. addq %rax, %r10
  372. movq 56($inp), %rax
  373. movq %rdx, %rbx
  374. adcq \$0, %rbx
  375. mulq %r12
  376. addq %rax, %r11
  377. movq %r12, %rax
  378. movq %r10, %r15
  379. leaq (%rcx,%r10,2), %r10 #shld \$1, %rcx, %r10
  380. adcq \$0, %rdx
  381. shrq \$63, %r15
  382. addq %rbx, %r11
  383. movq %rdx, %r12
  384. adcq \$0, %r12
  385. movq %r11, %rbx
  386. leaq (%r15,%r11,2), %r11 #shld \$1, %r10, %r11
  387. mulq %rax
  388. addq %rax, %r9
  389. adcq %rdx, %r10
  390. adcq \$0, %r11
  391. movq %r9, 80(%rsp)
  392. movq %r10, 88(%rsp)
  393. #seventh iteration
  394. movq 48($inp), %r13
  395. movq 56($inp), %rax
  396. mulq %r13
  397. addq %rax, %r12
  398. movq %r13, %rax
  399. movq %rdx, %r13
  400. adcq \$0, %r13
  401. xorq %r14, %r14
  402. shlq \$1, %rbx
  403. adcq %r12, %r12 #shld \$1, %rbx, %r12
  404. adcq %r13, %r13 #shld \$1, %r12, %r13
  405. adcq %r14, %r14 #shld \$1, %r13, %r14
  406. mulq %rax
  407. addq %rax, %r11
  408. adcq %rdx, %r12
  409. adcq \$0, %r13
  410. movq %r11, 96(%rsp)
  411. movq %r12, 104(%rsp)
  412. #eighth iteration
  413. movq 56($inp), %rax
  414. mulq %rax
  415. addq %rax, %r13
  416. adcq \$0, %rdx
  417. addq %rdx, %r14
  418. movq %r13, 112(%rsp)
  419. movq %r14, 120(%rsp)
  420. movq (%rsp), %r8
  421. movq 8(%rsp), %r9
  422. movq 16(%rsp), %r10
  423. movq 24(%rsp), %r11
  424. movq 32(%rsp), %r12
  425. movq 40(%rsp), %r13
  426. movq 48(%rsp), %r14
  427. movq 56(%rsp), %r15
  428. call __rsaz_512_reduce
  429. addq 64(%rsp), %r8
  430. adcq 72(%rsp), %r9
  431. adcq 80(%rsp), %r10
  432. adcq 88(%rsp), %r11
  433. adcq 96(%rsp), %r12
  434. adcq 104(%rsp), %r13
  435. adcq 112(%rsp), %r14
  436. adcq 120(%rsp), %r15
  437. sbbq %rcx, %rcx
  438. call __rsaz_512_subtract
  439. movq %r8, %rdx
  440. movq %r9, %rax
  441. movl 128+8(%rsp), $times
  442. movq $out, $inp
  443. decl $times
  444. jnz .Loop_sqr
  445. ___
  446. if ($addx) {
  447. $code.=<<___;
  448. jmp .Lsqr_tail
  449. .align 32
  450. .Loop_sqrx:
  451. movl $times,128+8(%rsp)
  452. movq $out, %xmm0 # off-load
  453. movq %rbp, %xmm1 # off-load
  454. #first iteration
  455. mulx %rax, %r8, %r9
  456. mulx 16($inp), %rcx, %r10
  457. xor %rbp, %rbp # cf=0, of=0
  458. mulx 24($inp), %rax, %r11
  459. adcx %rcx, %r9
  460. mulx 32($inp), %rcx, %r12
  461. adcx %rax, %r10
  462. mulx 40($inp), %rax, %r13
  463. adcx %rcx, %r11
  464. .byte 0xc4,0x62,0xf3,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($inp), %rcx, %r14
  465. adcx %rax, %r12
  466. adcx %rcx, %r13
  467. .byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 # mulx 56($inp), %rax, %r15
  468. adcx %rax, %r14
  469. adcx %rbp, %r15 # %rbp is 0
  470. mov %r9, %rcx
  471. shld \$1, %r8, %r9
  472. shl \$1, %r8
  473. xor %ebp, %ebp
  474. mulx %rdx, %rax, %rdx
  475. adcx %rdx, %r8
  476. mov 8($inp), %rdx
  477. adcx %rbp, %r9
  478. mov %rax, (%rsp)
  479. mov %r8, 8(%rsp)
  480. #second iteration
  481. mulx 16($inp), %rax, %rbx
  482. adox %rax, %r10
  483. adcx %rbx, %r11
  484. .byte 0xc4,0x62,0xc3,0xf6,0x86,0x18,0x00,0x00,0x00 # mulx 24($inp), $out, %r8
  485. adox $out, %r11
  486. adcx %r8, %r12
  487. mulx 32($inp), %rax, %rbx
  488. adox %rax, %r12
  489. adcx %rbx, %r13
  490. mulx 40($inp), $out, %r8
  491. adox $out, %r13
  492. adcx %r8, %r14
  493. .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx
  494. adox %rax, %r14
  495. adcx %rbx, %r15
  496. .byte 0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r8
  497. adox $out, %r15
  498. adcx %rbp, %r8
  499. adox %rbp, %r8
  500. mov %r11, %rbx
  501. shld \$1, %r10, %r11
  502. shld \$1, %rcx, %r10
  503. xor %ebp,%ebp
  504. mulx %rdx, %rax, %rcx
  505. mov 16($inp), %rdx
  506. adcx %rax, %r9
  507. adcx %rcx, %r10
  508. adcx %rbp, %r11
  509. mov %r9, 16(%rsp)
  510. .byte 0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00 # mov %r10, 24(%rsp)
  511. #third iteration
  512. .byte 0xc4,0x62,0xc3,0xf6,0x8e,0x18,0x00,0x00,0x00 # mulx 24($inp), $out, %r9
  513. adox $out, %r12
  514. adcx %r9, %r13
  515. mulx 32($inp), %rax, %rcx
  516. adox %rax, %r13
  517. adcx %rcx, %r14
  518. mulx 40($inp), $out, %r9
  519. adox $out, %r14
  520. adcx %r9, %r15
  521. .byte 0xc4,0xe2,0xfb,0xf6,0x8e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rcx
  522. adox %rax, %r15
  523. adcx %rcx, %r8
  524. .byte 0xc4,0x62,0xc3,0xf6,0x8e,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r9
  525. adox $out, %r8
  526. adcx %rbp, %r9
  527. adox %rbp, %r9
  528. mov %r13, %rcx
  529. shld \$1, %r12, %r13
  530. shld \$1, %rbx, %r12
  531. xor %ebp, %ebp
  532. mulx %rdx, %rax, %rdx
  533. adcx %rax, %r11
  534. adcx %rdx, %r12
  535. mov 24($inp), %rdx
  536. adcx %rbp, %r13
  537. mov %r11, 32(%rsp)
  538. .byte 0x4c,0x89,0xa4,0x24,0x28,0x00,0x00,0x00 # mov %r12, 40(%rsp)
  539. #fourth iteration
  540. .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x20,0x00,0x00,0x00 # mulx 32($inp), %rax, %rbx
  541. adox %rax, %r14
  542. adcx %rbx, %r15
  543. mulx 40($inp), $out, %r10
  544. adox $out, %r15
  545. adcx %r10, %r8
  546. mulx 48($inp), %rax, %rbx
  547. adox %rax, %r8
  548. adcx %rbx, %r9
  549. mulx 56($inp), $out, %r10
  550. adox $out, %r9
  551. adcx %rbp, %r10
  552. adox %rbp, %r10
  553. .byte 0x66
  554. mov %r15, %rbx
  555. shld \$1, %r14, %r15
  556. shld \$1, %rcx, %r14
  557. xor %ebp, %ebp
  558. mulx %rdx, %rax, %rdx
  559. adcx %rax, %r13
  560. adcx %rdx, %r14
  561. mov 32($inp), %rdx
  562. adcx %rbp, %r15
  563. mov %r13, 48(%rsp)
  564. mov %r14, 56(%rsp)
  565. #fifth iteration
  566. .byte 0xc4,0x62,0xc3,0xf6,0x9e,0x28,0x00,0x00,0x00 # mulx 40($inp), $out, %r11
  567. adox $out, %r8
  568. adcx %r11, %r9
  569. mulx 48($inp), %rax, %rcx
  570. adox %rax, %r9
  571. adcx %rcx, %r10
  572. mulx 56($inp), $out, %r11
  573. adox $out, %r10
  574. adcx %rbp, %r11
  575. adox %rbp, %r11
  576. mov %r9, %rcx
  577. shld \$1, %r8, %r9
  578. shld \$1, %rbx, %r8
  579. xor %ebp, %ebp
  580. mulx %rdx, %rax, %rdx
  581. adcx %rax, %r15
  582. adcx %rdx, %r8
  583. mov 40($inp), %rdx
  584. adcx %rbp, %r9
  585. mov %r15, 64(%rsp)
  586. mov %r8, 72(%rsp)
  587. #sixth iteration
  588. .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx
  589. adox %rax, %r10
  590. adcx %rbx, %r11
  591. .byte 0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r12
  592. adox $out, %r11
  593. adcx %rbp, %r12
  594. adox %rbp, %r12
  595. mov %r11, %rbx
  596. shld \$1, %r10, %r11
  597. shld \$1, %rcx, %r10
  598. xor %ebp, %ebp
  599. mulx %rdx, %rax, %rdx
  600. adcx %rax, %r9
  601. adcx %rdx, %r10
  602. mov 48($inp), %rdx
  603. adcx %rbp, %r11
  604. mov %r9, 80(%rsp)
  605. mov %r10, 88(%rsp)
  606. #seventh iteration
  607. .byte 0xc4,0x62,0xfb,0xf6,0xae,0x38,0x00,0x00,0x00 # mulx 56($inp), %rax, %r13
  608. adox %rax, %r12
  609. adox %rbp, %r13
  610. xor %r14, %r14
  611. shld \$1, %r13, %r14
  612. shld \$1, %r12, %r13
  613. shld \$1, %rbx, %r12
  614. xor %ebp, %ebp
  615. mulx %rdx, %rax, %rdx
  616. adcx %rax, %r11
  617. adcx %rdx, %r12
  618. mov 56($inp), %rdx
  619. adcx %rbp, %r13
  620. .byte 0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00 # mov %r11, 96(%rsp)
  621. .byte 0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00 # mov %r12, 104(%rsp)
  622. #eighth iteration
  623. mulx %rdx, %rax, %rdx
  624. adox %rax, %r13
  625. adox %rbp, %rdx
  626. .byte 0x66
  627. add %rdx, %r14
  628. movq %r13, 112(%rsp)
  629. movq %r14, 120(%rsp)
  630. movq %xmm0, $out
  631. movq %xmm1, %rbp
  632. movq 128(%rsp), %rdx # pull $n0
  633. movq (%rsp), %r8
  634. movq 8(%rsp), %r9
  635. movq 16(%rsp), %r10
  636. movq 24(%rsp), %r11
  637. movq 32(%rsp), %r12
  638. movq 40(%rsp), %r13
  639. movq 48(%rsp), %r14
  640. movq 56(%rsp), %r15
  641. call __rsaz_512_reducex
  642. addq 64(%rsp), %r8
  643. adcq 72(%rsp), %r9
  644. adcq 80(%rsp), %r10
  645. adcq 88(%rsp), %r11
  646. adcq 96(%rsp), %r12
  647. adcq 104(%rsp), %r13
  648. adcq 112(%rsp), %r14
  649. adcq 120(%rsp), %r15
  650. sbbq %rcx, %rcx
  651. call __rsaz_512_subtract
  652. movq %r8, %rdx
  653. movq %r9, %rax
  654. movl 128+8(%rsp), $times
  655. movq $out, $inp
  656. decl $times
  657. jnz .Loop_sqrx
  658. .Lsqr_tail:
  659. ___
  660. }
  661. $code.=<<___;
  662. leaq 128+24+48(%rsp), %rax
  663. movq -48(%rax), %r15
  664. movq -40(%rax), %r14
  665. movq -32(%rax), %r13
  666. movq -24(%rax), %r12
  667. movq -16(%rax), %rbp
  668. movq -8(%rax), %rbx
  669. leaq (%rax), %rsp
  670. .Lsqr_epilogue:
  671. ret
  672. .size rsaz_512_sqr,.-rsaz_512_sqr
  673. ___
  674. }
  675. {
  676. my ($out,$ap,$bp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8");
  677. $code.=<<___;
  678. .globl rsaz_512_mul
  679. .type rsaz_512_mul,\@function,5
  680. .align 32
  681. rsaz_512_mul:
  682. push %rbx
  683. push %rbp
  684. push %r12
  685. push %r13
  686. push %r14
  687. push %r15
  688. subq \$128+24, %rsp
  689. .Lmul_body:
  690. movq $out, %xmm0 # off-load arguments
  691. movq $mod, %xmm1
  692. movq $n0, 128(%rsp)
  693. ___
  694. $code.=<<___ if ($addx);
  695. movl \$0x80100,%r11d
  696. andl OPENSSL_ia32cap_P+8(%rip),%r11d
  697. cmpl \$0x80100,%r11d # check for MULX and ADO/CX
  698. je .Lmulx
  699. ___
  700. $code.=<<___;
  701. movq ($bp), %rbx # pass b[0]
  702. movq $bp, %rbp # pass argument
  703. call __rsaz_512_mul
  704. movq %xmm0, $out
  705. movq %xmm1, %rbp
  706. movq (%rsp), %r8
  707. movq 8(%rsp), %r9
  708. movq 16(%rsp), %r10
  709. movq 24(%rsp), %r11
  710. movq 32(%rsp), %r12
  711. movq 40(%rsp), %r13
  712. movq 48(%rsp), %r14
  713. movq 56(%rsp), %r15
  714. call __rsaz_512_reduce
  715. ___
  716. $code.=<<___ if ($addx);
  717. jmp .Lmul_tail
  718. .align 32
  719. .Lmulx:
  720. movq $bp, %rbp # pass argument
  721. movq ($bp), %rdx # pass b[0]
  722. call __rsaz_512_mulx
  723. movq %xmm0, $out
  724. movq %xmm1, %rbp
  725. movq 128(%rsp), %rdx # pull $n0
  726. movq (%rsp), %r8
  727. movq 8(%rsp), %r9
  728. movq 16(%rsp), %r10
  729. movq 24(%rsp), %r11
  730. movq 32(%rsp), %r12
  731. movq 40(%rsp), %r13
  732. movq 48(%rsp), %r14
  733. movq 56(%rsp), %r15
  734. call __rsaz_512_reducex
  735. .Lmul_tail:
  736. ___
  737. $code.=<<___;
  738. addq 64(%rsp), %r8
  739. adcq 72(%rsp), %r9
  740. adcq 80(%rsp), %r10
  741. adcq 88(%rsp), %r11
  742. adcq 96(%rsp), %r12
  743. adcq 104(%rsp), %r13
  744. adcq 112(%rsp), %r14
  745. adcq 120(%rsp), %r15
  746. sbbq %rcx, %rcx
  747. call __rsaz_512_subtract
  748. leaq 128+24+48(%rsp), %rax
  749. movq -48(%rax), %r15
  750. movq -40(%rax), %r14
  751. movq -32(%rax), %r13
  752. movq -24(%rax), %r12
  753. movq -16(%rax), %rbp
  754. movq -8(%rax), %rbx
  755. leaq (%rax), %rsp
  756. .Lmul_epilogue:
  757. ret
  758. .size rsaz_512_mul,.-rsaz_512_mul
  759. ___
  760. }
  761. {
  762. my ($out,$ap,$bp,$mod,$n0,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
  763. $code.=<<___;
  764. .globl rsaz_512_mul_gather4
  765. .type rsaz_512_mul_gather4,\@function,6
  766. .align 32
  767. rsaz_512_mul_gather4:
  768. push %rbx
  769. push %rbp
  770. push %r12
  771. push %r13
  772. push %r14
  773. push %r15
  774. mov $pwr, $pwr
  775. subq \$128+24, %rsp
  776. .Lmul_gather4_body:
  777. ___
  778. $code.=<<___ if ($addx);
  779. movl \$0x80100,%r11d
  780. andl OPENSSL_ia32cap_P+8(%rip),%r11d
  781. cmpl \$0x80100,%r11d # check for MULX and ADO/CX
  782. je .Lmulx_gather
  783. ___
  784. $code.=<<___;
  785. movl 64($bp,$pwr,4), %eax
  786. movq $out, %xmm0 # off-load arguments
  787. movl ($bp,$pwr,4), %ebx
  788. movq $mod, %xmm1
  789. movq $n0, 128(%rsp)
  790. shlq \$32, %rax
  791. or %rax, %rbx
  792. movq ($ap), %rax
  793. movq 8($ap), %rcx
  794. leaq 128($bp,$pwr,4), %rbp
  795. mulq %rbx # 0 iteration
  796. movq %rax, (%rsp)
  797. movq %rcx, %rax
  798. movq %rdx, %r8
  799. mulq %rbx
  800. movd (%rbp), %xmm4
  801. addq %rax, %r8
  802. movq 16($ap), %rax
  803. movq %rdx, %r9
  804. adcq \$0, %r9
  805. mulq %rbx
  806. movd 64(%rbp), %xmm5
  807. addq %rax, %r9
  808. movq 24($ap), %rax
  809. movq %rdx, %r10
  810. adcq \$0, %r10
  811. mulq %rbx
  812. pslldq \$4, %xmm5
  813. addq %rax, %r10
  814. movq 32($ap), %rax
  815. movq %rdx, %r11
  816. adcq \$0, %r11
  817. mulq %rbx
  818. por %xmm5, %xmm4
  819. addq %rax, %r11
  820. movq 40($ap), %rax
  821. movq %rdx, %r12
  822. adcq \$0, %r12
  823. mulq %rbx
  824. addq %rax, %r12
  825. movq 48($ap), %rax
  826. movq %rdx, %r13
  827. adcq \$0, %r13
  828. mulq %rbx
  829. leaq 128(%rbp), %rbp
  830. addq %rax, %r13
  831. movq 56($ap), %rax
  832. movq %rdx, %r14
  833. adcq \$0, %r14
  834. mulq %rbx
  835. movq %xmm4, %rbx
  836. addq %rax, %r14
  837. movq ($ap), %rax
  838. movq %rdx, %r15
  839. adcq \$0, %r15
  840. leaq 8(%rsp), %rdi
  841. movl \$7, %ecx
  842. jmp .Loop_mul_gather
  843. .align 32
  844. .Loop_mul_gather:
  845. mulq %rbx
  846. addq %rax, %r8
  847. movq 8($ap), %rax
  848. movq %r8, (%rdi)
  849. movq %rdx, %r8
  850. adcq \$0, %r8
  851. mulq %rbx
  852. movd (%rbp), %xmm4
  853. addq %rax, %r9
  854. movq 16($ap), %rax
  855. adcq \$0, %rdx
  856. addq %r9, %r8
  857. movq %rdx, %r9
  858. adcq \$0, %r9
  859. mulq %rbx
  860. movd 64(%rbp), %xmm5
  861. addq %rax, %r10
  862. movq 24($ap), %rax
  863. adcq \$0, %rdx
  864. addq %r10, %r9
  865. movq %rdx, %r10
  866. adcq \$0, %r10
  867. mulq %rbx
  868. pslldq \$4, %xmm5
  869. addq %rax, %r11
  870. movq 32($ap), %rax
  871. adcq \$0, %rdx
  872. addq %r11, %r10
  873. movq %rdx, %r11
  874. adcq \$0, %r11
  875. mulq %rbx
  876. por %xmm5, %xmm4
  877. addq %rax, %r12
  878. movq 40($ap), %rax
  879. adcq \$0, %rdx
  880. addq %r12, %r11
  881. movq %rdx, %r12
  882. adcq \$0, %r12
  883. mulq %rbx
  884. addq %rax, %r13
  885. movq 48($ap), %rax
  886. adcq \$0, %rdx
  887. addq %r13, %r12
  888. movq %rdx, %r13
  889. adcq \$0, %r13
  890. mulq %rbx
  891. addq %rax, %r14
  892. movq 56($ap), %rax
  893. adcq \$0, %rdx
  894. addq %r14, %r13
  895. movq %rdx, %r14
  896. adcq \$0, %r14
  897. mulq %rbx
  898. movq %xmm4, %rbx
  899. addq %rax, %r15
  900. movq ($ap), %rax
  901. adcq \$0, %rdx
  902. addq %r15, %r14
  903. movq %rdx, %r15
  904. adcq \$0, %r15
  905. leaq 128(%rbp), %rbp
  906. leaq 8(%rdi), %rdi
  907. decl %ecx
  908. jnz .Loop_mul_gather
  909. movq %r8, (%rdi)
  910. movq %r9, 8(%rdi)
  911. movq %r10, 16(%rdi)
  912. movq %r11, 24(%rdi)
  913. movq %r12, 32(%rdi)
  914. movq %r13, 40(%rdi)
  915. movq %r14, 48(%rdi)
  916. movq %r15, 56(%rdi)
  917. movq %xmm0, $out
  918. movq %xmm1, %rbp
  919. movq (%rsp), %r8
  920. movq 8(%rsp), %r9
  921. movq 16(%rsp), %r10
  922. movq 24(%rsp), %r11
  923. movq 32(%rsp), %r12
  924. movq 40(%rsp), %r13
  925. movq 48(%rsp), %r14
  926. movq 56(%rsp), %r15
  927. call __rsaz_512_reduce
  928. ___
  929. $code.=<<___ if ($addx);
  930. jmp .Lmul_gather_tail
  931. .align 32
  932. .Lmulx_gather:
  933. mov 64($bp,$pwr,4), %eax
  934. movq $out, %xmm0 # off-load arguments
  935. lea 128($bp,$pwr,4), %rbp
  936. mov ($bp,$pwr,4), %edx
  937. movq $mod, %xmm1
  938. mov $n0, 128(%rsp)
  939. shl \$32, %rax
  940. or %rax, %rdx
  941. mulx ($ap), %rbx, %r8 # 0 iteration
  942. mov %rbx, (%rsp)
  943. xor %edi, %edi # cf=0, of=0
  944. mulx 8($ap), %rax, %r9
  945. movd (%rbp), %xmm4
  946. mulx 16($ap), %rbx, %r10
  947. movd 64(%rbp), %xmm5
  948. adcx %rax, %r8
  949. mulx 24($ap), %rax, %r11
  950. pslldq \$4, %xmm5
  951. adcx %rbx, %r9
  952. mulx 32($ap), %rbx, %r12
  953. por %xmm5, %xmm4
  954. adcx %rax, %r10
  955. mulx 40($ap), %rax, %r13
  956. adcx %rbx, %r11
  957. mulx 48($ap), %rbx, %r14
  958. lea 128(%rbp), %rbp
  959. adcx %rax, %r12
  960. mulx 56($ap), %rax, %r15
  961. movq %xmm4, %rdx
  962. adcx %rbx, %r13
  963. adcx %rax, %r14
  964. mov %r8, %rbx
  965. adcx %rdi, %r15 # %rdi is 0
  966. mov \$-7, %rcx
  967. jmp .Loop_mulx_gather
  968. .align 32
  969. .Loop_mulx_gather:
  970. mulx ($ap), %rax, %r8
  971. adcx %rax, %rbx
  972. adox %r9, %r8
  973. mulx 8($ap), %rax, %r9
  974. .byte 0x66,0x0f,0x6e,0xa5,0x00,0x00,0x00,0x00 # movd (%rbp), %xmm4
  975. adcx %rax, %r8
  976. adox %r10, %r9
  977. mulx 16($ap), %rax, %r10
  978. movd 64(%rbp), %xmm5
  979. lea 128(%rbp), %rbp
  980. adcx %rax, %r9
  981. adox %r11, %r10
  982. .byte 0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00 # mulx 24($ap), %rax, %r11
  983. pslldq \$4, %xmm5
  984. por %xmm5, %xmm4
  985. adcx %rax, %r10
  986. adox %r12, %r11
  987. mulx 32($ap), %rax, %r12
  988. adcx %rax, %r11
  989. adox %r13, %r12
  990. mulx 40($ap), %rax, %r13
  991. adcx %rax, %r12
  992. adox %r14, %r13
  993. .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14
  994. adcx %rax, %r13
  995. adox %r15, %r14
  996. mulx 56($ap), %rax, %r15
  997. movq %xmm4, %rdx
  998. mov %rbx, 64(%rsp,%rcx,8)
  999. adcx %rax, %r14
  1000. adox %rdi, %r15
  1001. mov %r8, %rbx
  1002. adcx %rdi, %r15 # cf=0
  1003. inc %rcx # of=0
  1004. jnz .Loop_mulx_gather
  1005. mov %r8, 64(%rsp)
  1006. mov %r9, 64+8(%rsp)
  1007. mov %r10, 64+16(%rsp)
  1008. mov %r11, 64+24(%rsp)
  1009. mov %r12, 64+32(%rsp)
  1010. mov %r13, 64+40(%rsp)
  1011. mov %r14, 64+48(%rsp)
  1012. mov %r15, 64+56(%rsp)
  1013. movq %xmm0, $out
  1014. movq %xmm1, %rbp
  1015. mov 128(%rsp), %rdx # pull $n0
  1016. mov (%rsp), %r8
  1017. mov 8(%rsp), %r9
  1018. mov 16(%rsp), %r10
  1019. mov 24(%rsp), %r11
  1020. mov 32(%rsp), %r12
  1021. mov 40(%rsp), %r13
  1022. mov 48(%rsp), %r14
  1023. mov 56(%rsp), %r15
  1024. call __rsaz_512_reducex
  1025. .Lmul_gather_tail:
  1026. ___
  1027. $code.=<<___;
  1028. addq 64(%rsp), %r8
  1029. adcq 72(%rsp), %r9
  1030. adcq 80(%rsp), %r10
  1031. adcq 88(%rsp), %r11
  1032. adcq 96(%rsp), %r12
  1033. adcq 104(%rsp), %r13
  1034. adcq 112(%rsp), %r14
  1035. adcq 120(%rsp), %r15
  1036. sbbq %rcx, %rcx
  1037. call __rsaz_512_subtract
  1038. leaq 128+24+48(%rsp), %rax
  1039. movq -48(%rax), %r15
  1040. movq -40(%rax), %r14
  1041. movq -32(%rax), %r13
  1042. movq -24(%rax), %r12
  1043. movq -16(%rax), %rbp
  1044. movq -8(%rax), %rbx
  1045. leaq (%rax), %rsp
  1046. .Lmul_gather4_epilogue:
  1047. ret
  1048. .size rsaz_512_mul_gather4,.-rsaz_512_mul_gather4
  1049. ___
  1050. }
  1051. {
  1052. my ($out,$ap,$mod,$n0,$tbl,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
  1053. $code.=<<___;
  1054. .globl rsaz_512_mul_scatter4
  1055. .type rsaz_512_mul_scatter4,\@function,6
  1056. .align 32
  1057. rsaz_512_mul_scatter4:
  1058. push %rbx
  1059. push %rbp
  1060. push %r12
  1061. push %r13
  1062. push %r14
  1063. push %r15
  1064. mov $pwr, $pwr
  1065. subq \$128+24, %rsp
  1066. .Lmul_scatter4_body:
  1067. leaq ($tbl,$pwr,4), $tbl
  1068. movq $out, %xmm0 # off-load arguments
  1069. movq $mod, %xmm1
  1070. movq $tbl, %xmm2
  1071. movq $n0, 128(%rsp)
  1072. movq $out, %rbp
  1073. ___
  1074. $code.=<<___ if ($addx);
  1075. movl \$0x80100,%r11d
  1076. andl OPENSSL_ia32cap_P+8(%rip),%r11d
  1077. cmpl \$0x80100,%r11d # check for MULX and ADO/CX
  1078. je .Lmulx_scatter
  1079. ___
  1080. $code.=<<___;
  1081. movq ($out),%rbx # pass b[0]
  1082. call __rsaz_512_mul
  1083. movq %xmm0, $out
  1084. movq %xmm1, %rbp
  1085. movq (%rsp), %r8
  1086. movq 8(%rsp), %r9
  1087. movq 16(%rsp), %r10
  1088. movq 24(%rsp), %r11
  1089. movq 32(%rsp), %r12
  1090. movq 40(%rsp), %r13
  1091. movq 48(%rsp), %r14
  1092. movq 56(%rsp), %r15
  1093. call __rsaz_512_reduce
  1094. ___
  1095. $code.=<<___ if ($addx);
  1096. jmp .Lmul_scatter_tail
  1097. .align 32
  1098. .Lmulx_scatter:
  1099. movq ($out), %rdx # pass b[0]
  1100. call __rsaz_512_mulx
  1101. movq %xmm0, $out
  1102. movq %xmm1, %rbp
  1103. movq 128(%rsp), %rdx # pull $n0
  1104. movq (%rsp), %r8
  1105. movq 8(%rsp), %r9
  1106. movq 16(%rsp), %r10
  1107. movq 24(%rsp), %r11
  1108. movq 32(%rsp), %r12
  1109. movq 40(%rsp), %r13
  1110. movq 48(%rsp), %r14
  1111. movq 56(%rsp), %r15
  1112. call __rsaz_512_reducex
  1113. .Lmul_scatter_tail:
  1114. ___
  1115. $code.=<<___;
  1116. addq 64(%rsp), %r8
  1117. adcq 72(%rsp), %r9
  1118. adcq 80(%rsp), %r10
  1119. adcq 88(%rsp), %r11
  1120. adcq 96(%rsp), %r12
  1121. adcq 104(%rsp), %r13
  1122. adcq 112(%rsp), %r14
  1123. adcq 120(%rsp), %r15
  1124. movq %xmm2, $inp
  1125. sbbq %rcx, %rcx
  1126. call __rsaz_512_subtract
  1127. movl %r8d, 64*0($inp) # scatter
  1128. shrq \$32, %r8
  1129. movl %r9d, 64*2($inp)
  1130. shrq \$32, %r9
  1131. movl %r10d, 64*4($inp)
  1132. shrq \$32, %r10
  1133. movl %r11d, 64*6($inp)
  1134. shrq \$32, %r11
  1135. movl %r12d, 64*8($inp)
  1136. shrq \$32, %r12
  1137. movl %r13d, 64*10($inp)
  1138. shrq \$32, %r13
  1139. movl %r14d, 64*12($inp)
  1140. shrq \$32, %r14
  1141. movl %r15d, 64*14($inp)
  1142. shrq \$32, %r15
  1143. movl %r8d, 64*1($inp)
  1144. movl %r9d, 64*3($inp)
  1145. movl %r10d, 64*5($inp)
  1146. movl %r11d, 64*7($inp)
  1147. movl %r12d, 64*9($inp)
  1148. movl %r13d, 64*11($inp)
  1149. movl %r14d, 64*13($inp)
  1150. movl %r15d, 64*15($inp)
  1151. leaq 128+24+48(%rsp), %rax
  1152. movq -48(%rax), %r15
  1153. movq -40(%rax), %r14
  1154. movq -32(%rax), %r13
  1155. movq -24(%rax), %r12
  1156. movq -16(%rax), %rbp
  1157. movq -8(%rax), %rbx
  1158. leaq (%rax), %rsp
  1159. .Lmul_scatter4_epilogue:
  1160. ret
  1161. .size rsaz_512_mul_scatter4,.-rsaz_512_mul_scatter4
  1162. ___
  1163. }
  1164. {
  1165. my ($out,$inp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx");
  1166. $code.=<<___;
  1167. .globl rsaz_512_mul_by_one
  1168. .type rsaz_512_mul_by_one,\@function,4
  1169. .align 32
  1170. rsaz_512_mul_by_one:
  1171. push %rbx
  1172. push %rbp
  1173. push %r12
  1174. push %r13
  1175. push %r14
  1176. push %r15
  1177. subq \$128+24, %rsp
  1178. .Lmul_by_one_body:
  1179. ___
  1180. $code.=<<___ if ($addx);
  1181. movl OPENSSL_ia32cap_P+8(%rip),%eax
  1182. ___
  1183. $code.=<<___;
  1184. movq $mod, %rbp # reassign argument
  1185. movq $n0, 128(%rsp)
  1186. movq ($inp), %r8
  1187. pxor %xmm0, %xmm0
  1188. movq 8($inp), %r9
  1189. movq 16($inp), %r10
  1190. movq 24($inp), %r11
  1191. movq 32($inp), %r12
  1192. movq 40($inp), %r13
  1193. movq 48($inp), %r14
  1194. movq 56($inp), %r15
  1195. movdqa %xmm0, (%rsp)
  1196. movdqa %xmm0, 16(%rsp)
  1197. movdqa %xmm0, 32(%rsp)
  1198. movdqa %xmm0, 48(%rsp)
  1199. movdqa %xmm0, 64(%rsp)
  1200. movdqa %xmm0, 80(%rsp)
  1201. movdqa %xmm0, 96(%rsp)
  1202. ___
  1203. $code.=<<___ if ($addx);
  1204. andl \$0x80100,%eax
  1205. cmpl \$0x80100,%eax # check for MULX and ADO/CX
  1206. je .Lby_one_callx
  1207. ___
  1208. $code.=<<___;
  1209. call __rsaz_512_reduce
  1210. ___
  1211. $code.=<<___ if ($addx);
  1212. jmp .Lby_one_tail
  1213. .align 32
  1214. .Lby_one_callx:
  1215. movq 128(%rsp), %rdx # pull $n0
  1216. call __rsaz_512_reducex
  1217. .Lby_one_tail:
  1218. ___
  1219. $code.=<<___;
  1220. movq %r8, ($out)
  1221. movq %r9, 8($out)
  1222. movq %r10, 16($out)
  1223. movq %r11, 24($out)
  1224. movq %r12, 32($out)
  1225. movq %r13, 40($out)
  1226. movq %r14, 48($out)
  1227. movq %r15, 56($out)
  1228. leaq 128+24+48(%rsp), %rax
  1229. movq -48(%rax), %r15
  1230. movq -40(%rax), %r14
  1231. movq -32(%rax), %r13
  1232. movq -24(%rax), %r12
  1233. movq -16(%rax), %rbp
  1234. movq -8(%rax), %rbx
  1235. leaq (%rax), %rsp
  1236. .Lmul_by_one_epilogue:
  1237. ret
  1238. .size rsaz_512_mul_by_one,.-rsaz_512_mul_by_one
  1239. ___
  1240. }
  1241. { # __rsaz_512_reduce
  1242. #
  1243. # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0
  1244. # output: %r8-%r15
  1245. # clobbers: everything except %rbp and %rdi
  1246. $code.=<<___;
  1247. .type __rsaz_512_reduce,\@abi-omnipotent
  1248. .align 32
  1249. __rsaz_512_reduce:
  1250. movq %r8, %rbx
  1251. imulq 128+8(%rsp), %rbx
  1252. movq 0(%rbp), %rax
  1253. movl \$8, %ecx
  1254. jmp .Lreduction_loop
  1255. .align 32
  1256. .Lreduction_loop:
  1257. mulq %rbx
  1258. movq 8(%rbp), %rax
  1259. negq %r8
  1260. movq %rdx, %r8
  1261. adcq \$0, %r8
  1262. mulq %rbx
  1263. addq %rax, %r9
  1264. movq 16(%rbp), %rax
  1265. adcq \$0, %rdx
  1266. addq %r9, %r8
  1267. movq %rdx, %r9
  1268. adcq \$0, %r9
  1269. mulq %rbx
  1270. addq %rax, %r10
  1271. movq 24(%rbp), %rax
  1272. adcq \$0, %rdx
  1273. addq %r10, %r9
  1274. movq %rdx, %r10
  1275. adcq \$0, %r10
  1276. mulq %rbx
  1277. addq %rax, %r11
  1278. movq 32(%rbp), %rax
  1279. adcq \$0, %rdx
  1280. addq %r11, %r10
  1281. movq 128+8(%rsp), %rsi
  1282. #movq %rdx, %r11
  1283. #adcq \$0, %r11
  1284. adcq \$0, %rdx
  1285. movq %rdx, %r11
  1286. mulq %rbx
  1287. addq %rax, %r12
  1288. movq 40(%rbp), %rax
  1289. adcq \$0, %rdx
  1290. imulq %r8, %rsi
  1291. addq %r12, %r11
  1292. movq %rdx, %r12
  1293. adcq \$0, %r12
  1294. mulq %rbx
  1295. addq %rax, %r13
  1296. movq 48(%rbp), %rax
  1297. adcq \$0, %rdx
  1298. addq %r13, %r12
  1299. movq %rdx, %r13
  1300. adcq \$0, %r13
  1301. mulq %rbx
  1302. addq %rax, %r14
  1303. movq 56(%rbp), %rax
  1304. adcq \$0, %rdx
  1305. addq %r14, %r13
  1306. movq %rdx, %r14
  1307. adcq \$0, %r14
  1308. mulq %rbx
  1309. movq %rsi, %rbx
  1310. addq %rax, %r15
  1311. movq 0(%rbp), %rax
  1312. adcq \$0, %rdx
  1313. addq %r15, %r14
  1314. movq %rdx, %r15
  1315. adcq \$0, %r15
  1316. decl %ecx
  1317. jne .Lreduction_loop
  1318. ret
  1319. .size __rsaz_512_reduce,.-__rsaz_512_reduce
  1320. ___
  1321. }
  1322. if ($addx) {
  1323. # __rsaz_512_reducex
  1324. #
  1325. # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0
  1326. # output: %r8-%r15
  1327. # clobbers: everything except %rbp and %rdi
  1328. $code.=<<___;
  1329. .type __rsaz_512_reducex,\@abi-omnipotent
  1330. .align 32
  1331. __rsaz_512_reducex:
  1332. #movq 128+8(%rsp), %rdx # pull $n0
  1333. imulq %r8, %rdx
  1334. xorq %rsi, %rsi # cf=0,of=0
  1335. movl \$8, %ecx
  1336. jmp .Lreduction_loopx
  1337. .align 32
  1338. .Lreduction_loopx:
  1339. mov %r8, %rbx
  1340. mulx 0(%rbp), %rax, %r8
  1341. adcx %rbx, %rax
  1342. adox %r9, %r8
  1343. mulx 8(%rbp), %rax, %r9
  1344. adcx %rax, %r8
  1345. adox %r10, %r9
  1346. mulx 16(%rbp), %rbx, %r10
  1347. adcx %rbx, %r9
  1348. adox %r11, %r10
  1349. mulx 24(%rbp), %rbx, %r11
  1350. adcx %rbx, %r10
  1351. adox %r12, %r11
  1352. .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 32(%rbp), %rbx, %r12
  1353. mov %rdx, %rax
  1354. mov %r8, %rdx
  1355. adcx %rbx, %r11
  1356. adox %r13, %r12
  1357. mulx 128+8(%rsp), %rbx, %rdx
  1358. mov %rax, %rdx
  1359. mulx 40(%rbp), %rax, %r13
  1360. adcx %rax, %r12
  1361. adox %r14, %r13
  1362. .byte 0xc4,0x62,0xfb,0xf6,0xb5,0x30,0x00,0x00,0x00 # mulx 48(%rbp), %rax, %r14
  1363. adcx %rax, %r13
  1364. adox %r15, %r14
  1365. mulx 56(%rbp), %rax, %r15
  1366. mov %rbx, %rdx
  1367. adcx %rax, %r14
  1368. adox %rsi, %r15 # %rsi is 0
  1369. adcx %rsi, %r15 # cf=0
  1370. decl %ecx # of=0
  1371. jne .Lreduction_loopx
  1372. ret
  1373. .size __rsaz_512_reducex,.-__rsaz_512_reducex
  1374. ___
  1375. }
  1376. { # __rsaz_512_subtract
  1377. # input: %r8-%r15, %rdi - $out, %rbp - $mod, %rcx - mask
  1378. # output:
  1379. # clobbers: everything but %rdi, %rsi and %rbp
  1380. $code.=<<___;
  1381. .type __rsaz_512_subtract,\@abi-omnipotent
  1382. .align 32
  1383. __rsaz_512_subtract:
  1384. movq %r8, ($out)
  1385. movq %r9, 8($out)
  1386. movq %r10, 16($out)
  1387. movq %r11, 24($out)
  1388. movq %r12, 32($out)
  1389. movq %r13, 40($out)
  1390. movq %r14, 48($out)
  1391. movq %r15, 56($out)
  1392. movq 0($mod), %r8
  1393. movq 8($mod), %r9
  1394. negq %r8
  1395. notq %r9
  1396. andq %rcx, %r8
  1397. movq 16($mod), %r10
  1398. andq %rcx, %r9
  1399. notq %r10
  1400. movq 24($mod), %r11
  1401. andq %rcx, %r10
  1402. notq %r11
  1403. movq 32($mod), %r12
  1404. andq %rcx, %r11
  1405. notq %r12
  1406. movq 40($mod), %r13
  1407. andq %rcx, %r12
  1408. notq %r13
  1409. movq 48($mod), %r14
  1410. andq %rcx, %r13
  1411. notq %r14
  1412. movq 56($mod), %r15
  1413. andq %rcx, %r14
  1414. notq %r15
  1415. andq %rcx, %r15
  1416. addq ($out), %r8
  1417. adcq 8($out), %r9
  1418. adcq 16($out), %r10
  1419. adcq 24($out), %r11
  1420. adcq 32($out), %r12
  1421. adcq 40($out), %r13
  1422. adcq 48($out), %r14
  1423. adcq 56($out), %r15
  1424. movq %r8, ($out)
  1425. movq %r9, 8($out)
  1426. movq %r10, 16($out)
  1427. movq %r11, 24($out)
  1428. movq %r12, 32($out)
  1429. movq %r13, 40($out)
  1430. movq %r14, 48($out)
  1431. movq %r15, 56($out)
  1432. ret
  1433. .size __rsaz_512_subtract,.-__rsaz_512_subtract
  1434. ___
  1435. }
  1436. { # __rsaz_512_mul
  1437. #
  1438. # input: %rsi - ap, %rbp - bp
  1439. # ouput:
  1440. # clobbers: everything
  1441. my ($ap,$bp) = ("%rsi","%rbp");
  1442. $code.=<<___;
  1443. .type __rsaz_512_mul,\@abi-omnipotent
  1444. .align 32
  1445. __rsaz_512_mul:
  1446. leaq 8(%rsp), %rdi
  1447. movq ($ap), %rax
  1448. mulq %rbx
  1449. movq %rax, (%rdi)
  1450. movq 8($ap), %rax
  1451. movq %rdx, %r8
  1452. mulq %rbx
  1453. addq %rax, %r8
  1454. movq 16($ap), %rax
  1455. movq %rdx, %r9
  1456. adcq \$0, %r9
  1457. mulq %rbx
  1458. addq %rax, %r9
  1459. movq 24($ap), %rax
  1460. movq %rdx, %r10
  1461. adcq \$0, %r10
  1462. mulq %rbx
  1463. addq %rax, %r10
  1464. movq 32($ap), %rax
  1465. movq %rdx, %r11
  1466. adcq \$0, %r11
  1467. mulq %rbx
  1468. addq %rax, %r11
  1469. movq 40($ap), %rax
  1470. movq %rdx, %r12
  1471. adcq \$0, %r12
  1472. mulq %rbx
  1473. addq %rax, %r12
  1474. movq 48($ap), %rax
  1475. movq %rdx, %r13
  1476. adcq \$0, %r13
  1477. mulq %rbx
  1478. addq %rax, %r13
  1479. movq 56($ap), %rax
  1480. movq %rdx, %r14
  1481. adcq \$0, %r14
  1482. mulq %rbx
  1483. addq %rax, %r14
  1484. movq ($ap), %rax
  1485. movq %rdx, %r15
  1486. adcq \$0, %r15
  1487. leaq 8($bp), $bp
  1488. leaq 8(%rdi), %rdi
  1489. movl \$7, %ecx
  1490. jmp .Loop_mul
  1491. .align 32
  1492. .Loop_mul:
  1493. movq ($bp), %rbx
  1494. mulq %rbx
  1495. addq %rax, %r8
  1496. movq 8($ap), %rax
  1497. movq %r8, (%rdi)
  1498. movq %rdx, %r8
  1499. adcq \$0, %r8
  1500. mulq %rbx
  1501. addq %rax, %r9
  1502. movq 16($ap), %rax
  1503. adcq \$0, %rdx
  1504. addq %r9, %r8
  1505. movq %rdx, %r9
  1506. adcq \$0, %r9
  1507. mulq %rbx
  1508. addq %rax, %r10
  1509. movq 24($ap), %rax
  1510. adcq \$0, %rdx
  1511. addq %r10, %r9
  1512. movq %rdx, %r10
  1513. adcq \$0, %r10
  1514. mulq %rbx
  1515. addq %rax, %r11
  1516. movq 32($ap), %rax
  1517. adcq \$0, %rdx
  1518. addq %r11, %r10
  1519. movq %rdx, %r11
  1520. adcq \$0, %r11
  1521. mulq %rbx
  1522. addq %rax, %r12
  1523. movq 40($ap), %rax
  1524. adcq \$0, %rdx
  1525. addq %r12, %r11
  1526. movq %rdx, %r12
  1527. adcq \$0, %r12
  1528. mulq %rbx
  1529. addq %rax, %r13
  1530. movq 48($ap), %rax
  1531. adcq \$0, %rdx
  1532. addq %r13, %r12
  1533. movq %rdx, %r13
  1534. adcq \$0, %r13
  1535. mulq %rbx
  1536. addq %rax, %r14
  1537. movq 56($ap), %rax
  1538. adcq \$0, %rdx
  1539. addq %r14, %r13
  1540. movq %rdx, %r14
  1541. leaq 8($bp), $bp
  1542. adcq \$0, %r14
  1543. mulq %rbx
  1544. addq %rax, %r15
  1545. movq ($ap), %rax
  1546. adcq \$0, %rdx
  1547. addq %r15, %r14
  1548. movq %rdx, %r15
  1549. adcq \$0, %r15
  1550. leaq 8(%rdi), %rdi
  1551. decl %ecx
  1552. jnz .Loop_mul
  1553. movq %r8, (%rdi)
  1554. movq %r9, 8(%rdi)
  1555. movq %r10, 16(%rdi)
  1556. movq %r11, 24(%rdi)
  1557. movq %r12, 32(%rdi)
  1558. movq %r13, 40(%rdi)
  1559. movq %r14, 48(%rdi)
  1560. movq %r15, 56(%rdi)
  1561. ret
  1562. .size __rsaz_512_mul,.-__rsaz_512_mul
  1563. ___
  1564. }
  1565. if ($addx) {
  1566. # __rsaz_512_mulx
  1567. #
  1568. # input: %rsi - ap, %rbp - bp
  1569. # ouput:
  1570. # clobbers: everything
  1571. my ($ap,$bp,$zero) = ("%rsi","%rbp","%rdi");
  1572. $code.=<<___;
  1573. .type __rsaz_512_mulx,\@abi-omnipotent
  1574. .align 32
  1575. __rsaz_512_mulx:
  1576. mulx ($ap), %rbx, %r8 # initial %rdx preloaded by caller
  1577. mov \$-6, %rcx
  1578. mulx 8($ap), %rax, %r9
  1579. movq %rbx, 8(%rsp)
  1580. mulx 16($ap), %rbx, %r10
  1581. adc %rax, %r8
  1582. mulx 24($ap), %rax, %r11
  1583. adc %rbx, %r9
  1584. mulx 32($ap), %rbx, %r12
  1585. adc %rax, %r10
  1586. mulx 40($ap), %rax, %r13
  1587. adc %rbx, %r11
  1588. mulx 48($ap), %rbx, %r14
  1589. adc %rax, %r12
  1590. mulx 56($ap), %rax, %r15
  1591. mov 8($bp), %rdx
  1592. adc %rbx, %r13
  1593. adc %rax, %r14
  1594. adc \$0, %r15
  1595. xor $zero, $zero # cf=0,of=0
  1596. jmp .Loop_mulx
  1597. .align 32
  1598. .Loop_mulx:
  1599. movq %r8, %rbx
  1600. mulx ($ap), %rax, %r8
  1601. adcx %rax, %rbx
  1602. adox %r9, %r8
  1603. mulx 8($ap), %rax, %r9
  1604. adcx %rax, %r8
  1605. adox %r10, %r9
  1606. mulx 16($ap), %rax, %r10
  1607. adcx %rax, %r9
  1608. adox %r11, %r10
  1609. mulx 24($ap), %rax, %r11
  1610. adcx %rax, %r10
  1611. adox %r12, %r11
  1612. .byte 0x3e,0xc4,0x62,0xfb,0xf6,0xa6,0x20,0x00,0x00,0x00 # mulx 32($ap), %rax, %r12
  1613. adcx %rax, %r11
  1614. adox %r13, %r12
  1615. mulx 40($ap), %rax, %r13
  1616. adcx %rax, %r12
  1617. adox %r14, %r13
  1618. mulx 48($ap), %rax, %r14
  1619. adcx %rax, %r13
  1620. adox %r15, %r14
  1621. mulx 56($ap), %rax, %r15
  1622. movq 64($bp,%rcx,8), %rdx
  1623. movq %rbx, 8+64-8(%rsp,%rcx,8)
  1624. adcx %rax, %r14
  1625. adox $zero, %r15
  1626. adcx $zero, %r15 # cf=0
  1627. inc %rcx # of=0
  1628. jnz .Loop_mulx
  1629. movq %r8, %rbx
  1630. mulx ($ap), %rax, %r8
  1631. adcx %rax, %rbx
  1632. adox %r9, %r8
  1633. .byte 0xc4,0x62,0xfb,0xf6,0x8e,0x08,0x00,0x00,0x00 # mulx 8($ap), %rax, %r9
  1634. adcx %rax, %r8
  1635. adox %r10, %r9
  1636. .byte 0xc4,0x62,0xfb,0xf6,0x96,0x10,0x00,0x00,0x00 # mulx 16($ap), %rax, %r10
  1637. adcx %rax, %r9
  1638. adox %r11, %r10
  1639. mulx 24($ap), %rax, %r11
  1640. adcx %rax, %r10
  1641. adox %r12, %r11
  1642. mulx 32($ap), %rax, %r12
  1643. adcx %rax, %r11
  1644. adox %r13, %r12
  1645. mulx 40($ap), %rax, %r13
  1646. adcx %rax, %r12
  1647. adox %r14, %r13
  1648. .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14
  1649. adcx %rax, %r13
  1650. adox %r15, %r14
  1651. .byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 # mulx 56($ap), %rax, %r15
  1652. adcx %rax, %r14
  1653. adox $zero, %r15
  1654. adcx $zero, %r15
  1655. mov %rbx, 8+64-8(%rsp)
  1656. mov %r8, 8+64(%rsp)
  1657. mov %r9, 8+64+8(%rsp)
  1658. mov %r10, 8+64+16(%rsp)
  1659. mov %r11, 8+64+24(%rsp)
  1660. mov %r12, 8+64+32(%rsp)
  1661. mov %r13, 8+64+40(%rsp)
  1662. mov %r14, 8+64+48(%rsp)
  1663. mov %r15, 8+64+56(%rsp)
  1664. ret
  1665. .size __rsaz_512_mulx,.-__rsaz_512_mulx
  1666. ___
  1667. }
  1668. {
  1669. my ($out,$inp,$power)= $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx");
  1670. $code.=<<___;
  1671. .globl rsaz_512_scatter4
  1672. .type rsaz_512_scatter4,\@abi-omnipotent
  1673. .align 16
  1674. rsaz_512_scatter4:
  1675. leaq ($out,$power,4), $out
  1676. movl \$8, %r9d
  1677. jmp .Loop_scatter
  1678. .align 16
  1679. .Loop_scatter:
  1680. movq ($inp), %rax
  1681. leaq 8($inp), $inp
  1682. movl %eax, ($out)
  1683. shrq \$32, %rax
  1684. movl %eax, 64($out)
  1685. leaq 128($out), $out
  1686. decl %r9d
  1687. jnz .Loop_scatter
  1688. ret
  1689. .size rsaz_512_scatter4,.-rsaz_512_scatter4
  1690. .globl rsaz_512_gather4
  1691. .type rsaz_512_gather4,\@abi-omnipotent
  1692. .align 16
  1693. rsaz_512_gather4:
  1694. leaq ($inp,$power,4), $inp
  1695. movl \$8, %r9d
  1696. jmp .Loop_gather
  1697. .align 16
  1698. .Loop_gather:
  1699. movl ($inp), %eax
  1700. movl 64($inp), %r8d
  1701. leaq 128($inp), $inp
  1702. shlq \$32, %r8
  1703. or %r8, %rax
  1704. movq %rax, ($out)
  1705. leaq 8($out), $out
  1706. decl %r9d
  1707. jnz .Loop_gather
  1708. ret
  1709. .size rsaz_512_gather4,.-rsaz_512_gather4
  1710. ___
  1711. }
  1712. # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
  1713. # CONTEXT *context,DISPATCHER_CONTEXT *disp)
  1714. if ($win64) {
  1715. $rec="%rcx";
  1716. $frame="%rdx";
  1717. $context="%r8";
  1718. $disp="%r9";
  1719. $code.=<<___;
  1720. .extern __imp_RtlVirtualUnwind
  1721. .type se_handler,\@abi-omnipotent
  1722. .align 16
  1723. se_handler:
  1724. push %rsi
  1725. push %rdi
  1726. push %rbx
  1727. push %rbp
  1728. push %r12
  1729. push %r13
  1730. push %r14
  1731. push %r15
  1732. pushfq
  1733. sub \$64,%rsp
  1734. mov 120($context),%rax # pull context->Rax
  1735. mov 248($context),%rbx # pull context->Rip
  1736. mov 8($disp),%rsi # disp->ImageBase
  1737. mov 56($disp),%r11 # disp->HandlerData
  1738. mov 0(%r11),%r10d # HandlerData[0]
  1739. lea (%rsi,%r10),%r10 # end of prologue label
  1740. cmp %r10,%rbx # context->Rip<end of prologue label
  1741. jb .Lcommon_seh_tail
  1742. mov 152($context),%rax # pull context->Rsp
  1743. mov 4(%r11),%r10d # HandlerData[1]
  1744. lea (%rsi,%r10),%r10 # epilogue label
  1745. cmp %r10,%rbx # context->Rip>=epilogue label
  1746. jae .Lcommon_seh_tail
  1747. lea 128+24+48(%rax),%rax
  1748. mov -8(%rax),%rbx
  1749. mov -16(%rax),%rbp
  1750. mov -24(%rax),%r12
  1751. mov -32(%rax),%r13
  1752. mov -40(%rax),%r14
  1753. mov -48(%rax),%r15
  1754. mov %rbx,144($context) # restore context->Rbx
  1755. mov %rbp,160($context) # restore context->Rbp
  1756. mov %r12,216($context) # restore context->R12
  1757. mov %r13,224($context) # restore context->R13
  1758. mov %r14,232($context) # restore context->R14
  1759. mov %r15,240($context) # restore context->R15
  1760. .Lcommon_seh_tail:
  1761. mov 8(%rax),%rdi
  1762. mov 16(%rax),%rsi
  1763. mov %rax,152($context) # restore context->Rsp
  1764. mov %rsi,168($context) # restore context->Rsi
  1765. mov %rdi,176($context) # restore context->Rdi
  1766. mov 40($disp),%rdi # disp->ContextRecord
  1767. mov $context,%rsi # context
  1768. mov \$154,%ecx # sizeof(CONTEXT)
  1769. .long 0xa548f3fc # cld; rep movsq
  1770. mov $disp,%rsi
  1771. xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
  1772. mov 8(%rsi),%rdx # arg2, disp->ImageBase
  1773. mov 0(%rsi),%r8 # arg3, disp->ControlPc
  1774. mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
  1775. mov 40(%rsi),%r10 # disp->ContextRecord
  1776. lea 56(%rsi),%r11 # &disp->HandlerData
  1777. lea 24(%rsi),%r12 # &disp->EstablisherFrame
  1778. mov %r10,32(%rsp) # arg5
  1779. mov %r11,40(%rsp) # arg6
  1780. mov %r12,48(%rsp) # arg7
  1781. mov %rcx,56(%rsp) # arg8, (NULL)
  1782. call *__imp_RtlVirtualUnwind(%rip)
  1783. mov \$1,%eax # ExceptionContinueSearch
  1784. add \$64,%rsp
  1785. popfq
  1786. pop %r15
  1787. pop %r14
  1788. pop %r13
  1789. pop %r12
  1790. pop %rbp
  1791. pop %rbx
  1792. pop %rdi
  1793. pop %rsi
  1794. ret
  1795. .size sqr_handler,.-sqr_handler
  1796. .section .pdata
  1797. .align 4
  1798. .rva .LSEH_begin_rsaz_512_sqr
  1799. .rva .LSEH_end_rsaz_512_sqr
  1800. .rva .LSEH_info_rsaz_512_sqr
  1801. .rva .LSEH_begin_rsaz_512_mul
  1802. .rva .LSEH_end_rsaz_512_mul
  1803. .rva .LSEH_info_rsaz_512_mul
  1804. .rva .LSEH_begin_rsaz_512_mul_gather4
  1805. .rva .LSEH_end_rsaz_512_mul_gather4
  1806. .rva .LSEH_info_rsaz_512_mul_gather4
  1807. .rva .LSEH_begin_rsaz_512_mul_scatter4
  1808. .rva .LSEH_end_rsaz_512_mul_scatter4
  1809. .rva .LSEH_info_rsaz_512_mul_scatter4
  1810. .rva .LSEH_begin_rsaz_512_mul_by_one
  1811. .rva .LSEH_end_rsaz_512_mul_by_one
  1812. .rva .LSEH_info_rsaz_512_mul_by_one
  1813. .section .xdata
  1814. .align 8
  1815. .LSEH_info_rsaz_512_sqr:
  1816. .byte 9,0,0,0
  1817. .rva se_handler
  1818. .rva .Lsqr_body,.Lsqr_epilogue # HandlerData[]
  1819. .LSEH_info_rsaz_512_mul:
  1820. .byte 9,0,0,0
  1821. .rva se_handler
  1822. .rva .Lmul_body,.Lmul_epilogue # HandlerData[]
  1823. .LSEH_info_rsaz_512_mul_gather4:
  1824. .byte 9,0,0,0
  1825. .rva se_handler
  1826. .rva .Lmul_gather4_body,.Lmul_gather4_epilogue # HandlerData[]
  1827. .LSEH_info_rsaz_512_mul_scatter4:
  1828. .byte 9,0,0,0
  1829. .rva se_handler
  1830. .rva .Lmul_scatter4_body,.Lmul_scatter4_epilogue # HandlerData[]
  1831. .LSEH_info_rsaz_512_mul_by_one:
  1832. .byte 9,0,0,0
  1833. .rva se_handler
  1834. .rva .Lmul_by_one_body,.Lmul_by_one_epilogue # HandlerData[]
  1835. ___
  1836. }
  1837. $code =~ s/\`([^\`]*)\`/eval $1/gem;
  1838. print $code;
  1839. close STDOUT;