rsaz-x86_64.pl 46 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351
  1. #!/usr/bin/env perl
  2. ##############################################################################
  3. # #
  4. # Copyright (c) 2012, Intel Corporation #
  5. # #
  6. # All rights reserved. #
  7. # #
  8. # Redistribution and use in source and binary forms, with or without #
  9. # modification, are permitted provided that the following conditions are #
  10. # met: #
  11. # #
  12. # * Redistributions of source code must retain the above copyright #
  13. # notice, this list of conditions and the following disclaimer. #
  14. # #
  15. # * Redistributions in binary form must reproduce the above copyright #
  16. # notice, this list of conditions and the following disclaimer in the #
  17. # documentation and/or other materials provided with the #
  18. # distribution. #
  19. # #
  20. # * Neither the name of the Intel Corporation nor the names of its #
  21. # contributors may be used to endorse or promote products derived from #
  22. # this software without specific prior written permission. #
  23. # #
  24. # #
  25. # THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY #
  26. # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE #
  27. # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR #
  28. # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR #
  29. # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, #
  30. # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, #
  31. # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR #
  32. # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF #
  33. # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING #
  34. # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS #
  35. # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #
  36. # #
  37. ##############################################################################
  38. # Developers and authors: #
  39. # Shay Gueron (1, 2), and Vlad Krasnov (1) #
  40. # (1) Intel Architecture Group, Microprocessor and Chipset Development, #
  41. # Israel Development Center, Haifa, Israel #
  42. # (2) University of Haifa #
  43. ##############################################################################
  44. # Reference: #
  45. # [1] S. Gueron, "Efficient Software Implementations of Modular #
  46. # Exponentiation", http://eprint.iacr.org/2011/239 #
  47. # [2] S. Gueron, V. Krasnov. "Speeding up Big-Numbers Squaring". #
  48. # IEEE Proceedings of 9th International Conference on Information #
  49. # Technology: New Generations (ITNG 2012), 821-823 (2012). #
  50. # [3] S. Gueron, Efficient Software Implementations of Modular Exponentiation#
  51. # Journal of Cryptographic Engineering 2:31-43 (2012). #
  52. # [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis #
  53. # resistant 512-bit and 1024-bit modular exponentiation for optimizing #
  54. # RSA1024 and RSA2048 on x86_64 platforms", #
  55. # http://rt.openssl.org/Ticket/Display.html?id=2582&user=guest&pass=guest#
  56. ##############################################################################
  57. # While original submission covers 512- and 1024-bit exponentiation,
  58. # this module is limited to 512-bit version only (and as such
  59. # accelerates RSA1024 sign). This is because improvement for longer
  60. # keys is not high enough to justify the effort, highest measured
  61. # was ~5% on Westmere. [This is relative to OpenSSL 1.0.2, upcoming
  62. # for the moment of this writing!] Nor does this module implement
  63. # "monolithic" complete exponentiation jumbo-subroutine, but adheres
  64. # to more modular mixture of C and assembly. And it's optimized even
  65. # for processors other than Intel Core family (see table below for
  66. # improvement coefficients).
  67. # <appro@openssl.org>
  68. #
  69. # RSA1024 sign/sec this/original |this/rsax(*) this/fips(*)
  70. # ----------------+---------------------------
  71. # Opteron +13% |+5% +20%
  72. # Bulldozer -0% |-1% +10%
  73. # P4 +11% |+7% +8%
  74. # Westmere +5% |+14% +17%
  75. # Sandy Bridge +2% |+12% +29%
  76. # Ivy Bridge +1% |+11% +35%
  77. # Haswell(**) -0% |+12% +39%
  78. # Atom +13% |+11% +4%
  79. # VIA Nano +70% |+9% +25%
  80. #
  81. # (*) rsax engine and fips numbers are presented for reference
  82. # purposes;
  83. # (**) MULX was attempted, but found to give only marginal improvement;
  84. $flavour = shift;
  85. $output = shift;
  86. if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
  87. $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  88. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  89. ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  90. ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  91. die "can't locate x86_64-xlate.pl";
  92. open OUT,"| \"$^X\" $xlate $flavour $output";
  93. *STDOUT=*OUT;
  94. if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
  95. =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
  96. $addx = ($1>=2.23);
  97. }
  98. if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
  99. `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
  100. $addx = ($1>=2.10);
  101. }
  102. if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
  103. `ml64 2>&1` =~ /Version ([0-9]+)\./) {
  104. $addx = ($1>=12);
  105. }
  106. if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9])\.([0-9]+)/) {
  107. my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10
  108. $addx = ($ver>=3.03);
  109. }
  110. ($out, $inp, $mod) = ("%rdi", "%rsi", "%rbp"); # common internal API
  111. {
  112. my ($out,$inp,$mod,$n0,$times) = ("%rdi","%rsi","%rdx","%rcx","%r8d");
  113. $code.=<<___;
  114. .text
  115. .extern OPENSSL_ia32cap_P
  116. .globl rsaz_512_sqr
  117. .type rsaz_512_sqr,\@function,5
  118. .align 32
  119. rsaz_512_sqr: # 25-29% faster than rsaz_512_mul
  120. push %rbx
  121. push %rbp
  122. push %r12
  123. push %r13
  124. push %r14
  125. push %r15
  126. subq \$128+24, %rsp
  127. .Lsqr_body:
  128. movq $mod, %rbp # common argument
  129. movq ($inp), %rdx
  130. movq 8($inp), %rax
  131. movq $n0, 128(%rsp)
  132. ___
  133. $code.=<<___ if ($addx);
  134. movl \$0x80100,%r11d
  135. andl OPENSSL_ia32cap_P+8(%rip),%r11d
  136. cmpl \$0x80100,%r11d # check for MULX and ADO/CX
  137. je .Loop_sqrx
  138. ___
  139. $code.=<<___;
  140. jmp .Loop_sqr
  141. .align 32
  142. .Loop_sqr:
  143. movl $times,128+8(%rsp)
  144. #first iteration
  145. movq %rdx, %rbx
  146. mulq %rdx
  147. movq %rax, %r8
  148. movq 16($inp), %rax
  149. movq %rdx, %r9
  150. mulq %rbx
  151. addq %rax, %r9
  152. movq 24($inp), %rax
  153. movq %rdx, %r10
  154. adcq \$0, %r10
  155. mulq %rbx
  156. addq %rax, %r10
  157. movq 32($inp), %rax
  158. movq %rdx, %r11
  159. adcq \$0, %r11
  160. mulq %rbx
  161. addq %rax, %r11
  162. movq 40($inp), %rax
  163. movq %rdx, %r12
  164. adcq \$0, %r12
  165. mulq %rbx
  166. addq %rax, %r12
  167. movq 48($inp), %rax
  168. movq %rdx, %r13
  169. adcq \$0, %r13
  170. mulq %rbx
  171. addq %rax, %r13
  172. movq 56($inp), %rax
  173. movq %rdx, %r14
  174. adcq \$0, %r14
  175. mulq %rbx
  176. addq %rax, %r14
  177. movq %rbx, %rax
  178. movq %rdx, %r15
  179. adcq \$0, %r15
  180. addq %r8, %r8 #shlq \$1, %r8
  181. movq %r9, %rcx
  182. adcq %r9, %r9 #shld \$1, %r8, %r9
  183. mulq %rax
  184. movq %rax, (%rsp)
  185. addq %rdx, %r8
  186. adcq \$0, %r9
  187. movq %r8, 8(%rsp)
  188. shrq \$63, %rcx
  189. #second iteration
  190. movq 8($inp), %r8
  191. movq 16($inp), %rax
  192. mulq %r8
  193. addq %rax, %r10
  194. movq 24($inp), %rax
  195. movq %rdx, %rbx
  196. adcq \$0, %rbx
  197. mulq %r8
  198. addq %rax, %r11
  199. movq 32($inp), %rax
  200. adcq \$0, %rdx
  201. addq %rbx, %r11
  202. movq %rdx, %rbx
  203. adcq \$0, %rbx
  204. mulq %r8
  205. addq %rax, %r12
  206. movq 40($inp), %rax
  207. adcq \$0, %rdx
  208. addq %rbx, %r12
  209. movq %rdx, %rbx
  210. adcq \$0, %rbx
  211. mulq %r8
  212. addq %rax, %r13
  213. movq 48($inp), %rax
  214. adcq \$0, %rdx
  215. addq %rbx, %r13
  216. movq %rdx, %rbx
  217. adcq \$0, %rbx
  218. mulq %r8
  219. addq %rax, %r14
  220. movq 56($inp), %rax
  221. adcq \$0, %rdx
  222. addq %rbx, %r14
  223. movq %rdx, %rbx
  224. adcq \$0, %rbx
  225. mulq %r8
  226. addq %rax, %r15
  227. movq %r8, %rax
  228. adcq \$0, %rdx
  229. addq %rbx, %r15
  230. movq %rdx, %r8
  231. movq %r10, %rdx
  232. adcq \$0, %r8
  233. add %rdx, %rdx
  234. lea (%rcx,%r10,2), %r10 #shld \$1, %rcx, %r10
  235. movq %r11, %rbx
  236. adcq %r11, %r11 #shld \$1, %r10, %r11
  237. mulq %rax
  238. addq %rax, %r9
  239. adcq %rdx, %r10
  240. adcq \$0, %r11
  241. movq %r9, 16(%rsp)
  242. movq %r10, 24(%rsp)
  243. shrq \$63, %rbx
  244. #third iteration
  245. movq 16($inp), %r9
  246. movq 24($inp), %rax
  247. mulq %r9
  248. addq %rax, %r12
  249. movq 32($inp), %rax
  250. movq %rdx, %rcx
  251. adcq \$0, %rcx
  252. mulq %r9
  253. addq %rax, %r13
  254. movq 40($inp), %rax
  255. adcq \$0, %rdx
  256. addq %rcx, %r13
  257. movq %rdx, %rcx
  258. adcq \$0, %rcx
  259. mulq %r9
  260. addq %rax, %r14
  261. movq 48($inp), %rax
  262. adcq \$0, %rdx
  263. addq %rcx, %r14
  264. movq %rdx, %rcx
  265. adcq \$0, %rcx
  266. mulq %r9
  267. movq %r12, %r10
  268. lea (%rbx,%r12,2), %r12 #shld \$1, %rbx, %r12
  269. addq %rax, %r15
  270. movq 56($inp), %rax
  271. adcq \$0, %rdx
  272. addq %rcx, %r15
  273. movq %rdx, %rcx
  274. adcq \$0, %rcx
  275. mulq %r9
  276. shrq \$63, %r10
  277. addq %rax, %r8
  278. movq %r9, %rax
  279. adcq \$0, %rdx
  280. addq %rcx, %r8
  281. movq %rdx, %r9
  282. adcq \$0, %r9
  283. movq %r13, %rcx
  284. leaq (%r10,%r13,2), %r13 #shld \$1, %r12, %r13
  285. mulq %rax
  286. addq %rax, %r11
  287. adcq %rdx, %r12
  288. adcq \$0, %r13
  289. movq %r11, 32(%rsp)
  290. movq %r12, 40(%rsp)
  291. shrq \$63, %rcx
  292. #fourth iteration
  293. movq 24($inp), %r10
  294. movq 32($inp), %rax
  295. mulq %r10
  296. addq %rax, %r14
  297. movq 40($inp), %rax
  298. movq %rdx, %rbx
  299. adcq \$0, %rbx
  300. mulq %r10
  301. addq %rax, %r15
  302. movq 48($inp), %rax
  303. adcq \$0, %rdx
  304. addq %rbx, %r15
  305. movq %rdx, %rbx
  306. adcq \$0, %rbx
  307. mulq %r10
  308. movq %r14, %r12
  309. leaq (%rcx,%r14,2), %r14 #shld \$1, %rcx, %r14
  310. addq %rax, %r8
  311. movq 56($inp), %rax
  312. adcq \$0, %rdx
  313. addq %rbx, %r8
  314. movq %rdx, %rbx
  315. adcq \$0, %rbx
  316. mulq %r10
  317. shrq \$63, %r12
  318. addq %rax, %r9
  319. movq %r10, %rax
  320. adcq \$0, %rdx
  321. addq %rbx, %r9
  322. movq %rdx, %r10
  323. adcq \$0, %r10
  324. movq %r15, %rbx
  325. leaq (%r12,%r15,2),%r15 #shld \$1, %r14, %r15
  326. mulq %rax
  327. addq %rax, %r13
  328. adcq %rdx, %r14
  329. adcq \$0, %r15
  330. movq %r13, 48(%rsp)
  331. movq %r14, 56(%rsp)
  332. shrq \$63, %rbx
  333. #fifth iteration
  334. movq 32($inp), %r11
  335. movq 40($inp), %rax
  336. mulq %r11
  337. addq %rax, %r8
  338. movq 48($inp), %rax
  339. movq %rdx, %rcx
  340. adcq \$0, %rcx
  341. mulq %r11
  342. addq %rax, %r9
  343. movq 56($inp), %rax
  344. adcq \$0, %rdx
  345. movq %r8, %r12
  346. leaq (%rbx,%r8,2), %r8 #shld \$1, %rbx, %r8
  347. addq %rcx, %r9
  348. movq %rdx, %rcx
  349. adcq \$0, %rcx
  350. mulq %r11
  351. shrq \$63, %r12
  352. addq %rax, %r10
  353. movq %r11, %rax
  354. adcq \$0, %rdx
  355. addq %rcx, %r10
  356. movq %rdx, %r11
  357. adcq \$0, %r11
  358. movq %r9, %rcx
  359. leaq (%r12,%r9,2), %r9 #shld \$1, %r8, %r9
  360. mulq %rax
  361. addq %rax, %r15
  362. adcq %rdx, %r8
  363. adcq \$0, %r9
  364. movq %r15, 64(%rsp)
  365. movq %r8, 72(%rsp)
  366. shrq \$63, %rcx
  367. #sixth iteration
  368. movq 40($inp), %r12
  369. movq 48($inp), %rax
  370. mulq %r12
  371. addq %rax, %r10
  372. movq 56($inp), %rax
  373. movq %rdx, %rbx
  374. adcq \$0, %rbx
  375. mulq %r12
  376. addq %rax, %r11
  377. movq %r12, %rax
  378. movq %r10, %r15
  379. leaq (%rcx,%r10,2), %r10 #shld \$1, %rcx, %r10
  380. adcq \$0, %rdx
  381. shrq \$63, %r15
  382. addq %rbx, %r11
  383. movq %rdx, %r12
  384. adcq \$0, %r12
  385. movq %r11, %rbx
  386. leaq (%r15,%r11,2), %r11 #shld \$1, %r10, %r11
  387. mulq %rax
  388. addq %rax, %r9
  389. adcq %rdx, %r10
  390. adcq \$0, %r11
  391. movq %r9, 80(%rsp)
  392. movq %r10, 88(%rsp)
  393. #seventh iteration
  394. movq 48($inp), %r13
  395. movq 56($inp), %rax
  396. mulq %r13
  397. addq %rax, %r12
  398. movq %r13, %rax
  399. movq %rdx, %r13
  400. adcq \$0, %r13
  401. xorq %r14, %r14
  402. shlq \$1, %rbx
  403. adcq %r12, %r12 #shld \$1, %rbx, %r12
  404. adcq %r13, %r13 #shld \$1, %r12, %r13
  405. adcq %r14, %r14 #shld \$1, %r13, %r14
  406. mulq %rax
  407. addq %rax, %r11
  408. adcq %rdx, %r12
  409. adcq \$0, %r13
  410. movq %r11, 96(%rsp)
  411. movq %r12, 104(%rsp)
  412. #eighth iteration
  413. movq 56($inp), %rax
  414. mulq %rax
  415. addq %rax, %r13
  416. adcq \$0, %rdx
  417. addq %rdx, %r14
  418. movq %r13, 112(%rsp)
  419. movq %r14, 120(%rsp)
  420. movq (%rsp), %r8
  421. movq 8(%rsp), %r9
  422. movq 16(%rsp), %r10
  423. movq 24(%rsp), %r11
  424. movq 32(%rsp), %r12
  425. movq 40(%rsp), %r13
  426. movq 48(%rsp), %r14
  427. movq 56(%rsp), %r15
  428. call __rsaz_512_reduce
  429. addq 64(%rsp), %r8
  430. adcq 72(%rsp), %r9
  431. adcq 80(%rsp), %r10
  432. adcq 88(%rsp), %r11
  433. adcq 96(%rsp), %r12
  434. adcq 104(%rsp), %r13
  435. adcq 112(%rsp), %r14
  436. adcq 120(%rsp), %r15
  437. sbbq %rcx, %rcx
  438. call __rsaz_512_subtract
  439. movq %r8, %rdx
  440. movq %r9, %rax
  441. movl 128+8(%rsp), $times
  442. movq $out, $inp
  443. decl $times
  444. jnz .Loop_sqr
  445. ___
  446. if ($addx) {
  447. $code.=<<___;
  448. jmp .Lsqr_tail
  449. .align 32
  450. .Loop_sqrx:
  451. movl $times,128+8(%rsp)
  452. movq $out, %xmm0 # off-load
  453. movq %rbp, %xmm1 # off-load
  454. #first iteration
  455. mulx %rax, %r8, %r9
  456. mulx 16($inp), %rcx, %r10
  457. xor %rbp, %rbp # cf=0, of=0
  458. mulx 24($inp), %rax, %r11
  459. adcx %rcx, %r9
  460. mulx 32($inp), %rcx, %r12
  461. adcx %rax, %r10
  462. mulx 40($inp), %rax, %r13
  463. adcx %rcx, %r11
  464. .byte 0xc4,0x62,0xf3,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($inp), %rcx, %r14
  465. adcx %rax, %r12
  466. adcx %rcx, %r13
  467. .byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 # mulx 56($inp), %rax, %r15
  468. adcx %rax, %r14
  469. adcx %rbp, %r15 # %rbp is 0
  470. mov %r9, %rcx
  471. shld \$1, %r8, %r9
  472. shl \$1, %r8
  473. xor %ebp, %ebp
  474. mulx %rdx, %rax, %rdx
  475. adcx %rdx, %r8
  476. mov 8($inp), %rdx
  477. adcx %rbp, %r9
  478. mov %rax, (%rsp)
  479. mov %r8, 8(%rsp)
  480. #second iteration
  481. mulx 16($inp), %rax, %rbx
  482. adox %rax, %r10
  483. adcx %rbx, %r11
  484. .byte 0xc4,0x62,0xc3,0xf6,0x86,0x18,0x00,0x00,0x00 # mulx 24($inp), $out, %r8
  485. adox $out, %r11
  486. adcx %r8, %r12
  487. mulx 32($inp), %rax, %rbx
  488. adox %rax, %r12
  489. adcx %rbx, %r13
  490. mulx 40($inp), $out, %r8
  491. adox $out, %r13
  492. adcx %r8, %r14
  493. .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx
  494. adox %rax, %r14
  495. adcx %rbx, %r15
  496. .byte 0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r8
  497. adox $out, %r15
  498. adcx %rbp, %r8
  499. adox %rbp, %r8
  500. mov %r11, %rbx
  501. shld \$1, %r10, %r11
  502. shld \$1, %rcx, %r10
  503. xor %ebp,%ebp
  504. mulx %rdx, %rax, %rcx
  505. mov 16($inp), %rdx
  506. adcx %rax, %r9
  507. adcx %rcx, %r10
  508. adcx %rbp, %r11
  509. mov %r9, 16(%rsp)
  510. .byte 0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00 # mov %r10, 24(%rsp)
  511. #third iteration
  512. .byte 0xc4,0x62,0xc3,0xf6,0x8e,0x18,0x00,0x00,0x00 # mulx 24($inp), $out, %r9
  513. adox $out, %r12
  514. adcx %r9, %r13
  515. mulx 32($inp), %rax, %rcx
  516. adox %rax, %r13
  517. adcx %rcx, %r14
  518. mulx 40($inp), $out, %r9
  519. adox $out, %r14
  520. adcx %r9, %r15
  521. .byte 0xc4,0xe2,0xfb,0xf6,0x8e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rcx
  522. adox %rax, %r15
  523. adcx %rcx, %r8
  524. .byte 0xc4,0x62,0xc3,0xf6,0x8e,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r9
  525. adox $out, %r8
  526. adcx %rbp, %r9
  527. adox %rbp, %r9
  528. mov %r13, %rcx
  529. shld \$1, %r12, %r13
  530. shld \$1, %rbx, %r12
  531. xor %ebp, %ebp
  532. mulx %rdx, %rax, %rdx
  533. adcx %rax, %r11
  534. adcx %rdx, %r12
  535. mov 24($inp), %rdx
  536. adcx %rbp, %r13
  537. mov %r11, 32(%rsp)
  538. .byte 0x4c,0x89,0xa4,0x24,0x28,0x00,0x00,0x00 # mov %r12, 40(%rsp)
  539. #fourth iteration
  540. .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x20,0x00,0x00,0x00 # mulx 32($inp), %rax, %rbx
  541. adox %rax, %r14
  542. adcx %rbx, %r15
  543. mulx 40($inp), $out, %r10
  544. adox $out, %r15
  545. adcx %r10, %r8
  546. mulx 48($inp), %rax, %rbx
  547. adox %rax, %r8
  548. adcx %rbx, %r9
  549. mulx 56($inp), $out, %r10
  550. adox $out, %r9
  551. adcx %rbp, %r10
  552. adox %rbp, %r10
  553. .byte 0x66
  554. mov %r15, %rbx
  555. shld \$1, %r14, %r15
  556. shld \$1, %rcx, %r14
  557. xor %ebp, %ebp
  558. mulx %rdx, %rax, %rdx
  559. adcx %rax, %r13
  560. adcx %rdx, %r14
  561. mov 32($inp), %rdx
  562. adcx %rbp, %r15
  563. mov %r13, 48(%rsp)
  564. mov %r14, 56(%rsp)
  565. #fifth iteration
  566. .byte 0xc4,0x62,0xc3,0xf6,0x9e,0x28,0x00,0x00,0x00 # mulx 40($inp), $out, %r11
  567. adox $out, %r8
  568. adcx %r11, %r9
  569. mulx 48($inp), %rax, %rcx
  570. adox %rax, %r9
  571. adcx %rcx, %r10
  572. mulx 56($inp), $out, %r11
  573. adox $out, %r10
  574. adcx %rbp, %r11
  575. adox %rbp, %r11
  576. mov %r9, %rcx
  577. shld \$1, %r8, %r9
  578. shld \$1, %rbx, %r8
  579. xor %ebp, %ebp
  580. mulx %rdx, %rax, %rdx
  581. adcx %rax, %r15
  582. adcx %rdx, %r8
  583. mov 40($inp), %rdx
  584. adcx %rbp, %r9
  585. mov %r15, 64(%rsp)
  586. mov %r8, 72(%rsp)
  587. #sixth iteration
  588. .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx
  589. adox %rax, %r10
  590. adcx %rbx, %r11
  591. .byte 0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r12
  592. adox $out, %r11
  593. adcx %rbp, %r12
  594. adox %rbp, %r12
  595. mov %r11, %rbx
  596. shld \$1, %r10, %r11
  597. shld \$1, %rcx, %r10
  598. xor %ebp, %ebp
  599. mulx %rdx, %rax, %rdx
  600. adcx %rax, %r9
  601. adcx %rdx, %r10
  602. mov 48($inp), %rdx
  603. adcx %rbp, %r11
  604. mov %r9, 80(%rsp)
  605. mov %r10, 88(%rsp)
  606. #seventh iteration
  607. .byte 0xc4,0x62,0xfb,0xf6,0xae,0x38,0x00,0x00,0x00 # mulx 56($inp), %rax, %r13
  608. adox %rax, %r12
  609. adox %rbp, %r13
  610. xor %r14, %r14
  611. shld \$1, %r13, %r14
  612. shld \$1, %r12, %r13
  613. shld \$1, %rbx, %r12
  614. xor %ebp, %ebp
  615. mulx %rdx, %rax, %rdx
  616. adcx %rax, %r11
  617. adcx %rdx, %r12
  618. mov 56($inp), %rdx
  619. adcx %rbp, %r13
  620. .byte 0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00 # mov %r11, 96(%rsp)
  621. .byte 0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00 # mov %r12, 104(%rsp)
  622. #eighth iteration
  623. mulx %rdx, %rax, %rdx
  624. adox %rax, %r13
  625. adox %rbp, %rdx
  626. .byte 0x66
  627. add %rdx, %r14
  628. movq %r13, 112(%rsp)
  629. movq %r14, 120(%rsp)
  630. movq %xmm0, $out
  631. movq %xmm1, %rbp
  632. movq 128(%rsp), %rdx # pull $n0
  633. movq (%rsp), %r8
  634. movq 8(%rsp), %r9
  635. movq 16(%rsp), %r10
  636. movq 24(%rsp), %r11
  637. movq 32(%rsp), %r12
  638. movq 40(%rsp), %r13
  639. movq 48(%rsp), %r14
  640. movq 56(%rsp), %r15
  641. call __rsaz_512_reducex
  642. addq 64(%rsp), %r8
  643. adcq 72(%rsp), %r9
  644. adcq 80(%rsp), %r10
  645. adcq 88(%rsp), %r11
  646. adcq 96(%rsp), %r12
  647. adcq 104(%rsp), %r13
  648. adcq 112(%rsp), %r14
  649. adcq 120(%rsp), %r15
  650. sbbq %rcx, %rcx
  651. call __rsaz_512_subtract
  652. movq %r8, %rdx
  653. movq %r9, %rax
  654. movl 128+8(%rsp), $times
  655. movq $out, $inp
  656. decl $times
  657. jnz .Loop_sqrx
  658. .Lsqr_tail:
  659. ___
  660. }
  661. $code.=<<___;
  662. leaq 128+24+48(%rsp), %rax
  663. movq -48(%rax), %r15
  664. movq -40(%rax), %r14
  665. movq -32(%rax), %r13
  666. movq -24(%rax), %r12
  667. movq -16(%rax), %rbp
  668. movq -8(%rax), %rbx
  669. leaq (%rax), %rsp
  670. .Lsqr_epilogue:
  671. ret
  672. .size rsaz_512_sqr,.-rsaz_512_sqr
  673. ___
  674. }
  675. {
  676. my ($out,$ap,$bp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8");
  677. $code.=<<___;
  678. .globl rsaz_512_mul
  679. .type rsaz_512_mul,\@function,5
  680. .align 32
  681. rsaz_512_mul:
  682. push %rbx
  683. push %rbp
  684. push %r12
  685. push %r13
  686. push %r14
  687. push %r15
  688. subq \$128+24, %rsp
  689. .Lmul_body:
  690. movq $out, %xmm0 # off-load arguments
  691. movq $mod, %xmm1
  692. movq $n0, 128(%rsp)
  693. ___
  694. $code.=<<___ if ($addx);
  695. movl \$0x80100,%r11d
  696. andl OPENSSL_ia32cap_P+8(%rip),%r11d
  697. cmpl \$0x80100,%r11d # check for MULX and ADO/CX
  698. je .Lmulx
  699. ___
  700. $code.=<<___;
  701. movq ($bp), %rbx # pass b[0]
  702. movq $bp, %rbp # pass argument
  703. call __rsaz_512_mul
  704. movq %xmm0, $out
  705. movq %xmm1, %rbp
  706. movq (%rsp), %r8
  707. movq 8(%rsp), %r9
  708. movq 16(%rsp), %r10
  709. movq 24(%rsp), %r11
  710. movq 32(%rsp), %r12
  711. movq 40(%rsp), %r13
  712. movq 48(%rsp), %r14
  713. movq 56(%rsp), %r15
  714. call __rsaz_512_reduce
  715. ___
  716. $code.=<<___ if ($addx);
  717. jmp .Lmul_tail
  718. .align 32
  719. .Lmulx:
  720. movq $bp, %rbp # pass argument
  721. movq ($bp), %rdx # pass b[0]
  722. call __rsaz_512_mulx
  723. movq %xmm0, $out
  724. movq %xmm1, %rbp
  725. movq 128(%rsp), %rdx # pull $n0
  726. movq (%rsp), %r8
  727. movq 8(%rsp), %r9
  728. movq 16(%rsp), %r10
  729. movq 24(%rsp), %r11
  730. movq 32(%rsp), %r12
  731. movq 40(%rsp), %r13
  732. movq 48(%rsp), %r14
  733. movq 56(%rsp), %r15
  734. call __rsaz_512_reducex
  735. .Lmul_tail:
  736. ___
  737. $code.=<<___;
  738. addq 64(%rsp), %r8
  739. adcq 72(%rsp), %r9
  740. adcq 80(%rsp), %r10
  741. adcq 88(%rsp), %r11
  742. adcq 96(%rsp), %r12
  743. adcq 104(%rsp), %r13
  744. adcq 112(%rsp), %r14
  745. adcq 120(%rsp), %r15
  746. sbbq %rcx, %rcx
  747. call __rsaz_512_subtract
  748. leaq 128+24+48(%rsp), %rax
  749. movq -48(%rax), %r15
  750. movq -40(%rax), %r14
  751. movq -32(%rax), %r13
  752. movq -24(%rax), %r12
  753. movq -16(%rax), %rbp
  754. movq -8(%rax), %rbx
  755. leaq (%rax), %rsp
  756. .Lmul_epilogue:
  757. ret
  758. .size rsaz_512_mul,.-rsaz_512_mul
  759. ___
  760. }
  761. {
  762. my ($out,$ap,$bp,$mod,$n0,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
  763. $code.=<<___;
  764. .globl rsaz_512_mul_gather4
  765. .type rsaz_512_mul_gather4,\@function,6
  766. .align 32
  767. rsaz_512_mul_gather4:
  768. push %rbx
  769. push %rbp
  770. push %r12
  771. push %r13
  772. push %r14
  773. push %r15
  774. subq \$`128+24+($win64?0xb0:0)`, %rsp
  775. ___
  776. $code.=<<___ if ($win64);
  777. movaps %xmm6,0xa0(%rsp)
  778. movaps %xmm7,0xb0(%rsp)
  779. movaps %xmm8,0xc0(%rsp)
  780. movaps %xmm9,0xd0(%rsp)
  781. movaps %xmm10,0xe0(%rsp)
  782. movaps %xmm11,0xf0(%rsp)
  783. movaps %xmm12,0x100(%rsp)
  784. movaps %xmm13,0x110(%rsp)
  785. movaps %xmm14,0x120(%rsp)
  786. movaps %xmm15,0x130(%rsp)
  787. ___
  788. $code.=<<___;
  789. .Lmul_gather4_body:
  790. movd $pwr,%xmm8
  791. movdqa .Linc+16(%rip),%xmm1 # 00000002000000020000000200000002
  792. movdqa .Linc(%rip),%xmm0 # 00000001000000010000000000000000
  793. pshufd \$0,%xmm8,%xmm8 # broadcast $power
  794. movdqa %xmm1,%xmm7
  795. movdqa %xmm1,%xmm2
  796. ___
  797. ########################################################################
  798. # calculate mask by comparing 0..15 to $power
  799. #
  800. for($i=0;$i<4;$i++) {
  801. $code.=<<___;
  802. paddd %xmm`$i`,%xmm`$i+1`
  803. pcmpeqd %xmm8,%xmm`$i`
  804. movdqa %xmm7,%xmm`$i+3`
  805. ___
  806. }
  807. for(;$i<7;$i++) {
  808. $code.=<<___;
  809. paddd %xmm`$i`,%xmm`$i+1`
  810. pcmpeqd %xmm8,%xmm`$i`
  811. ___
  812. }
  813. $code.=<<___;
  814. pcmpeqd %xmm8,%xmm7
  815. movdqa 16*0($bp),%xmm8
  816. movdqa 16*1($bp),%xmm9
  817. movdqa 16*2($bp),%xmm10
  818. movdqa 16*3($bp),%xmm11
  819. pand %xmm0,%xmm8
  820. movdqa 16*4($bp),%xmm12
  821. pand %xmm1,%xmm9
  822. movdqa 16*5($bp),%xmm13
  823. pand %xmm2,%xmm10
  824. movdqa 16*6($bp),%xmm14
  825. pand %xmm3,%xmm11
  826. movdqa 16*7($bp),%xmm15
  827. leaq 128($bp), %rbp
  828. pand %xmm4,%xmm12
  829. pand %xmm5,%xmm13
  830. pand %xmm6,%xmm14
  831. pand %xmm7,%xmm15
  832. por %xmm10,%xmm8
  833. por %xmm11,%xmm9
  834. por %xmm12,%xmm8
  835. por %xmm13,%xmm9
  836. por %xmm14,%xmm8
  837. por %xmm15,%xmm9
  838. por %xmm9,%xmm8
  839. pshufd \$0x4e,%xmm8,%xmm9
  840. por %xmm9,%xmm8
  841. ___
  842. $code.=<<___ if ($addx);
  843. movl \$0x80100,%r11d
  844. andl OPENSSL_ia32cap_P+8(%rip),%r11d
  845. cmpl \$0x80100,%r11d # check for MULX and ADO/CX
  846. je .Lmulx_gather
  847. ___
  848. $code.=<<___;
  849. movq %xmm8,%rbx
  850. movq $n0, 128(%rsp) # off-load arguments
  851. movq $out, 128+8(%rsp)
  852. movq $mod, 128+16(%rsp)
  853. movq ($ap), %rax
  854. movq 8($ap), %rcx
  855. mulq %rbx # 0 iteration
  856. movq %rax, (%rsp)
  857. movq %rcx, %rax
  858. movq %rdx, %r8
  859. mulq %rbx
  860. addq %rax, %r8
  861. movq 16($ap), %rax
  862. movq %rdx, %r9
  863. adcq \$0, %r9
  864. mulq %rbx
  865. addq %rax, %r9
  866. movq 24($ap), %rax
  867. movq %rdx, %r10
  868. adcq \$0, %r10
  869. mulq %rbx
  870. addq %rax, %r10
  871. movq 32($ap), %rax
  872. movq %rdx, %r11
  873. adcq \$0, %r11
  874. mulq %rbx
  875. addq %rax, %r11
  876. movq 40($ap), %rax
  877. movq %rdx, %r12
  878. adcq \$0, %r12
  879. mulq %rbx
  880. addq %rax, %r12
  881. movq 48($ap), %rax
  882. movq %rdx, %r13
  883. adcq \$0, %r13
  884. mulq %rbx
  885. addq %rax, %r13
  886. movq 56($ap), %rax
  887. movq %rdx, %r14
  888. adcq \$0, %r14
  889. mulq %rbx
  890. addq %rax, %r14
  891. movq ($ap), %rax
  892. movq %rdx, %r15
  893. adcq \$0, %r15
  894. leaq 8(%rsp), %rdi
  895. movl \$7, %ecx
  896. jmp .Loop_mul_gather
  897. .align 32
  898. .Loop_mul_gather:
  899. movdqa 16*0(%rbp),%xmm8
  900. movdqa 16*1(%rbp),%xmm9
  901. movdqa 16*2(%rbp),%xmm10
  902. movdqa 16*3(%rbp),%xmm11
  903. pand %xmm0,%xmm8
  904. movdqa 16*4(%rbp),%xmm12
  905. pand %xmm1,%xmm9
  906. movdqa 16*5(%rbp),%xmm13
  907. pand %xmm2,%xmm10
  908. movdqa 16*6(%rbp),%xmm14
  909. pand %xmm3,%xmm11
  910. movdqa 16*7(%rbp),%xmm15
  911. leaq 128(%rbp), %rbp
  912. pand %xmm4,%xmm12
  913. pand %xmm5,%xmm13
  914. pand %xmm6,%xmm14
  915. pand %xmm7,%xmm15
  916. por %xmm10,%xmm8
  917. por %xmm11,%xmm9
  918. por %xmm12,%xmm8
  919. por %xmm13,%xmm9
  920. por %xmm14,%xmm8
  921. por %xmm15,%xmm9
  922. por %xmm9,%xmm8
  923. pshufd \$0x4e,%xmm8,%xmm9
  924. por %xmm9,%xmm8
  925. movq %xmm8,%rbx
  926. mulq %rbx
  927. addq %rax, %r8
  928. movq 8($ap), %rax
  929. movq %r8, (%rdi)
  930. movq %rdx, %r8
  931. adcq \$0, %r8
  932. mulq %rbx
  933. addq %rax, %r9
  934. movq 16($ap), %rax
  935. adcq \$0, %rdx
  936. addq %r9, %r8
  937. movq %rdx, %r9
  938. adcq \$0, %r9
  939. mulq %rbx
  940. addq %rax, %r10
  941. movq 24($ap), %rax
  942. adcq \$0, %rdx
  943. addq %r10, %r9
  944. movq %rdx, %r10
  945. adcq \$0, %r10
  946. mulq %rbx
  947. addq %rax, %r11
  948. movq 32($ap), %rax
  949. adcq \$0, %rdx
  950. addq %r11, %r10
  951. movq %rdx, %r11
  952. adcq \$0, %r11
  953. mulq %rbx
  954. addq %rax, %r12
  955. movq 40($ap), %rax
  956. adcq \$0, %rdx
  957. addq %r12, %r11
  958. movq %rdx, %r12
  959. adcq \$0, %r12
  960. mulq %rbx
  961. addq %rax, %r13
  962. movq 48($ap), %rax
  963. adcq \$0, %rdx
  964. addq %r13, %r12
  965. movq %rdx, %r13
  966. adcq \$0, %r13
  967. mulq %rbx
  968. addq %rax, %r14
  969. movq 56($ap), %rax
  970. adcq \$0, %rdx
  971. addq %r14, %r13
  972. movq %rdx, %r14
  973. adcq \$0, %r14
  974. mulq %rbx
  975. addq %rax, %r15
  976. movq ($ap), %rax
  977. adcq \$0, %rdx
  978. addq %r15, %r14
  979. movq %rdx, %r15
  980. adcq \$0, %r15
  981. leaq 8(%rdi), %rdi
  982. decl %ecx
  983. jnz .Loop_mul_gather
  984. movq %r8, (%rdi)
  985. movq %r9, 8(%rdi)
  986. movq %r10, 16(%rdi)
  987. movq %r11, 24(%rdi)
  988. movq %r12, 32(%rdi)
  989. movq %r13, 40(%rdi)
  990. movq %r14, 48(%rdi)
  991. movq %r15, 56(%rdi)
  992. movq 128+8(%rsp), $out
  993. movq 128+16(%rsp), %rbp
  994. movq (%rsp), %r8
  995. movq 8(%rsp), %r9
  996. movq 16(%rsp), %r10
  997. movq 24(%rsp), %r11
  998. movq 32(%rsp), %r12
  999. movq 40(%rsp), %r13
  1000. movq 48(%rsp), %r14
  1001. movq 56(%rsp), %r15
  1002. call __rsaz_512_reduce
  1003. ___
  1004. $code.=<<___ if ($addx);
  1005. jmp .Lmul_gather_tail
  1006. .align 32
  1007. .Lmulx_gather:
  1008. movq %xmm8,%rdx
  1009. mov $n0, 128(%rsp) # off-load arguments
  1010. mov $out, 128+8(%rsp)
  1011. mov $mod, 128+16(%rsp)
  1012. mulx ($ap), %rbx, %r8 # 0 iteration
  1013. mov %rbx, (%rsp)
  1014. xor %edi, %edi # cf=0, of=0
  1015. mulx 8($ap), %rax, %r9
  1016. mulx 16($ap), %rbx, %r10
  1017. adcx %rax, %r8
  1018. mulx 24($ap), %rax, %r11
  1019. adcx %rbx, %r9
  1020. mulx 32($ap), %rbx, %r12
  1021. adcx %rax, %r10
  1022. mulx 40($ap), %rax, %r13
  1023. adcx %rbx, %r11
  1024. mulx 48($ap), %rbx, %r14
  1025. adcx %rax, %r12
  1026. mulx 56($ap), %rax, %r15
  1027. adcx %rbx, %r13
  1028. adcx %rax, %r14
  1029. .byte 0x67
  1030. mov %r8, %rbx
  1031. adcx %rdi, %r15 # %rdi is 0
  1032. mov \$-7, %rcx
  1033. jmp .Loop_mulx_gather
  1034. .align 32
  1035. .Loop_mulx_gather:
  1036. movdqa 16*0(%rbp),%xmm8
  1037. movdqa 16*1(%rbp),%xmm9
  1038. movdqa 16*2(%rbp),%xmm10
  1039. movdqa 16*3(%rbp),%xmm11
  1040. pand %xmm0,%xmm8
  1041. movdqa 16*4(%rbp),%xmm12
  1042. pand %xmm1,%xmm9
  1043. movdqa 16*5(%rbp),%xmm13
  1044. pand %xmm2,%xmm10
  1045. movdqa 16*6(%rbp),%xmm14
  1046. pand %xmm3,%xmm11
  1047. movdqa 16*7(%rbp),%xmm15
  1048. leaq 128(%rbp), %rbp
  1049. pand %xmm4,%xmm12
  1050. pand %xmm5,%xmm13
  1051. pand %xmm6,%xmm14
  1052. pand %xmm7,%xmm15
  1053. por %xmm10,%xmm8
  1054. por %xmm11,%xmm9
  1055. por %xmm12,%xmm8
  1056. por %xmm13,%xmm9
  1057. por %xmm14,%xmm8
  1058. por %xmm15,%xmm9
  1059. por %xmm9,%xmm8
  1060. pshufd \$0x4e,%xmm8,%xmm9
  1061. por %xmm9,%xmm8
  1062. movq %xmm8,%rdx
  1063. .byte 0xc4,0x62,0xfb,0xf6,0x86,0x00,0x00,0x00,0x00 # mulx ($ap), %rax, %r8
  1064. adcx %rax, %rbx
  1065. adox %r9, %r8
  1066. mulx 8($ap), %rax, %r9
  1067. adcx %rax, %r8
  1068. adox %r10, %r9
  1069. mulx 16($ap), %rax, %r10
  1070. adcx %rax, %r9
  1071. adox %r11, %r10
  1072. .byte 0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00 # mulx 24($ap), %rax, %r11
  1073. adcx %rax, %r10
  1074. adox %r12, %r11
  1075. mulx 32($ap), %rax, %r12
  1076. adcx %rax, %r11
  1077. adox %r13, %r12
  1078. mulx 40($ap), %rax, %r13
  1079. adcx %rax, %r12
  1080. adox %r14, %r13
  1081. .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14
  1082. adcx %rax, %r13
  1083. .byte 0x67
  1084. adox %r15, %r14
  1085. mulx 56($ap), %rax, %r15
  1086. mov %rbx, 64(%rsp,%rcx,8)
  1087. adcx %rax, %r14
  1088. adox %rdi, %r15
  1089. mov %r8, %rbx
  1090. adcx %rdi, %r15 # cf=0
  1091. inc %rcx # of=0
  1092. jnz .Loop_mulx_gather
  1093. mov %r8, 64(%rsp)
  1094. mov %r9, 64+8(%rsp)
  1095. mov %r10, 64+16(%rsp)
  1096. mov %r11, 64+24(%rsp)
  1097. mov %r12, 64+32(%rsp)
  1098. mov %r13, 64+40(%rsp)
  1099. mov %r14, 64+48(%rsp)
  1100. mov %r15, 64+56(%rsp)
  1101. mov 128(%rsp), %rdx # pull arguments
  1102. mov 128+8(%rsp), $out
  1103. mov 128+16(%rsp), %rbp
  1104. mov (%rsp), %r8
  1105. mov 8(%rsp), %r9
  1106. mov 16(%rsp), %r10
  1107. mov 24(%rsp), %r11
  1108. mov 32(%rsp), %r12
  1109. mov 40(%rsp), %r13
  1110. mov 48(%rsp), %r14
  1111. mov 56(%rsp), %r15
  1112. call __rsaz_512_reducex
  1113. .Lmul_gather_tail:
  1114. ___
  1115. $code.=<<___;
  1116. addq 64(%rsp), %r8
  1117. adcq 72(%rsp), %r9
  1118. adcq 80(%rsp), %r10
  1119. adcq 88(%rsp), %r11
  1120. adcq 96(%rsp), %r12
  1121. adcq 104(%rsp), %r13
  1122. adcq 112(%rsp), %r14
  1123. adcq 120(%rsp), %r15
  1124. sbbq %rcx, %rcx
  1125. call __rsaz_512_subtract
  1126. leaq 128+24+48(%rsp), %rax
  1127. ___
  1128. $code.=<<___ if ($win64);
  1129. movaps 0xa0-0xc8(%rax),%xmm6
  1130. movaps 0xb0-0xc8(%rax),%xmm7
  1131. movaps 0xc0-0xc8(%rax),%xmm8
  1132. movaps 0xd0-0xc8(%rax),%xmm9
  1133. movaps 0xe0-0xc8(%rax),%xmm10
  1134. movaps 0xf0-0xc8(%rax),%xmm11
  1135. movaps 0x100-0xc8(%rax),%xmm12
  1136. movaps 0x110-0xc8(%rax),%xmm13
  1137. movaps 0x120-0xc8(%rax),%xmm14
  1138. movaps 0x130-0xc8(%rax),%xmm15
  1139. lea 0xb0(%rax),%rax
  1140. ___
  1141. $code.=<<___;
  1142. movq -48(%rax), %r15
  1143. movq -40(%rax), %r14
  1144. movq -32(%rax), %r13
  1145. movq -24(%rax), %r12
  1146. movq -16(%rax), %rbp
  1147. movq -8(%rax), %rbx
  1148. leaq (%rax), %rsp
  1149. .Lmul_gather4_epilogue:
  1150. ret
  1151. .size rsaz_512_mul_gather4,.-rsaz_512_mul_gather4
  1152. ___
  1153. }
  1154. {
  1155. my ($out,$ap,$mod,$n0,$tbl,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
  1156. $code.=<<___;
  1157. .globl rsaz_512_mul_scatter4
  1158. .type rsaz_512_mul_scatter4,\@function,6
  1159. .align 32
  1160. rsaz_512_mul_scatter4:
  1161. push %rbx
  1162. push %rbp
  1163. push %r12
  1164. push %r13
  1165. push %r14
  1166. push %r15
  1167. mov $pwr, $pwr
  1168. subq \$128+24, %rsp
  1169. .Lmul_scatter4_body:
  1170. leaq ($tbl,$pwr,8), $tbl
  1171. movq $out, %xmm0 # off-load arguments
  1172. movq $mod, %xmm1
  1173. movq $tbl, %xmm2
  1174. movq $n0, 128(%rsp)
  1175. movq $out, %rbp
  1176. ___
  1177. $code.=<<___ if ($addx);
  1178. movl \$0x80100,%r11d
  1179. andl OPENSSL_ia32cap_P+8(%rip),%r11d
  1180. cmpl \$0x80100,%r11d # check for MULX and ADO/CX
  1181. je .Lmulx_scatter
  1182. ___
  1183. $code.=<<___;
  1184. movq ($out),%rbx # pass b[0]
  1185. call __rsaz_512_mul
  1186. movq %xmm0, $out
  1187. movq %xmm1, %rbp
  1188. movq (%rsp), %r8
  1189. movq 8(%rsp), %r9
  1190. movq 16(%rsp), %r10
  1191. movq 24(%rsp), %r11
  1192. movq 32(%rsp), %r12
  1193. movq 40(%rsp), %r13
  1194. movq 48(%rsp), %r14
  1195. movq 56(%rsp), %r15
  1196. call __rsaz_512_reduce
  1197. ___
  1198. $code.=<<___ if ($addx);
  1199. jmp .Lmul_scatter_tail
  1200. .align 32
  1201. .Lmulx_scatter:
  1202. movq ($out), %rdx # pass b[0]
  1203. call __rsaz_512_mulx
  1204. movq %xmm0, $out
  1205. movq %xmm1, %rbp
  1206. movq 128(%rsp), %rdx # pull $n0
  1207. movq (%rsp), %r8
  1208. movq 8(%rsp), %r9
  1209. movq 16(%rsp), %r10
  1210. movq 24(%rsp), %r11
  1211. movq 32(%rsp), %r12
  1212. movq 40(%rsp), %r13
  1213. movq 48(%rsp), %r14
  1214. movq 56(%rsp), %r15
  1215. call __rsaz_512_reducex
  1216. .Lmul_scatter_tail:
  1217. ___
  1218. $code.=<<___;
  1219. addq 64(%rsp), %r8
  1220. adcq 72(%rsp), %r9
  1221. adcq 80(%rsp), %r10
  1222. adcq 88(%rsp), %r11
  1223. adcq 96(%rsp), %r12
  1224. adcq 104(%rsp), %r13
  1225. adcq 112(%rsp), %r14
  1226. adcq 120(%rsp), %r15
  1227. movq %xmm2, $inp
  1228. sbbq %rcx, %rcx
  1229. call __rsaz_512_subtract
  1230. movq %r8, 128*0($inp) # scatter
  1231. movq %r9, 128*1($inp)
  1232. movq %r10, 128*2($inp)
  1233. movq %r11, 128*3($inp)
  1234. movq %r12, 128*4($inp)
  1235. movq %r13, 128*5($inp)
  1236. movq %r14, 128*6($inp)
  1237. movq %r15, 128*7($inp)
  1238. leaq 128+24+48(%rsp), %rax
  1239. movq -48(%rax), %r15
  1240. movq -40(%rax), %r14
  1241. movq -32(%rax), %r13
  1242. movq -24(%rax), %r12
  1243. movq -16(%rax), %rbp
  1244. movq -8(%rax), %rbx
  1245. leaq (%rax), %rsp
  1246. .Lmul_scatter4_epilogue:
  1247. ret
  1248. .size rsaz_512_mul_scatter4,.-rsaz_512_mul_scatter4
  1249. ___
  1250. }
  1251. {
  1252. my ($out,$inp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx");
  1253. $code.=<<___;
  1254. .globl rsaz_512_mul_by_one
  1255. .type rsaz_512_mul_by_one,\@function,4
  1256. .align 32
  1257. rsaz_512_mul_by_one:
  1258. push %rbx
  1259. push %rbp
  1260. push %r12
  1261. push %r13
  1262. push %r14
  1263. push %r15
  1264. subq \$128+24, %rsp
  1265. .Lmul_by_one_body:
  1266. ___
  1267. $code.=<<___ if ($addx);
  1268. movl OPENSSL_ia32cap_P+8(%rip),%eax
  1269. ___
  1270. $code.=<<___;
  1271. movq $mod, %rbp # reassign argument
  1272. movq $n0, 128(%rsp)
  1273. movq ($inp), %r8
  1274. pxor %xmm0, %xmm0
  1275. movq 8($inp), %r9
  1276. movq 16($inp), %r10
  1277. movq 24($inp), %r11
  1278. movq 32($inp), %r12
  1279. movq 40($inp), %r13
  1280. movq 48($inp), %r14
  1281. movq 56($inp), %r15
  1282. movdqa %xmm0, (%rsp)
  1283. movdqa %xmm0, 16(%rsp)
  1284. movdqa %xmm0, 32(%rsp)
  1285. movdqa %xmm0, 48(%rsp)
  1286. movdqa %xmm0, 64(%rsp)
  1287. movdqa %xmm0, 80(%rsp)
  1288. movdqa %xmm0, 96(%rsp)
  1289. ___
  1290. $code.=<<___ if ($addx);
  1291. andl \$0x80100,%eax
  1292. cmpl \$0x80100,%eax # check for MULX and ADO/CX
  1293. je .Lby_one_callx
  1294. ___
  1295. $code.=<<___;
  1296. call __rsaz_512_reduce
  1297. ___
  1298. $code.=<<___ if ($addx);
  1299. jmp .Lby_one_tail
  1300. .align 32
  1301. .Lby_one_callx:
  1302. movq 128(%rsp), %rdx # pull $n0
  1303. call __rsaz_512_reducex
  1304. .Lby_one_tail:
  1305. ___
  1306. $code.=<<___;
  1307. movq %r8, ($out)
  1308. movq %r9, 8($out)
  1309. movq %r10, 16($out)
  1310. movq %r11, 24($out)
  1311. movq %r12, 32($out)
  1312. movq %r13, 40($out)
  1313. movq %r14, 48($out)
  1314. movq %r15, 56($out)
  1315. leaq 128+24+48(%rsp), %rax
  1316. movq -48(%rax), %r15
  1317. movq -40(%rax), %r14
  1318. movq -32(%rax), %r13
  1319. movq -24(%rax), %r12
  1320. movq -16(%rax), %rbp
  1321. movq -8(%rax), %rbx
  1322. leaq (%rax), %rsp
  1323. .Lmul_by_one_epilogue:
  1324. ret
  1325. .size rsaz_512_mul_by_one,.-rsaz_512_mul_by_one
  1326. ___
  1327. }
  1328. { # __rsaz_512_reduce
  1329. #
  1330. # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0
  1331. # output: %r8-%r15
  1332. # clobbers: everything except %rbp and %rdi
  1333. $code.=<<___;
  1334. .type __rsaz_512_reduce,\@abi-omnipotent
  1335. .align 32
  1336. __rsaz_512_reduce:
  1337. movq %r8, %rbx
  1338. imulq 128+8(%rsp), %rbx
  1339. movq 0(%rbp), %rax
  1340. movl \$8, %ecx
  1341. jmp .Lreduction_loop
  1342. .align 32
  1343. .Lreduction_loop:
  1344. mulq %rbx
  1345. movq 8(%rbp), %rax
  1346. negq %r8
  1347. movq %rdx, %r8
  1348. adcq \$0, %r8
  1349. mulq %rbx
  1350. addq %rax, %r9
  1351. movq 16(%rbp), %rax
  1352. adcq \$0, %rdx
  1353. addq %r9, %r8
  1354. movq %rdx, %r9
  1355. adcq \$0, %r9
  1356. mulq %rbx
  1357. addq %rax, %r10
  1358. movq 24(%rbp), %rax
  1359. adcq \$0, %rdx
  1360. addq %r10, %r9
  1361. movq %rdx, %r10
  1362. adcq \$0, %r10
  1363. mulq %rbx
  1364. addq %rax, %r11
  1365. movq 32(%rbp), %rax
  1366. adcq \$0, %rdx
  1367. addq %r11, %r10
  1368. movq 128+8(%rsp), %rsi
  1369. #movq %rdx, %r11
  1370. #adcq \$0, %r11
  1371. adcq \$0, %rdx
  1372. movq %rdx, %r11
  1373. mulq %rbx
  1374. addq %rax, %r12
  1375. movq 40(%rbp), %rax
  1376. adcq \$0, %rdx
  1377. imulq %r8, %rsi
  1378. addq %r12, %r11
  1379. movq %rdx, %r12
  1380. adcq \$0, %r12
  1381. mulq %rbx
  1382. addq %rax, %r13
  1383. movq 48(%rbp), %rax
  1384. adcq \$0, %rdx
  1385. addq %r13, %r12
  1386. movq %rdx, %r13
  1387. adcq \$0, %r13
  1388. mulq %rbx
  1389. addq %rax, %r14
  1390. movq 56(%rbp), %rax
  1391. adcq \$0, %rdx
  1392. addq %r14, %r13
  1393. movq %rdx, %r14
  1394. adcq \$0, %r14
  1395. mulq %rbx
  1396. movq %rsi, %rbx
  1397. addq %rax, %r15
  1398. movq 0(%rbp), %rax
  1399. adcq \$0, %rdx
  1400. addq %r15, %r14
  1401. movq %rdx, %r15
  1402. adcq \$0, %r15
  1403. decl %ecx
  1404. jne .Lreduction_loop
  1405. ret
  1406. .size __rsaz_512_reduce,.-__rsaz_512_reduce
  1407. ___
  1408. }
  1409. if ($addx) {
  1410. # __rsaz_512_reducex
  1411. #
  1412. # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0
  1413. # output: %r8-%r15
  1414. # clobbers: everything except %rbp and %rdi
  1415. $code.=<<___;
  1416. .type __rsaz_512_reducex,\@abi-omnipotent
  1417. .align 32
  1418. __rsaz_512_reducex:
  1419. #movq 128+8(%rsp), %rdx # pull $n0
  1420. imulq %r8, %rdx
  1421. xorq %rsi, %rsi # cf=0,of=0
  1422. movl \$8, %ecx
  1423. jmp .Lreduction_loopx
  1424. .align 32
  1425. .Lreduction_loopx:
  1426. mov %r8, %rbx
  1427. mulx 0(%rbp), %rax, %r8
  1428. adcx %rbx, %rax
  1429. adox %r9, %r8
  1430. mulx 8(%rbp), %rax, %r9
  1431. adcx %rax, %r8
  1432. adox %r10, %r9
  1433. mulx 16(%rbp), %rbx, %r10
  1434. adcx %rbx, %r9
  1435. adox %r11, %r10
  1436. mulx 24(%rbp), %rbx, %r11
  1437. adcx %rbx, %r10
  1438. adox %r12, %r11
  1439. .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 32(%rbp), %rbx, %r12
  1440. mov %rdx, %rax
  1441. mov %r8, %rdx
  1442. adcx %rbx, %r11
  1443. adox %r13, %r12
  1444. mulx 128+8(%rsp), %rbx, %rdx
  1445. mov %rax, %rdx
  1446. mulx 40(%rbp), %rax, %r13
  1447. adcx %rax, %r12
  1448. adox %r14, %r13
  1449. .byte 0xc4,0x62,0xfb,0xf6,0xb5,0x30,0x00,0x00,0x00 # mulx 48(%rbp), %rax, %r14
  1450. adcx %rax, %r13
  1451. adox %r15, %r14
  1452. mulx 56(%rbp), %rax, %r15
  1453. mov %rbx, %rdx
  1454. adcx %rax, %r14
  1455. adox %rsi, %r15 # %rsi is 0
  1456. adcx %rsi, %r15 # cf=0
  1457. decl %ecx # of=0
  1458. jne .Lreduction_loopx
  1459. ret
  1460. .size __rsaz_512_reducex,.-__rsaz_512_reducex
  1461. ___
  1462. }
  1463. { # __rsaz_512_subtract
  1464. # input: %r8-%r15, %rdi - $out, %rbp - $mod, %rcx - mask
  1465. # output:
  1466. # clobbers: everything but %rdi, %rsi and %rbp
  1467. $code.=<<___;
  1468. .type __rsaz_512_subtract,\@abi-omnipotent
  1469. .align 32
  1470. __rsaz_512_subtract:
  1471. movq %r8, ($out)
  1472. movq %r9, 8($out)
  1473. movq %r10, 16($out)
  1474. movq %r11, 24($out)
  1475. movq %r12, 32($out)
  1476. movq %r13, 40($out)
  1477. movq %r14, 48($out)
  1478. movq %r15, 56($out)
  1479. movq 0($mod), %r8
  1480. movq 8($mod), %r9
  1481. negq %r8
  1482. notq %r9
  1483. andq %rcx, %r8
  1484. movq 16($mod), %r10
  1485. andq %rcx, %r9
  1486. notq %r10
  1487. movq 24($mod), %r11
  1488. andq %rcx, %r10
  1489. notq %r11
  1490. movq 32($mod), %r12
  1491. andq %rcx, %r11
  1492. notq %r12
  1493. movq 40($mod), %r13
  1494. andq %rcx, %r12
  1495. notq %r13
  1496. movq 48($mod), %r14
  1497. andq %rcx, %r13
  1498. notq %r14
  1499. movq 56($mod), %r15
  1500. andq %rcx, %r14
  1501. notq %r15
  1502. andq %rcx, %r15
  1503. addq ($out), %r8
  1504. adcq 8($out), %r9
  1505. adcq 16($out), %r10
  1506. adcq 24($out), %r11
  1507. adcq 32($out), %r12
  1508. adcq 40($out), %r13
  1509. adcq 48($out), %r14
  1510. adcq 56($out), %r15
  1511. movq %r8, ($out)
  1512. movq %r9, 8($out)
  1513. movq %r10, 16($out)
  1514. movq %r11, 24($out)
  1515. movq %r12, 32($out)
  1516. movq %r13, 40($out)
  1517. movq %r14, 48($out)
  1518. movq %r15, 56($out)
  1519. ret
  1520. .size __rsaz_512_subtract,.-__rsaz_512_subtract
  1521. ___
  1522. }
  1523. { # __rsaz_512_mul
  1524. #
  1525. # input: %rsi - ap, %rbp - bp
  1526. # ouput:
  1527. # clobbers: everything
  1528. my ($ap,$bp) = ("%rsi","%rbp");
  1529. $code.=<<___;
  1530. .type __rsaz_512_mul,\@abi-omnipotent
  1531. .align 32
  1532. __rsaz_512_mul:
  1533. leaq 8(%rsp), %rdi
  1534. movq ($ap), %rax
  1535. mulq %rbx
  1536. movq %rax, (%rdi)
  1537. movq 8($ap), %rax
  1538. movq %rdx, %r8
  1539. mulq %rbx
  1540. addq %rax, %r8
  1541. movq 16($ap), %rax
  1542. movq %rdx, %r9
  1543. adcq \$0, %r9
  1544. mulq %rbx
  1545. addq %rax, %r9
  1546. movq 24($ap), %rax
  1547. movq %rdx, %r10
  1548. adcq \$0, %r10
  1549. mulq %rbx
  1550. addq %rax, %r10
  1551. movq 32($ap), %rax
  1552. movq %rdx, %r11
  1553. adcq \$0, %r11
  1554. mulq %rbx
  1555. addq %rax, %r11
  1556. movq 40($ap), %rax
  1557. movq %rdx, %r12
  1558. adcq \$0, %r12
  1559. mulq %rbx
  1560. addq %rax, %r12
  1561. movq 48($ap), %rax
  1562. movq %rdx, %r13
  1563. adcq \$0, %r13
  1564. mulq %rbx
  1565. addq %rax, %r13
  1566. movq 56($ap), %rax
  1567. movq %rdx, %r14
  1568. adcq \$0, %r14
  1569. mulq %rbx
  1570. addq %rax, %r14
  1571. movq ($ap), %rax
  1572. movq %rdx, %r15
  1573. adcq \$0, %r15
  1574. leaq 8($bp), $bp
  1575. leaq 8(%rdi), %rdi
  1576. movl \$7, %ecx
  1577. jmp .Loop_mul
  1578. .align 32
  1579. .Loop_mul:
  1580. movq ($bp), %rbx
  1581. mulq %rbx
  1582. addq %rax, %r8
  1583. movq 8($ap), %rax
  1584. movq %r8, (%rdi)
  1585. movq %rdx, %r8
  1586. adcq \$0, %r8
  1587. mulq %rbx
  1588. addq %rax, %r9
  1589. movq 16($ap), %rax
  1590. adcq \$0, %rdx
  1591. addq %r9, %r8
  1592. movq %rdx, %r9
  1593. adcq \$0, %r9
  1594. mulq %rbx
  1595. addq %rax, %r10
  1596. movq 24($ap), %rax
  1597. adcq \$0, %rdx
  1598. addq %r10, %r9
  1599. movq %rdx, %r10
  1600. adcq \$0, %r10
  1601. mulq %rbx
  1602. addq %rax, %r11
  1603. movq 32($ap), %rax
  1604. adcq \$0, %rdx
  1605. addq %r11, %r10
  1606. movq %rdx, %r11
  1607. adcq \$0, %r11
  1608. mulq %rbx
  1609. addq %rax, %r12
  1610. movq 40($ap), %rax
  1611. adcq \$0, %rdx
  1612. addq %r12, %r11
  1613. movq %rdx, %r12
  1614. adcq \$0, %r12
  1615. mulq %rbx
  1616. addq %rax, %r13
  1617. movq 48($ap), %rax
  1618. adcq \$0, %rdx
  1619. addq %r13, %r12
  1620. movq %rdx, %r13
  1621. adcq \$0, %r13
  1622. mulq %rbx
  1623. addq %rax, %r14
  1624. movq 56($ap), %rax
  1625. adcq \$0, %rdx
  1626. addq %r14, %r13
  1627. movq %rdx, %r14
  1628. leaq 8($bp), $bp
  1629. adcq \$0, %r14
  1630. mulq %rbx
  1631. addq %rax, %r15
  1632. movq ($ap), %rax
  1633. adcq \$0, %rdx
  1634. addq %r15, %r14
  1635. movq %rdx, %r15
  1636. adcq \$0, %r15
  1637. leaq 8(%rdi), %rdi
  1638. decl %ecx
  1639. jnz .Loop_mul
  1640. movq %r8, (%rdi)
  1641. movq %r9, 8(%rdi)
  1642. movq %r10, 16(%rdi)
  1643. movq %r11, 24(%rdi)
  1644. movq %r12, 32(%rdi)
  1645. movq %r13, 40(%rdi)
  1646. movq %r14, 48(%rdi)
  1647. movq %r15, 56(%rdi)
  1648. ret
  1649. .size __rsaz_512_mul,.-__rsaz_512_mul
  1650. ___
  1651. }
  1652. if ($addx) {
  1653. # __rsaz_512_mulx
  1654. #
  1655. # input: %rsi - ap, %rbp - bp
  1656. # ouput:
  1657. # clobbers: everything
  1658. my ($ap,$bp,$zero) = ("%rsi","%rbp","%rdi");
  1659. $code.=<<___;
  1660. .type __rsaz_512_mulx,\@abi-omnipotent
  1661. .align 32
  1662. __rsaz_512_mulx:
  1663. mulx ($ap), %rbx, %r8 # initial %rdx preloaded by caller
  1664. mov \$-6, %rcx
  1665. mulx 8($ap), %rax, %r9
  1666. movq %rbx, 8(%rsp)
  1667. mulx 16($ap), %rbx, %r10
  1668. adc %rax, %r8
  1669. mulx 24($ap), %rax, %r11
  1670. adc %rbx, %r9
  1671. mulx 32($ap), %rbx, %r12
  1672. adc %rax, %r10
  1673. mulx 40($ap), %rax, %r13
  1674. adc %rbx, %r11
  1675. mulx 48($ap), %rbx, %r14
  1676. adc %rax, %r12
  1677. mulx 56($ap), %rax, %r15
  1678. mov 8($bp), %rdx
  1679. adc %rbx, %r13
  1680. adc %rax, %r14
  1681. adc \$0, %r15
  1682. xor $zero, $zero # cf=0,of=0
  1683. jmp .Loop_mulx
  1684. .align 32
  1685. .Loop_mulx:
  1686. movq %r8, %rbx
  1687. mulx ($ap), %rax, %r8
  1688. adcx %rax, %rbx
  1689. adox %r9, %r8
  1690. mulx 8($ap), %rax, %r9
  1691. adcx %rax, %r8
  1692. adox %r10, %r9
  1693. mulx 16($ap), %rax, %r10
  1694. adcx %rax, %r9
  1695. adox %r11, %r10
  1696. mulx 24($ap), %rax, %r11
  1697. adcx %rax, %r10
  1698. adox %r12, %r11
  1699. .byte 0x3e,0xc4,0x62,0xfb,0xf6,0xa6,0x20,0x00,0x00,0x00 # mulx 32($ap), %rax, %r12
  1700. adcx %rax, %r11
  1701. adox %r13, %r12
  1702. mulx 40($ap), %rax, %r13
  1703. adcx %rax, %r12
  1704. adox %r14, %r13
  1705. mulx 48($ap), %rax, %r14
  1706. adcx %rax, %r13
  1707. adox %r15, %r14
  1708. mulx 56($ap), %rax, %r15
  1709. movq 64($bp,%rcx,8), %rdx
  1710. movq %rbx, 8+64-8(%rsp,%rcx,8)
  1711. adcx %rax, %r14
  1712. adox $zero, %r15
  1713. adcx $zero, %r15 # cf=0
  1714. inc %rcx # of=0
  1715. jnz .Loop_mulx
  1716. movq %r8, %rbx
  1717. mulx ($ap), %rax, %r8
  1718. adcx %rax, %rbx
  1719. adox %r9, %r8
  1720. .byte 0xc4,0x62,0xfb,0xf6,0x8e,0x08,0x00,0x00,0x00 # mulx 8($ap), %rax, %r9
  1721. adcx %rax, %r8
  1722. adox %r10, %r9
  1723. .byte 0xc4,0x62,0xfb,0xf6,0x96,0x10,0x00,0x00,0x00 # mulx 16($ap), %rax, %r10
  1724. adcx %rax, %r9
  1725. adox %r11, %r10
  1726. mulx 24($ap), %rax, %r11
  1727. adcx %rax, %r10
  1728. adox %r12, %r11
  1729. mulx 32($ap), %rax, %r12
  1730. adcx %rax, %r11
  1731. adox %r13, %r12
  1732. mulx 40($ap), %rax, %r13
  1733. adcx %rax, %r12
  1734. adox %r14, %r13
  1735. .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14
  1736. adcx %rax, %r13
  1737. adox %r15, %r14
  1738. .byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 # mulx 56($ap), %rax, %r15
  1739. adcx %rax, %r14
  1740. adox $zero, %r15
  1741. adcx $zero, %r15
  1742. mov %rbx, 8+64-8(%rsp)
  1743. mov %r8, 8+64(%rsp)
  1744. mov %r9, 8+64+8(%rsp)
  1745. mov %r10, 8+64+16(%rsp)
  1746. mov %r11, 8+64+24(%rsp)
  1747. mov %r12, 8+64+32(%rsp)
  1748. mov %r13, 8+64+40(%rsp)
  1749. mov %r14, 8+64+48(%rsp)
  1750. mov %r15, 8+64+56(%rsp)
  1751. ret
  1752. .size __rsaz_512_mulx,.-__rsaz_512_mulx
  1753. ___
  1754. }
  1755. {
  1756. my ($out,$inp,$power)= $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx");
  1757. $code.=<<___;
  1758. .globl rsaz_512_scatter4
  1759. .type rsaz_512_scatter4,\@abi-omnipotent
  1760. .align 16
  1761. rsaz_512_scatter4:
  1762. leaq ($out,$power,8), $out
  1763. movl \$8, %r9d
  1764. jmp .Loop_scatter
  1765. .align 16
  1766. .Loop_scatter:
  1767. movq ($inp), %rax
  1768. leaq 8($inp), $inp
  1769. movq %rax, ($out)
  1770. leaq 128($out), $out
  1771. decl %r9d
  1772. jnz .Loop_scatter
  1773. ret
  1774. .size rsaz_512_scatter4,.-rsaz_512_scatter4
  1775. .globl rsaz_512_gather4
  1776. .type rsaz_512_gather4,\@abi-omnipotent
  1777. .align 16
  1778. rsaz_512_gather4:
  1779. ___
  1780. $code.=<<___ if ($win64);
  1781. .LSEH_begin_rsaz_512_gather4:
  1782. .byte 0x48,0x81,0xec,0xa8,0x00,0x00,0x00 # sub $0xa8,%rsp
  1783. .byte 0x0f,0x29,0x34,0x24 # movaps %xmm6,(%rsp)
  1784. .byte 0x0f,0x29,0x7c,0x24,0x10 # movaps %xmm7,0x10(%rsp)
  1785. .byte 0x44,0x0f,0x29,0x44,0x24,0x20 # movaps %xmm8,0x20(%rsp)
  1786. .byte 0x44,0x0f,0x29,0x4c,0x24,0x30 # movaps %xmm9,0x30(%rsp)
  1787. .byte 0x44,0x0f,0x29,0x54,0x24,0x40 # movaps %xmm10,0x40(%rsp)
  1788. .byte 0x44,0x0f,0x29,0x5c,0x24,0x50 # movaps %xmm11,0x50(%rsp)
  1789. .byte 0x44,0x0f,0x29,0x64,0x24,0x60 # movaps %xmm12,0x60(%rsp)
  1790. .byte 0x44,0x0f,0x29,0x6c,0x24,0x70 # movaps %xmm13,0x70(%rsp)
  1791. .byte 0x44,0x0f,0x29,0xb4,0x24,0x80,0,0,0 # movaps %xmm14,0x80(%rsp)
  1792. .byte 0x44,0x0f,0x29,0xbc,0x24,0x90,0,0,0 # movaps %xmm15,0x90(%rsp)
  1793. ___
  1794. $code.=<<___;
  1795. movd $power,%xmm8
  1796. movdqa .Linc+16(%rip),%xmm1 # 00000002000000020000000200000002
  1797. movdqa .Linc(%rip),%xmm0 # 00000001000000010000000000000000
  1798. pshufd \$0,%xmm8,%xmm8 # broadcast $power
  1799. movdqa %xmm1,%xmm7
  1800. movdqa %xmm1,%xmm2
  1801. ___
  1802. ########################################################################
  1803. # calculate mask by comparing 0..15 to $power
  1804. #
  1805. for($i=0;$i<4;$i++) {
  1806. $code.=<<___;
  1807. paddd %xmm`$i`,%xmm`$i+1`
  1808. pcmpeqd %xmm8,%xmm`$i`
  1809. movdqa %xmm7,%xmm`$i+3`
  1810. ___
  1811. }
  1812. for(;$i<7;$i++) {
  1813. $code.=<<___;
  1814. paddd %xmm`$i`,%xmm`$i+1`
  1815. pcmpeqd %xmm8,%xmm`$i`
  1816. ___
  1817. }
  1818. $code.=<<___;
  1819. pcmpeqd %xmm8,%xmm7
  1820. movl \$8, %r9d
  1821. jmp .Loop_gather
  1822. .align 16
  1823. .Loop_gather:
  1824. movdqa 16*0($inp),%xmm8
  1825. movdqa 16*1($inp),%xmm9
  1826. movdqa 16*2($inp),%xmm10
  1827. movdqa 16*3($inp),%xmm11
  1828. pand %xmm0,%xmm8
  1829. movdqa 16*4($inp),%xmm12
  1830. pand %xmm1,%xmm9
  1831. movdqa 16*5($inp),%xmm13
  1832. pand %xmm2,%xmm10
  1833. movdqa 16*6($inp),%xmm14
  1834. pand %xmm3,%xmm11
  1835. movdqa 16*7($inp),%xmm15
  1836. leaq 128($inp), $inp
  1837. pand %xmm4,%xmm12
  1838. pand %xmm5,%xmm13
  1839. pand %xmm6,%xmm14
  1840. pand %xmm7,%xmm15
  1841. por %xmm10,%xmm8
  1842. por %xmm11,%xmm9
  1843. por %xmm12,%xmm8
  1844. por %xmm13,%xmm9
  1845. por %xmm14,%xmm8
  1846. por %xmm15,%xmm9
  1847. por %xmm9,%xmm8
  1848. pshufd \$0x4e,%xmm8,%xmm9
  1849. por %xmm9,%xmm8
  1850. movq %xmm8,($out)
  1851. leaq 8($out), $out
  1852. decl %r9d
  1853. jnz .Loop_gather
  1854. ___
  1855. $code.=<<___ if ($win64);
  1856. movaps 0x00(%rsp),%xmm6
  1857. movaps 0x10(%rsp),%xmm7
  1858. movaps 0x20(%rsp),%xmm8
  1859. movaps 0x30(%rsp),%xmm9
  1860. movaps 0x40(%rsp),%xmm10
  1861. movaps 0x50(%rsp),%xmm11
  1862. movaps 0x60(%rsp),%xmm12
  1863. movaps 0x70(%rsp),%xmm13
  1864. movaps 0x80(%rsp),%xmm14
  1865. movaps 0x90(%rsp),%xmm15
  1866. add \$0xa8,%rsp
  1867. ___
  1868. $code.=<<___;
  1869. ret
  1870. .LSEH_end_rsaz_512_gather4:
  1871. .size rsaz_512_gather4,.-rsaz_512_gather4
  1872. .align 64
  1873. .Linc:
  1874. .long 0,0, 1,1
  1875. .long 2,2, 2,2
  1876. ___
  1877. }
  1878. # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
  1879. # CONTEXT *context,DISPATCHER_CONTEXT *disp)
  1880. if ($win64) {
  1881. $rec="%rcx";
  1882. $frame="%rdx";
  1883. $context="%r8";
  1884. $disp="%r9";
  1885. $code.=<<___;
  1886. .extern __imp_RtlVirtualUnwind
  1887. .type se_handler,\@abi-omnipotent
  1888. .align 16
  1889. se_handler:
  1890. push %rsi
  1891. push %rdi
  1892. push %rbx
  1893. push %rbp
  1894. push %r12
  1895. push %r13
  1896. push %r14
  1897. push %r15
  1898. pushfq
  1899. sub \$64,%rsp
  1900. mov 120($context),%rax # pull context->Rax
  1901. mov 248($context),%rbx # pull context->Rip
  1902. mov 8($disp),%rsi # disp->ImageBase
  1903. mov 56($disp),%r11 # disp->HandlerData
  1904. mov 0(%r11),%r10d # HandlerData[0]
  1905. lea (%rsi,%r10),%r10 # end of prologue label
  1906. cmp %r10,%rbx # context->Rip<end of prologue label
  1907. jb .Lcommon_seh_tail
  1908. mov 152($context),%rax # pull context->Rsp
  1909. mov 4(%r11),%r10d # HandlerData[1]
  1910. lea (%rsi,%r10),%r10 # epilogue label
  1911. cmp %r10,%rbx # context->Rip>=epilogue label
  1912. jae .Lcommon_seh_tail
  1913. lea 128+24+48(%rax),%rax
  1914. lea .Lmul_gather4_epilogue(%rip),%rbx
  1915. cmp %r10,%rbx
  1916. jne .Lse_not_in_mul_gather4
  1917. lea 0xb0(%rax),%rax
  1918. lea -48-0xa8(%rax),%rsi
  1919. lea 512($context),%rdi
  1920. mov \$20,%ecx
  1921. .long 0xa548f3fc # cld; rep movsq
  1922. .Lse_not_in_mul_gather4:
  1923. mov -8(%rax),%rbx
  1924. mov -16(%rax),%rbp
  1925. mov -24(%rax),%r12
  1926. mov -32(%rax),%r13
  1927. mov -40(%rax),%r14
  1928. mov -48(%rax),%r15
  1929. mov %rbx,144($context) # restore context->Rbx
  1930. mov %rbp,160($context) # restore context->Rbp
  1931. mov %r12,216($context) # restore context->R12
  1932. mov %r13,224($context) # restore context->R13
  1933. mov %r14,232($context) # restore context->R14
  1934. mov %r15,240($context) # restore context->R15
  1935. .Lcommon_seh_tail:
  1936. mov 8(%rax),%rdi
  1937. mov 16(%rax),%rsi
  1938. mov %rax,152($context) # restore context->Rsp
  1939. mov %rsi,168($context) # restore context->Rsi
  1940. mov %rdi,176($context) # restore context->Rdi
  1941. mov 40($disp),%rdi # disp->ContextRecord
  1942. mov $context,%rsi # context
  1943. mov \$154,%ecx # sizeof(CONTEXT)
  1944. .long 0xa548f3fc # cld; rep movsq
  1945. mov $disp,%rsi
  1946. xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
  1947. mov 8(%rsi),%rdx # arg2, disp->ImageBase
  1948. mov 0(%rsi),%r8 # arg3, disp->ControlPc
  1949. mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
  1950. mov 40(%rsi),%r10 # disp->ContextRecord
  1951. lea 56(%rsi),%r11 # &disp->HandlerData
  1952. lea 24(%rsi),%r12 # &disp->EstablisherFrame
  1953. mov %r10,32(%rsp) # arg5
  1954. mov %r11,40(%rsp) # arg6
  1955. mov %r12,48(%rsp) # arg7
  1956. mov %rcx,56(%rsp) # arg8, (NULL)
  1957. call *__imp_RtlVirtualUnwind(%rip)
  1958. mov \$1,%eax # ExceptionContinueSearch
  1959. add \$64,%rsp
  1960. popfq
  1961. pop %r15
  1962. pop %r14
  1963. pop %r13
  1964. pop %r12
  1965. pop %rbp
  1966. pop %rbx
  1967. pop %rdi
  1968. pop %rsi
  1969. ret
  1970. .size se_handler,.-se_handler
  1971. .section .pdata
  1972. .align 4
  1973. .rva .LSEH_begin_rsaz_512_sqr
  1974. .rva .LSEH_end_rsaz_512_sqr
  1975. .rva .LSEH_info_rsaz_512_sqr
  1976. .rva .LSEH_begin_rsaz_512_mul
  1977. .rva .LSEH_end_rsaz_512_mul
  1978. .rva .LSEH_info_rsaz_512_mul
  1979. .rva .LSEH_begin_rsaz_512_mul_gather4
  1980. .rva .LSEH_end_rsaz_512_mul_gather4
  1981. .rva .LSEH_info_rsaz_512_mul_gather4
  1982. .rva .LSEH_begin_rsaz_512_mul_scatter4
  1983. .rva .LSEH_end_rsaz_512_mul_scatter4
  1984. .rva .LSEH_info_rsaz_512_mul_scatter4
  1985. .rva .LSEH_begin_rsaz_512_mul_by_one
  1986. .rva .LSEH_end_rsaz_512_mul_by_one
  1987. .rva .LSEH_info_rsaz_512_mul_by_one
  1988. .rva .LSEH_begin_rsaz_512_gather4
  1989. .rva .LSEH_end_rsaz_512_gather4
  1990. .rva .LSEH_info_rsaz_512_gather4
  1991. .section .xdata
  1992. .align 8
  1993. .LSEH_info_rsaz_512_sqr:
  1994. .byte 9,0,0,0
  1995. .rva se_handler
  1996. .rva .Lsqr_body,.Lsqr_epilogue # HandlerData[]
  1997. .LSEH_info_rsaz_512_mul:
  1998. .byte 9,0,0,0
  1999. .rva se_handler
  2000. .rva .Lmul_body,.Lmul_epilogue # HandlerData[]
  2001. .LSEH_info_rsaz_512_mul_gather4:
  2002. .byte 9,0,0,0
  2003. .rva se_handler
  2004. .rva .Lmul_gather4_body,.Lmul_gather4_epilogue # HandlerData[]
  2005. .LSEH_info_rsaz_512_mul_scatter4:
  2006. .byte 9,0,0,0
  2007. .rva se_handler
  2008. .rva .Lmul_scatter4_body,.Lmul_scatter4_epilogue # HandlerData[]
  2009. .LSEH_info_rsaz_512_mul_by_one:
  2010. .byte 9,0,0,0
  2011. .rva se_handler
  2012. .rva .Lmul_by_one_body,.Lmul_by_one_epilogue # HandlerData[]
  2013. .LSEH_info_rsaz_512_gather4:
  2014. .byte 0x01,0x46,0x16,0x00
  2015. .byte 0x46,0xf8,0x09,0x00 # vmovaps 0x90(rsp),xmm15
  2016. .byte 0x3d,0xe8,0x08,0x00 # vmovaps 0x80(rsp),xmm14
  2017. .byte 0x34,0xd8,0x07,0x00 # vmovaps 0x70(rsp),xmm13
  2018. .byte 0x2e,0xc8,0x06,0x00 # vmovaps 0x60(rsp),xmm12
  2019. .byte 0x28,0xb8,0x05,0x00 # vmovaps 0x50(rsp),xmm11
  2020. .byte 0x22,0xa8,0x04,0x00 # vmovaps 0x40(rsp),xmm10
  2021. .byte 0x1c,0x98,0x03,0x00 # vmovaps 0x30(rsp),xmm9
  2022. .byte 0x16,0x88,0x02,0x00 # vmovaps 0x20(rsp),xmm8
  2023. .byte 0x10,0x78,0x01,0x00 # vmovaps 0x10(rsp),xmm7
  2024. .byte 0x0b,0x68,0x00,0x00 # vmovaps 0x00(rsp),xmm6
  2025. .byte 0x07,0x01,0x15,0x00 # sub rsp,0xa8
  2026. ___
  2027. }
  2028. $code =~ s/\`([^\`]*)\`/eval $1/gem;
  2029. print $code;
  2030. close STDOUT;