2
0

rsaz-x86_64.pl 45 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404
  1. #! /usr/bin/env perl
  2. # Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
  3. # Copyright (c) 2012, Intel Corporation. All Rights Reserved.
  4. #
  5. # Licensed under the OpenSSL license (the "License"). You may not use
  6. # this file except in compliance with the License. You can obtain a copy
  7. # in the file LICENSE in the source distribution or at
  8. # https://www.openssl.org/source/license.html
  9. #
  10. # Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1)
  11. # (1) Intel Corporation, Israel Development Center, Haifa, Israel
  12. # (2) University of Haifa, Israel
  13. #
  14. # References:
  15. # [1] S. Gueron, "Efficient Software Implementations of Modular
  16. # Exponentiation", http://eprint.iacr.org/2011/239
  17. # [2] S. Gueron, V. Krasnov. "Speeding up Big-Numbers Squaring".
  18. # IEEE Proceedings of 9th International Conference on Information
  19. # Technology: New Generations (ITNG 2012), 821-823 (2012).
  20. # [3] S. Gueron, Efficient Software Implementations of Modular Exponentiation
  21. # Journal of Cryptographic Engineering 2:31-43 (2012).
  22. # [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis
  23. # resistant 512-bit and 1024-bit modular exponentiation for optimizing
  24. # RSA1024 and RSA2048 on x86_64 platforms",
  25. # http://rt.openssl.org/Ticket/Display.html?id=2582&user=guest&pass=guest
  26. #
  27. # While original submission covers 512- and 1024-bit exponentiation,
  28. # this module is limited to 512-bit version only (and as such
  29. # accelerates RSA1024 sign). This is because improvement for longer
  30. # keys is not high enough to justify the effort, highest measured
  31. # was ~5% on Westmere. [This is relative to OpenSSL 1.0.2, upcoming
  32. # for the moment of this writing!] Nor does this module implement
  33. # "monolithic" complete exponentiation jumbo-subroutine, but adheres
  34. # to more modular mixture of C and assembly. And it's optimized even
  35. # for processors other than Intel Core family (see table below for
  36. # improvement coefficients).
  37. # <appro@openssl.org>
  38. #
  39. # RSA1024 sign/sec this/original |this/rsax(*) this/fips(*)
  40. # ----------------+---------------------------
  41. # Opteron +13% |+5% +20%
  42. # Bulldozer -0% |-1% +10%
  43. # P4 +11% |+7% +8%
  44. # Westmere +5% |+14% +17%
  45. # Sandy Bridge +2% |+12% +29%
  46. # Ivy Bridge +1% |+11% +35%
  47. # Haswell(**) -0% |+12% +39%
  48. # Atom +13% |+11% +4%
  49. # VIA Nano +70% |+9% +25%
  50. #
  51. # (*) rsax engine and fips numbers are presented for reference
  52. # purposes;
  53. # (**) MULX was attempted, but found to give only marginal improvement;
  54. $flavour = shift;
  55. $output = shift;
  56. if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
  57. $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  58. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  59. ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  60. ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  61. die "can't locate x86_64-xlate.pl";
  62. open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
  63. *STDOUT=*OUT;
  64. if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
  65. =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
  66. $addx = ($1>=2.23);
  67. }
  68. if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
  69. `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
  70. $addx = ($1>=2.10);
  71. }
  72. if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
  73. `ml64 2>&1` =~ /Version ([0-9]+)\./) {
  74. $addx = ($1>=12);
  75. }
  76. if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9])\.([0-9]+)/) {
  77. my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10
  78. $addx = ($ver>=3.03);
  79. }
  80. ($out, $inp, $mod) = ("%rdi", "%rsi", "%rbp"); # common internal API
  81. {
  82. my ($out,$inp,$mod,$n0,$times) = ("%rdi","%rsi","%rdx","%rcx","%r8d");
  83. $code.=<<___;
  84. .text
  85. .extern OPENSSL_ia32cap_P
  86. .globl rsaz_512_sqr
  87. .type rsaz_512_sqr,\@function,5
  88. .align 32
  89. rsaz_512_sqr: # 25-29% faster than rsaz_512_mul
  90. .cfi_startproc
  91. push %rbx
  92. .cfi_push %rbx
  93. push %rbp
  94. .cfi_push %rbp
  95. push %r12
  96. .cfi_push %r12
  97. push %r13
  98. .cfi_push %r13
  99. push %r14
  100. .cfi_push %r14
  101. push %r15
  102. .cfi_push %r15
  103. subq \$128+24, %rsp
  104. .cfi_adjust_cfa_offset 128+24
  105. .Lsqr_body:
  106. movq $mod, %rbp # common argument
  107. movq ($inp), %rdx
  108. movq 8($inp), %rax
  109. movq $n0, 128(%rsp)
  110. ___
  111. $code.=<<___ if ($addx);
  112. movl \$0x80100,%r11d
  113. andl OPENSSL_ia32cap_P+8(%rip),%r11d
  114. cmpl \$0x80100,%r11d # check for MULX and ADO/CX
  115. je .Loop_sqrx
  116. ___
  117. $code.=<<___;
  118. jmp .Loop_sqr
  119. .align 32
  120. .Loop_sqr:
  121. movl $times,128+8(%rsp)
  122. #first iteration
  123. movq %rdx, %rbx
  124. mulq %rdx
  125. movq %rax, %r8
  126. movq 16($inp), %rax
  127. movq %rdx, %r9
  128. mulq %rbx
  129. addq %rax, %r9
  130. movq 24($inp), %rax
  131. movq %rdx, %r10
  132. adcq \$0, %r10
  133. mulq %rbx
  134. addq %rax, %r10
  135. movq 32($inp), %rax
  136. movq %rdx, %r11
  137. adcq \$0, %r11
  138. mulq %rbx
  139. addq %rax, %r11
  140. movq 40($inp), %rax
  141. movq %rdx, %r12
  142. adcq \$0, %r12
  143. mulq %rbx
  144. addq %rax, %r12
  145. movq 48($inp), %rax
  146. movq %rdx, %r13
  147. adcq \$0, %r13
  148. mulq %rbx
  149. addq %rax, %r13
  150. movq 56($inp), %rax
  151. movq %rdx, %r14
  152. adcq \$0, %r14
  153. mulq %rbx
  154. addq %rax, %r14
  155. movq %rbx, %rax
  156. movq %rdx, %r15
  157. adcq \$0, %r15
  158. addq %r8, %r8 #shlq \$1, %r8
  159. movq %r9, %rcx
  160. adcq %r9, %r9 #shld \$1, %r8, %r9
  161. mulq %rax
  162. movq %rax, (%rsp)
  163. addq %rdx, %r8
  164. adcq \$0, %r9
  165. movq %r8, 8(%rsp)
  166. shrq \$63, %rcx
  167. #second iteration
  168. movq 8($inp), %r8
  169. movq 16($inp), %rax
  170. mulq %r8
  171. addq %rax, %r10
  172. movq 24($inp), %rax
  173. movq %rdx, %rbx
  174. adcq \$0, %rbx
  175. mulq %r8
  176. addq %rax, %r11
  177. movq 32($inp), %rax
  178. adcq \$0, %rdx
  179. addq %rbx, %r11
  180. movq %rdx, %rbx
  181. adcq \$0, %rbx
  182. mulq %r8
  183. addq %rax, %r12
  184. movq 40($inp), %rax
  185. adcq \$0, %rdx
  186. addq %rbx, %r12
  187. movq %rdx, %rbx
  188. adcq \$0, %rbx
  189. mulq %r8
  190. addq %rax, %r13
  191. movq 48($inp), %rax
  192. adcq \$0, %rdx
  193. addq %rbx, %r13
  194. movq %rdx, %rbx
  195. adcq \$0, %rbx
  196. mulq %r8
  197. addq %rax, %r14
  198. movq 56($inp), %rax
  199. adcq \$0, %rdx
  200. addq %rbx, %r14
  201. movq %rdx, %rbx
  202. adcq \$0, %rbx
  203. mulq %r8
  204. addq %rax, %r15
  205. movq %r8, %rax
  206. adcq \$0, %rdx
  207. addq %rbx, %r15
  208. movq %rdx, %r8
  209. movq %r10, %rdx
  210. adcq \$0, %r8
  211. add %rdx, %rdx
  212. lea (%rcx,%r10,2), %r10 #shld \$1, %rcx, %r10
  213. movq %r11, %rbx
  214. adcq %r11, %r11 #shld \$1, %r10, %r11
  215. mulq %rax
  216. addq %rax, %r9
  217. adcq %rdx, %r10
  218. adcq \$0, %r11
  219. movq %r9, 16(%rsp)
  220. movq %r10, 24(%rsp)
  221. shrq \$63, %rbx
  222. #third iteration
  223. movq 16($inp), %r9
  224. movq 24($inp), %rax
  225. mulq %r9
  226. addq %rax, %r12
  227. movq 32($inp), %rax
  228. movq %rdx, %rcx
  229. adcq \$0, %rcx
  230. mulq %r9
  231. addq %rax, %r13
  232. movq 40($inp), %rax
  233. adcq \$0, %rdx
  234. addq %rcx, %r13
  235. movq %rdx, %rcx
  236. adcq \$0, %rcx
  237. mulq %r9
  238. addq %rax, %r14
  239. movq 48($inp), %rax
  240. adcq \$0, %rdx
  241. addq %rcx, %r14
  242. movq %rdx, %rcx
  243. adcq \$0, %rcx
  244. mulq %r9
  245. movq %r12, %r10
  246. lea (%rbx,%r12,2), %r12 #shld \$1, %rbx, %r12
  247. addq %rax, %r15
  248. movq 56($inp), %rax
  249. adcq \$0, %rdx
  250. addq %rcx, %r15
  251. movq %rdx, %rcx
  252. adcq \$0, %rcx
  253. mulq %r9
  254. shrq \$63, %r10
  255. addq %rax, %r8
  256. movq %r9, %rax
  257. adcq \$0, %rdx
  258. addq %rcx, %r8
  259. movq %rdx, %r9
  260. adcq \$0, %r9
  261. movq %r13, %rcx
  262. leaq (%r10,%r13,2), %r13 #shld \$1, %r12, %r13
  263. mulq %rax
  264. addq %rax, %r11
  265. adcq %rdx, %r12
  266. adcq \$0, %r13
  267. movq %r11, 32(%rsp)
  268. movq %r12, 40(%rsp)
  269. shrq \$63, %rcx
  270. #fourth iteration
  271. movq 24($inp), %r10
  272. movq 32($inp), %rax
  273. mulq %r10
  274. addq %rax, %r14
  275. movq 40($inp), %rax
  276. movq %rdx, %rbx
  277. adcq \$0, %rbx
  278. mulq %r10
  279. addq %rax, %r15
  280. movq 48($inp), %rax
  281. adcq \$0, %rdx
  282. addq %rbx, %r15
  283. movq %rdx, %rbx
  284. adcq \$0, %rbx
  285. mulq %r10
  286. movq %r14, %r12
  287. leaq (%rcx,%r14,2), %r14 #shld \$1, %rcx, %r14
  288. addq %rax, %r8
  289. movq 56($inp), %rax
  290. adcq \$0, %rdx
  291. addq %rbx, %r8
  292. movq %rdx, %rbx
  293. adcq \$0, %rbx
  294. mulq %r10
  295. shrq \$63, %r12
  296. addq %rax, %r9
  297. movq %r10, %rax
  298. adcq \$0, %rdx
  299. addq %rbx, %r9
  300. movq %rdx, %r10
  301. adcq \$0, %r10
  302. movq %r15, %rbx
  303. leaq (%r12,%r15,2),%r15 #shld \$1, %r14, %r15
  304. mulq %rax
  305. addq %rax, %r13
  306. adcq %rdx, %r14
  307. adcq \$0, %r15
  308. movq %r13, 48(%rsp)
  309. movq %r14, 56(%rsp)
  310. shrq \$63, %rbx
  311. #fifth iteration
  312. movq 32($inp), %r11
  313. movq 40($inp), %rax
  314. mulq %r11
  315. addq %rax, %r8
  316. movq 48($inp), %rax
  317. movq %rdx, %rcx
  318. adcq \$0, %rcx
  319. mulq %r11
  320. addq %rax, %r9
  321. movq 56($inp), %rax
  322. adcq \$0, %rdx
  323. movq %r8, %r12
  324. leaq (%rbx,%r8,2), %r8 #shld \$1, %rbx, %r8
  325. addq %rcx, %r9
  326. movq %rdx, %rcx
  327. adcq \$0, %rcx
  328. mulq %r11
  329. shrq \$63, %r12
  330. addq %rax, %r10
  331. movq %r11, %rax
  332. adcq \$0, %rdx
  333. addq %rcx, %r10
  334. movq %rdx, %r11
  335. adcq \$0, %r11
  336. movq %r9, %rcx
  337. leaq (%r12,%r9,2), %r9 #shld \$1, %r8, %r9
  338. mulq %rax
  339. addq %rax, %r15
  340. adcq %rdx, %r8
  341. adcq \$0, %r9
  342. movq %r15, 64(%rsp)
  343. movq %r8, 72(%rsp)
  344. shrq \$63, %rcx
  345. #sixth iteration
  346. movq 40($inp), %r12
  347. movq 48($inp), %rax
  348. mulq %r12
  349. addq %rax, %r10
  350. movq 56($inp), %rax
  351. movq %rdx, %rbx
  352. adcq \$0, %rbx
  353. mulq %r12
  354. addq %rax, %r11
  355. movq %r12, %rax
  356. movq %r10, %r15
  357. leaq (%rcx,%r10,2), %r10 #shld \$1, %rcx, %r10
  358. adcq \$0, %rdx
  359. shrq \$63, %r15
  360. addq %rbx, %r11
  361. movq %rdx, %r12
  362. adcq \$0, %r12
  363. movq %r11, %rbx
  364. leaq (%r15,%r11,2), %r11 #shld \$1, %r10, %r11
  365. mulq %rax
  366. addq %rax, %r9
  367. adcq %rdx, %r10
  368. adcq \$0, %r11
  369. movq %r9, 80(%rsp)
  370. movq %r10, 88(%rsp)
  371. #seventh iteration
  372. movq 48($inp), %r13
  373. movq 56($inp), %rax
  374. mulq %r13
  375. addq %rax, %r12
  376. movq %r13, %rax
  377. movq %rdx, %r13
  378. adcq \$0, %r13
  379. xorq %r14, %r14
  380. shlq \$1, %rbx
  381. adcq %r12, %r12 #shld \$1, %rbx, %r12
  382. adcq %r13, %r13 #shld \$1, %r12, %r13
  383. adcq %r14, %r14 #shld \$1, %r13, %r14
  384. mulq %rax
  385. addq %rax, %r11
  386. adcq %rdx, %r12
  387. adcq \$0, %r13
  388. movq %r11, 96(%rsp)
  389. movq %r12, 104(%rsp)
  390. #eighth iteration
  391. movq 56($inp), %rax
  392. mulq %rax
  393. addq %rax, %r13
  394. adcq \$0, %rdx
  395. addq %rdx, %r14
  396. movq %r13, 112(%rsp)
  397. movq %r14, 120(%rsp)
  398. movq (%rsp), %r8
  399. movq 8(%rsp), %r9
  400. movq 16(%rsp), %r10
  401. movq 24(%rsp), %r11
  402. movq 32(%rsp), %r12
  403. movq 40(%rsp), %r13
  404. movq 48(%rsp), %r14
  405. movq 56(%rsp), %r15
  406. call __rsaz_512_reduce
  407. addq 64(%rsp), %r8
  408. adcq 72(%rsp), %r9
  409. adcq 80(%rsp), %r10
  410. adcq 88(%rsp), %r11
  411. adcq 96(%rsp), %r12
  412. adcq 104(%rsp), %r13
  413. adcq 112(%rsp), %r14
  414. adcq 120(%rsp), %r15
  415. sbbq %rcx, %rcx
  416. call __rsaz_512_subtract
  417. movq %r8, %rdx
  418. movq %r9, %rax
  419. movl 128+8(%rsp), $times
  420. movq $out, $inp
  421. decl $times
  422. jnz .Loop_sqr
  423. ___
  424. if ($addx) {
  425. $code.=<<___;
  426. jmp .Lsqr_tail
  427. .align 32
  428. .Loop_sqrx:
  429. movl $times,128+8(%rsp)
  430. movq $out, %xmm0 # off-load
  431. movq %rbp, %xmm1 # off-load
  432. #first iteration
  433. mulx %rax, %r8, %r9
  434. mulx 16($inp), %rcx, %r10
  435. xor %rbp, %rbp # cf=0, of=0
  436. mulx 24($inp), %rax, %r11
  437. adcx %rcx, %r9
  438. mulx 32($inp), %rcx, %r12
  439. adcx %rax, %r10
  440. mulx 40($inp), %rax, %r13
  441. adcx %rcx, %r11
  442. .byte 0xc4,0x62,0xf3,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($inp), %rcx, %r14
  443. adcx %rax, %r12
  444. adcx %rcx, %r13
  445. .byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 # mulx 56($inp), %rax, %r15
  446. adcx %rax, %r14
  447. adcx %rbp, %r15 # %rbp is 0
  448. mov %r9, %rcx
  449. shld \$1, %r8, %r9
  450. shl \$1, %r8
  451. xor %ebp, %ebp
  452. mulx %rdx, %rax, %rdx
  453. adcx %rdx, %r8
  454. mov 8($inp), %rdx
  455. adcx %rbp, %r9
  456. mov %rax, (%rsp)
  457. mov %r8, 8(%rsp)
  458. #second iteration
  459. mulx 16($inp), %rax, %rbx
  460. adox %rax, %r10
  461. adcx %rbx, %r11
  462. .byte 0xc4,0x62,0xc3,0xf6,0x86,0x18,0x00,0x00,0x00 # mulx 24($inp), $out, %r8
  463. adox $out, %r11
  464. adcx %r8, %r12
  465. mulx 32($inp), %rax, %rbx
  466. adox %rax, %r12
  467. adcx %rbx, %r13
  468. mulx 40($inp), $out, %r8
  469. adox $out, %r13
  470. adcx %r8, %r14
  471. .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx
  472. adox %rax, %r14
  473. adcx %rbx, %r15
  474. .byte 0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r8
  475. adox $out, %r15
  476. adcx %rbp, %r8
  477. adox %rbp, %r8
  478. mov %r11, %rbx
  479. shld \$1, %r10, %r11
  480. shld \$1, %rcx, %r10
  481. xor %ebp,%ebp
  482. mulx %rdx, %rax, %rcx
  483. mov 16($inp), %rdx
  484. adcx %rax, %r9
  485. adcx %rcx, %r10
  486. adcx %rbp, %r11
  487. mov %r9, 16(%rsp)
  488. .byte 0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00 # mov %r10, 24(%rsp)
  489. #third iteration
  490. .byte 0xc4,0x62,0xc3,0xf6,0x8e,0x18,0x00,0x00,0x00 # mulx 24($inp), $out, %r9
  491. adox $out, %r12
  492. adcx %r9, %r13
  493. mulx 32($inp), %rax, %rcx
  494. adox %rax, %r13
  495. adcx %rcx, %r14
  496. mulx 40($inp), $out, %r9
  497. adox $out, %r14
  498. adcx %r9, %r15
  499. .byte 0xc4,0xe2,0xfb,0xf6,0x8e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rcx
  500. adox %rax, %r15
  501. adcx %rcx, %r8
  502. .byte 0xc4,0x62,0xc3,0xf6,0x8e,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r9
  503. adox $out, %r8
  504. adcx %rbp, %r9
  505. adox %rbp, %r9
  506. mov %r13, %rcx
  507. shld \$1, %r12, %r13
  508. shld \$1, %rbx, %r12
  509. xor %ebp, %ebp
  510. mulx %rdx, %rax, %rdx
  511. adcx %rax, %r11
  512. adcx %rdx, %r12
  513. mov 24($inp), %rdx
  514. adcx %rbp, %r13
  515. mov %r11, 32(%rsp)
  516. .byte 0x4c,0x89,0xa4,0x24,0x28,0x00,0x00,0x00 # mov %r12, 40(%rsp)
  517. #fourth iteration
  518. .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x20,0x00,0x00,0x00 # mulx 32($inp), %rax, %rbx
  519. adox %rax, %r14
  520. adcx %rbx, %r15
  521. mulx 40($inp), $out, %r10
  522. adox $out, %r15
  523. adcx %r10, %r8
  524. mulx 48($inp), %rax, %rbx
  525. adox %rax, %r8
  526. adcx %rbx, %r9
  527. mulx 56($inp), $out, %r10
  528. adox $out, %r9
  529. adcx %rbp, %r10
  530. adox %rbp, %r10
  531. .byte 0x66
  532. mov %r15, %rbx
  533. shld \$1, %r14, %r15
  534. shld \$1, %rcx, %r14
  535. xor %ebp, %ebp
  536. mulx %rdx, %rax, %rdx
  537. adcx %rax, %r13
  538. adcx %rdx, %r14
  539. mov 32($inp), %rdx
  540. adcx %rbp, %r15
  541. mov %r13, 48(%rsp)
  542. mov %r14, 56(%rsp)
  543. #fifth iteration
  544. .byte 0xc4,0x62,0xc3,0xf6,0x9e,0x28,0x00,0x00,0x00 # mulx 40($inp), $out, %r11
  545. adox $out, %r8
  546. adcx %r11, %r9
  547. mulx 48($inp), %rax, %rcx
  548. adox %rax, %r9
  549. adcx %rcx, %r10
  550. mulx 56($inp), $out, %r11
  551. adox $out, %r10
  552. adcx %rbp, %r11
  553. adox %rbp, %r11
  554. mov %r9, %rcx
  555. shld \$1, %r8, %r9
  556. shld \$1, %rbx, %r8
  557. xor %ebp, %ebp
  558. mulx %rdx, %rax, %rdx
  559. adcx %rax, %r15
  560. adcx %rdx, %r8
  561. mov 40($inp), %rdx
  562. adcx %rbp, %r9
  563. mov %r15, 64(%rsp)
  564. mov %r8, 72(%rsp)
  565. #sixth iteration
  566. .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx
  567. adox %rax, %r10
  568. adcx %rbx, %r11
  569. .byte 0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r12
  570. adox $out, %r11
  571. adcx %rbp, %r12
  572. adox %rbp, %r12
  573. mov %r11, %rbx
  574. shld \$1, %r10, %r11
  575. shld \$1, %rcx, %r10
  576. xor %ebp, %ebp
  577. mulx %rdx, %rax, %rdx
  578. adcx %rax, %r9
  579. adcx %rdx, %r10
  580. mov 48($inp), %rdx
  581. adcx %rbp, %r11
  582. mov %r9, 80(%rsp)
  583. mov %r10, 88(%rsp)
  584. #seventh iteration
  585. .byte 0xc4,0x62,0xfb,0xf6,0xae,0x38,0x00,0x00,0x00 # mulx 56($inp), %rax, %r13
  586. adox %rax, %r12
  587. adox %rbp, %r13
  588. xor %r14, %r14
  589. shld \$1, %r13, %r14
  590. shld \$1, %r12, %r13
  591. shld \$1, %rbx, %r12
  592. xor %ebp, %ebp
  593. mulx %rdx, %rax, %rdx
  594. adcx %rax, %r11
  595. adcx %rdx, %r12
  596. mov 56($inp), %rdx
  597. adcx %rbp, %r13
  598. .byte 0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00 # mov %r11, 96(%rsp)
  599. .byte 0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00 # mov %r12, 104(%rsp)
  600. #eighth iteration
  601. mulx %rdx, %rax, %rdx
  602. adox %rax, %r13
  603. adox %rbp, %rdx
  604. .byte 0x66
  605. add %rdx, %r14
  606. movq %r13, 112(%rsp)
  607. movq %r14, 120(%rsp)
  608. movq %xmm0, $out
  609. movq %xmm1, %rbp
  610. movq 128(%rsp), %rdx # pull $n0
  611. movq (%rsp), %r8
  612. movq 8(%rsp), %r9
  613. movq 16(%rsp), %r10
  614. movq 24(%rsp), %r11
  615. movq 32(%rsp), %r12
  616. movq 40(%rsp), %r13
  617. movq 48(%rsp), %r14
  618. movq 56(%rsp), %r15
  619. call __rsaz_512_reducex
  620. addq 64(%rsp), %r8
  621. adcq 72(%rsp), %r9
  622. adcq 80(%rsp), %r10
  623. adcq 88(%rsp), %r11
  624. adcq 96(%rsp), %r12
  625. adcq 104(%rsp), %r13
  626. adcq 112(%rsp), %r14
  627. adcq 120(%rsp), %r15
  628. sbbq %rcx, %rcx
  629. call __rsaz_512_subtract
  630. movq %r8, %rdx
  631. movq %r9, %rax
  632. movl 128+8(%rsp), $times
  633. movq $out, $inp
  634. decl $times
  635. jnz .Loop_sqrx
  636. .Lsqr_tail:
  637. ___
  638. }
  639. $code.=<<___;
  640. leaq 128+24+48(%rsp), %rax
  641. .cfi_def_cfa %rax,8
  642. movq -48(%rax), %r15
  643. .cfi_restore %r15
  644. movq -40(%rax), %r14
  645. .cfi_restore %r14
  646. movq -32(%rax), %r13
  647. .cfi_restore %r13
  648. movq -24(%rax), %r12
  649. .cfi_restore %r12
  650. movq -16(%rax), %rbp
  651. .cfi_restore %rbp
  652. movq -8(%rax), %rbx
  653. .cfi_restore %rbx
  654. leaq (%rax), %rsp
  655. .cfi_def_cfa_register %rsp
  656. .Lsqr_epilogue:
  657. ret
  658. .cfi_endproc
  659. .size rsaz_512_sqr,.-rsaz_512_sqr
  660. ___
  661. }
  662. {
  663. my ($out,$ap,$bp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8");
  664. $code.=<<___;
  665. .globl rsaz_512_mul
  666. .type rsaz_512_mul,\@function,5
  667. .align 32
  668. rsaz_512_mul:
  669. .cfi_startproc
  670. push %rbx
  671. .cfi_push %rbx
  672. push %rbp
  673. .cfi_push %rbp
  674. push %r12
  675. .cfi_push %r12
  676. push %r13
  677. .cfi_push %r13
  678. push %r14
  679. .cfi_push %r14
  680. push %r15
  681. .cfi_push %r15
  682. subq \$128+24, %rsp
  683. .cfi_adjust_cfa_offset 128+24
  684. .Lmul_body:
  685. movq $out, %xmm0 # off-load arguments
  686. movq $mod, %xmm1
  687. movq $n0, 128(%rsp)
  688. ___
  689. $code.=<<___ if ($addx);
  690. movl \$0x80100,%r11d
  691. andl OPENSSL_ia32cap_P+8(%rip),%r11d
  692. cmpl \$0x80100,%r11d # check for MULX and ADO/CX
  693. je .Lmulx
  694. ___
  695. $code.=<<___;
  696. movq ($bp), %rbx # pass b[0]
  697. movq $bp, %rbp # pass argument
  698. call __rsaz_512_mul
  699. movq %xmm0, $out
  700. movq %xmm1, %rbp
  701. movq (%rsp), %r8
  702. movq 8(%rsp), %r9
  703. movq 16(%rsp), %r10
  704. movq 24(%rsp), %r11
  705. movq 32(%rsp), %r12
  706. movq 40(%rsp), %r13
  707. movq 48(%rsp), %r14
  708. movq 56(%rsp), %r15
  709. call __rsaz_512_reduce
  710. ___
  711. $code.=<<___ if ($addx);
  712. jmp .Lmul_tail
  713. .align 32
  714. .Lmulx:
  715. movq $bp, %rbp # pass argument
  716. movq ($bp), %rdx # pass b[0]
  717. call __rsaz_512_mulx
  718. movq %xmm0, $out
  719. movq %xmm1, %rbp
  720. movq 128(%rsp), %rdx # pull $n0
  721. movq (%rsp), %r8
  722. movq 8(%rsp), %r9
  723. movq 16(%rsp), %r10
  724. movq 24(%rsp), %r11
  725. movq 32(%rsp), %r12
  726. movq 40(%rsp), %r13
  727. movq 48(%rsp), %r14
  728. movq 56(%rsp), %r15
  729. call __rsaz_512_reducex
  730. .Lmul_tail:
  731. ___
  732. $code.=<<___;
  733. addq 64(%rsp), %r8
  734. adcq 72(%rsp), %r9
  735. adcq 80(%rsp), %r10
  736. adcq 88(%rsp), %r11
  737. adcq 96(%rsp), %r12
  738. adcq 104(%rsp), %r13
  739. adcq 112(%rsp), %r14
  740. adcq 120(%rsp), %r15
  741. sbbq %rcx, %rcx
  742. call __rsaz_512_subtract
  743. leaq 128+24+48(%rsp), %rax
  744. .cfi_def_cfa %rax,8
  745. movq -48(%rax), %r15
  746. .cfi_restore %r15
  747. movq -40(%rax), %r14
  748. .cfi_restore %r14
  749. movq -32(%rax), %r13
  750. .cfi_restore %r13
  751. movq -24(%rax), %r12
  752. .cfi_restore %r12
  753. movq -16(%rax), %rbp
  754. .cfi_restore %rbp
  755. movq -8(%rax), %rbx
  756. .cfi_restore %rbx
  757. leaq (%rax), %rsp
  758. .cfi_def_cfa_register %rsp
  759. .Lmul_epilogue:
  760. ret
  761. .cfi_endproc
  762. .size rsaz_512_mul,.-rsaz_512_mul
  763. ___
  764. }
  765. {
  766. my ($out,$ap,$bp,$mod,$n0,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
  767. $code.=<<___;
  768. .globl rsaz_512_mul_gather4
  769. .type rsaz_512_mul_gather4,\@function,6
  770. .align 32
  771. rsaz_512_mul_gather4:
  772. .cfi_startproc
  773. push %rbx
  774. .cfi_push %rbx
  775. push %rbp
  776. .cfi_push %rbp
  777. push %r12
  778. .cfi_push %r12
  779. push %r13
  780. .cfi_push %r13
  781. push %r14
  782. .cfi_push %r14
  783. push %r15
  784. .cfi_push %r15
  785. subq \$`128+24+($win64?0xb0:0)`, %rsp
  786. .cfi_adjust_cfa_offset `128+24+($win64?0xb0:0)`
  787. ___
  788. $code.=<<___ if ($win64);
  789. movaps %xmm6,0xa0(%rsp)
  790. movaps %xmm7,0xb0(%rsp)
  791. movaps %xmm8,0xc0(%rsp)
  792. movaps %xmm9,0xd0(%rsp)
  793. movaps %xmm10,0xe0(%rsp)
  794. movaps %xmm11,0xf0(%rsp)
  795. movaps %xmm12,0x100(%rsp)
  796. movaps %xmm13,0x110(%rsp)
  797. movaps %xmm14,0x120(%rsp)
  798. movaps %xmm15,0x130(%rsp)
  799. ___
  800. $code.=<<___;
  801. .Lmul_gather4_body:
  802. movd $pwr,%xmm8
  803. movdqa .Linc+16(%rip),%xmm1 # 00000002000000020000000200000002
  804. movdqa .Linc(%rip),%xmm0 # 00000001000000010000000000000000
  805. pshufd \$0,%xmm8,%xmm8 # broadcast $power
  806. movdqa %xmm1,%xmm7
  807. movdqa %xmm1,%xmm2
  808. ___
  809. ########################################################################
  810. # calculate mask by comparing 0..15 to $power
  811. #
  812. for($i=0;$i<4;$i++) {
  813. $code.=<<___;
  814. paddd %xmm`$i`,%xmm`$i+1`
  815. pcmpeqd %xmm8,%xmm`$i`
  816. movdqa %xmm7,%xmm`$i+3`
  817. ___
  818. }
  819. for(;$i<7;$i++) {
  820. $code.=<<___;
  821. paddd %xmm`$i`,%xmm`$i+1`
  822. pcmpeqd %xmm8,%xmm`$i`
  823. ___
  824. }
  825. $code.=<<___;
  826. pcmpeqd %xmm8,%xmm7
  827. movdqa 16*0($bp),%xmm8
  828. movdqa 16*1($bp),%xmm9
  829. movdqa 16*2($bp),%xmm10
  830. movdqa 16*3($bp),%xmm11
  831. pand %xmm0,%xmm8
  832. movdqa 16*4($bp),%xmm12
  833. pand %xmm1,%xmm9
  834. movdqa 16*5($bp),%xmm13
  835. pand %xmm2,%xmm10
  836. movdqa 16*6($bp),%xmm14
  837. pand %xmm3,%xmm11
  838. movdqa 16*7($bp),%xmm15
  839. leaq 128($bp), %rbp
  840. pand %xmm4,%xmm12
  841. pand %xmm5,%xmm13
  842. pand %xmm6,%xmm14
  843. pand %xmm7,%xmm15
  844. por %xmm10,%xmm8
  845. por %xmm11,%xmm9
  846. por %xmm12,%xmm8
  847. por %xmm13,%xmm9
  848. por %xmm14,%xmm8
  849. por %xmm15,%xmm9
  850. por %xmm9,%xmm8
  851. pshufd \$0x4e,%xmm8,%xmm9
  852. por %xmm9,%xmm8
  853. ___
  854. $code.=<<___ if ($addx);
  855. movl \$0x80100,%r11d
  856. andl OPENSSL_ia32cap_P+8(%rip),%r11d
  857. cmpl \$0x80100,%r11d # check for MULX and ADO/CX
  858. je .Lmulx_gather
  859. ___
  860. $code.=<<___;
  861. movq %xmm8,%rbx
  862. movq $n0, 128(%rsp) # off-load arguments
  863. movq $out, 128+8(%rsp)
  864. movq $mod, 128+16(%rsp)
  865. movq ($ap), %rax
  866. movq 8($ap), %rcx
  867. mulq %rbx # 0 iteration
  868. movq %rax, (%rsp)
  869. movq %rcx, %rax
  870. movq %rdx, %r8
  871. mulq %rbx
  872. addq %rax, %r8
  873. movq 16($ap), %rax
  874. movq %rdx, %r9
  875. adcq \$0, %r9
  876. mulq %rbx
  877. addq %rax, %r9
  878. movq 24($ap), %rax
  879. movq %rdx, %r10
  880. adcq \$0, %r10
  881. mulq %rbx
  882. addq %rax, %r10
  883. movq 32($ap), %rax
  884. movq %rdx, %r11
  885. adcq \$0, %r11
  886. mulq %rbx
  887. addq %rax, %r11
  888. movq 40($ap), %rax
  889. movq %rdx, %r12
  890. adcq \$0, %r12
  891. mulq %rbx
  892. addq %rax, %r12
  893. movq 48($ap), %rax
  894. movq %rdx, %r13
  895. adcq \$0, %r13
  896. mulq %rbx
  897. addq %rax, %r13
  898. movq 56($ap), %rax
  899. movq %rdx, %r14
  900. adcq \$0, %r14
  901. mulq %rbx
  902. addq %rax, %r14
  903. movq ($ap), %rax
  904. movq %rdx, %r15
  905. adcq \$0, %r15
  906. leaq 8(%rsp), %rdi
  907. movl \$7, %ecx
  908. jmp .Loop_mul_gather
  909. .align 32
  910. .Loop_mul_gather:
  911. movdqa 16*0(%rbp),%xmm8
  912. movdqa 16*1(%rbp),%xmm9
  913. movdqa 16*2(%rbp),%xmm10
  914. movdqa 16*3(%rbp),%xmm11
  915. pand %xmm0,%xmm8
  916. movdqa 16*4(%rbp),%xmm12
  917. pand %xmm1,%xmm9
  918. movdqa 16*5(%rbp),%xmm13
  919. pand %xmm2,%xmm10
  920. movdqa 16*6(%rbp),%xmm14
  921. pand %xmm3,%xmm11
  922. movdqa 16*7(%rbp),%xmm15
  923. leaq 128(%rbp), %rbp
  924. pand %xmm4,%xmm12
  925. pand %xmm5,%xmm13
  926. pand %xmm6,%xmm14
  927. pand %xmm7,%xmm15
  928. por %xmm10,%xmm8
  929. por %xmm11,%xmm9
  930. por %xmm12,%xmm8
  931. por %xmm13,%xmm9
  932. por %xmm14,%xmm8
  933. por %xmm15,%xmm9
  934. por %xmm9,%xmm8
  935. pshufd \$0x4e,%xmm8,%xmm9
  936. por %xmm9,%xmm8
  937. movq %xmm8,%rbx
  938. mulq %rbx
  939. addq %rax, %r8
  940. movq 8($ap), %rax
  941. movq %r8, (%rdi)
  942. movq %rdx, %r8
  943. adcq \$0, %r8
  944. mulq %rbx
  945. addq %rax, %r9
  946. movq 16($ap), %rax
  947. adcq \$0, %rdx
  948. addq %r9, %r8
  949. movq %rdx, %r9
  950. adcq \$0, %r9
  951. mulq %rbx
  952. addq %rax, %r10
  953. movq 24($ap), %rax
  954. adcq \$0, %rdx
  955. addq %r10, %r9
  956. movq %rdx, %r10
  957. adcq \$0, %r10
  958. mulq %rbx
  959. addq %rax, %r11
  960. movq 32($ap), %rax
  961. adcq \$0, %rdx
  962. addq %r11, %r10
  963. movq %rdx, %r11
  964. adcq \$0, %r11
  965. mulq %rbx
  966. addq %rax, %r12
  967. movq 40($ap), %rax
  968. adcq \$0, %rdx
  969. addq %r12, %r11
  970. movq %rdx, %r12
  971. adcq \$0, %r12
  972. mulq %rbx
  973. addq %rax, %r13
  974. movq 48($ap), %rax
  975. adcq \$0, %rdx
  976. addq %r13, %r12
  977. movq %rdx, %r13
  978. adcq \$0, %r13
  979. mulq %rbx
  980. addq %rax, %r14
  981. movq 56($ap), %rax
  982. adcq \$0, %rdx
  983. addq %r14, %r13
  984. movq %rdx, %r14
  985. adcq \$0, %r14
  986. mulq %rbx
  987. addq %rax, %r15
  988. movq ($ap), %rax
  989. adcq \$0, %rdx
  990. addq %r15, %r14
  991. movq %rdx, %r15
  992. adcq \$0, %r15
  993. leaq 8(%rdi), %rdi
  994. decl %ecx
  995. jnz .Loop_mul_gather
  996. movq %r8, (%rdi)
  997. movq %r9, 8(%rdi)
  998. movq %r10, 16(%rdi)
  999. movq %r11, 24(%rdi)
  1000. movq %r12, 32(%rdi)
  1001. movq %r13, 40(%rdi)
  1002. movq %r14, 48(%rdi)
  1003. movq %r15, 56(%rdi)
  1004. movq 128+8(%rsp), $out
  1005. movq 128+16(%rsp), %rbp
  1006. movq (%rsp), %r8
  1007. movq 8(%rsp), %r9
  1008. movq 16(%rsp), %r10
  1009. movq 24(%rsp), %r11
  1010. movq 32(%rsp), %r12
  1011. movq 40(%rsp), %r13
  1012. movq 48(%rsp), %r14
  1013. movq 56(%rsp), %r15
  1014. call __rsaz_512_reduce
  1015. ___
  1016. $code.=<<___ if ($addx);
  1017. jmp .Lmul_gather_tail
  1018. .align 32
  1019. .Lmulx_gather:
  1020. movq %xmm8,%rdx
  1021. mov $n0, 128(%rsp) # off-load arguments
  1022. mov $out, 128+8(%rsp)
  1023. mov $mod, 128+16(%rsp)
  1024. mulx ($ap), %rbx, %r8 # 0 iteration
  1025. mov %rbx, (%rsp)
  1026. xor %edi, %edi # cf=0, of=0
  1027. mulx 8($ap), %rax, %r9
  1028. mulx 16($ap), %rbx, %r10
  1029. adcx %rax, %r8
  1030. mulx 24($ap), %rax, %r11
  1031. adcx %rbx, %r9
  1032. mulx 32($ap), %rbx, %r12
  1033. adcx %rax, %r10
  1034. mulx 40($ap), %rax, %r13
  1035. adcx %rbx, %r11
  1036. mulx 48($ap), %rbx, %r14
  1037. adcx %rax, %r12
  1038. mulx 56($ap), %rax, %r15
  1039. adcx %rbx, %r13
  1040. adcx %rax, %r14
  1041. .byte 0x67
  1042. mov %r8, %rbx
  1043. adcx %rdi, %r15 # %rdi is 0
  1044. mov \$-7, %rcx
  1045. jmp .Loop_mulx_gather
  1046. .align 32
  1047. .Loop_mulx_gather:
  1048. movdqa 16*0(%rbp),%xmm8
  1049. movdqa 16*1(%rbp),%xmm9
  1050. movdqa 16*2(%rbp),%xmm10
  1051. movdqa 16*3(%rbp),%xmm11
  1052. pand %xmm0,%xmm8
  1053. movdqa 16*4(%rbp),%xmm12
  1054. pand %xmm1,%xmm9
  1055. movdqa 16*5(%rbp),%xmm13
  1056. pand %xmm2,%xmm10
  1057. movdqa 16*6(%rbp),%xmm14
  1058. pand %xmm3,%xmm11
  1059. movdqa 16*7(%rbp),%xmm15
  1060. leaq 128(%rbp), %rbp
  1061. pand %xmm4,%xmm12
  1062. pand %xmm5,%xmm13
  1063. pand %xmm6,%xmm14
  1064. pand %xmm7,%xmm15
  1065. por %xmm10,%xmm8
  1066. por %xmm11,%xmm9
  1067. por %xmm12,%xmm8
  1068. por %xmm13,%xmm9
  1069. por %xmm14,%xmm8
  1070. por %xmm15,%xmm9
  1071. por %xmm9,%xmm8
  1072. pshufd \$0x4e,%xmm8,%xmm9
  1073. por %xmm9,%xmm8
  1074. movq %xmm8,%rdx
  1075. .byte 0xc4,0x62,0xfb,0xf6,0x86,0x00,0x00,0x00,0x00 # mulx ($ap), %rax, %r8
  1076. adcx %rax, %rbx
  1077. adox %r9, %r8
  1078. mulx 8($ap), %rax, %r9
  1079. adcx %rax, %r8
  1080. adox %r10, %r9
  1081. mulx 16($ap), %rax, %r10
  1082. adcx %rax, %r9
  1083. adox %r11, %r10
  1084. .byte 0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00 # mulx 24($ap), %rax, %r11
  1085. adcx %rax, %r10
  1086. adox %r12, %r11
  1087. mulx 32($ap), %rax, %r12
  1088. adcx %rax, %r11
  1089. adox %r13, %r12
  1090. mulx 40($ap), %rax, %r13
  1091. adcx %rax, %r12
  1092. adox %r14, %r13
  1093. .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14
  1094. adcx %rax, %r13
  1095. .byte 0x67
  1096. adox %r15, %r14
  1097. mulx 56($ap), %rax, %r15
  1098. mov %rbx, 64(%rsp,%rcx,8)
  1099. adcx %rax, %r14
  1100. adox %rdi, %r15
  1101. mov %r8, %rbx
  1102. adcx %rdi, %r15 # cf=0
  1103. inc %rcx # of=0
  1104. jnz .Loop_mulx_gather
  1105. mov %r8, 64(%rsp)
  1106. mov %r9, 64+8(%rsp)
  1107. mov %r10, 64+16(%rsp)
  1108. mov %r11, 64+24(%rsp)
  1109. mov %r12, 64+32(%rsp)
  1110. mov %r13, 64+40(%rsp)
  1111. mov %r14, 64+48(%rsp)
  1112. mov %r15, 64+56(%rsp)
  1113. mov 128(%rsp), %rdx # pull arguments
  1114. mov 128+8(%rsp), $out
  1115. mov 128+16(%rsp), %rbp
  1116. mov (%rsp), %r8
  1117. mov 8(%rsp), %r9
  1118. mov 16(%rsp), %r10
  1119. mov 24(%rsp), %r11
  1120. mov 32(%rsp), %r12
  1121. mov 40(%rsp), %r13
  1122. mov 48(%rsp), %r14
  1123. mov 56(%rsp), %r15
  1124. call __rsaz_512_reducex
  1125. .Lmul_gather_tail:
  1126. ___
  1127. $code.=<<___;
  1128. addq 64(%rsp), %r8
  1129. adcq 72(%rsp), %r9
  1130. adcq 80(%rsp), %r10
  1131. adcq 88(%rsp), %r11
  1132. adcq 96(%rsp), %r12
  1133. adcq 104(%rsp), %r13
  1134. adcq 112(%rsp), %r14
  1135. adcq 120(%rsp), %r15
  1136. sbbq %rcx, %rcx
  1137. call __rsaz_512_subtract
  1138. leaq 128+24+48(%rsp), %rax
  1139. ___
  1140. $code.=<<___ if ($win64);
  1141. movaps 0xa0-0xc8(%rax),%xmm6
  1142. movaps 0xb0-0xc8(%rax),%xmm7
  1143. movaps 0xc0-0xc8(%rax),%xmm8
  1144. movaps 0xd0-0xc8(%rax),%xmm9
  1145. movaps 0xe0-0xc8(%rax),%xmm10
  1146. movaps 0xf0-0xc8(%rax),%xmm11
  1147. movaps 0x100-0xc8(%rax),%xmm12
  1148. movaps 0x110-0xc8(%rax),%xmm13
  1149. movaps 0x120-0xc8(%rax),%xmm14
  1150. movaps 0x130-0xc8(%rax),%xmm15
  1151. lea 0xb0(%rax),%rax
  1152. ___
  1153. $code.=<<___;
  1154. .cfi_def_cfa %rax,8
  1155. movq -48(%rax), %r15
  1156. .cfi_restore %r15
  1157. movq -40(%rax), %r14
  1158. .cfi_restore %r14
  1159. movq -32(%rax), %r13
  1160. .cfi_restore %r13
  1161. movq -24(%rax), %r12
  1162. .cfi_restore %r12
  1163. movq -16(%rax), %rbp
  1164. .cfi_restore %rbp
  1165. movq -8(%rax), %rbx
  1166. .cfi_restore %rbx
  1167. leaq (%rax), %rsp
  1168. .cfi_def_cfa_register %rsp
  1169. .Lmul_gather4_epilogue:
  1170. ret
  1171. .cfi_endproc
  1172. .size rsaz_512_mul_gather4,.-rsaz_512_mul_gather4
  1173. ___
  1174. }
  1175. {
  1176. my ($out,$ap,$mod,$n0,$tbl,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
  1177. $code.=<<___;
  1178. .globl rsaz_512_mul_scatter4
  1179. .type rsaz_512_mul_scatter4,\@function,6
  1180. .align 32
  1181. rsaz_512_mul_scatter4:
  1182. .cfi_startproc
  1183. push %rbx
  1184. .cfi_push %rbx
  1185. push %rbp
  1186. .cfi_push %rbp
  1187. push %r12
  1188. .cfi_push %r12
  1189. push %r13
  1190. .cfi_push %r13
  1191. push %r14
  1192. .cfi_push %r14
  1193. push %r15
  1194. .cfi_push %r15
  1195. mov $pwr, $pwr
  1196. subq \$128+24, %rsp
  1197. .cfi_adjust_cfa_offset 128+24
  1198. .Lmul_scatter4_body:
  1199. leaq ($tbl,$pwr,8), $tbl
  1200. movq $out, %xmm0 # off-load arguments
  1201. movq $mod, %xmm1
  1202. movq $tbl, %xmm2
  1203. movq $n0, 128(%rsp)
  1204. movq $out, %rbp
  1205. ___
  1206. $code.=<<___ if ($addx);
  1207. movl \$0x80100,%r11d
  1208. andl OPENSSL_ia32cap_P+8(%rip),%r11d
  1209. cmpl \$0x80100,%r11d # check for MULX and ADO/CX
  1210. je .Lmulx_scatter
  1211. ___
  1212. $code.=<<___;
  1213. movq ($out),%rbx # pass b[0]
  1214. call __rsaz_512_mul
  1215. movq %xmm0, $out
  1216. movq %xmm1, %rbp
  1217. movq (%rsp), %r8
  1218. movq 8(%rsp), %r9
  1219. movq 16(%rsp), %r10
  1220. movq 24(%rsp), %r11
  1221. movq 32(%rsp), %r12
  1222. movq 40(%rsp), %r13
  1223. movq 48(%rsp), %r14
  1224. movq 56(%rsp), %r15
  1225. call __rsaz_512_reduce
  1226. ___
  1227. $code.=<<___ if ($addx);
  1228. jmp .Lmul_scatter_tail
  1229. .align 32
  1230. .Lmulx_scatter:
  1231. movq ($out), %rdx # pass b[0]
  1232. call __rsaz_512_mulx
  1233. movq %xmm0, $out
  1234. movq %xmm1, %rbp
  1235. movq 128(%rsp), %rdx # pull $n0
  1236. movq (%rsp), %r8
  1237. movq 8(%rsp), %r9
  1238. movq 16(%rsp), %r10
  1239. movq 24(%rsp), %r11
  1240. movq 32(%rsp), %r12
  1241. movq 40(%rsp), %r13
  1242. movq 48(%rsp), %r14
  1243. movq 56(%rsp), %r15
  1244. call __rsaz_512_reducex
  1245. .Lmul_scatter_tail:
  1246. ___
  1247. $code.=<<___;
  1248. addq 64(%rsp), %r8
  1249. adcq 72(%rsp), %r9
  1250. adcq 80(%rsp), %r10
  1251. adcq 88(%rsp), %r11
  1252. adcq 96(%rsp), %r12
  1253. adcq 104(%rsp), %r13
  1254. adcq 112(%rsp), %r14
  1255. adcq 120(%rsp), %r15
  1256. movq %xmm2, $inp
  1257. sbbq %rcx, %rcx
  1258. call __rsaz_512_subtract
  1259. movq %r8, 128*0($inp) # scatter
  1260. movq %r9, 128*1($inp)
  1261. movq %r10, 128*2($inp)
  1262. movq %r11, 128*3($inp)
  1263. movq %r12, 128*4($inp)
  1264. movq %r13, 128*5($inp)
  1265. movq %r14, 128*6($inp)
  1266. movq %r15, 128*7($inp)
  1267. leaq 128+24+48(%rsp), %rax
  1268. .cfi_def_cfa %rax,8
  1269. movq -48(%rax), %r15
  1270. .cfi_restore %r15
  1271. movq -40(%rax), %r14
  1272. .cfi_restore %r14
  1273. movq -32(%rax), %r13
  1274. .cfi_restore %r13
  1275. movq -24(%rax), %r12
  1276. .cfi_restore %r12
  1277. movq -16(%rax), %rbp
  1278. .cfi_restore %rbp
  1279. movq -8(%rax), %rbx
  1280. .cfi_restore %rbx
  1281. leaq (%rax), %rsp
  1282. .cfi_def_cfa_register %rsp
  1283. .Lmul_scatter4_epilogue:
  1284. ret
  1285. .cfi_endproc
  1286. .size rsaz_512_mul_scatter4,.-rsaz_512_mul_scatter4
  1287. ___
  1288. }
  1289. {
  1290. my ($out,$inp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx");
  1291. $code.=<<___;
  1292. .globl rsaz_512_mul_by_one
  1293. .type rsaz_512_mul_by_one,\@function,4
  1294. .align 32
  1295. rsaz_512_mul_by_one:
  1296. .cfi_startproc
  1297. push %rbx
  1298. .cfi_push %rbx
  1299. push %rbp
  1300. .cfi_push %rbp
  1301. push %r12
  1302. .cfi_push %r12
  1303. push %r13
  1304. .cfi_push %r13
  1305. push %r14
  1306. .cfi_push %r14
  1307. push %r15
  1308. .cfi_push %r15
  1309. subq \$128+24, %rsp
  1310. .cfi_adjust_cfa_offset 128+24
  1311. .Lmul_by_one_body:
  1312. ___
  1313. $code.=<<___ if ($addx);
  1314. movl OPENSSL_ia32cap_P+8(%rip),%eax
  1315. ___
  1316. $code.=<<___;
  1317. movq $mod, %rbp # reassign argument
  1318. movq $n0, 128(%rsp)
  1319. movq ($inp), %r8
  1320. pxor %xmm0, %xmm0
  1321. movq 8($inp), %r9
  1322. movq 16($inp), %r10
  1323. movq 24($inp), %r11
  1324. movq 32($inp), %r12
  1325. movq 40($inp), %r13
  1326. movq 48($inp), %r14
  1327. movq 56($inp), %r15
  1328. movdqa %xmm0, (%rsp)
  1329. movdqa %xmm0, 16(%rsp)
  1330. movdqa %xmm0, 32(%rsp)
  1331. movdqa %xmm0, 48(%rsp)
  1332. movdqa %xmm0, 64(%rsp)
  1333. movdqa %xmm0, 80(%rsp)
  1334. movdqa %xmm0, 96(%rsp)
  1335. ___
  1336. $code.=<<___ if ($addx);
  1337. andl \$0x80100,%eax
  1338. cmpl \$0x80100,%eax # check for MULX and ADO/CX
  1339. je .Lby_one_callx
  1340. ___
  1341. $code.=<<___;
  1342. call __rsaz_512_reduce
  1343. ___
  1344. $code.=<<___ if ($addx);
  1345. jmp .Lby_one_tail
  1346. .align 32
  1347. .Lby_one_callx:
  1348. movq 128(%rsp), %rdx # pull $n0
  1349. call __rsaz_512_reducex
  1350. .Lby_one_tail:
  1351. ___
  1352. $code.=<<___;
  1353. movq %r8, ($out)
  1354. movq %r9, 8($out)
  1355. movq %r10, 16($out)
  1356. movq %r11, 24($out)
  1357. movq %r12, 32($out)
  1358. movq %r13, 40($out)
  1359. movq %r14, 48($out)
  1360. movq %r15, 56($out)
  1361. leaq 128+24+48(%rsp), %rax
  1362. .cfi_def_cfa %rax,8
  1363. movq -48(%rax), %r15
  1364. .cfi_restore %r15
  1365. movq -40(%rax), %r14
  1366. .cfi_restore %r14
  1367. movq -32(%rax), %r13
  1368. .cfi_restore %r13
  1369. movq -24(%rax), %r12
  1370. .cfi_restore %r12
  1371. movq -16(%rax), %rbp
  1372. .cfi_restore %rbp
  1373. movq -8(%rax), %rbx
  1374. .cfi_restore %rbx
  1375. leaq (%rax), %rsp
  1376. .cfi_def_cfa_register %rsp
  1377. .Lmul_by_one_epilogue:
  1378. ret
  1379. .cfi_endproc
  1380. .size rsaz_512_mul_by_one,.-rsaz_512_mul_by_one
  1381. ___
  1382. }
  1383. { # __rsaz_512_reduce
  1384. #
  1385. # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0
  1386. # output: %r8-%r15
  1387. # clobbers: everything except %rbp and %rdi
  1388. $code.=<<___;
  1389. .type __rsaz_512_reduce,\@abi-omnipotent
  1390. .align 32
  1391. __rsaz_512_reduce:
  1392. movq %r8, %rbx
  1393. imulq 128+8(%rsp), %rbx
  1394. movq 0(%rbp), %rax
  1395. movl \$8, %ecx
  1396. jmp .Lreduction_loop
  1397. .align 32
  1398. .Lreduction_loop:
  1399. mulq %rbx
  1400. movq 8(%rbp), %rax
  1401. negq %r8
  1402. movq %rdx, %r8
  1403. adcq \$0, %r8
  1404. mulq %rbx
  1405. addq %rax, %r9
  1406. movq 16(%rbp), %rax
  1407. adcq \$0, %rdx
  1408. addq %r9, %r8
  1409. movq %rdx, %r9
  1410. adcq \$0, %r9
  1411. mulq %rbx
  1412. addq %rax, %r10
  1413. movq 24(%rbp), %rax
  1414. adcq \$0, %rdx
  1415. addq %r10, %r9
  1416. movq %rdx, %r10
  1417. adcq \$0, %r10
  1418. mulq %rbx
  1419. addq %rax, %r11
  1420. movq 32(%rbp), %rax
  1421. adcq \$0, %rdx
  1422. addq %r11, %r10
  1423. movq 128+8(%rsp), %rsi
  1424. #movq %rdx, %r11
  1425. #adcq \$0, %r11
  1426. adcq \$0, %rdx
  1427. movq %rdx, %r11
  1428. mulq %rbx
  1429. addq %rax, %r12
  1430. movq 40(%rbp), %rax
  1431. adcq \$0, %rdx
  1432. imulq %r8, %rsi
  1433. addq %r12, %r11
  1434. movq %rdx, %r12
  1435. adcq \$0, %r12
  1436. mulq %rbx
  1437. addq %rax, %r13
  1438. movq 48(%rbp), %rax
  1439. adcq \$0, %rdx
  1440. addq %r13, %r12
  1441. movq %rdx, %r13
  1442. adcq \$0, %r13
  1443. mulq %rbx
  1444. addq %rax, %r14
  1445. movq 56(%rbp), %rax
  1446. adcq \$0, %rdx
  1447. addq %r14, %r13
  1448. movq %rdx, %r14
  1449. adcq \$0, %r14
  1450. mulq %rbx
  1451. movq %rsi, %rbx
  1452. addq %rax, %r15
  1453. movq 0(%rbp), %rax
  1454. adcq \$0, %rdx
  1455. addq %r15, %r14
  1456. movq %rdx, %r15
  1457. adcq \$0, %r15
  1458. decl %ecx
  1459. jne .Lreduction_loop
  1460. ret
  1461. .size __rsaz_512_reduce,.-__rsaz_512_reduce
  1462. ___
  1463. }
  1464. if ($addx) {
  1465. # __rsaz_512_reducex
  1466. #
  1467. # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0
  1468. # output: %r8-%r15
  1469. # clobbers: everything except %rbp and %rdi
  1470. $code.=<<___;
  1471. .type __rsaz_512_reducex,\@abi-omnipotent
  1472. .align 32
  1473. __rsaz_512_reducex:
  1474. #movq 128+8(%rsp), %rdx # pull $n0
  1475. imulq %r8, %rdx
  1476. xorq %rsi, %rsi # cf=0,of=0
  1477. movl \$8, %ecx
  1478. jmp .Lreduction_loopx
  1479. .align 32
  1480. .Lreduction_loopx:
  1481. mov %r8, %rbx
  1482. mulx 0(%rbp), %rax, %r8
  1483. adcx %rbx, %rax
  1484. adox %r9, %r8
  1485. mulx 8(%rbp), %rax, %r9
  1486. adcx %rax, %r8
  1487. adox %r10, %r9
  1488. mulx 16(%rbp), %rbx, %r10
  1489. adcx %rbx, %r9
  1490. adox %r11, %r10
  1491. mulx 24(%rbp), %rbx, %r11
  1492. adcx %rbx, %r10
  1493. adox %r12, %r11
  1494. .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 32(%rbp), %rbx, %r12
  1495. mov %rdx, %rax
  1496. mov %r8, %rdx
  1497. adcx %rbx, %r11
  1498. adox %r13, %r12
  1499. mulx 128+8(%rsp), %rbx, %rdx
  1500. mov %rax, %rdx
  1501. mulx 40(%rbp), %rax, %r13
  1502. adcx %rax, %r12
  1503. adox %r14, %r13
  1504. .byte 0xc4,0x62,0xfb,0xf6,0xb5,0x30,0x00,0x00,0x00 # mulx 48(%rbp), %rax, %r14
  1505. adcx %rax, %r13
  1506. adox %r15, %r14
  1507. mulx 56(%rbp), %rax, %r15
  1508. mov %rbx, %rdx
  1509. adcx %rax, %r14
  1510. adox %rsi, %r15 # %rsi is 0
  1511. adcx %rsi, %r15 # cf=0
  1512. decl %ecx # of=0
  1513. jne .Lreduction_loopx
  1514. ret
  1515. .size __rsaz_512_reducex,.-__rsaz_512_reducex
  1516. ___
  1517. }
  1518. { # __rsaz_512_subtract
  1519. # input: %r8-%r15, %rdi - $out, %rbp - $mod, %rcx - mask
  1520. # output:
  1521. # clobbers: everything but %rdi, %rsi and %rbp
  1522. $code.=<<___;
  1523. .type __rsaz_512_subtract,\@abi-omnipotent
  1524. .align 32
  1525. __rsaz_512_subtract:
  1526. movq %r8, ($out)
  1527. movq %r9, 8($out)
  1528. movq %r10, 16($out)
  1529. movq %r11, 24($out)
  1530. movq %r12, 32($out)
  1531. movq %r13, 40($out)
  1532. movq %r14, 48($out)
  1533. movq %r15, 56($out)
  1534. movq 0($mod), %r8
  1535. movq 8($mod), %r9
  1536. negq %r8
  1537. notq %r9
  1538. andq %rcx, %r8
  1539. movq 16($mod), %r10
  1540. andq %rcx, %r9
  1541. notq %r10
  1542. movq 24($mod), %r11
  1543. andq %rcx, %r10
  1544. notq %r11
  1545. movq 32($mod), %r12
  1546. andq %rcx, %r11
  1547. notq %r12
  1548. movq 40($mod), %r13
  1549. andq %rcx, %r12
  1550. notq %r13
  1551. movq 48($mod), %r14
  1552. andq %rcx, %r13
  1553. notq %r14
  1554. movq 56($mod), %r15
  1555. andq %rcx, %r14
  1556. notq %r15
  1557. andq %rcx, %r15
  1558. addq ($out), %r8
  1559. adcq 8($out), %r9
  1560. adcq 16($out), %r10
  1561. adcq 24($out), %r11
  1562. adcq 32($out), %r12
  1563. adcq 40($out), %r13
  1564. adcq 48($out), %r14
  1565. adcq 56($out), %r15
  1566. movq %r8, ($out)
  1567. movq %r9, 8($out)
  1568. movq %r10, 16($out)
  1569. movq %r11, 24($out)
  1570. movq %r12, 32($out)
  1571. movq %r13, 40($out)
  1572. movq %r14, 48($out)
  1573. movq %r15, 56($out)
  1574. ret
  1575. .size __rsaz_512_subtract,.-__rsaz_512_subtract
  1576. ___
  1577. }
  1578. { # __rsaz_512_mul
  1579. #
  1580. # input: %rsi - ap, %rbp - bp
  1581. # output:
  1582. # clobbers: everything
  1583. my ($ap,$bp) = ("%rsi","%rbp");
  1584. $code.=<<___;
  1585. .type __rsaz_512_mul,\@abi-omnipotent
  1586. .align 32
  1587. __rsaz_512_mul:
  1588. leaq 8(%rsp), %rdi
  1589. movq ($ap), %rax
  1590. mulq %rbx
  1591. movq %rax, (%rdi)
  1592. movq 8($ap), %rax
  1593. movq %rdx, %r8
  1594. mulq %rbx
  1595. addq %rax, %r8
  1596. movq 16($ap), %rax
  1597. movq %rdx, %r9
  1598. adcq \$0, %r9
  1599. mulq %rbx
  1600. addq %rax, %r9
  1601. movq 24($ap), %rax
  1602. movq %rdx, %r10
  1603. adcq \$0, %r10
  1604. mulq %rbx
  1605. addq %rax, %r10
  1606. movq 32($ap), %rax
  1607. movq %rdx, %r11
  1608. adcq \$0, %r11
  1609. mulq %rbx
  1610. addq %rax, %r11
  1611. movq 40($ap), %rax
  1612. movq %rdx, %r12
  1613. adcq \$0, %r12
  1614. mulq %rbx
  1615. addq %rax, %r12
  1616. movq 48($ap), %rax
  1617. movq %rdx, %r13
  1618. adcq \$0, %r13
  1619. mulq %rbx
  1620. addq %rax, %r13
  1621. movq 56($ap), %rax
  1622. movq %rdx, %r14
  1623. adcq \$0, %r14
  1624. mulq %rbx
  1625. addq %rax, %r14
  1626. movq ($ap), %rax
  1627. movq %rdx, %r15
  1628. adcq \$0, %r15
  1629. leaq 8($bp), $bp
  1630. leaq 8(%rdi), %rdi
  1631. movl \$7, %ecx
  1632. jmp .Loop_mul
  1633. .align 32
  1634. .Loop_mul:
  1635. movq ($bp), %rbx
  1636. mulq %rbx
  1637. addq %rax, %r8
  1638. movq 8($ap), %rax
  1639. movq %r8, (%rdi)
  1640. movq %rdx, %r8
  1641. adcq \$0, %r8
  1642. mulq %rbx
  1643. addq %rax, %r9
  1644. movq 16($ap), %rax
  1645. adcq \$0, %rdx
  1646. addq %r9, %r8
  1647. movq %rdx, %r9
  1648. adcq \$0, %r9
  1649. mulq %rbx
  1650. addq %rax, %r10
  1651. movq 24($ap), %rax
  1652. adcq \$0, %rdx
  1653. addq %r10, %r9
  1654. movq %rdx, %r10
  1655. adcq \$0, %r10
  1656. mulq %rbx
  1657. addq %rax, %r11
  1658. movq 32($ap), %rax
  1659. adcq \$0, %rdx
  1660. addq %r11, %r10
  1661. movq %rdx, %r11
  1662. adcq \$0, %r11
  1663. mulq %rbx
  1664. addq %rax, %r12
  1665. movq 40($ap), %rax
  1666. adcq \$0, %rdx
  1667. addq %r12, %r11
  1668. movq %rdx, %r12
  1669. adcq \$0, %r12
  1670. mulq %rbx
  1671. addq %rax, %r13
  1672. movq 48($ap), %rax
  1673. adcq \$0, %rdx
  1674. addq %r13, %r12
  1675. movq %rdx, %r13
  1676. adcq \$0, %r13
  1677. mulq %rbx
  1678. addq %rax, %r14
  1679. movq 56($ap), %rax
  1680. adcq \$0, %rdx
  1681. addq %r14, %r13
  1682. movq %rdx, %r14
  1683. leaq 8($bp), $bp
  1684. adcq \$0, %r14
  1685. mulq %rbx
  1686. addq %rax, %r15
  1687. movq ($ap), %rax
  1688. adcq \$0, %rdx
  1689. addq %r15, %r14
  1690. movq %rdx, %r15
  1691. adcq \$0, %r15
  1692. leaq 8(%rdi), %rdi
  1693. decl %ecx
  1694. jnz .Loop_mul
  1695. movq %r8, (%rdi)
  1696. movq %r9, 8(%rdi)
  1697. movq %r10, 16(%rdi)
  1698. movq %r11, 24(%rdi)
  1699. movq %r12, 32(%rdi)
  1700. movq %r13, 40(%rdi)
  1701. movq %r14, 48(%rdi)
  1702. movq %r15, 56(%rdi)
  1703. ret
  1704. .size __rsaz_512_mul,.-__rsaz_512_mul
  1705. ___
  1706. }
  1707. if ($addx) {
  1708. # __rsaz_512_mulx
  1709. #
  1710. # input: %rsi - ap, %rbp - bp
  1711. # output:
  1712. # clobbers: everything
  1713. my ($ap,$bp,$zero) = ("%rsi","%rbp","%rdi");
  1714. $code.=<<___;
  1715. .type __rsaz_512_mulx,\@abi-omnipotent
  1716. .align 32
  1717. __rsaz_512_mulx:
  1718. mulx ($ap), %rbx, %r8 # initial %rdx preloaded by caller
  1719. mov \$-6, %rcx
  1720. mulx 8($ap), %rax, %r9
  1721. movq %rbx, 8(%rsp)
  1722. mulx 16($ap), %rbx, %r10
  1723. adc %rax, %r8
  1724. mulx 24($ap), %rax, %r11
  1725. adc %rbx, %r9
  1726. mulx 32($ap), %rbx, %r12
  1727. adc %rax, %r10
  1728. mulx 40($ap), %rax, %r13
  1729. adc %rbx, %r11
  1730. mulx 48($ap), %rbx, %r14
  1731. adc %rax, %r12
  1732. mulx 56($ap), %rax, %r15
  1733. mov 8($bp), %rdx
  1734. adc %rbx, %r13
  1735. adc %rax, %r14
  1736. adc \$0, %r15
  1737. xor $zero, $zero # cf=0,of=0
  1738. jmp .Loop_mulx
  1739. .align 32
  1740. .Loop_mulx:
  1741. movq %r8, %rbx
  1742. mulx ($ap), %rax, %r8
  1743. adcx %rax, %rbx
  1744. adox %r9, %r8
  1745. mulx 8($ap), %rax, %r9
  1746. adcx %rax, %r8
  1747. adox %r10, %r9
  1748. mulx 16($ap), %rax, %r10
  1749. adcx %rax, %r9
  1750. adox %r11, %r10
  1751. mulx 24($ap), %rax, %r11
  1752. adcx %rax, %r10
  1753. adox %r12, %r11
  1754. .byte 0x3e,0xc4,0x62,0xfb,0xf6,0xa6,0x20,0x00,0x00,0x00 # mulx 32($ap), %rax, %r12
  1755. adcx %rax, %r11
  1756. adox %r13, %r12
  1757. mulx 40($ap), %rax, %r13
  1758. adcx %rax, %r12
  1759. adox %r14, %r13
  1760. mulx 48($ap), %rax, %r14
  1761. adcx %rax, %r13
  1762. adox %r15, %r14
  1763. mulx 56($ap), %rax, %r15
  1764. movq 64($bp,%rcx,8), %rdx
  1765. movq %rbx, 8+64-8(%rsp,%rcx,8)
  1766. adcx %rax, %r14
  1767. adox $zero, %r15
  1768. adcx $zero, %r15 # cf=0
  1769. inc %rcx # of=0
  1770. jnz .Loop_mulx
  1771. movq %r8, %rbx
  1772. mulx ($ap), %rax, %r8
  1773. adcx %rax, %rbx
  1774. adox %r9, %r8
  1775. .byte 0xc4,0x62,0xfb,0xf6,0x8e,0x08,0x00,0x00,0x00 # mulx 8($ap), %rax, %r9
  1776. adcx %rax, %r8
  1777. adox %r10, %r9
  1778. .byte 0xc4,0x62,0xfb,0xf6,0x96,0x10,0x00,0x00,0x00 # mulx 16($ap), %rax, %r10
  1779. adcx %rax, %r9
  1780. adox %r11, %r10
  1781. mulx 24($ap), %rax, %r11
  1782. adcx %rax, %r10
  1783. adox %r12, %r11
  1784. mulx 32($ap), %rax, %r12
  1785. adcx %rax, %r11
  1786. adox %r13, %r12
  1787. mulx 40($ap), %rax, %r13
  1788. adcx %rax, %r12
  1789. adox %r14, %r13
  1790. .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14
  1791. adcx %rax, %r13
  1792. adox %r15, %r14
  1793. .byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 # mulx 56($ap), %rax, %r15
  1794. adcx %rax, %r14
  1795. adox $zero, %r15
  1796. adcx $zero, %r15
  1797. mov %rbx, 8+64-8(%rsp)
  1798. mov %r8, 8+64(%rsp)
  1799. mov %r9, 8+64+8(%rsp)
  1800. mov %r10, 8+64+16(%rsp)
  1801. mov %r11, 8+64+24(%rsp)
  1802. mov %r12, 8+64+32(%rsp)
  1803. mov %r13, 8+64+40(%rsp)
  1804. mov %r14, 8+64+48(%rsp)
  1805. mov %r15, 8+64+56(%rsp)
  1806. ret
  1807. .size __rsaz_512_mulx,.-__rsaz_512_mulx
  1808. ___
  1809. }
  1810. {
  1811. my ($out,$inp,$power)= $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx");
  1812. $code.=<<___;
  1813. .globl rsaz_512_scatter4
  1814. .type rsaz_512_scatter4,\@abi-omnipotent
  1815. .align 16
  1816. rsaz_512_scatter4:
  1817. leaq ($out,$power,8), $out
  1818. movl \$8, %r9d
  1819. jmp .Loop_scatter
  1820. .align 16
  1821. .Loop_scatter:
  1822. movq ($inp), %rax
  1823. leaq 8($inp), $inp
  1824. movq %rax, ($out)
  1825. leaq 128($out), $out
  1826. decl %r9d
  1827. jnz .Loop_scatter
  1828. ret
  1829. .size rsaz_512_scatter4,.-rsaz_512_scatter4
  1830. .globl rsaz_512_gather4
  1831. .type rsaz_512_gather4,\@abi-omnipotent
  1832. .align 16
  1833. rsaz_512_gather4:
  1834. ___
  1835. $code.=<<___ if ($win64);
  1836. .LSEH_begin_rsaz_512_gather4:
  1837. .byte 0x48,0x81,0xec,0xa8,0x00,0x00,0x00 # sub $0xa8,%rsp
  1838. .byte 0x0f,0x29,0x34,0x24 # movaps %xmm6,(%rsp)
  1839. .byte 0x0f,0x29,0x7c,0x24,0x10 # movaps %xmm7,0x10(%rsp)
  1840. .byte 0x44,0x0f,0x29,0x44,0x24,0x20 # movaps %xmm8,0x20(%rsp)
  1841. .byte 0x44,0x0f,0x29,0x4c,0x24,0x30 # movaps %xmm9,0x30(%rsp)
  1842. .byte 0x44,0x0f,0x29,0x54,0x24,0x40 # movaps %xmm10,0x40(%rsp)
  1843. .byte 0x44,0x0f,0x29,0x5c,0x24,0x50 # movaps %xmm11,0x50(%rsp)
  1844. .byte 0x44,0x0f,0x29,0x64,0x24,0x60 # movaps %xmm12,0x60(%rsp)
  1845. .byte 0x44,0x0f,0x29,0x6c,0x24,0x70 # movaps %xmm13,0x70(%rsp)
  1846. .byte 0x44,0x0f,0x29,0xb4,0x24,0x80,0,0,0 # movaps %xmm14,0x80(%rsp)
  1847. .byte 0x44,0x0f,0x29,0xbc,0x24,0x90,0,0,0 # movaps %xmm15,0x90(%rsp)
  1848. ___
  1849. $code.=<<___;
  1850. movd $power,%xmm8
  1851. movdqa .Linc+16(%rip),%xmm1 # 00000002000000020000000200000002
  1852. movdqa .Linc(%rip),%xmm0 # 00000001000000010000000000000000
  1853. pshufd \$0,%xmm8,%xmm8 # broadcast $power
  1854. movdqa %xmm1,%xmm7
  1855. movdqa %xmm1,%xmm2
  1856. ___
  1857. ########################################################################
  1858. # calculate mask by comparing 0..15 to $power
  1859. #
  1860. for($i=0;$i<4;$i++) {
  1861. $code.=<<___;
  1862. paddd %xmm`$i`,%xmm`$i+1`
  1863. pcmpeqd %xmm8,%xmm`$i`
  1864. movdqa %xmm7,%xmm`$i+3`
  1865. ___
  1866. }
  1867. for(;$i<7;$i++) {
  1868. $code.=<<___;
  1869. paddd %xmm`$i`,%xmm`$i+1`
  1870. pcmpeqd %xmm8,%xmm`$i`
  1871. ___
  1872. }
  1873. $code.=<<___;
  1874. pcmpeqd %xmm8,%xmm7
  1875. movl \$8, %r9d
  1876. jmp .Loop_gather
  1877. .align 16
  1878. .Loop_gather:
  1879. movdqa 16*0($inp),%xmm8
  1880. movdqa 16*1($inp),%xmm9
  1881. movdqa 16*2($inp),%xmm10
  1882. movdqa 16*3($inp),%xmm11
  1883. pand %xmm0,%xmm8
  1884. movdqa 16*4($inp),%xmm12
  1885. pand %xmm1,%xmm9
  1886. movdqa 16*5($inp),%xmm13
  1887. pand %xmm2,%xmm10
  1888. movdqa 16*6($inp),%xmm14
  1889. pand %xmm3,%xmm11
  1890. movdqa 16*7($inp),%xmm15
  1891. leaq 128($inp), $inp
  1892. pand %xmm4,%xmm12
  1893. pand %xmm5,%xmm13
  1894. pand %xmm6,%xmm14
  1895. pand %xmm7,%xmm15
  1896. por %xmm10,%xmm8
  1897. por %xmm11,%xmm9
  1898. por %xmm12,%xmm8
  1899. por %xmm13,%xmm9
  1900. por %xmm14,%xmm8
  1901. por %xmm15,%xmm9
  1902. por %xmm9,%xmm8
  1903. pshufd \$0x4e,%xmm8,%xmm9
  1904. por %xmm9,%xmm8
  1905. movq %xmm8,($out)
  1906. leaq 8($out), $out
  1907. decl %r9d
  1908. jnz .Loop_gather
  1909. ___
  1910. $code.=<<___ if ($win64);
  1911. movaps 0x00(%rsp),%xmm6
  1912. movaps 0x10(%rsp),%xmm7
  1913. movaps 0x20(%rsp),%xmm8
  1914. movaps 0x30(%rsp),%xmm9
  1915. movaps 0x40(%rsp),%xmm10
  1916. movaps 0x50(%rsp),%xmm11
  1917. movaps 0x60(%rsp),%xmm12
  1918. movaps 0x70(%rsp),%xmm13
  1919. movaps 0x80(%rsp),%xmm14
  1920. movaps 0x90(%rsp),%xmm15
  1921. add \$0xa8,%rsp
  1922. ___
  1923. $code.=<<___;
  1924. ret
  1925. .LSEH_end_rsaz_512_gather4:
  1926. .size rsaz_512_gather4,.-rsaz_512_gather4
  1927. .align 64
  1928. .Linc:
  1929. .long 0,0, 1,1
  1930. .long 2,2, 2,2
  1931. ___
  1932. }
  1933. # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
  1934. # CONTEXT *context,DISPATCHER_CONTEXT *disp)
  1935. if ($win64) {
  1936. $rec="%rcx";
  1937. $frame="%rdx";
  1938. $context="%r8";
  1939. $disp="%r9";
  1940. $code.=<<___;
  1941. .extern __imp_RtlVirtualUnwind
  1942. .type se_handler,\@abi-omnipotent
  1943. .align 16
  1944. se_handler:
  1945. push %rsi
  1946. push %rdi
  1947. push %rbx
  1948. push %rbp
  1949. push %r12
  1950. push %r13
  1951. push %r14
  1952. push %r15
  1953. pushfq
  1954. sub \$64,%rsp
  1955. mov 120($context),%rax # pull context->Rax
  1956. mov 248($context),%rbx # pull context->Rip
  1957. mov 8($disp),%rsi # disp->ImageBase
  1958. mov 56($disp),%r11 # disp->HandlerData
  1959. mov 0(%r11),%r10d # HandlerData[0]
  1960. lea (%rsi,%r10),%r10 # end of prologue label
  1961. cmp %r10,%rbx # context->Rip<end of prologue label
  1962. jb .Lcommon_seh_tail
  1963. mov 152($context),%rax # pull context->Rsp
  1964. mov 4(%r11),%r10d # HandlerData[1]
  1965. lea (%rsi,%r10),%r10 # epilogue label
  1966. cmp %r10,%rbx # context->Rip>=epilogue label
  1967. jae .Lcommon_seh_tail
  1968. lea 128+24+48(%rax),%rax
  1969. lea .Lmul_gather4_epilogue(%rip),%rbx
  1970. cmp %r10,%rbx
  1971. jne .Lse_not_in_mul_gather4
  1972. lea 0xb0(%rax),%rax
  1973. lea -48-0xa8(%rax),%rsi
  1974. lea 512($context),%rdi
  1975. mov \$20,%ecx
  1976. .long 0xa548f3fc # cld; rep movsq
  1977. .Lse_not_in_mul_gather4:
  1978. mov -8(%rax),%rbx
  1979. mov -16(%rax),%rbp
  1980. mov -24(%rax),%r12
  1981. mov -32(%rax),%r13
  1982. mov -40(%rax),%r14
  1983. mov -48(%rax),%r15
  1984. mov %rbx,144($context) # restore context->Rbx
  1985. mov %rbp,160($context) # restore context->Rbp
  1986. mov %r12,216($context) # restore context->R12
  1987. mov %r13,224($context) # restore context->R13
  1988. mov %r14,232($context) # restore context->R14
  1989. mov %r15,240($context) # restore context->R15
  1990. .Lcommon_seh_tail:
  1991. mov 8(%rax),%rdi
  1992. mov 16(%rax),%rsi
  1993. mov %rax,152($context) # restore context->Rsp
  1994. mov %rsi,168($context) # restore context->Rsi
  1995. mov %rdi,176($context) # restore context->Rdi
  1996. mov 40($disp),%rdi # disp->ContextRecord
  1997. mov $context,%rsi # context
  1998. mov \$154,%ecx # sizeof(CONTEXT)
  1999. .long 0xa548f3fc # cld; rep movsq
  2000. mov $disp,%rsi
  2001. xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
  2002. mov 8(%rsi),%rdx # arg2, disp->ImageBase
  2003. mov 0(%rsi),%r8 # arg3, disp->ControlPc
  2004. mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
  2005. mov 40(%rsi),%r10 # disp->ContextRecord
  2006. lea 56(%rsi),%r11 # &disp->HandlerData
  2007. lea 24(%rsi),%r12 # &disp->EstablisherFrame
  2008. mov %r10,32(%rsp) # arg5
  2009. mov %r11,40(%rsp) # arg6
  2010. mov %r12,48(%rsp) # arg7
  2011. mov %rcx,56(%rsp) # arg8, (NULL)
  2012. call *__imp_RtlVirtualUnwind(%rip)
  2013. mov \$1,%eax # ExceptionContinueSearch
  2014. add \$64,%rsp
  2015. popfq
  2016. pop %r15
  2017. pop %r14
  2018. pop %r13
  2019. pop %r12
  2020. pop %rbp
  2021. pop %rbx
  2022. pop %rdi
  2023. pop %rsi
  2024. ret
  2025. .size se_handler,.-se_handler
  2026. .section .pdata
  2027. .align 4
  2028. .rva .LSEH_begin_rsaz_512_sqr
  2029. .rva .LSEH_end_rsaz_512_sqr
  2030. .rva .LSEH_info_rsaz_512_sqr
  2031. .rva .LSEH_begin_rsaz_512_mul
  2032. .rva .LSEH_end_rsaz_512_mul
  2033. .rva .LSEH_info_rsaz_512_mul
  2034. .rva .LSEH_begin_rsaz_512_mul_gather4
  2035. .rva .LSEH_end_rsaz_512_mul_gather4
  2036. .rva .LSEH_info_rsaz_512_mul_gather4
  2037. .rva .LSEH_begin_rsaz_512_mul_scatter4
  2038. .rva .LSEH_end_rsaz_512_mul_scatter4
  2039. .rva .LSEH_info_rsaz_512_mul_scatter4
  2040. .rva .LSEH_begin_rsaz_512_mul_by_one
  2041. .rva .LSEH_end_rsaz_512_mul_by_one
  2042. .rva .LSEH_info_rsaz_512_mul_by_one
  2043. .rva .LSEH_begin_rsaz_512_gather4
  2044. .rva .LSEH_end_rsaz_512_gather4
  2045. .rva .LSEH_info_rsaz_512_gather4
  2046. .section .xdata
  2047. .align 8
  2048. .LSEH_info_rsaz_512_sqr:
  2049. .byte 9,0,0,0
  2050. .rva se_handler
  2051. .rva .Lsqr_body,.Lsqr_epilogue # HandlerData[]
  2052. .LSEH_info_rsaz_512_mul:
  2053. .byte 9,0,0,0
  2054. .rva se_handler
  2055. .rva .Lmul_body,.Lmul_epilogue # HandlerData[]
  2056. .LSEH_info_rsaz_512_mul_gather4:
  2057. .byte 9,0,0,0
  2058. .rva se_handler
  2059. .rva .Lmul_gather4_body,.Lmul_gather4_epilogue # HandlerData[]
  2060. .LSEH_info_rsaz_512_mul_scatter4:
  2061. .byte 9,0,0,0
  2062. .rva se_handler
  2063. .rva .Lmul_scatter4_body,.Lmul_scatter4_epilogue # HandlerData[]
  2064. .LSEH_info_rsaz_512_mul_by_one:
  2065. .byte 9,0,0,0
  2066. .rva se_handler
  2067. .rva .Lmul_by_one_body,.Lmul_by_one_epilogue # HandlerData[]
  2068. .LSEH_info_rsaz_512_gather4:
  2069. .byte 0x01,0x46,0x16,0x00
  2070. .byte 0x46,0xf8,0x09,0x00 # vmovaps 0x90(rsp),xmm15
  2071. .byte 0x3d,0xe8,0x08,0x00 # vmovaps 0x80(rsp),xmm14
  2072. .byte 0x34,0xd8,0x07,0x00 # vmovaps 0x70(rsp),xmm13
  2073. .byte 0x2e,0xc8,0x06,0x00 # vmovaps 0x60(rsp),xmm12
  2074. .byte 0x28,0xb8,0x05,0x00 # vmovaps 0x50(rsp),xmm11
  2075. .byte 0x22,0xa8,0x04,0x00 # vmovaps 0x40(rsp),xmm10
  2076. .byte 0x1c,0x98,0x03,0x00 # vmovaps 0x30(rsp),xmm9
  2077. .byte 0x16,0x88,0x02,0x00 # vmovaps 0x20(rsp),xmm8
  2078. .byte 0x10,0x78,0x01,0x00 # vmovaps 0x10(rsp),xmm7
  2079. .byte 0x0b,0x68,0x00,0x00 # vmovaps 0x00(rsp),xmm6
  2080. .byte 0x07,0x01,0x15,0x00 # sub rsp,0xa8
  2081. ___
  2082. }
  2083. $code =~ s/\`([^\`]*)\`/eval $1/gem;
  2084. print $code;
  2085. close STDOUT;