rsaz-avx2.pl 51 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984
  1. #! /usr/bin/env perl
  2. # Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved.
  3. # Copyright (c) 2012, Intel Corporation. All Rights Reserved.
  4. #
  5. # Licensed under the Apache License 2.0 (the "License"). You may not use
  6. # this file except in compliance with the License. You can obtain a copy
  7. # in the file LICENSE in the source distribution or at
  8. # https://www.openssl.org/source/license.html
  9. #
  10. # Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1)
  11. # (1) Intel Corporation, Israel Development Center, Haifa, Israel
  12. # (2) University of Haifa, Israel
  13. #
  14. # References:
  15. # [1] S. Gueron, V. Krasnov: "Software Implementation of Modular
  16. # Exponentiation, Using Advanced Vector Instructions Architectures",
  17. # F. Ozbudak and F. Rodriguez-Henriquez (Eds.): WAIFI 2012, LNCS 7369,
  18. # pp. 119?135, 2012. Springer-Verlag Berlin Heidelberg 2012
  19. # [2] S. Gueron: "Efficient Software Implementations of Modular
  20. # Exponentiation", Journal of Cryptographic Engineering 2:31-43 (2012).
  21. # [3] S. Gueron, V. Krasnov: "Speeding up Big-numbers Squaring",IEEE
  22. # Proceedings of 9th International Conference on Information Technology:
  23. # New Generations (ITNG 2012), pp.821-823 (2012)
  24. # [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis
  25. # resistant 1024-bit modular exponentiation, for optimizing RSA2048
  26. # on AVX2 capable x86_64 platforms",
  27. # http://rt.openssl.org/Ticket/Display.html?id=2850&user=guest&pass=guest
  28. #
  29. # +13% improvement over original submission by <appro@openssl.org>
  30. #
  31. # rsa2048 sign/sec OpenSSL 1.0.1 scalar(*) this
  32. # 2.3GHz Haswell 621 765/+23% 1113/+79%
  33. # 2.3GHz Broadwell(**) 688 1200(***)/+74% 1120/+63%
  34. #
  35. # (*) if system doesn't support AVX2, for reference purposes;
  36. # (**) scaled to 2.3GHz to simplify comparison;
  37. # (***) scalar AD*X code is faster than AVX2 and is preferred code
  38. # path for Broadwell;
  39. # $output is the last argument if it looks like a file (it has an extension)
  40. # $flavour is the first argument if it doesn't look like a file
  41. $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  42. $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  43. $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  44. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  45. ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  46. ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  47. die "can't locate x86_64-xlate.pl";
  48. if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
  49. =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
  50. $avx = ($1>=2.19) + ($1>=2.22);
  51. $addx = ($1>=2.23);
  52. }
  53. if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
  54. `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
  55. $avx = ($1>=2.09) + ($1>=2.10);
  56. $addx = ($1>=2.10);
  57. }
  58. if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
  59. `ml64 2>&1` =~ /Version ([0-9]+)\./) {
  60. $avx = ($1>=10) + ($1>=11);
  61. $addx = ($1>=11);
  62. }
  63. if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|based on LLVM) ([0-9]+)\.([0-9]+)/) {
  64. my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10
  65. $avx = ($ver>=3.0) + ($ver>=3.01);
  66. $addx = ($ver>=3.03);
  67. }
  68. open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
  69. or die "can't call $xlate: $!";
  70. *STDOUT = *OUT;
  71. if ($avx>1) {{{
  72. { # void AMS_WW(
  73. my $rp="%rdi"; # BN_ULONG *rp,
  74. my $ap="%rsi"; # const BN_ULONG *ap,
  75. my $np="%rdx"; # const BN_ULONG *np,
  76. my $n0="%ecx"; # const BN_ULONG n0,
  77. my $rep="%r8d"; # int repeat);
  78. # The registers that hold the accumulated redundant result
  79. # The AMM works on 1024 bit operands, and redundant word size is 29
  80. # Therefore: ceil(1024/29)/4 = 9
  81. my $ACC0="%ymm0";
  82. my $ACC1="%ymm1";
  83. my $ACC2="%ymm2";
  84. my $ACC3="%ymm3";
  85. my $ACC4="%ymm4";
  86. my $ACC5="%ymm5";
  87. my $ACC6="%ymm6";
  88. my $ACC7="%ymm7";
  89. my $ACC8="%ymm8";
  90. my $ACC9="%ymm9";
  91. # Registers that hold the broadcasted words of bp, currently used
  92. my $B1="%ymm10";
  93. my $B2="%ymm11";
  94. # Registers that hold the broadcasted words of Y, currently used
  95. my $Y1="%ymm12";
  96. my $Y2="%ymm13";
  97. # Helper registers
  98. my $TEMP1="%ymm14";
  99. my $AND_MASK="%ymm15";
  100. # alu registers that hold the first words of the ACC
  101. my $r0="%r9";
  102. my $r1="%r10";
  103. my $r2="%r11";
  104. my $r3="%r12";
  105. my $i="%r14d"; # loop counter
  106. my $tmp = "%r15";
  107. my $FrameSize=32*18+32*8; # place for A^2 and 2*A
  108. my $aap=$r0;
  109. my $tp0="%rbx";
  110. my $tp1=$r3;
  111. my $tpa=$tmp;
  112. $np="%r13"; # reassigned argument
  113. $code.=<<___;
  114. .text
  115. .globl rsaz_1024_sqr_avx2
  116. .type rsaz_1024_sqr_avx2,\@function,5
  117. .align 64
  118. rsaz_1024_sqr_avx2: # 702 cycles, 14% faster than rsaz_1024_mul_avx2
  119. .cfi_startproc
  120. lea (%rsp), %rax
  121. .cfi_def_cfa_register %rax
  122. push %rbx
  123. .cfi_push %rbx
  124. push %rbp
  125. .cfi_push %rbp
  126. push %r12
  127. .cfi_push %r12
  128. push %r13
  129. .cfi_push %r13
  130. push %r14
  131. .cfi_push %r14
  132. push %r15
  133. .cfi_push %r15
  134. vzeroupper
  135. ___
  136. $code.=<<___ if ($win64);
  137. lea -0xa8(%rsp),%rsp
  138. vmovaps %xmm6,-0xd8(%rax)
  139. vmovaps %xmm7,-0xc8(%rax)
  140. vmovaps %xmm8,-0xb8(%rax)
  141. vmovaps %xmm9,-0xa8(%rax)
  142. vmovaps %xmm10,-0x98(%rax)
  143. vmovaps %xmm11,-0x88(%rax)
  144. vmovaps %xmm12,-0x78(%rax)
  145. vmovaps %xmm13,-0x68(%rax)
  146. vmovaps %xmm14,-0x58(%rax)
  147. vmovaps %xmm15,-0x48(%rax)
  148. .Lsqr_1024_body:
  149. ___
  150. $code.=<<___;
  151. mov %rax,%rbp
  152. .cfi_def_cfa_register %rbp
  153. mov %rdx, $np # reassigned argument
  154. sub \$$FrameSize, %rsp
  155. mov $np, $tmp
  156. sub \$-128, $rp # size optimization
  157. sub \$-128, $ap
  158. sub \$-128, $np
  159. and \$4095, $tmp # see if $np crosses page
  160. add \$32*10, $tmp
  161. shr \$12, $tmp
  162. vpxor $ACC9,$ACC9,$ACC9
  163. jz .Lsqr_1024_no_n_copy
  164. # unaligned 256-bit load that crosses page boundary can
  165. # cause >2x performance degradation here, so if $np does
  166. # cross page boundary, copy it to stack and make sure stack
  167. # frame doesn't...
  168. sub \$32*10,%rsp
  169. vmovdqu 32*0-128($np), $ACC0
  170. and \$-2048, %rsp
  171. vmovdqu 32*1-128($np), $ACC1
  172. vmovdqu 32*2-128($np), $ACC2
  173. vmovdqu 32*3-128($np), $ACC3
  174. vmovdqu 32*4-128($np), $ACC4
  175. vmovdqu 32*5-128($np), $ACC5
  176. vmovdqu 32*6-128($np), $ACC6
  177. vmovdqu 32*7-128($np), $ACC7
  178. vmovdqu 32*8-128($np), $ACC8
  179. lea $FrameSize+128(%rsp),$np
  180. vmovdqu $ACC0, 32*0-128($np)
  181. vmovdqu $ACC1, 32*1-128($np)
  182. vmovdqu $ACC2, 32*2-128($np)
  183. vmovdqu $ACC3, 32*3-128($np)
  184. vmovdqu $ACC4, 32*4-128($np)
  185. vmovdqu $ACC5, 32*5-128($np)
  186. vmovdqu $ACC6, 32*6-128($np)
  187. vmovdqu $ACC7, 32*7-128($np)
  188. vmovdqu $ACC8, 32*8-128($np)
  189. vmovdqu $ACC9, 32*9-128($np) # $ACC9 is zero
  190. .Lsqr_1024_no_n_copy:
  191. and \$-1024, %rsp
  192. vmovdqu 32*1-128($ap), $ACC1
  193. vmovdqu 32*2-128($ap), $ACC2
  194. vmovdqu 32*3-128($ap), $ACC3
  195. vmovdqu 32*4-128($ap), $ACC4
  196. vmovdqu 32*5-128($ap), $ACC5
  197. vmovdqu 32*6-128($ap), $ACC6
  198. vmovdqu 32*7-128($ap), $ACC7
  199. vmovdqu 32*8-128($ap), $ACC8
  200. lea 192(%rsp), $tp0 # 64+128=192
  201. vmovdqu .Land_mask(%rip), $AND_MASK
  202. jmp .LOOP_GRANDE_SQR_1024
  203. .align 32
  204. .LOOP_GRANDE_SQR_1024:
  205. lea 32*18+128(%rsp), $aap # size optimization
  206. lea 448(%rsp), $tp1 # 64+128+256=448
  207. # the squaring is performed as described in Variant B of
  208. # "Speeding up Big-Number Squaring", so start by calculating
  209. # the A*2=A+A vector
  210. vpaddq $ACC1, $ACC1, $ACC1
  211. vpbroadcastq 32*0-128($ap), $B1
  212. vpaddq $ACC2, $ACC2, $ACC2
  213. vmovdqa $ACC1, 32*0-128($aap)
  214. vpaddq $ACC3, $ACC3, $ACC3
  215. vmovdqa $ACC2, 32*1-128($aap)
  216. vpaddq $ACC4, $ACC4, $ACC4
  217. vmovdqa $ACC3, 32*2-128($aap)
  218. vpaddq $ACC5, $ACC5, $ACC5
  219. vmovdqa $ACC4, 32*3-128($aap)
  220. vpaddq $ACC6, $ACC6, $ACC6
  221. vmovdqa $ACC5, 32*4-128($aap)
  222. vpaddq $ACC7, $ACC7, $ACC7
  223. vmovdqa $ACC6, 32*5-128($aap)
  224. vpaddq $ACC8, $ACC8, $ACC8
  225. vmovdqa $ACC7, 32*6-128($aap)
  226. vpxor $ACC9, $ACC9, $ACC9
  227. vmovdqa $ACC8, 32*7-128($aap)
  228. vpmuludq 32*0-128($ap), $B1, $ACC0
  229. vpbroadcastq 32*1-128($ap), $B2
  230. vmovdqu $ACC9, 32*9-192($tp0) # zero upper half
  231. vpmuludq $B1, $ACC1, $ACC1
  232. vmovdqu $ACC9, 32*10-448($tp1)
  233. vpmuludq $B1, $ACC2, $ACC2
  234. vmovdqu $ACC9, 32*11-448($tp1)
  235. vpmuludq $B1, $ACC3, $ACC3
  236. vmovdqu $ACC9, 32*12-448($tp1)
  237. vpmuludq $B1, $ACC4, $ACC4
  238. vmovdqu $ACC9, 32*13-448($tp1)
  239. vpmuludq $B1, $ACC5, $ACC5
  240. vmovdqu $ACC9, 32*14-448($tp1)
  241. vpmuludq $B1, $ACC6, $ACC6
  242. vmovdqu $ACC9, 32*15-448($tp1)
  243. vpmuludq $B1, $ACC7, $ACC7
  244. vmovdqu $ACC9, 32*16-448($tp1)
  245. vpmuludq $B1, $ACC8, $ACC8
  246. vpbroadcastq 32*2-128($ap), $B1
  247. vmovdqu $ACC9, 32*17-448($tp1)
  248. mov $ap, $tpa
  249. mov \$4, $i
  250. jmp .Lsqr_entry_1024
  251. ___
  252. $TEMP0=$Y1;
  253. $TEMP2=$Y2;
  254. $code.=<<___;
  255. .align 32
  256. .LOOP_SQR_1024:
  257. vpbroadcastq 32*1-128($tpa), $B2
  258. vpmuludq 32*0-128($ap), $B1, $ACC0
  259. vpaddq 32*0-192($tp0), $ACC0, $ACC0
  260. vpmuludq 32*0-128($aap), $B1, $ACC1
  261. vpaddq 32*1-192($tp0), $ACC1, $ACC1
  262. vpmuludq 32*1-128($aap), $B1, $ACC2
  263. vpaddq 32*2-192($tp0), $ACC2, $ACC2
  264. vpmuludq 32*2-128($aap), $B1, $ACC3
  265. vpaddq 32*3-192($tp0), $ACC3, $ACC3
  266. vpmuludq 32*3-128($aap), $B1, $ACC4
  267. vpaddq 32*4-192($tp0), $ACC4, $ACC4
  268. vpmuludq 32*4-128($aap), $B1, $ACC5
  269. vpaddq 32*5-192($tp0), $ACC5, $ACC5
  270. vpmuludq 32*5-128($aap), $B1, $ACC6
  271. vpaddq 32*6-192($tp0), $ACC6, $ACC6
  272. vpmuludq 32*6-128($aap), $B1, $ACC7
  273. vpaddq 32*7-192($tp0), $ACC7, $ACC7
  274. vpmuludq 32*7-128($aap), $B1, $ACC8
  275. vpbroadcastq 32*2-128($tpa), $B1
  276. vpaddq 32*8-192($tp0), $ACC8, $ACC8
  277. .Lsqr_entry_1024:
  278. vmovdqu $ACC0, 32*0-192($tp0)
  279. vmovdqu $ACC1, 32*1-192($tp0)
  280. vpmuludq 32*1-128($ap), $B2, $TEMP0
  281. vpaddq $TEMP0, $ACC2, $ACC2
  282. vpmuludq 32*1-128($aap), $B2, $TEMP1
  283. vpaddq $TEMP1, $ACC3, $ACC3
  284. vpmuludq 32*2-128($aap), $B2, $TEMP2
  285. vpaddq $TEMP2, $ACC4, $ACC4
  286. vpmuludq 32*3-128($aap), $B2, $TEMP0
  287. vpaddq $TEMP0, $ACC5, $ACC5
  288. vpmuludq 32*4-128($aap), $B2, $TEMP1
  289. vpaddq $TEMP1, $ACC6, $ACC6
  290. vpmuludq 32*5-128($aap), $B2, $TEMP2
  291. vpaddq $TEMP2, $ACC7, $ACC7
  292. vpmuludq 32*6-128($aap), $B2, $TEMP0
  293. vpaddq $TEMP0, $ACC8, $ACC8
  294. vpmuludq 32*7-128($aap), $B2, $ACC0
  295. vpbroadcastq 32*3-128($tpa), $B2
  296. vpaddq 32*9-192($tp0), $ACC0, $ACC0
  297. vmovdqu $ACC2, 32*2-192($tp0)
  298. vmovdqu $ACC3, 32*3-192($tp0)
  299. vpmuludq 32*2-128($ap), $B1, $TEMP2
  300. vpaddq $TEMP2, $ACC4, $ACC4
  301. vpmuludq 32*2-128($aap), $B1, $TEMP0
  302. vpaddq $TEMP0, $ACC5, $ACC5
  303. vpmuludq 32*3-128($aap), $B1, $TEMP1
  304. vpaddq $TEMP1, $ACC6, $ACC6
  305. vpmuludq 32*4-128($aap), $B1, $TEMP2
  306. vpaddq $TEMP2, $ACC7, $ACC7
  307. vpmuludq 32*5-128($aap), $B1, $TEMP0
  308. vpaddq $TEMP0, $ACC8, $ACC8
  309. vpmuludq 32*6-128($aap), $B1, $TEMP1
  310. vpaddq $TEMP1, $ACC0, $ACC0
  311. vpmuludq 32*7-128($aap), $B1, $ACC1
  312. vpbroadcastq 32*4-128($tpa), $B1
  313. vpaddq 32*10-448($tp1), $ACC1, $ACC1
  314. vmovdqu $ACC4, 32*4-192($tp0)
  315. vmovdqu $ACC5, 32*5-192($tp0)
  316. vpmuludq 32*3-128($ap), $B2, $TEMP0
  317. vpaddq $TEMP0, $ACC6, $ACC6
  318. vpmuludq 32*3-128($aap), $B2, $TEMP1
  319. vpaddq $TEMP1, $ACC7, $ACC7
  320. vpmuludq 32*4-128($aap), $B2, $TEMP2
  321. vpaddq $TEMP2, $ACC8, $ACC8
  322. vpmuludq 32*5-128($aap), $B2, $TEMP0
  323. vpaddq $TEMP0, $ACC0, $ACC0
  324. vpmuludq 32*6-128($aap), $B2, $TEMP1
  325. vpaddq $TEMP1, $ACC1, $ACC1
  326. vpmuludq 32*7-128($aap), $B2, $ACC2
  327. vpbroadcastq 32*5-128($tpa), $B2
  328. vpaddq 32*11-448($tp1), $ACC2, $ACC2
  329. vmovdqu $ACC6, 32*6-192($tp0)
  330. vmovdqu $ACC7, 32*7-192($tp0)
  331. vpmuludq 32*4-128($ap), $B1, $TEMP0
  332. vpaddq $TEMP0, $ACC8, $ACC8
  333. vpmuludq 32*4-128($aap), $B1, $TEMP1
  334. vpaddq $TEMP1, $ACC0, $ACC0
  335. vpmuludq 32*5-128($aap), $B1, $TEMP2
  336. vpaddq $TEMP2, $ACC1, $ACC1
  337. vpmuludq 32*6-128($aap), $B1, $TEMP0
  338. vpaddq $TEMP0, $ACC2, $ACC2
  339. vpmuludq 32*7-128($aap), $B1, $ACC3
  340. vpbroadcastq 32*6-128($tpa), $B1
  341. vpaddq 32*12-448($tp1), $ACC3, $ACC3
  342. vmovdqu $ACC8, 32*8-192($tp0)
  343. vmovdqu $ACC0, 32*9-192($tp0)
  344. lea 8($tp0), $tp0
  345. vpmuludq 32*5-128($ap), $B2, $TEMP2
  346. vpaddq $TEMP2, $ACC1, $ACC1
  347. vpmuludq 32*5-128($aap), $B2, $TEMP0
  348. vpaddq $TEMP0, $ACC2, $ACC2
  349. vpmuludq 32*6-128($aap), $B2, $TEMP1
  350. vpaddq $TEMP1, $ACC3, $ACC3
  351. vpmuludq 32*7-128($aap), $B2, $ACC4
  352. vpbroadcastq 32*7-128($tpa), $B2
  353. vpaddq 32*13-448($tp1), $ACC4, $ACC4
  354. vmovdqu $ACC1, 32*10-448($tp1)
  355. vmovdqu $ACC2, 32*11-448($tp1)
  356. vpmuludq 32*6-128($ap), $B1, $TEMP0
  357. vpaddq $TEMP0, $ACC3, $ACC3
  358. vpmuludq 32*6-128($aap), $B1, $TEMP1
  359. vpbroadcastq 32*8-128($tpa), $ACC0 # borrow $ACC0 for $B1
  360. vpaddq $TEMP1, $ACC4, $ACC4
  361. vpmuludq 32*7-128($aap), $B1, $ACC5
  362. vpbroadcastq 32*0+8-128($tpa), $B1 # for next iteration
  363. vpaddq 32*14-448($tp1), $ACC5, $ACC5
  364. vmovdqu $ACC3, 32*12-448($tp1)
  365. vmovdqu $ACC4, 32*13-448($tp1)
  366. lea 8($tpa), $tpa
  367. vpmuludq 32*7-128($ap), $B2, $TEMP0
  368. vpaddq $TEMP0, $ACC5, $ACC5
  369. vpmuludq 32*7-128($aap), $B2, $ACC6
  370. vpaddq 32*15-448($tp1), $ACC6, $ACC6
  371. vpmuludq 32*8-128($ap), $ACC0, $ACC7
  372. vmovdqu $ACC5, 32*14-448($tp1)
  373. vpaddq 32*16-448($tp1), $ACC7, $ACC7
  374. vmovdqu $ACC6, 32*15-448($tp1)
  375. vmovdqu $ACC7, 32*16-448($tp1)
  376. lea 8($tp1), $tp1
  377. dec $i
  378. jnz .LOOP_SQR_1024
  379. ___
  380. $ZERO = $ACC9;
  381. $TEMP0 = $B1;
  382. $TEMP2 = $B2;
  383. $TEMP3 = $Y1;
  384. $TEMP4 = $Y2;
  385. $code.=<<___;
  386. # we need to fix indices 32-39 to avoid overflow
  387. vmovdqu 32*8(%rsp), $ACC8 # 32*8-192($tp0),
  388. vmovdqu 32*9(%rsp), $ACC1 # 32*9-192($tp0)
  389. vmovdqu 32*10(%rsp), $ACC2 # 32*10-192($tp0)
  390. lea 192(%rsp), $tp0 # 64+128=192
  391. vpsrlq \$29, $ACC8, $TEMP1
  392. vpand $AND_MASK, $ACC8, $ACC8
  393. vpsrlq \$29, $ACC1, $TEMP2
  394. vpand $AND_MASK, $ACC1, $ACC1
  395. vpermq \$0x93, $TEMP1, $TEMP1
  396. vpxor $ZERO, $ZERO, $ZERO
  397. vpermq \$0x93, $TEMP2, $TEMP2
  398. vpblendd \$3, $ZERO, $TEMP1, $TEMP0
  399. vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
  400. vpaddq $TEMP0, $ACC8, $ACC8
  401. vpblendd \$3, $TEMP2, $ZERO, $TEMP2
  402. vpaddq $TEMP1, $ACC1, $ACC1
  403. vpaddq $TEMP2, $ACC2, $ACC2
  404. vmovdqu $ACC1, 32*9-192($tp0)
  405. vmovdqu $ACC2, 32*10-192($tp0)
  406. mov (%rsp), %rax
  407. mov 8(%rsp), $r1
  408. mov 16(%rsp), $r2
  409. mov 24(%rsp), $r3
  410. vmovdqu 32*1(%rsp), $ACC1
  411. vmovdqu 32*2-192($tp0), $ACC2
  412. vmovdqu 32*3-192($tp0), $ACC3
  413. vmovdqu 32*4-192($tp0), $ACC4
  414. vmovdqu 32*5-192($tp0), $ACC5
  415. vmovdqu 32*6-192($tp0), $ACC6
  416. vmovdqu 32*7-192($tp0), $ACC7
  417. mov %rax, $r0
  418. imull $n0, %eax
  419. and \$0x1fffffff, %eax
  420. vmovd %eax, $Y1
  421. mov %rax, %rdx
  422. imulq -128($np), %rax
  423. vpbroadcastq $Y1, $Y1
  424. add %rax, $r0
  425. mov %rdx, %rax
  426. imulq 8-128($np), %rax
  427. shr \$29, $r0
  428. add %rax, $r1
  429. mov %rdx, %rax
  430. imulq 16-128($np), %rax
  431. add $r0, $r1
  432. add %rax, $r2
  433. imulq 24-128($np), %rdx
  434. add %rdx, $r3
  435. mov $r1, %rax
  436. imull $n0, %eax
  437. and \$0x1fffffff, %eax
  438. mov \$9, $i
  439. jmp .LOOP_REDUCE_1024
  440. .align 32
  441. .LOOP_REDUCE_1024:
  442. vmovd %eax, $Y2
  443. vpbroadcastq $Y2, $Y2
  444. vpmuludq 32*1-128($np), $Y1, $TEMP0
  445. mov %rax, %rdx
  446. imulq -128($np), %rax
  447. vpaddq $TEMP0, $ACC1, $ACC1
  448. add %rax, $r1
  449. vpmuludq 32*2-128($np), $Y1, $TEMP1
  450. mov %rdx, %rax
  451. imulq 8-128($np), %rax
  452. vpaddq $TEMP1, $ACC2, $ACC2
  453. vpmuludq 32*3-128($np), $Y1, $TEMP2
  454. .byte 0x67
  455. add %rax, $r2
  456. .byte 0x67
  457. mov %rdx, %rax
  458. imulq 16-128($np), %rax
  459. shr \$29, $r1
  460. vpaddq $TEMP2, $ACC3, $ACC3
  461. vpmuludq 32*4-128($np), $Y1, $TEMP0
  462. add %rax, $r3
  463. add $r1, $r2
  464. vpaddq $TEMP0, $ACC4, $ACC4
  465. vpmuludq 32*5-128($np), $Y1, $TEMP1
  466. mov $r2, %rax
  467. imull $n0, %eax
  468. vpaddq $TEMP1, $ACC5, $ACC5
  469. vpmuludq 32*6-128($np), $Y1, $TEMP2
  470. and \$0x1fffffff, %eax
  471. vpaddq $TEMP2, $ACC6, $ACC6
  472. vpmuludq 32*7-128($np), $Y1, $TEMP0
  473. vpaddq $TEMP0, $ACC7, $ACC7
  474. vpmuludq 32*8-128($np), $Y1, $TEMP1
  475. vmovd %eax, $Y1
  476. #vmovdqu 32*1-8-128($np), $TEMP2 # moved below
  477. vpaddq $TEMP1, $ACC8, $ACC8
  478. #vmovdqu 32*2-8-128($np), $TEMP0 # moved below
  479. vpbroadcastq $Y1, $Y1
  480. vpmuludq 32*1-8-128($np), $Y2, $TEMP2 # see above
  481. vmovdqu 32*3-8-128($np), $TEMP1
  482. mov %rax, %rdx
  483. imulq -128($np), %rax
  484. vpaddq $TEMP2, $ACC1, $ACC1
  485. vpmuludq 32*2-8-128($np), $Y2, $TEMP0 # see above
  486. vmovdqu 32*4-8-128($np), $TEMP2
  487. add %rax, $r2
  488. mov %rdx, %rax
  489. imulq 8-128($np), %rax
  490. vpaddq $TEMP0, $ACC2, $ACC2
  491. add $r3, %rax
  492. shr \$29, $r2
  493. vpmuludq $Y2, $TEMP1, $TEMP1
  494. vmovdqu 32*5-8-128($np), $TEMP0
  495. add $r2, %rax
  496. vpaddq $TEMP1, $ACC3, $ACC3
  497. vpmuludq $Y2, $TEMP2, $TEMP2
  498. vmovdqu 32*6-8-128($np), $TEMP1
  499. .byte 0x67
  500. mov %rax, $r3
  501. imull $n0, %eax
  502. vpaddq $TEMP2, $ACC4, $ACC4
  503. vpmuludq $Y2, $TEMP0, $TEMP0
  504. .byte 0xc4,0x41,0x7e,0x6f,0x9d,0x58,0x00,0x00,0x00 # vmovdqu 32*7-8-128($np), $TEMP2
  505. and \$0x1fffffff, %eax
  506. vpaddq $TEMP0, $ACC5, $ACC5
  507. vpmuludq $Y2, $TEMP1, $TEMP1
  508. vmovdqu 32*8-8-128($np), $TEMP0
  509. vpaddq $TEMP1, $ACC6, $ACC6
  510. vpmuludq $Y2, $TEMP2, $TEMP2
  511. vmovdqu 32*9-8-128($np), $ACC9
  512. vmovd %eax, $ACC0 # borrow ACC0 for Y2
  513. imulq -128($np), %rax
  514. vpaddq $TEMP2, $ACC7, $ACC7
  515. vpmuludq $Y2, $TEMP0, $TEMP0
  516. vmovdqu 32*1-16-128($np), $TEMP1
  517. vpbroadcastq $ACC0, $ACC0
  518. vpaddq $TEMP0, $ACC8, $ACC8
  519. vpmuludq $Y2, $ACC9, $ACC9
  520. vmovdqu 32*2-16-128($np), $TEMP2
  521. add %rax, $r3
  522. ___
  523. ($ACC0,$Y2)=($Y2,$ACC0);
  524. $code.=<<___;
  525. vmovdqu 32*1-24-128($np), $ACC0
  526. vpmuludq $Y1, $TEMP1, $TEMP1
  527. vmovdqu 32*3-16-128($np), $TEMP0
  528. vpaddq $TEMP1, $ACC1, $ACC1
  529. vpmuludq $Y2, $ACC0, $ACC0
  530. vpmuludq $Y1, $TEMP2, $TEMP2
  531. .byte 0xc4,0x41,0x7e,0x6f,0xb5,0xf0,0xff,0xff,0xff # vmovdqu 32*4-16-128($np), $TEMP1
  532. vpaddq $ACC1, $ACC0, $ACC0
  533. vpaddq $TEMP2, $ACC2, $ACC2
  534. vpmuludq $Y1, $TEMP0, $TEMP0
  535. vmovdqu 32*5-16-128($np), $TEMP2
  536. .byte 0x67
  537. vmovq $ACC0, %rax
  538. vmovdqu $ACC0, (%rsp) # transfer $r0-$r3
  539. vpaddq $TEMP0, $ACC3, $ACC3
  540. vpmuludq $Y1, $TEMP1, $TEMP1
  541. vmovdqu 32*6-16-128($np), $TEMP0
  542. vpaddq $TEMP1, $ACC4, $ACC4
  543. vpmuludq $Y1, $TEMP2, $TEMP2
  544. vmovdqu 32*7-16-128($np), $TEMP1
  545. vpaddq $TEMP2, $ACC5, $ACC5
  546. vpmuludq $Y1, $TEMP0, $TEMP0
  547. vmovdqu 32*8-16-128($np), $TEMP2
  548. vpaddq $TEMP0, $ACC6, $ACC6
  549. vpmuludq $Y1, $TEMP1, $TEMP1
  550. shr \$29, $r3
  551. vmovdqu 32*9-16-128($np), $TEMP0
  552. add $r3, %rax
  553. vpaddq $TEMP1, $ACC7, $ACC7
  554. vpmuludq $Y1, $TEMP2, $TEMP2
  555. #vmovdqu 32*2-24-128($np), $TEMP1 # moved below
  556. mov %rax, $r0
  557. imull $n0, %eax
  558. vpaddq $TEMP2, $ACC8, $ACC8
  559. vpmuludq $Y1, $TEMP0, $TEMP0
  560. and \$0x1fffffff, %eax
  561. vmovd %eax, $Y1
  562. vmovdqu 32*3-24-128($np), $TEMP2
  563. .byte 0x67
  564. vpaddq $TEMP0, $ACC9, $ACC9
  565. vpbroadcastq $Y1, $Y1
  566. vpmuludq 32*2-24-128($np), $Y2, $TEMP1 # see above
  567. vmovdqu 32*4-24-128($np), $TEMP0
  568. mov %rax, %rdx
  569. imulq -128($np), %rax
  570. mov 8(%rsp), $r1
  571. vpaddq $TEMP1, $ACC2, $ACC1
  572. vpmuludq $Y2, $TEMP2, $TEMP2
  573. vmovdqu 32*5-24-128($np), $TEMP1
  574. add %rax, $r0
  575. mov %rdx, %rax
  576. imulq 8-128($np), %rax
  577. .byte 0x67
  578. shr \$29, $r0
  579. mov 16(%rsp), $r2
  580. vpaddq $TEMP2, $ACC3, $ACC2
  581. vpmuludq $Y2, $TEMP0, $TEMP0
  582. vmovdqu 32*6-24-128($np), $TEMP2
  583. add %rax, $r1
  584. mov %rdx, %rax
  585. imulq 16-128($np), %rax
  586. vpaddq $TEMP0, $ACC4, $ACC3
  587. vpmuludq $Y2, $TEMP1, $TEMP1
  588. vmovdqu 32*7-24-128($np), $TEMP0
  589. imulq 24-128($np), %rdx # future $r3
  590. add %rax, $r2
  591. lea ($r0,$r1), %rax
  592. vpaddq $TEMP1, $ACC5, $ACC4
  593. vpmuludq $Y2, $TEMP2, $TEMP2
  594. vmovdqu 32*8-24-128($np), $TEMP1
  595. mov %rax, $r1
  596. imull $n0, %eax
  597. vpmuludq $Y2, $TEMP0, $TEMP0
  598. vpaddq $TEMP2, $ACC6, $ACC5
  599. vmovdqu 32*9-24-128($np), $TEMP2
  600. and \$0x1fffffff, %eax
  601. vpaddq $TEMP0, $ACC7, $ACC6
  602. vpmuludq $Y2, $TEMP1, $TEMP1
  603. add 24(%rsp), %rdx
  604. vpaddq $TEMP1, $ACC8, $ACC7
  605. vpmuludq $Y2, $TEMP2, $TEMP2
  606. vpaddq $TEMP2, $ACC9, $ACC8
  607. vmovq $r3, $ACC9
  608. mov %rdx, $r3
  609. dec $i
  610. jnz .LOOP_REDUCE_1024
  611. ___
  612. ($ACC0,$Y2)=($Y2,$ACC0);
  613. $code.=<<___;
  614. lea 448(%rsp), $tp1 # size optimization
  615. vpaddq $ACC9, $Y2, $ACC0
  616. vpxor $ZERO, $ZERO, $ZERO
  617. vpaddq 32*9-192($tp0), $ACC0, $ACC0
  618. vpaddq 32*10-448($tp1), $ACC1, $ACC1
  619. vpaddq 32*11-448($tp1), $ACC2, $ACC2
  620. vpaddq 32*12-448($tp1), $ACC3, $ACC3
  621. vpaddq 32*13-448($tp1), $ACC4, $ACC4
  622. vpaddq 32*14-448($tp1), $ACC5, $ACC5
  623. vpaddq 32*15-448($tp1), $ACC6, $ACC6
  624. vpaddq 32*16-448($tp1), $ACC7, $ACC7
  625. vpaddq 32*17-448($tp1), $ACC8, $ACC8
  626. vpsrlq \$29, $ACC0, $TEMP1
  627. vpand $AND_MASK, $ACC0, $ACC0
  628. vpsrlq \$29, $ACC1, $TEMP2
  629. vpand $AND_MASK, $ACC1, $ACC1
  630. vpsrlq \$29, $ACC2, $TEMP3
  631. vpermq \$0x93, $TEMP1, $TEMP1
  632. vpand $AND_MASK, $ACC2, $ACC2
  633. vpsrlq \$29, $ACC3, $TEMP4
  634. vpermq \$0x93, $TEMP2, $TEMP2
  635. vpand $AND_MASK, $ACC3, $ACC3
  636. vpermq \$0x93, $TEMP3, $TEMP3
  637. vpblendd \$3, $ZERO, $TEMP1, $TEMP0
  638. vpermq \$0x93, $TEMP4, $TEMP4
  639. vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
  640. vpaddq $TEMP0, $ACC0, $ACC0
  641. vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
  642. vpaddq $TEMP1, $ACC1, $ACC1
  643. vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
  644. vpaddq $TEMP2, $ACC2, $ACC2
  645. vpblendd \$3, $TEMP4, $ZERO, $TEMP4
  646. vpaddq $TEMP3, $ACC3, $ACC3
  647. vpaddq $TEMP4, $ACC4, $ACC4
  648. vpsrlq \$29, $ACC0, $TEMP1
  649. vpand $AND_MASK, $ACC0, $ACC0
  650. vpsrlq \$29, $ACC1, $TEMP2
  651. vpand $AND_MASK, $ACC1, $ACC1
  652. vpsrlq \$29, $ACC2, $TEMP3
  653. vpermq \$0x93, $TEMP1, $TEMP1
  654. vpand $AND_MASK, $ACC2, $ACC2
  655. vpsrlq \$29, $ACC3, $TEMP4
  656. vpermq \$0x93, $TEMP2, $TEMP2
  657. vpand $AND_MASK, $ACC3, $ACC3
  658. vpermq \$0x93, $TEMP3, $TEMP3
  659. vpblendd \$3, $ZERO, $TEMP1, $TEMP0
  660. vpermq \$0x93, $TEMP4, $TEMP4
  661. vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
  662. vpaddq $TEMP0, $ACC0, $ACC0
  663. vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
  664. vpaddq $TEMP1, $ACC1, $ACC1
  665. vmovdqu $ACC0, 32*0-128($rp)
  666. vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
  667. vpaddq $TEMP2, $ACC2, $ACC2
  668. vmovdqu $ACC1, 32*1-128($rp)
  669. vpblendd \$3, $TEMP4, $ZERO, $TEMP4
  670. vpaddq $TEMP3, $ACC3, $ACC3
  671. vmovdqu $ACC2, 32*2-128($rp)
  672. vpaddq $TEMP4, $ACC4, $ACC4
  673. vmovdqu $ACC3, 32*3-128($rp)
  674. ___
  675. $TEMP5=$ACC0;
  676. $code.=<<___;
  677. vpsrlq \$29, $ACC4, $TEMP1
  678. vpand $AND_MASK, $ACC4, $ACC4
  679. vpsrlq \$29, $ACC5, $TEMP2
  680. vpand $AND_MASK, $ACC5, $ACC5
  681. vpsrlq \$29, $ACC6, $TEMP3
  682. vpermq \$0x93, $TEMP1, $TEMP1
  683. vpand $AND_MASK, $ACC6, $ACC6
  684. vpsrlq \$29, $ACC7, $TEMP4
  685. vpermq \$0x93, $TEMP2, $TEMP2
  686. vpand $AND_MASK, $ACC7, $ACC7
  687. vpsrlq \$29, $ACC8, $TEMP5
  688. vpermq \$0x93, $TEMP3, $TEMP3
  689. vpand $AND_MASK, $ACC8, $ACC8
  690. vpermq \$0x93, $TEMP4, $TEMP4
  691. vpblendd \$3, $ZERO, $TEMP1, $TEMP0
  692. vpermq \$0x93, $TEMP5, $TEMP5
  693. vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
  694. vpaddq $TEMP0, $ACC4, $ACC4
  695. vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
  696. vpaddq $TEMP1, $ACC5, $ACC5
  697. vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
  698. vpaddq $TEMP2, $ACC6, $ACC6
  699. vpblendd \$3, $TEMP4, $TEMP5, $TEMP4
  700. vpaddq $TEMP3, $ACC7, $ACC7
  701. vpaddq $TEMP4, $ACC8, $ACC8
  702. vpsrlq \$29, $ACC4, $TEMP1
  703. vpand $AND_MASK, $ACC4, $ACC4
  704. vpsrlq \$29, $ACC5, $TEMP2
  705. vpand $AND_MASK, $ACC5, $ACC5
  706. vpsrlq \$29, $ACC6, $TEMP3
  707. vpermq \$0x93, $TEMP1, $TEMP1
  708. vpand $AND_MASK, $ACC6, $ACC6
  709. vpsrlq \$29, $ACC7, $TEMP4
  710. vpermq \$0x93, $TEMP2, $TEMP2
  711. vpand $AND_MASK, $ACC7, $ACC7
  712. vpsrlq \$29, $ACC8, $TEMP5
  713. vpermq \$0x93, $TEMP3, $TEMP3
  714. vpand $AND_MASK, $ACC8, $ACC8
  715. vpermq \$0x93, $TEMP4, $TEMP4
  716. vpblendd \$3, $ZERO, $TEMP1, $TEMP0
  717. vpermq \$0x93, $TEMP5, $TEMP5
  718. vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
  719. vpaddq $TEMP0, $ACC4, $ACC4
  720. vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
  721. vpaddq $TEMP1, $ACC5, $ACC5
  722. vmovdqu $ACC4, 32*4-128($rp)
  723. vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
  724. vpaddq $TEMP2, $ACC6, $ACC6
  725. vmovdqu $ACC5, 32*5-128($rp)
  726. vpblendd \$3, $TEMP4, $TEMP5, $TEMP4
  727. vpaddq $TEMP3, $ACC7, $ACC7
  728. vmovdqu $ACC6, 32*6-128($rp)
  729. vpaddq $TEMP4, $ACC8, $ACC8
  730. vmovdqu $ACC7, 32*7-128($rp)
  731. vmovdqu $ACC8, 32*8-128($rp)
  732. mov $rp, $ap
  733. dec $rep
  734. jne .LOOP_GRANDE_SQR_1024
  735. vzeroall
  736. mov %rbp, %rax
  737. .cfi_def_cfa_register %rax
  738. ___
  739. $code.=<<___ if ($win64);
  740. .Lsqr_1024_in_tail:
  741. movaps -0xd8(%rax),%xmm6
  742. movaps -0xc8(%rax),%xmm7
  743. movaps -0xb8(%rax),%xmm8
  744. movaps -0xa8(%rax),%xmm9
  745. movaps -0x98(%rax),%xmm10
  746. movaps -0x88(%rax),%xmm11
  747. movaps -0x78(%rax),%xmm12
  748. movaps -0x68(%rax),%xmm13
  749. movaps -0x58(%rax),%xmm14
  750. movaps -0x48(%rax),%xmm15
  751. ___
  752. $code.=<<___;
  753. mov -48(%rax),%r15
  754. .cfi_restore %r15
  755. mov -40(%rax),%r14
  756. .cfi_restore %r14
  757. mov -32(%rax),%r13
  758. .cfi_restore %r13
  759. mov -24(%rax),%r12
  760. .cfi_restore %r12
  761. mov -16(%rax),%rbp
  762. .cfi_restore %rbp
  763. mov -8(%rax),%rbx
  764. .cfi_restore %rbx
  765. lea (%rax),%rsp # restore %rsp
  766. .cfi_def_cfa_register %rsp
  767. .Lsqr_1024_epilogue:
  768. ret
  769. .cfi_endproc
  770. .size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2
  771. ___
  772. }
  773. { # void AMM_WW(
  774. my $rp="%rdi"; # BN_ULONG *rp,
  775. my $ap="%rsi"; # const BN_ULONG *ap,
  776. my $bp="%rdx"; # const BN_ULONG *bp,
  777. my $np="%rcx"; # const BN_ULONG *np,
  778. my $n0="%r8d"; # unsigned int n0);
  779. # The registers that hold the accumulated redundant result
  780. # The AMM works on 1024 bit operands, and redundant word size is 29
  781. # Therefore: ceil(1024/29)/4 = 9
  782. my $ACC0="%ymm0";
  783. my $ACC1="%ymm1";
  784. my $ACC2="%ymm2";
  785. my $ACC3="%ymm3";
  786. my $ACC4="%ymm4";
  787. my $ACC5="%ymm5";
  788. my $ACC6="%ymm6";
  789. my $ACC7="%ymm7";
  790. my $ACC8="%ymm8";
  791. my $ACC9="%ymm9";
  792. # Registers that hold the broadcasted words of multiplier, currently used
  793. my $Bi="%ymm10";
  794. my $Yi="%ymm11";
  795. # Helper registers
  796. my $TEMP0=$ACC0;
  797. my $TEMP1="%ymm12";
  798. my $TEMP2="%ymm13";
  799. my $ZERO="%ymm14";
  800. my $AND_MASK="%ymm15";
  801. # alu registers that hold the first words of the ACC
  802. my $r0="%r9";
  803. my $r1="%r10";
  804. my $r2="%r11";
  805. my $r3="%r12";
  806. my $i="%r14d";
  807. my $tmp="%r15";
  808. $bp="%r13"; # reassigned argument
  809. $code.=<<___;
  810. .globl rsaz_1024_mul_avx2
  811. .type rsaz_1024_mul_avx2,\@function,5
  812. .align 64
  813. rsaz_1024_mul_avx2:
  814. .cfi_startproc
  815. lea (%rsp), %rax
  816. .cfi_def_cfa_register %rax
  817. push %rbx
  818. .cfi_push %rbx
  819. push %rbp
  820. .cfi_push %rbp
  821. push %r12
  822. .cfi_push %r12
  823. push %r13
  824. .cfi_push %r13
  825. push %r14
  826. .cfi_push %r14
  827. push %r15
  828. .cfi_push %r15
  829. ___
  830. $code.=<<___ if ($win64);
  831. vzeroupper
  832. lea -0xa8(%rsp),%rsp
  833. vmovaps %xmm6,-0xd8(%rax)
  834. vmovaps %xmm7,-0xc8(%rax)
  835. vmovaps %xmm8,-0xb8(%rax)
  836. vmovaps %xmm9,-0xa8(%rax)
  837. vmovaps %xmm10,-0x98(%rax)
  838. vmovaps %xmm11,-0x88(%rax)
  839. vmovaps %xmm12,-0x78(%rax)
  840. vmovaps %xmm13,-0x68(%rax)
  841. vmovaps %xmm14,-0x58(%rax)
  842. vmovaps %xmm15,-0x48(%rax)
  843. .Lmul_1024_body:
  844. ___
  845. $code.=<<___;
  846. mov %rax,%rbp
  847. .cfi_def_cfa_register %rbp
  848. vzeroall
  849. mov %rdx, $bp # reassigned argument
  850. sub \$64,%rsp
  851. # unaligned 256-bit load that crosses page boundary can
  852. # cause severe performance degradation here, so if $ap does
  853. # cross page boundary, swap it with $bp [meaning that caller
  854. # is advised to lay down $ap and $bp next to each other, so
  855. # that only one can cross page boundary].
  856. .byte 0x67,0x67
  857. mov $ap, $tmp
  858. and \$4095, $tmp
  859. add \$32*10, $tmp
  860. shr \$12, $tmp
  861. mov $ap, $tmp
  862. cmovnz $bp, $ap
  863. cmovnz $tmp, $bp
  864. mov $np, $tmp
  865. sub \$-128,$ap # size optimization
  866. sub \$-128,$np
  867. sub \$-128,$rp
  868. and \$4095, $tmp # see if $np crosses page
  869. add \$32*10, $tmp
  870. .byte 0x67,0x67
  871. shr \$12, $tmp
  872. jz .Lmul_1024_no_n_copy
  873. # unaligned 256-bit load that crosses page boundary can
  874. # cause severe performance degradation here, so if $np does
  875. # cross page boundary, copy it to stack and make sure stack
  876. # frame doesn't...
  877. sub \$32*10,%rsp
  878. vmovdqu 32*0-128($np), $ACC0
  879. and \$-512, %rsp
  880. vmovdqu 32*1-128($np), $ACC1
  881. vmovdqu 32*2-128($np), $ACC2
  882. vmovdqu 32*3-128($np), $ACC3
  883. vmovdqu 32*4-128($np), $ACC4
  884. vmovdqu 32*5-128($np), $ACC5
  885. vmovdqu 32*6-128($np), $ACC6
  886. vmovdqu 32*7-128($np), $ACC7
  887. vmovdqu 32*8-128($np), $ACC8
  888. lea 64+128(%rsp),$np
  889. vmovdqu $ACC0, 32*0-128($np)
  890. vpxor $ACC0, $ACC0, $ACC0
  891. vmovdqu $ACC1, 32*1-128($np)
  892. vpxor $ACC1, $ACC1, $ACC1
  893. vmovdqu $ACC2, 32*2-128($np)
  894. vpxor $ACC2, $ACC2, $ACC2
  895. vmovdqu $ACC3, 32*3-128($np)
  896. vpxor $ACC3, $ACC3, $ACC3
  897. vmovdqu $ACC4, 32*4-128($np)
  898. vpxor $ACC4, $ACC4, $ACC4
  899. vmovdqu $ACC5, 32*5-128($np)
  900. vpxor $ACC5, $ACC5, $ACC5
  901. vmovdqu $ACC6, 32*6-128($np)
  902. vpxor $ACC6, $ACC6, $ACC6
  903. vmovdqu $ACC7, 32*7-128($np)
  904. vpxor $ACC7, $ACC7, $ACC7
  905. vmovdqu $ACC8, 32*8-128($np)
  906. vmovdqa $ACC0, $ACC8
  907. vmovdqu $ACC9, 32*9-128($np) # $ACC9 is zero after vzeroall
  908. .Lmul_1024_no_n_copy:
  909. and \$-64,%rsp
  910. mov ($bp), %rbx
  911. vpbroadcastq ($bp), $Bi
  912. vmovdqu $ACC0, (%rsp) # clear top of stack
  913. xor $r0, $r0
  914. .byte 0x67
  915. xor $r1, $r1
  916. xor $r2, $r2
  917. xor $r3, $r3
  918. vmovdqu .Land_mask(%rip), $AND_MASK
  919. mov \$9, $i
  920. vmovdqu $ACC9, 32*9-128($rp) # $ACC9 is zero after vzeroall
  921. jmp .Loop_mul_1024
  922. .align 32
  923. .Loop_mul_1024:
  924. vpsrlq \$29, $ACC3, $ACC9 # correct $ACC3(*)
  925. mov %rbx, %rax
  926. imulq -128($ap), %rax
  927. add $r0, %rax
  928. mov %rbx, $r1
  929. imulq 8-128($ap), $r1
  930. add 8(%rsp), $r1
  931. mov %rax, $r0
  932. imull $n0, %eax
  933. and \$0x1fffffff, %eax
  934. mov %rbx, $r2
  935. imulq 16-128($ap), $r2
  936. add 16(%rsp), $r2
  937. mov %rbx, $r3
  938. imulq 24-128($ap), $r3
  939. add 24(%rsp), $r3
  940. vpmuludq 32*1-128($ap),$Bi,$TEMP0
  941. vmovd %eax, $Yi
  942. vpaddq $TEMP0,$ACC1,$ACC1
  943. vpmuludq 32*2-128($ap),$Bi,$TEMP1
  944. vpbroadcastq $Yi, $Yi
  945. vpaddq $TEMP1,$ACC2,$ACC2
  946. vpmuludq 32*3-128($ap),$Bi,$TEMP2
  947. vpand $AND_MASK, $ACC3, $ACC3 # correct $ACC3
  948. vpaddq $TEMP2,$ACC3,$ACC3
  949. vpmuludq 32*4-128($ap),$Bi,$TEMP0
  950. vpaddq $TEMP0,$ACC4,$ACC4
  951. vpmuludq 32*5-128($ap),$Bi,$TEMP1
  952. vpaddq $TEMP1,$ACC5,$ACC5
  953. vpmuludq 32*6-128($ap),$Bi,$TEMP2
  954. vpaddq $TEMP2,$ACC6,$ACC6
  955. vpmuludq 32*7-128($ap),$Bi,$TEMP0
  956. vpermq \$0x93, $ACC9, $ACC9 # correct $ACC3
  957. vpaddq $TEMP0,$ACC7,$ACC7
  958. vpmuludq 32*8-128($ap),$Bi,$TEMP1
  959. vpbroadcastq 8($bp), $Bi
  960. vpaddq $TEMP1,$ACC8,$ACC8
  961. mov %rax,%rdx
  962. imulq -128($np),%rax
  963. add %rax,$r0
  964. mov %rdx,%rax
  965. imulq 8-128($np),%rax
  966. add %rax,$r1
  967. mov %rdx,%rax
  968. imulq 16-128($np),%rax
  969. add %rax,$r2
  970. shr \$29, $r0
  971. imulq 24-128($np),%rdx
  972. add %rdx,$r3
  973. add $r0, $r1
  974. vpmuludq 32*1-128($np),$Yi,$TEMP2
  975. vmovq $Bi, %rbx
  976. vpaddq $TEMP2,$ACC1,$ACC1
  977. vpmuludq 32*2-128($np),$Yi,$TEMP0
  978. vpaddq $TEMP0,$ACC2,$ACC2
  979. vpmuludq 32*3-128($np),$Yi,$TEMP1
  980. vpaddq $TEMP1,$ACC3,$ACC3
  981. vpmuludq 32*4-128($np),$Yi,$TEMP2
  982. vpaddq $TEMP2,$ACC4,$ACC4
  983. vpmuludq 32*5-128($np),$Yi,$TEMP0
  984. vpaddq $TEMP0,$ACC5,$ACC5
  985. vpmuludq 32*6-128($np),$Yi,$TEMP1
  986. vpaddq $TEMP1,$ACC6,$ACC6
  987. vpmuludq 32*7-128($np),$Yi,$TEMP2
  988. vpblendd \$3, $ZERO, $ACC9, $TEMP1 # correct $ACC3
  989. vpaddq $TEMP2,$ACC7,$ACC7
  990. vpmuludq 32*8-128($np),$Yi,$TEMP0
  991. vpaddq $TEMP1, $ACC3, $ACC3 # correct $ACC3
  992. vpaddq $TEMP0,$ACC8,$ACC8
  993. mov %rbx, %rax
  994. imulq -128($ap),%rax
  995. add %rax,$r1
  996. vmovdqu -8+32*1-128($ap),$TEMP1
  997. mov %rbx, %rax
  998. imulq 8-128($ap),%rax
  999. add %rax,$r2
  1000. vmovdqu -8+32*2-128($ap),$TEMP2
  1001. mov $r1, %rax
  1002. vpblendd \$0xfc, $ZERO, $ACC9, $ACC9 # correct $ACC3
  1003. imull $n0, %eax
  1004. vpaddq $ACC9,$ACC4,$ACC4 # correct $ACC3
  1005. and \$0x1fffffff, %eax
  1006. imulq 16-128($ap),%rbx
  1007. add %rbx,$r3
  1008. vpmuludq $Bi,$TEMP1,$TEMP1
  1009. vmovd %eax, $Yi
  1010. vmovdqu -8+32*3-128($ap),$TEMP0
  1011. vpaddq $TEMP1,$ACC1,$ACC1
  1012. vpmuludq $Bi,$TEMP2,$TEMP2
  1013. vpbroadcastq $Yi, $Yi
  1014. vmovdqu -8+32*4-128($ap),$TEMP1
  1015. vpaddq $TEMP2,$ACC2,$ACC2
  1016. vpmuludq $Bi,$TEMP0,$TEMP0
  1017. vmovdqu -8+32*5-128($ap),$TEMP2
  1018. vpaddq $TEMP0,$ACC3,$ACC3
  1019. vpmuludq $Bi,$TEMP1,$TEMP1
  1020. vmovdqu -8+32*6-128($ap),$TEMP0
  1021. vpaddq $TEMP1,$ACC4,$ACC4
  1022. vpmuludq $Bi,$TEMP2,$TEMP2
  1023. vmovdqu -8+32*7-128($ap),$TEMP1
  1024. vpaddq $TEMP2,$ACC5,$ACC5
  1025. vpmuludq $Bi,$TEMP0,$TEMP0
  1026. vmovdqu -8+32*8-128($ap),$TEMP2
  1027. vpaddq $TEMP0,$ACC6,$ACC6
  1028. vpmuludq $Bi,$TEMP1,$TEMP1
  1029. vmovdqu -8+32*9-128($ap),$ACC9
  1030. vpaddq $TEMP1,$ACC7,$ACC7
  1031. vpmuludq $Bi,$TEMP2,$TEMP2
  1032. vpaddq $TEMP2,$ACC8,$ACC8
  1033. vpmuludq $Bi,$ACC9,$ACC9
  1034. vpbroadcastq 16($bp), $Bi
  1035. mov %rax,%rdx
  1036. imulq -128($np),%rax
  1037. add %rax,$r1
  1038. vmovdqu -8+32*1-128($np),$TEMP0
  1039. mov %rdx,%rax
  1040. imulq 8-128($np),%rax
  1041. add %rax,$r2
  1042. vmovdqu -8+32*2-128($np),$TEMP1
  1043. shr \$29, $r1
  1044. imulq 16-128($np),%rdx
  1045. add %rdx,$r3
  1046. add $r1, $r2
  1047. vpmuludq $Yi,$TEMP0,$TEMP0
  1048. vmovq $Bi, %rbx
  1049. vmovdqu -8+32*3-128($np),$TEMP2
  1050. vpaddq $TEMP0,$ACC1,$ACC1
  1051. vpmuludq $Yi,$TEMP1,$TEMP1
  1052. vmovdqu -8+32*4-128($np),$TEMP0
  1053. vpaddq $TEMP1,$ACC2,$ACC2
  1054. vpmuludq $Yi,$TEMP2,$TEMP2
  1055. vmovdqu -8+32*5-128($np),$TEMP1
  1056. vpaddq $TEMP2,$ACC3,$ACC3
  1057. vpmuludq $Yi,$TEMP0,$TEMP0
  1058. vmovdqu -8+32*6-128($np),$TEMP2
  1059. vpaddq $TEMP0,$ACC4,$ACC4
  1060. vpmuludq $Yi,$TEMP1,$TEMP1
  1061. vmovdqu -8+32*7-128($np),$TEMP0
  1062. vpaddq $TEMP1,$ACC5,$ACC5
  1063. vpmuludq $Yi,$TEMP2,$TEMP2
  1064. vmovdqu -8+32*8-128($np),$TEMP1
  1065. vpaddq $TEMP2,$ACC6,$ACC6
  1066. vpmuludq $Yi,$TEMP0,$TEMP0
  1067. vmovdqu -8+32*9-128($np),$TEMP2
  1068. vpaddq $TEMP0,$ACC7,$ACC7
  1069. vpmuludq $Yi,$TEMP1,$TEMP1
  1070. vpaddq $TEMP1,$ACC8,$ACC8
  1071. vpmuludq $Yi,$TEMP2,$TEMP2
  1072. vpaddq $TEMP2,$ACC9,$ACC9
  1073. vmovdqu -16+32*1-128($ap),$TEMP0
  1074. mov %rbx,%rax
  1075. imulq -128($ap),%rax
  1076. add $r2,%rax
  1077. vmovdqu -16+32*2-128($ap),$TEMP1
  1078. mov %rax,$r2
  1079. imull $n0, %eax
  1080. and \$0x1fffffff, %eax
  1081. imulq 8-128($ap),%rbx
  1082. add %rbx,$r3
  1083. vpmuludq $Bi,$TEMP0,$TEMP0
  1084. vmovd %eax, $Yi
  1085. vmovdqu -16+32*3-128($ap),$TEMP2
  1086. vpaddq $TEMP0,$ACC1,$ACC1
  1087. vpmuludq $Bi,$TEMP1,$TEMP1
  1088. vpbroadcastq $Yi, $Yi
  1089. vmovdqu -16+32*4-128($ap),$TEMP0
  1090. vpaddq $TEMP1,$ACC2,$ACC2
  1091. vpmuludq $Bi,$TEMP2,$TEMP2
  1092. vmovdqu -16+32*5-128($ap),$TEMP1
  1093. vpaddq $TEMP2,$ACC3,$ACC3
  1094. vpmuludq $Bi,$TEMP0,$TEMP0
  1095. vmovdqu -16+32*6-128($ap),$TEMP2
  1096. vpaddq $TEMP0,$ACC4,$ACC4
  1097. vpmuludq $Bi,$TEMP1,$TEMP1
  1098. vmovdqu -16+32*7-128($ap),$TEMP0
  1099. vpaddq $TEMP1,$ACC5,$ACC5
  1100. vpmuludq $Bi,$TEMP2,$TEMP2
  1101. vmovdqu -16+32*8-128($ap),$TEMP1
  1102. vpaddq $TEMP2,$ACC6,$ACC6
  1103. vpmuludq $Bi,$TEMP0,$TEMP0
  1104. vmovdqu -16+32*9-128($ap),$TEMP2
  1105. vpaddq $TEMP0,$ACC7,$ACC7
  1106. vpmuludq $Bi,$TEMP1,$TEMP1
  1107. vpaddq $TEMP1,$ACC8,$ACC8
  1108. vpmuludq $Bi,$TEMP2,$TEMP2
  1109. vpbroadcastq 24($bp), $Bi
  1110. vpaddq $TEMP2,$ACC9,$ACC9
  1111. vmovdqu -16+32*1-128($np),$TEMP0
  1112. mov %rax,%rdx
  1113. imulq -128($np),%rax
  1114. add %rax,$r2
  1115. vmovdqu -16+32*2-128($np),$TEMP1
  1116. imulq 8-128($np),%rdx
  1117. add %rdx,$r3
  1118. shr \$29, $r2
  1119. vpmuludq $Yi,$TEMP0,$TEMP0
  1120. vmovq $Bi, %rbx
  1121. vmovdqu -16+32*3-128($np),$TEMP2
  1122. vpaddq $TEMP0,$ACC1,$ACC1
  1123. vpmuludq $Yi,$TEMP1,$TEMP1
  1124. vmovdqu -16+32*4-128($np),$TEMP0
  1125. vpaddq $TEMP1,$ACC2,$ACC2
  1126. vpmuludq $Yi,$TEMP2,$TEMP2
  1127. vmovdqu -16+32*5-128($np),$TEMP1
  1128. vpaddq $TEMP2,$ACC3,$ACC3
  1129. vpmuludq $Yi,$TEMP0,$TEMP0
  1130. vmovdqu -16+32*6-128($np),$TEMP2
  1131. vpaddq $TEMP0,$ACC4,$ACC4
  1132. vpmuludq $Yi,$TEMP1,$TEMP1
  1133. vmovdqu -16+32*7-128($np),$TEMP0
  1134. vpaddq $TEMP1,$ACC5,$ACC5
  1135. vpmuludq $Yi,$TEMP2,$TEMP2
  1136. vmovdqu -16+32*8-128($np),$TEMP1
  1137. vpaddq $TEMP2,$ACC6,$ACC6
  1138. vpmuludq $Yi,$TEMP0,$TEMP0
  1139. vmovdqu -16+32*9-128($np),$TEMP2
  1140. vpaddq $TEMP0,$ACC7,$ACC7
  1141. vpmuludq $Yi,$TEMP1,$TEMP1
  1142. vmovdqu -24+32*1-128($ap),$TEMP0
  1143. vpaddq $TEMP1,$ACC8,$ACC8
  1144. vpmuludq $Yi,$TEMP2,$TEMP2
  1145. vmovdqu -24+32*2-128($ap),$TEMP1
  1146. vpaddq $TEMP2,$ACC9,$ACC9
  1147. add $r2, $r3
  1148. imulq -128($ap),%rbx
  1149. add %rbx,$r3
  1150. mov $r3, %rax
  1151. imull $n0, %eax
  1152. and \$0x1fffffff, %eax
  1153. vpmuludq $Bi,$TEMP0,$TEMP0
  1154. vmovd %eax, $Yi
  1155. vmovdqu -24+32*3-128($ap),$TEMP2
  1156. vpaddq $TEMP0,$ACC1,$ACC1
  1157. vpmuludq $Bi,$TEMP1,$TEMP1
  1158. vpbroadcastq $Yi, $Yi
  1159. vmovdqu -24+32*4-128($ap),$TEMP0
  1160. vpaddq $TEMP1,$ACC2,$ACC2
  1161. vpmuludq $Bi,$TEMP2,$TEMP2
  1162. vmovdqu -24+32*5-128($ap),$TEMP1
  1163. vpaddq $TEMP2,$ACC3,$ACC3
  1164. vpmuludq $Bi,$TEMP0,$TEMP0
  1165. vmovdqu -24+32*6-128($ap),$TEMP2
  1166. vpaddq $TEMP0,$ACC4,$ACC4
  1167. vpmuludq $Bi,$TEMP1,$TEMP1
  1168. vmovdqu -24+32*7-128($ap),$TEMP0
  1169. vpaddq $TEMP1,$ACC5,$ACC5
  1170. vpmuludq $Bi,$TEMP2,$TEMP2
  1171. vmovdqu -24+32*8-128($ap),$TEMP1
  1172. vpaddq $TEMP2,$ACC6,$ACC6
  1173. vpmuludq $Bi,$TEMP0,$TEMP0
  1174. vmovdqu -24+32*9-128($ap),$TEMP2
  1175. vpaddq $TEMP0,$ACC7,$ACC7
  1176. vpmuludq $Bi,$TEMP1,$TEMP1
  1177. vpaddq $TEMP1,$ACC8,$ACC8
  1178. vpmuludq $Bi,$TEMP2,$TEMP2
  1179. vpbroadcastq 32($bp), $Bi
  1180. vpaddq $TEMP2,$ACC9,$ACC9
  1181. add \$32, $bp # $bp++
  1182. vmovdqu -24+32*1-128($np),$TEMP0
  1183. imulq -128($np),%rax
  1184. add %rax,$r3
  1185. shr \$29, $r3
  1186. vmovdqu -24+32*2-128($np),$TEMP1
  1187. vpmuludq $Yi,$TEMP0,$TEMP0
  1188. vmovq $Bi, %rbx
  1189. vmovdqu -24+32*3-128($np),$TEMP2
  1190. vpaddq $TEMP0,$ACC1,$ACC0 # $ACC0==$TEMP0
  1191. vpmuludq $Yi,$TEMP1,$TEMP1
  1192. vmovdqu $ACC0, (%rsp) # transfer $r0-$r3
  1193. vpaddq $TEMP1,$ACC2,$ACC1
  1194. vmovdqu -24+32*4-128($np),$TEMP0
  1195. vpmuludq $Yi,$TEMP2,$TEMP2
  1196. vmovdqu -24+32*5-128($np),$TEMP1
  1197. vpaddq $TEMP2,$ACC3,$ACC2
  1198. vpmuludq $Yi,$TEMP0,$TEMP0
  1199. vmovdqu -24+32*6-128($np),$TEMP2
  1200. vpaddq $TEMP0,$ACC4,$ACC3
  1201. vpmuludq $Yi,$TEMP1,$TEMP1
  1202. vmovdqu -24+32*7-128($np),$TEMP0
  1203. vpaddq $TEMP1,$ACC5,$ACC4
  1204. vpmuludq $Yi,$TEMP2,$TEMP2
  1205. vmovdqu -24+32*8-128($np),$TEMP1
  1206. vpaddq $TEMP2,$ACC6,$ACC5
  1207. vpmuludq $Yi,$TEMP0,$TEMP0
  1208. vmovdqu -24+32*9-128($np),$TEMP2
  1209. mov $r3, $r0
  1210. vpaddq $TEMP0,$ACC7,$ACC6
  1211. vpmuludq $Yi,$TEMP1,$TEMP1
  1212. add (%rsp), $r0
  1213. vpaddq $TEMP1,$ACC8,$ACC7
  1214. vpmuludq $Yi,$TEMP2,$TEMP2
  1215. vmovq $r3, $TEMP1
  1216. vpaddq $TEMP2,$ACC9,$ACC8
  1217. dec $i
  1218. jnz .Loop_mul_1024
  1219. ___
  1220. # (*) Original implementation was correcting ACC1-ACC3 for overflow
  1221. # after 7 loop runs, or after 28 iterations, or 56 additions.
  1222. # But as we underutilize resources, it's possible to correct in
  1223. # each iteration with marginal performance loss. But then, as
  1224. # we do it in each iteration, we can correct less digits, and
  1225. # avoid performance penalties completely.
  1226. $TEMP0 = $ACC9;
  1227. $TEMP3 = $Bi;
  1228. $TEMP4 = $Yi;
  1229. $code.=<<___;
  1230. vpaddq (%rsp), $TEMP1, $ACC0
  1231. vpsrlq \$29, $ACC0, $TEMP1
  1232. vpand $AND_MASK, $ACC0, $ACC0
  1233. vpsrlq \$29, $ACC1, $TEMP2
  1234. vpand $AND_MASK, $ACC1, $ACC1
  1235. vpsrlq \$29, $ACC2, $TEMP3
  1236. vpermq \$0x93, $TEMP1, $TEMP1
  1237. vpand $AND_MASK, $ACC2, $ACC2
  1238. vpsrlq \$29, $ACC3, $TEMP4
  1239. vpermq \$0x93, $TEMP2, $TEMP2
  1240. vpand $AND_MASK, $ACC3, $ACC3
  1241. vpblendd \$3, $ZERO, $TEMP1, $TEMP0
  1242. vpermq \$0x93, $TEMP3, $TEMP3
  1243. vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
  1244. vpermq \$0x93, $TEMP4, $TEMP4
  1245. vpaddq $TEMP0, $ACC0, $ACC0
  1246. vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
  1247. vpaddq $TEMP1, $ACC1, $ACC1
  1248. vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
  1249. vpaddq $TEMP2, $ACC2, $ACC2
  1250. vpblendd \$3, $TEMP4, $ZERO, $TEMP4
  1251. vpaddq $TEMP3, $ACC3, $ACC3
  1252. vpaddq $TEMP4, $ACC4, $ACC4
  1253. vpsrlq \$29, $ACC0, $TEMP1
  1254. vpand $AND_MASK, $ACC0, $ACC0
  1255. vpsrlq \$29, $ACC1, $TEMP2
  1256. vpand $AND_MASK, $ACC1, $ACC1
  1257. vpsrlq \$29, $ACC2, $TEMP3
  1258. vpermq \$0x93, $TEMP1, $TEMP1
  1259. vpand $AND_MASK, $ACC2, $ACC2
  1260. vpsrlq \$29, $ACC3, $TEMP4
  1261. vpermq \$0x93, $TEMP2, $TEMP2
  1262. vpand $AND_MASK, $ACC3, $ACC3
  1263. vpermq \$0x93, $TEMP3, $TEMP3
  1264. vpblendd \$3, $ZERO, $TEMP1, $TEMP0
  1265. vpermq \$0x93, $TEMP4, $TEMP4
  1266. vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
  1267. vpaddq $TEMP0, $ACC0, $ACC0
  1268. vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
  1269. vpaddq $TEMP1, $ACC1, $ACC1
  1270. vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
  1271. vpaddq $TEMP2, $ACC2, $ACC2
  1272. vpblendd \$3, $TEMP4, $ZERO, $TEMP4
  1273. vpaddq $TEMP3, $ACC3, $ACC3
  1274. vpaddq $TEMP4, $ACC4, $ACC4
  1275. vmovdqu $ACC0, 0-128($rp)
  1276. vmovdqu $ACC1, 32-128($rp)
  1277. vmovdqu $ACC2, 64-128($rp)
  1278. vmovdqu $ACC3, 96-128($rp)
  1279. ___
  1280. $TEMP5=$ACC0;
  1281. $code.=<<___;
  1282. vpsrlq \$29, $ACC4, $TEMP1
  1283. vpand $AND_MASK, $ACC4, $ACC4
  1284. vpsrlq \$29, $ACC5, $TEMP2
  1285. vpand $AND_MASK, $ACC5, $ACC5
  1286. vpsrlq \$29, $ACC6, $TEMP3
  1287. vpermq \$0x93, $TEMP1, $TEMP1
  1288. vpand $AND_MASK, $ACC6, $ACC6
  1289. vpsrlq \$29, $ACC7, $TEMP4
  1290. vpermq \$0x93, $TEMP2, $TEMP2
  1291. vpand $AND_MASK, $ACC7, $ACC7
  1292. vpsrlq \$29, $ACC8, $TEMP5
  1293. vpermq \$0x93, $TEMP3, $TEMP3
  1294. vpand $AND_MASK, $ACC8, $ACC8
  1295. vpermq \$0x93, $TEMP4, $TEMP4
  1296. vpblendd \$3, $ZERO, $TEMP1, $TEMP0
  1297. vpermq \$0x93, $TEMP5, $TEMP5
  1298. vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
  1299. vpaddq $TEMP0, $ACC4, $ACC4
  1300. vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
  1301. vpaddq $TEMP1, $ACC5, $ACC5
  1302. vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
  1303. vpaddq $TEMP2, $ACC6, $ACC6
  1304. vpblendd \$3, $TEMP4, $TEMP5, $TEMP4
  1305. vpaddq $TEMP3, $ACC7, $ACC7
  1306. vpaddq $TEMP4, $ACC8, $ACC8
  1307. vpsrlq \$29, $ACC4, $TEMP1
  1308. vpand $AND_MASK, $ACC4, $ACC4
  1309. vpsrlq \$29, $ACC5, $TEMP2
  1310. vpand $AND_MASK, $ACC5, $ACC5
  1311. vpsrlq \$29, $ACC6, $TEMP3
  1312. vpermq \$0x93, $TEMP1, $TEMP1
  1313. vpand $AND_MASK, $ACC6, $ACC6
  1314. vpsrlq \$29, $ACC7, $TEMP4
  1315. vpermq \$0x93, $TEMP2, $TEMP2
  1316. vpand $AND_MASK, $ACC7, $ACC7
  1317. vpsrlq \$29, $ACC8, $TEMP5
  1318. vpermq \$0x93, $TEMP3, $TEMP3
  1319. vpand $AND_MASK, $ACC8, $ACC8
  1320. vpermq \$0x93, $TEMP4, $TEMP4
  1321. vpblendd \$3, $ZERO, $TEMP1, $TEMP0
  1322. vpermq \$0x93, $TEMP5, $TEMP5
  1323. vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
  1324. vpaddq $TEMP0, $ACC4, $ACC4
  1325. vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
  1326. vpaddq $TEMP1, $ACC5, $ACC5
  1327. vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
  1328. vpaddq $TEMP2, $ACC6, $ACC6
  1329. vpblendd \$3, $TEMP4, $TEMP5, $TEMP4
  1330. vpaddq $TEMP3, $ACC7, $ACC7
  1331. vpaddq $TEMP4, $ACC8, $ACC8
  1332. vmovdqu $ACC4, 128-128($rp)
  1333. vmovdqu $ACC5, 160-128($rp)
  1334. vmovdqu $ACC6, 192-128($rp)
  1335. vmovdqu $ACC7, 224-128($rp)
  1336. vmovdqu $ACC8, 256-128($rp)
  1337. vzeroupper
  1338. mov %rbp, %rax
  1339. .cfi_def_cfa_register %rax
  1340. ___
  1341. $code.=<<___ if ($win64);
  1342. .Lmul_1024_in_tail:
  1343. movaps -0xd8(%rax),%xmm6
  1344. movaps -0xc8(%rax),%xmm7
  1345. movaps -0xb8(%rax),%xmm8
  1346. movaps -0xa8(%rax),%xmm9
  1347. movaps -0x98(%rax),%xmm10
  1348. movaps -0x88(%rax),%xmm11
  1349. movaps -0x78(%rax),%xmm12
  1350. movaps -0x68(%rax),%xmm13
  1351. movaps -0x58(%rax),%xmm14
  1352. movaps -0x48(%rax),%xmm15
  1353. ___
  1354. $code.=<<___;
  1355. mov -48(%rax),%r15
  1356. .cfi_restore %r15
  1357. mov -40(%rax),%r14
  1358. .cfi_restore %r14
  1359. mov -32(%rax),%r13
  1360. .cfi_restore %r13
  1361. mov -24(%rax),%r12
  1362. .cfi_restore %r12
  1363. mov -16(%rax),%rbp
  1364. .cfi_restore %rbp
  1365. mov -8(%rax),%rbx
  1366. .cfi_restore %rbx
  1367. lea (%rax),%rsp # restore %rsp
  1368. .cfi_def_cfa_register %rsp
  1369. .Lmul_1024_epilogue:
  1370. ret
  1371. .cfi_endproc
  1372. .size rsaz_1024_mul_avx2,.-rsaz_1024_mul_avx2
  1373. ___
  1374. }
  1375. {
  1376. my ($out,$inp) = $win64 ? ("%rcx","%rdx") : ("%rdi","%rsi");
  1377. my @T = map("%r$_",(8..11));
  1378. $code.=<<___;
  1379. .globl rsaz_1024_red2norm_avx2
  1380. .type rsaz_1024_red2norm_avx2,\@abi-omnipotent
  1381. .align 32
  1382. rsaz_1024_red2norm_avx2:
  1383. .cfi_startproc
  1384. sub \$-128,$inp # size optimization
  1385. xor %rax,%rax
  1386. ___
  1387. for ($j=0,$i=0; $i<16; $i++) {
  1388. my $k=0;
  1389. while (29*$j<64*($i+1)) { # load data till boundary
  1390. $code.=" mov `8*$j-128`($inp), @T[0]\n";
  1391. $j++; $k++; push(@T,shift(@T));
  1392. }
  1393. $l=$k;
  1394. while ($k>1) { # shift loaded data but last value
  1395. $code.=" shl \$`29*($j-$k)`,@T[-$k]\n";
  1396. $k--;
  1397. }
  1398. $code.=<<___; # shift last value
  1399. mov @T[-1], @T[0]
  1400. shl \$`29*($j-1)`, @T[-1]
  1401. shr \$`-29*($j-1)`, @T[0]
  1402. ___
  1403. while ($l) { # accumulate all values
  1404. $code.=" add @T[-$l], %rax\n";
  1405. $l--;
  1406. }
  1407. $code.=<<___;
  1408. adc \$0, @T[0] # consume eventual carry
  1409. mov %rax, 8*$i($out)
  1410. mov @T[0], %rax
  1411. ___
  1412. push(@T,shift(@T));
  1413. }
  1414. $code.=<<___;
  1415. ret
  1416. .cfi_endproc
  1417. .size rsaz_1024_red2norm_avx2,.-rsaz_1024_red2norm_avx2
  1418. .globl rsaz_1024_norm2red_avx2
  1419. .type rsaz_1024_norm2red_avx2,\@abi-omnipotent
  1420. .align 32
  1421. rsaz_1024_norm2red_avx2:
  1422. .cfi_startproc
  1423. sub \$-128,$out # size optimization
  1424. mov ($inp),@T[0]
  1425. mov \$0x1fffffff,%eax
  1426. ___
  1427. for ($j=0,$i=0; $i<16; $i++) {
  1428. $code.=" mov `8*($i+1)`($inp),@T[1]\n" if ($i<15);
  1429. $code.=" xor @T[1],@T[1]\n" if ($i==15);
  1430. my $k=1;
  1431. while (29*($j+1)<64*($i+1)) {
  1432. $code.=<<___;
  1433. mov @T[0],@T[-$k]
  1434. shr \$`29*$j`,@T[-$k]
  1435. and %rax,@T[-$k] # &0x1fffffff
  1436. mov @T[-$k],`8*$j-128`($out)
  1437. ___
  1438. $j++; $k++;
  1439. }
  1440. $code.=<<___;
  1441. shrd \$`29*$j`,@T[1],@T[0]
  1442. and %rax,@T[0]
  1443. mov @T[0],`8*$j-128`($out)
  1444. ___
  1445. $j++;
  1446. push(@T,shift(@T));
  1447. }
  1448. $code.=<<___;
  1449. mov @T[0],`8*$j-128`($out) # zero
  1450. mov @T[0],`8*($j+1)-128`($out)
  1451. mov @T[0],`8*($j+2)-128`($out)
  1452. mov @T[0],`8*($j+3)-128`($out)
  1453. ret
  1454. .cfi_endproc
  1455. .size rsaz_1024_norm2red_avx2,.-rsaz_1024_norm2red_avx2
  1456. ___
  1457. }
  1458. {
  1459. my ($out,$inp,$power) = $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx");
  1460. $code.=<<___;
  1461. .globl rsaz_1024_scatter5_avx2
  1462. .type rsaz_1024_scatter5_avx2,\@abi-omnipotent
  1463. .align 32
  1464. rsaz_1024_scatter5_avx2:
  1465. .cfi_startproc
  1466. vzeroupper
  1467. vmovdqu .Lscatter_permd(%rip),%ymm5
  1468. shl \$4,$power
  1469. lea ($out,$power),$out
  1470. mov \$9,%eax
  1471. jmp .Loop_scatter_1024
  1472. .align 32
  1473. .Loop_scatter_1024:
  1474. vmovdqu ($inp),%ymm0
  1475. lea 32($inp),$inp
  1476. vpermd %ymm0,%ymm5,%ymm0
  1477. vmovdqu %xmm0,($out)
  1478. lea 16*32($out),$out
  1479. dec %eax
  1480. jnz .Loop_scatter_1024
  1481. vzeroupper
  1482. ret
  1483. .cfi_endproc
  1484. .size rsaz_1024_scatter5_avx2,.-rsaz_1024_scatter5_avx2
  1485. .globl rsaz_1024_gather5_avx2
  1486. .type rsaz_1024_gather5_avx2,\@abi-omnipotent
  1487. .align 32
  1488. rsaz_1024_gather5_avx2:
  1489. .cfi_startproc
  1490. vzeroupper
  1491. mov %rsp,%r11
  1492. .cfi_def_cfa_register %r11
  1493. ___
  1494. $code.=<<___ if ($win64);
  1495. lea -0x88(%rsp),%rax
  1496. .LSEH_begin_rsaz_1024_gather5:
  1497. # I can't trust assembler to use specific encoding:-(
  1498. .byte 0x48,0x8d,0x60,0xe0 # lea -0x20(%rax),%rsp
  1499. .byte 0xc5,0xf8,0x29,0x70,0xe0 # vmovaps %xmm6,-0x20(%rax)
  1500. .byte 0xc5,0xf8,0x29,0x78,0xf0 # vmovaps %xmm7,-0x10(%rax)
  1501. .byte 0xc5,0x78,0x29,0x40,0x00 # vmovaps %xmm8,0(%rax)
  1502. .byte 0xc5,0x78,0x29,0x48,0x10 # vmovaps %xmm9,0x10(%rax)
  1503. .byte 0xc5,0x78,0x29,0x50,0x20 # vmovaps %xmm10,0x20(%rax)
  1504. .byte 0xc5,0x78,0x29,0x58,0x30 # vmovaps %xmm11,0x30(%rax)
  1505. .byte 0xc5,0x78,0x29,0x60,0x40 # vmovaps %xmm12,0x40(%rax)
  1506. .byte 0xc5,0x78,0x29,0x68,0x50 # vmovaps %xmm13,0x50(%rax)
  1507. .byte 0xc5,0x78,0x29,0x70,0x60 # vmovaps %xmm14,0x60(%rax)
  1508. .byte 0xc5,0x78,0x29,0x78,0x70 # vmovaps %xmm15,0x70(%rax)
  1509. ___
  1510. $code.=<<___;
  1511. lea -0x100(%rsp),%rsp
  1512. and \$-32, %rsp
  1513. lea .Linc(%rip), %r10
  1514. lea -128(%rsp),%rax # control u-op density
  1515. vmovd $power, %xmm4
  1516. vmovdqa (%r10),%ymm0
  1517. vmovdqa 32(%r10),%ymm1
  1518. vmovdqa 64(%r10),%ymm5
  1519. vpbroadcastd %xmm4,%ymm4
  1520. vpaddd %ymm5, %ymm0, %ymm2
  1521. vpcmpeqd %ymm4, %ymm0, %ymm0
  1522. vpaddd %ymm5, %ymm1, %ymm3
  1523. vpcmpeqd %ymm4, %ymm1, %ymm1
  1524. vmovdqa %ymm0, 32*0+128(%rax)
  1525. vpaddd %ymm5, %ymm2, %ymm0
  1526. vpcmpeqd %ymm4, %ymm2, %ymm2
  1527. vmovdqa %ymm1, 32*1+128(%rax)
  1528. vpaddd %ymm5, %ymm3, %ymm1
  1529. vpcmpeqd %ymm4, %ymm3, %ymm3
  1530. vmovdqa %ymm2, 32*2+128(%rax)
  1531. vpaddd %ymm5, %ymm0, %ymm2
  1532. vpcmpeqd %ymm4, %ymm0, %ymm0
  1533. vmovdqa %ymm3, 32*3+128(%rax)
  1534. vpaddd %ymm5, %ymm1, %ymm3
  1535. vpcmpeqd %ymm4, %ymm1, %ymm1
  1536. vmovdqa %ymm0, 32*4+128(%rax)
  1537. vpaddd %ymm5, %ymm2, %ymm8
  1538. vpcmpeqd %ymm4, %ymm2, %ymm2
  1539. vmovdqa %ymm1, 32*5+128(%rax)
  1540. vpaddd %ymm5, %ymm3, %ymm9
  1541. vpcmpeqd %ymm4, %ymm3, %ymm3
  1542. vmovdqa %ymm2, 32*6+128(%rax)
  1543. vpaddd %ymm5, %ymm8, %ymm10
  1544. vpcmpeqd %ymm4, %ymm8, %ymm8
  1545. vmovdqa %ymm3, 32*7+128(%rax)
  1546. vpaddd %ymm5, %ymm9, %ymm11
  1547. vpcmpeqd %ymm4, %ymm9, %ymm9
  1548. vpaddd %ymm5, %ymm10, %ymm12
  1549. vpcmpeqd %ymm4, %ymm10, %ymm10
  1550. vpaddd %ymm5, %ymm11, %ymm13
  1551. vpcmpeqd %ymm4, %ymm11, %ymm11
  1552. vpaddd %ymm5, %ymm12, %ymm14
  1553. vpcmpeqd %ymm4, %ymm12, %ymm12
  1554. vpaddd %ymm5, %ymm13, %ymm15
  1555. vpcmpeqd %ymm4, %ymm13, %ymm13
  1556. vpcmpeqd %ymm4, %ymm14, %ymm14
  1557. vpcmpeqd %ymm4, %ymm15, %ymm15
  1558. vmovdqa -32(%r10),%ymm7 # .Lgather_permd
  1559. lea 128($inp), $inp
  1560. mov \$9,$power
  1561. .Loop_gather_1024:
  1562. vmovdqa 32*0-128($inp), %ymm0
  1563. vmovdqa 32*1-128($inp), %ymm1
  1564. vmovdqa 32*2-128($inp), %ymm2
  1565. vmovdqa 32*3-128($inp), %ymm3
  1566. vpand 32*0+128(%rax), %ymm0, %ymm0
  1567. vpand 32*1+128(%rax), %ymm1, %ymm1
  1568. vpand 32*2+128(%rax), %ymm2, %ymm2
  1569. vpor %ymm0, %ymm1, %ymm4
  1570. vpand 32*3+128(%rax), %ymm3, %ymm3
  1571. vmovdqa 32*4-128($inp), %ymm0
  1572. vmovdqa 32*5-128($inp), %ymm1
  1573. vpor %ymm2, %ymm3, %ymm5
  1574. vmovdqa 32*6-128($inp), %ymm2
  1575. vmovdqa 32*7-128($inp), %ymm3
  1576. vpand 32*4+128(%rax), %ymm0, %ymm0
  1577. vpand 32*5+128(%rax), %ymm1, %ymm1
  1578. vpand 32*6+128(%rax), %ymm2, %ymm2
  1579. vpor %ymm0, %ymm4, %ymm4
  1580. vpand 32*7+128(%rax), %ymm3, %ymm3
  1581. vpand 32*8-128($inp), %ymm8, %ymm0
  1582. vpor %ymm1, %ymm5, %ymm5
  1583. vpand 32*9-128($inp), %ymm9, %ymm1
  1584. vpor %ymm2, %ymm4, %ymm4
  1585. vpand 32*10-128($inp),%ymm10, %ymm2
  1586. vpor %ymm3, %ymm5, %ymm5
  1587. vpand 32*11-128($inp),%ymm11, %ymm3
  1588. vpor %ymm0, %ymm4, %ymm4
  1589. vpand 32*12-128($inp),%ymm12, %ymm0
  1590. vpor %ymm1, %ymm5, %ymm5
  1591. vpand 32*13-128($inp),%ymm13, %ymm1
  1592. vpor %ymm2, %ymm4, %ymm4
  1593. vpand 32*14-128($inp),%ymm14, %ymm2
  1594. vpor %ymm3, %ymm5, %ymm5
  1595. vpand 32*15-128($inp),%ymm15, %ymm3
  1596. lea 32*16($inp), $inp
  1597. vpor %ymm0, %ymm4, %ymm4
  1598. vpor %ymm1, %ymm5, %ymm5
  1599. vpor %ymm2, %ymm4, %ymm4
  1600. vpor %ymm3, %ymm5, %ymm5
  1601. vpor %ymm5, %ymm4, %ymm4
  1602. vextracti128 \$1, %ymm4, %xmm5 # upper half is cleared
  1603. vpor %xmm4, %xmm5, %xmm5
  1604. vpermd %ymm5,%ymm7,%ymm5
  1605. vmovdqu %ymm5,($out)
  1606. lea 32($out),$out
  1607. dec $power
  1608. jnz .Loop_gather_1024
  1609. vpxor %ymm0,%ymm0,%ymm0
  1610. vmovdqu %ymm0,($out)
  1611. vzeroupper
  1612. ___
  1613. $code.=<<___ if ($win64);
  1614. movaps -0xa8(%r11),%xmm6
  1615. movaps -0x98(%r11),%xmm7
  1616. movaps -0x88(%r11),%xmm8
  1617. movaps -0x78(%r11),%xmm9
  1618. movaps -0x68(%r11),%xmm10
  1619. movaps -0x58(%r11),%xmm11
  1620. movaps -0x48(%r11),%xmm12
  1621. movaps -0x38(%r11),%xmm13
  1622. movaps -0x28(%r11),%xmm14
  1623. movaps -0x18(%r11),%xmm15
  1624. ___
  1625. $code.=<<___;
  1626. lea (%r11),%rsp
  1627. .cfi_def_cfa_register %rsp
  1628. ret
  1629. .cfi_endproc
  1630. .LSEH_end_rsaz_1024_gather5:
  1631. .size rsaz_1024_gather5_avx2,.-rsaz_1024_gather5_avx2
  1632. ___
  1633. }
  1634. $code.=<<___;
  1635. .extern OPENSSL_ia32cap_P
  1636. .globl rsaz_avx2_eligible
  1637. .type rsaz_avx2_eligible,\@abi-omnipotent
  1638. .align 32
  1639. rsaz_avx2_eligible:
  1640. mov OPENSSL_ia32cap_P+8(%rip),%eax
  1641. ___
  1642. $code.=<<___ if ($addx);
  1643. mov \$`1<<8|1<<19`,%ecx
  1644. mov \$0,%edx
  1645. and %eax,%ecx
  1646. cmp \$`1<<8|1<<19`,%ecx # check for BMI2+AD*X
  1647. cmove %edx,%eax
  1648. ___
  1649. $code.=<<___;
  1650. and \$`1<<5`,%eax
  1651. shr \$5,%eax
  1652. ret
  1653. .size rsaz_avx2_eligible,.-rsaz_avx2_eligible
  1654. .align 64
  1655. .Land_mask:
  1656. .quad 0x1fffffff,0x1fffffff,0x1fffffff,0x1fffffff
  1657. .Lscatter_permd:
  1658. .long 0,2,4,6,7,7,7,7
  1659. .Lgather_permd:
  1660. .long 0,7,1,7,2,7,3,7
  1661. .Linc:
  1662. .long 0,0,0,0, 1,1,1,1
  1663. .long 2,2,2,2, 3,3,3,3
  1664. .long 4,4,4,4, 4,4,4,4
  1665. .align 64
  1666. ___
  1667. if ($win64) {
  1668. $rec="%rcx";
  1669. $frame="%rdx";
  1670. $context="%r8";
  1671. $disp="%r9";
  1672. $code.=<<___
  1673. .extern __imp_RtlVirtualUnwind
  1674. .type rsaz_se_handler,\@abi-omnipotent
  1675. .align 16
  1676. rsaz_se_handler:
  1677. push %rsi
  1678. push %rdi
  1679. push %rbx
  1680. push %rbp
  1681. push %r12
  1682. push %r13
  1683. push %r14
  1684. push %r15
  1685. pushfq
  1686. sub \$64,%rsp
  1687. mov 120($context),%rax # pull context->Rax
  1688. mov 248($context),%rbx # pull context->Rip
  1689. mov 8($disp),%rsi # disp->ImageBase
  1690. mov 56($disp),%r11 # disp->HandlerData
  1691. mov 0(%r11),%r10d # HandlerData[0]
  1692. lea (%rsi,%r10),%r10 # prologue label
  1693. cmp %r10,%rbx # context->Rip<prologue label
  1694. jb .Lcommon_seh_tail
  1695. mov 4(%r11),%r10d # HandlerData[1]
  1696. lea (%rsi,%r10),%r10 # epilogue label
  1697. cmp %r10,%rbx # context->Rip>=epilogue label
  1698. jae .Lcommon_seh_tail
  1699. mov 160($context),%rbp # pull context->Rbp
  1700. mov 8(%r11),%r10d # HandlerData[2]
  1701. lea (%rsi,%r10),%r10 # "in tail" label
  1702. cmp %r10,%rbx # context->Rip>="in tail" label
  1703. cmovc %rbp,%rax
  1704. mov -48(%rax),%r15
  1705. mov -40(%rax),%r14
  1706. mov -32(%rax),%r13
  1707. mov -24(%rax),%r12
  1708. mov -16(%rax),%rbp
  1709. mov -8(%rax),%rbx
  1710. mov %r15,240($context)
  1711. mov %r14,232($context)
  1712. mov %r13,224($context)
  1713. mov %r12,216($context)
  1714. mov %rbp,160($context)
  1715. mov %rbx,144($context)
  1716. lea -0xd8(%rax),%rsi # %xmm save area
  1717. lea 512($context),%rdi # & context.Xmm6
  1718. mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
  1719. .long 0xa548f3fc # cld; rep movsq
  1720. .Lcommon_seh_tail:
  1721. mov 8(%rax),%rdi
  1722. mov 16(%rax),%rsi
  1723. mov %rax,152($context) # restore context->Rsp
  1724. mov %rsi,168($context) # restore context->Rsi
  1725. mov %rdi,176($context) # restore context->Rdi
  1726. mov 40($disp),%rdi # disp->ContextRecord
  1727. mov $context,%rsi # context
  1728. mov \$154,%ecx # sizeof(CONTEXT)
  1729. .long 0xa548f3fc # cld; rep movsq
  1730. mov $disp,%rsi
  1731. xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
  1732. mov 8(%rsi),%rdx # arg2, disp->ImageBase
  1733. mov 0(%rsi),%r8 # arg3, disp->ControlPc
  1734. mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
  1735. mov 40(%rsi),%r10 # disp->ContextRecord
  1736. lea 56(%rsi),%r11 # &disp->HandlerData
  1737. lea 24(%rsi),%r12 # &disp->EstablisherFrame
  1738. mov %r10,32(%rsp) # arg5
  1739. mov %r11,40(%rsp) # arg6
  1740. mov %r12,48(%rsp) # arg7
  1741. mov %rcx,56(%rsp) # arg8, (NULL)
  1742. call *__imp_RtlVirtualUnwind(%rip)
  1743. mov \$1,%eax # ExceptionContinueSearch
  1744. add \$64,%rsp
  1745. popfq
  1746. pop %r15
  1747. pop %r14
  1748. pop %r13
  1749. pop %r12
  1750. pop %rbp
  1751. pop %rbx
  1752. pop %rdi
  1753. pop %rsi
  1754. ret
  1755. .size rsaz_se_handler,.-rsaz_se_handler
  1756. .section .pdata
  1757. .align 4
  1758. .rva .LSEH_begin_rsaz_1024_sqr_avx2
  1759. .rva .LSEH_end_rsaz_1024_sqr_avx2
  1760. .rva .LSEH_info_rsaz_1024_sqr_avx2
  1761. .rva .LSEH_begin_rsaz_1024_mul_avx2
  1762. .rva .LSEH_end_rsaz_1024_mul_avx2
  1763. .rva .LSEH_info_rsaz_1024_mul_avx2
  1764. .rva .LSEH_begin_rsaz_1024_gather5
  1765. .rva .LSEH_end_rsaz_1024_gather5
  1766. .rva .LSEH_info_rsaz_1024_gather5
  1767. .section .xdata
  1768. .align 8
  1769. .LSEH_info_rsaz_1024_sqr_avx2:
  1770. .byte 9,0,0,0
  1771. .rva rsaz_se_handler
  1772. .rva .Lsqr_1024_body,.Lsqr_1024_epilogue,.Lsqr_1024_in_tail
  1773. .long 0
  1774. .LSEH_info_rsaz_1024_mul_avx2:
  1775. .byte 9,0,0,0
  1776. .rva rsaz_se_handler
  1777. .rva .Lmul_1024_body,.Lmul_1024_epilogue,.Lmul_1024_in_tail
  1778. .long 0
  1779. .LSEH_info_rsaz_1024_gather5:
  1780. .byte 0x01,0x36,0x17,0x0b
  1781. .byte 0x36,0xf8,0x09,0x00 # vmovaps 0x90(rsp),xmm15
  1782. .byte 0x31,0xe8,0x08,0x00 # vmovaps 0x80(rsp),xmm14
  1783. .byte 0x2c,0xd8,0x07,0x00 # vmovaps 0x70(rsp),xmm13
  1784. .byte 0x27,0xc8,0x06,0x00 # vmovaps 0x60(rsp),xmm12
  1785. .byte 0x22,0xb8,0x05,0x00 # vmovaps 0x50(rsp),xmm11
  1786. .byte 0x1d,0xa8,0x04,0x00 # vmovaps 0x40(rsp),xmm10
  1787. .byte 0x18,0x98,0x03,0x00 # vmovaps 0x30(rsp),xmm9
  1788. .byte 0x13,0x88,0x02,0x00 # vmovaps 0x20(rsp),xmm8
  1789. .byte 0x0e,0x78,0x01,0x00 # vmovaps 0x10(rsp),xmm7
  1790. .byte 0x09,0x68,0x00,0x00 # vmovaps 0x00(rsp),xmm6
  1791. .byte 0x04,0x01,0x15,0x00 # sub rsp,0xa8
  1792. .byte 0x00,0xb3,0x00,0x00 # set_frame r11
  1793. ___
  1794. }
  1795. foreach (split("\n",$code)) {
  1796. s/\`([^\`]*)\`/eval($1)/ge;
  1797. s/\b(sh[rl]d?\s+\$)(-?[0-9]+)/$1.$2%64/ge or
  1798. s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
  1799. s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go or
  1800. s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
  1801. s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
  1802. s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go;
  1803. print $_,"\n";
  1804. }
  1805. }}} else {{{
  1806. print <<___; # assembler is too old
  1807. .text
  1808. .globl rsaz_avx2_eligible
  1809. .type rsaz_avx2_eligible,\@abi-omnipotent
  1810. rsaz_avx2_eligible:
  1811. xor %eax,%eax
  1812. ret
  1813. .size rsaz_avx2_eligible,.-rsaz_avx2_eligible
  1814. .globl rsaz_1024_sqr_avx2
  1815. .globl rsaz_1024_mul_avx2
  1816. .globl rsaz_1024_norm2red_avx2
  1817. .globl rsaz_1024_red2norm_avx2
  1818. .globl rsaz_1024_scatter5_avx2
  1819. .globl rsaz_1024_gather5_avx2
  1820. .type rsaz_1024_sqr_avx2,\@abi-omnipotent
  1821. rsaz_1024_sqr_avx2:
  1822. rsaz_1024_mul_avx2:
  1823. rsaz_1024_norm2red_avx2:
  1824. rsaz_1024_red2norm_avx2:
  1825. rsaz_1024_scatter5_avx2:
  1826. rsaz_1024_gather5_avx2:
  1827. .byte 0x0f,0x0b # ud2
  1828. ret
  1829. .size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2
  1830. ___
  1831. }}}
  1832. close STDOUT or die "error closing STDOUT: $!";