ecp_nistz256-avx2.pl 55 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080
  1. #! /usr/bin/env perl
  2. # Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
  3. # Copyright (c) 2014, Intel Corporation. All Rights Reserved.
  4. #
  5. # Licensed under the OpenSSL license (the "License"). You may not use
  6. # this file except in compliance with the License. You can obtain a copy
  7. # in the file LICENSE in the source distribution or at
  8. # https://www.openssl.org/source/license.html
  9. #
  10. # Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1)
  11. # (1) Intel Corporation, Israel Development Center, Haifa, Israel
  12. # (2) University of Haifa, Israel
  13. #
  14. # Reference:
  15. # S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with
  16. # 256 Bit Primes"
  17. $flavour = shift;
  18. $output = shift;
  19. if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
  20. $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  21. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  22. ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  23. ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  24. die "can't locate x86_64-xlate.pl";
  25. open OUT,"| \"$^X\" $xlate $flavour $output";
  26. *STDOUT=*OUT;
  27. if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
  28. =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
  29. $avx = ($1>=2.19) + ($1>=2.22);
  30. $addx = ($1>=2.23);
  31. }
  32. if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
  33. `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
  34. $avx = ($1>=2.09) + ($1>=2.10);
  35. $addx = ($1>=2.10);
  36. }
  37. if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
  38. `ml64 2>&1` =~ /Version ([0-9]+)\./) {
  39. $avx = ($1>=10) + ($1>=11);
  40. $addx = ($1>=12);
  41. }
  42. if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|based on LLVM) ([3-9])\.([0-9]+)/) {
  43. my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10
  44. $avx = ($ver>=3.0) + ($ver>=3.01);
  45. $addx = ($ver>=3.03);
  46. }
  47. if ($avx>=2) {{
  48. $digit_size = "\$29";
  49. $n_digits = "\$9";
  50. $code.=<<___;
  51. .text
  52. .align 64
  53. .LAVX2_AND_MASK:
  54. .LAVX2_POLY:
  55. .quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
  56. .quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
  57. .quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
  58. .quad 0x000001ff, 0x000001ff, 0x000001ff, 0x000001ff
  59. .quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
  60. .quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
  61. .quad 0x00040000, 0x00040000, 0x00040000, 0x00040000
  62. .quad 0x1fe00000, 0x1fe00000, 0x1fe00000, 0x1fe00000
  63. .quad 0x00ffffff, 0x00ffffff, 0x00ffffff, 0x00ffffff
  64. .LAVX2_POLY_x2:
  65. .quad 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC
  66. .quad 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC
  67. .quad 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC
  68. .quad 0x400007FC, 0x400007FC, 0x400007FC, 0x400007FC
  69. .quad 0x3FFFFFFE, 0x3FFFFFFE, 0x3FFFFFFE, 0x3FFFFFFE
  70. .quad 0x3FFFFFFE, 0x3FFFFFFE, 0x3FFFFFFE, 0x3FFFFFFE
  71. .quad 0x400FFFFE, 0x400FFFFE, 0x400FFFFE, 0x400FFFFE
  72. .quad 0x7F7FFFFE, 0x7F7FFFFE, 0x7F7FFFFE, 0x7F7FFFFE
  73. .quad 0x03FFFFFC, 0x03FFFFFC, 0x03FFFFFC, 0x03FFFFFC
  74. .LAVX2_POLY_x8:
  75. .quad 0xFFFFFFF8, 0xFFFFFFF8, 0xFFFFFFF8, 0xFFFFFFF8
  76. .quad 0xFFFFFFF8, 0xFFFFFFF8, 0xFFFFFFF8, 0xFFFFFFF8
  77. .quad 0xFFFFFFF8, 0xFFFFFFF8, 0xFFFFFFF8, 0xFFFFFFF8
  78. .quad 0x80000FF8, 0x80000FF8, 0x80000FF8, 0x80000FF8
  79. .quad 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC
  80. .quad 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC
  81. .quad 0x801FFFFC, 0x801FFFFC, 0x801FFFFC, 0x801FFFFC
  82. .quad 0xFEFFFFFC, 0xFEFFFFFC, 0xFEFFFFFC, 0xFEFFFFFC
  83. .quad 0x07FFFFF8, 0x07FFFFF8, 0x07FFFFF8, 0x07FFFFF8
  84. .LONE:
  85. .quad 0x00000020, 0x00000020, 0x00000020, 0x00000020
  86. .quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
  87. .quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
  88. .quad 0x1fffc000, 0x1fffc000, 0x1fffc000, 0x1fffc000
  89. .quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
  90. .quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
  91. .quad 0x1f7fffff, 0x1f7fffff, 0x1f7fffff, 0x1f7fffff
  92. .quad 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff
  93. .quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
  94. # RR = 2^266 mod p in AVX2 format, to transform from the native OpenSSL
  95. # Montgomery form (*2^256) to our format (*2^261)
  96. .LTO_MONT_AVX2:
  97. .quad 0x00000400, 0x00000400, 0x00000400, 0x00000400
  98. .quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
  99. .quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
  100. .quad 0x1ff80000, 0x1ff80000, 0x1ff80000, 0x1ff80000
  101. .quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
  102. .quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
  103. .quad 0x0fffffff, 0x0fffffff, 0x0fffffff, 0x0fffffff
  104. .quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
  105. .quad 0x00000003, 0x00000003, 0x00000003, 0x00000003
  106. .LFROM_MONT_AVX2:
  107. .quad 0x00000001, 0x00000001, 0x00000001, 0x00000001
  108. .quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
  109. .quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
  110. .quad 0x1ffffe00, 0x1ffffe00, 0x1ffffe00, 0x1ffffe00
  111. .quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
  112. .quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
  113. .quad 0x1ffbffff, 0x1ffbffff, 0x1ffbffff, 0x1ffbffff
  114. .quad 0x001fffff, 0x001fffff, 0x001fffff, 0x001fffff
  115. .quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
  116. .LIntOne:
  117. .long 1,1,1,1,1,1,1,1
  118. ___
  119. {
  120. # This function receives a pointer to an array of four affine points
  121. # (X, Y, <1>) and rearranges the data for AVX2 execution, while
  122. # converting it to 2^29 radix redundant form
  123. my ($X0,$X1,$X2,$X3, $Y0,$Y1,$Y2,$Y3,
  124. $T0,$T1,$T2,$T3, $T4,$T5,$T6,$T7)=map("%ymm$_",(0..15));
  125. $code.=<<___;
  126. .globl ecp_nistz256_avx2_transpose_convert
  127. .type ecp_nistz256_avx2_transpose_convert,\@function,2
  128. .align 64
  129. ecp_nistz256_avx2_transpose_convert:
  130. vzeroupper
  131. ___
  132. $code.=<<___ if ($win64);
  133. lea -8-16*10(%rsp), %rsp
  134. vmovaps %xmm6, -8-16*10(%rax)
  135. vmovaps %xmm7, -8-16*9(%rax)
  136. vmovaps %xmm8, -8-16*8(%rax)
  137. vmovaps %xmm9, -8-16*7(%rax)
  138. vmovaps %xmm10, -8-16*6(%rax)
  139. vmovaps %xmm11, -8-16*5(%rax)
  140. vmovaps %xmm12, -8-16*4(%rax)
  141. vmovaps %xmm13, -8-16*3(%rax)
  142. vmovaps %xmm14, -8-16*2(%rax)
  143. vmovaps %xmm15, -8-16*1(%rax)
  144. ___
  145. $code.=<<___;
  146. # Load the data
  147. vmovdqa 32*0(%rsi), $X0
  148. lea 112(%rsi), %rax # size optimization
  149. vmovdqa 32*1(%rsi), $Y0
  150. lea .LAVX2_AND_MASK(%rip), %rdx
  151. vmovdqa 32*2(%rsi), $X1
  152. vmovdqa 32*3(%rsi), $Y1
  153. vmovdqa 32*4-112(%rax), $X2
  154. vmovdqa 32*5-112(%rax), $Y2
  155. vmovdqa 32*6-112(%rax), $X3
  156. vmovdqa 32*7-112(%rax), $Y3
  157. # Transpose X and Y independently
  158. vpunpcklqdq $X1, $X0, $T0 # T0 = [B2 A2 B0 A0]
  159. vpunpcklqdq $X3, $X2, $T1 # T1 = [D2 C2 D0 C0]
  160. vpunpckhqdq $X1, $X0, $T2 # T2 = [B3 A3 B1 A1]
  161. vpunpckhqdq $X3, $X2, $T3 # T3 = [D3 C3 D1 C1]
  162. vpunpcklqdq $Y1, $Y0, $T4
  163. vpunpcklqdq $Y3, $Y2, $T5
  164. vpunpckhqdq $Y1, $Y0, $T6
  165. vpunpckhqdq $Y3, $Y2, $T7
  166. vperm2i128 \$0x20, $T1, $T0, $X0 # X0 = [D0 C0 B0 A0]
  167. vperm2i128 \$0x20, $T3, $T2, $X1 # X1 = [D1 C1 B1 A1]
  168. vperm2i128 \$0x31, $T1, $T0, $X2 # X2 = [D2 C2 B2 A2]
  169. vperm2i128 \$0x31, $T3, $T2, $X3 # X3 = [D3 C3 B3 A3]
  170. vperm2i128 \$0x20, $T5, $T4, $Y0
  171. vperm2i128 \$0x20, $T7, $T6, $Y1
  172. vperm2i128 \$0x31, $T5, $T4, $Y2
  173. vperm2i128 \$0x31, $T7, $T6, $Y3
  174. vmovdqa (%rdx), $T7
  175. vpand (%rdx), $X0, $T0 # out[0] = in[0] & mask;
  176. vpsrlq \$29, $X0, $X0
  177. vpand $T7, $X0, $T1 # out[1] = (in[0] >> shift) & mask;
  178. vpsrlq \$29, $X0, $X0
  179. vpsllq \$6, $X1, $T2
  180. vpxor $X0, $T2, $T2
  181. vpand $T7, $T2, $T2 # out[2] = ((in[0] >> (shift*2)) ^ (in[1] << (64-shift*2))) & mask;
  182. vpsrlq \$23, $X1, $X1
  183. vpand $T7, $X1, $T3 # out[3] = (in[1] >> ((shift*3)%64)) & mask;
  184. vpsrlq \$29, $X1, $X1
  185. vpsllq \$12, $X2, $T4
  186. vpxor $X1, $T4, $T4
  187. vpand $T7, $T4, $T4 # out[4] = ((in[1] >> ((shift*4)%64)) ^ (in[2] << (64*2-shift*4))) & mask;
  188. vpsrlq \$17, $X2, $X2
  189. vpand $T7, $X2, $T5 # out[5] = (in[2] >> ((shift*5)%64)) & mask;
  190. vpsrlq \$29, $X2, $X2
  191. vpsllq \$18, $X3, $T6
  192. vpxor $X2, $T6, $T6
  193. vpand $T7, $T6, $T6 # out[6] = ((in[2] >> ((shift*6)%64)) ^ (in[3] << (64*3-shift*6))) & mask;
  194. vpsrlq \$11, $X3, $X3
  195. vmovdqa $T0, 32*0(%rdi)
  196. lea 112(%rdi), %rax # size optimization
  197. vpand $T7, $X3, $T0 # out[7] = (in[3] >> ((shift*7)%64)) & mask;
  198. vpsrlq \$29, $X3, $X3 # out[8] = (in[3] >> ((shift*8)%64)) & mask;
  199. vmovdqa $T1, 32*1(%rdi)
  200. vmovdqa $T2, 32*2(%rdi)
  201. vmovdqa $T3, 32*3(%rdi)
  202. vmovdqa $T4, 32*4-112(%rax)
  203. vmovdqa $T5, 32*5-112(%rax)
  204. vmovdqa $T6, 32*6-112(%rax)
  205. vmovdqa $T0, 32*7-112(%rax)
  206. vmovdqa $X3, 32*8-112(%rax)
  207. lea 448(%rdi), %rax # size optimization
  208. vpand $T7, $Y0, $T0 # out[0] = in[0] & mask;
  209. vpsrlq \$29, $Y0, $Y0
  210. vpand $T7, $Y0, $T1 # out[1] = (in[0] >> shift) & mask;
  211. vpsrlq \$29, $Y0, $Y0
  212. vpsllq \$6, $Y1, $T2
  213. vpxor $Y0, $T2, $T2
  214. vpand $T7, $T2, $T2 # out[2] = ((in[0] >> (shift*2)) ^ (in[1] << (64-shift*2))) & mask;
  215. vpsrlq \$23, $Y1, $Y1
  216. vpand $T7, $Y1, $T3 # out[3] = (in[1] >> ((shift*3)%64)) & mask;
  217. vpsrlq \$29, $Y1, $Y1
  218. vpsllq \$12, $Y2, $T4
  219. vpxor $Y1, $T4, $T4
  220. vpand $T7, $T4, $T4 # out[4] = ((in[1] >> ((shift*4)%64)) ^ (in[2] << (64*2-shift*4))) & mask;
  221. vpsrlq \$17, $Y2, $Y2
  222. vpand $T7, $Y2, $T5 # out[5] = (in[2] >> ((shift*5)%64)) & mask;
  223. vpsrlq \$29, $Y2, $Y2
  224. vpsllq \$18, $Y3, $T6
  225. vpxor $Y2, $T6, $T6
  226. vpand $T7, $T6, $T6 # out[6] = ((in[2] >> ((shift*6)%64)) ^ (in[3] << (64*3-shift*6))) & mask;
  227. vpsrlq \$11, $Y3, $Y3
  228. vmovdqa $T0, 32*9-448(%rax)
  229. vpand $T7, $Y3, $T0 # out[7] = (in[3] >> ((shift*7)%64)) & mask;
  230. vpsrlq \$29, $Y3, $Y3 # out[8] = (in[3] >> ((shift*8)%64)) & mask;
  231. vmovdqa $T1, 32*10-448(%rax)
  232. vmovdqa $T2, 32*11-448(%rax)
  233. vmovdqa $T3, 32*12-448(%rax)
  234. vmovdqa $T4, 32*13-448(%rax)
  235. vmovdqa $T5, 32*14-448(%rax)
  236. vmovdqa $T6, 32*15-448(%rax)
  237. vmovdqa $T0, 32*16-448(%rax)
  238. vmovdqa $Y3, 32*17-448(%rax)
  239. vzeroupper
  240. ___
  241. $code.=<<___ if ($win64);
  242. movaps 16*0(%rsp), %xmm6
  243. movaps 16*1(%rsp), %xmm7
  244. movaps 16*2(%rsp), %xmm8
  245. movaps 16*3(%rsp), %xmm9
  246. movaps 16*4(%rsp), %xmm10
  247. movaps 16*5(%rsp), %xmm11
  248. movaps 16*6(%rsp), %xmm12
  249. movaps 16*7(%rsp), %xmm13
  250. movaps 16*8(%rsp), %xmm14
  251. movaps 16*9(%rsp), %xmm15
  252. lea 8+16*10(%rsp), %rsp
  253. ___
  254. $code.=<<___;
  255. ret
  256. .size ecp_nistz256_avx2_transpose_convert,.-ecp_nistz256_avx2_transpose_convert
  257. ___
  258. }
  259. {
  260. ################################################################################
  261. # This function receives a pointer to an array of four AVX2 formatted points
  262. # (X, Y, Z) convert the data to normal representation, and rearranges the data
  263. my ($D0,$D1,$D2,$D3, $D4,$D5,$D6,$D7, $D8)=map("%ymm$_",(0..8));
  264. my ($T0,$T1,$T2,$T3, $T4,$T5,$T6)=map("%ymm$_",(9..15));
  265. $code.=<<___;
  266. .globl ecp_nistz256_avx2_convert_transpose_back
  267. .type ecp_nistz256_avx2_convert_transpose_back,\@function,2
  268. .align 32
  269. ecp_nistz256_avx2_convert_transpose_back:
  270. vzeroupper
  271. ___
  272. $code.=<<___ if ($win64);
  273. lea -8-16*10(%rsp), %rsp
  274. vmovaps %xmm6, -8-16*10(%rax)
  275. vmovaps %xmm7, -8-16*9(%rax)
  276. vmovaps %xmm8, -8-16*8(%rax)
  277. vmovaps %xmm9, -8-16*7(%rax)
  278. vmovaps %xmm10, -8-16*6(%rax)
  279. vmovaps %xmm11, -8-16*5(%rax)
  280. vmovaps %xmm12, -8-16*4(%rax)
  281. vmovaps %xmm13, -8-16*3(%rax)
  282. vmovaps %xmm14, -8-16*2(%rax)
  283. vmovaps %xmm15, -8-16*1(%rax)
  284. ___
  285. $code.=<<___;
  286. mov \$3, %ecx
  287. .Lconv_loop:
  288. vmovdqa 32*0(%rsi), $D0
  289. lea 160(%rsi), %rax # size optimization
  290. vmovdqa 32*1(%rsi), $D1
  291. vmovdqa 32*2(%rsi), $D2
  292. vmovdqa 32*3(%rsi), $D3
  293. vmovdqa 32*4-160(%rax), $D4
  294. vmovdqa 32*5-160(%rax), $D5
  295. vmovdqa 32*6-160(%rax), $D6
  296. vmovdqa 32*7-160(%rax), $D7
  297. vmovdqa 32*8-160(%rax), $D8
  298. vpsllq \$29, $D1, $D1
  299. vpsllq \$58, $D2, $T0
  300. vpaddq $D1, $D0, $D0
  301. vpaddq $T0, $D0, $D0 # out[0] = (in[0]) ^ (in[1] << shift*1) ^ (in[2] << shift*2);
  302. vpsrlq \$6, $D2, $D2
  303. vpsllq \$23, $D3, $D3
  304. vpsllq \$52, $D4, $T1
  305. vpaddq $D2, $D3, $D3
  306. vpaddq $D3, $T1, $D1 # out[1] = (in[2] >> (64*1-shift*2)) ^ (in[3] << shift*3%64) ^ (in[4] << shift*4%64);
  307. vpsrlq \$12, $D4, $D4
  308. vpsllq \$17, $D5, $D5
  309. vpsllq \$46, $D6, $T2
  310. vpaddq $D4, $D5, $D5
  311. vpaddq $D5, $T2, $D2 # out[2] = (in[4] >> (64*2-shift*4)) ^ (in[5] << shift*5%64) ^ (in[6] << shift*6%64);
  312. vpsrlq \$18, $D6, $D6
  313. vpsllq \$11, $D7, $D7
  314. vpsllq \$40, $D8, $T3
  315. vpaddq $D6, $D7, $D7
  316. vpaddq $D7, $T3, $D3 # out[3] = (in[6] >> (64*3-shift*6)) ^ (in[7] << shift*7%64) ^ (in[8] << shift*8%64);
  317. vpunpcklqdq $D1, $D0, $T0 # T0 = [B2 A2 B0 A0]
  318. vpunpcklqdq $D3, $D2, $T1 # T1 = [D2 C2 D0 C0]
  319. vpunpckhqdq $D1, $D0, $T2 # T2 = [B3 A3 B1 A1]
  320. vpunpckhqdq $D3, $D2, $T3 # T3 = [D3 C3 D1 C1]
  321. vperm2i128 \$0x20, $T1, $T0, $D0 # X0 = [D0 C0 B0 A0]
  322. vperm2i128 \$0x20, $T3, $T2, $D1 # X1 = [D1 C1 B1 A1]
  323. vperm2i128 \$0x31, $T1, $T0, $D2 # X2 = [D2 C2 B2 A2]
  324. vperm2i128 \$0x31, $T3, $T2, $D3 # X3 = [D3 C3 B3 A3]
  325. vmovdqa $D0, 32*0(%rdi)
  326. vmovdqa $D1, 32*3(%rdi)
  327. vmovdqa $D2, 32*6(%rdi)
  328. vmovdqa $D3, 32*9(%rdi)
  329. lea 32*9(%rsi), %rsi
  330. lea 32*1(%rdi), %rdi
  331. dec %ecx
  332. jnz .Lconv_loop
  333. vzeroupper
  334. ___
  335. $code.=<<___ if ($win64);
  336. movaps 16*0(%rsp), %xmm6
  337. movaps 16*1(%rsp), %xmm7
  338. movaps 16*2(%rsp), %xmm8
  339. movaps 16*3(%rsp), %xmm9
  340. movaps 16*4(%rsp), %xmm10
  341. movaps 16*5(%rsp), %xmm11
  342. movaps 16*6(%rsp), %xmm12
  343. movaps 16*7(%rsp), %xmm13
  344. movaps 16*8(%rsp), %xmm14
  345. movaps 16*9(%rsp), %xmm15
  346. lea 8+16*10(%rsp), %rsp
  347. ___
  348. $code.=<<___;
  349. ret
  350. .size ecp_nistz256_avx2_convert_transpose_back,.-ecp_nistz256_avx2_convert_transpose_back
  351. ___
  352. }
  353. {
  354. my ($r_ptr,$a_ptr,$b_ptr,$itr)=("%rdi","%rsi","%rdx","%ecx");
  355. my ($ACC0,$ACC1,$ACC2,$ACC3,$ACC4,$ACC5,$ACC6,$ACC7,$ACC8)=map("%ymm$_",(0..8));
  356. my ($B,$Y,$T0,$AND_MASK,$OVERFLOW)=map("%ymm$_",(9..13));
  357. sub NORMALIZE {
  358. my $ret=<<___;
  359. vpsrlq $digit_size, $ACC0, $T0
  360. vpand $AND_MASK, $ACC0, $ACC0
  361. vpaddq $T0, $ACC1, $ACC1
  362. vpsrlq $digit_size, $ACC1, $T0
  363. vpand $AND_MASK, $ACC1, $ACC1
  364. vpaddq $T0, $ACC2, $ACC2
  365. vpsrlq $digit_size, $ACC2, $T0
  366. vpand $AND_MASK, $ACC2, $ACC2
  367. vpaddq $T0, $ACC3, $ACC3
  368. vpsrlq $digit_size, $ACC3, $T0
  369. vpand $AND_MASK, $ACC3, $ACC3
  370. vpaddq $T0, $ACC4, $ACC4
  371. vpsrlq $digit_size, $ACC4, $T0
  372. vpand $AND_MASK, $ACC4, $ACC4
  373. vpaddq $T0, $ACC5, $ACC5
  374. vpsrlq $digit_size, $ACC5, $T0
  375. vpand $AND_MASK, $ACC5, $ACC5
  376. vpaddq $T0, $ACC6, $ACC6
  377. vpsrlq $digit_size, $ACC6, $T0
  378. vpand $AND_MASK, $ACC6, $ACC6
  379. vpaddq $T0, $ACC7, $ACC7
  380. vpsrlq $digit_size, $ACC7, $T0
  381. vpand $AND_MASK, $ACC7, $ACC7
  382. vpaddq $T0, $ACC8, $ACC8
  383. #vpand $AND_MASK, $ACC8, $ACC8
  384. ___
  385. $ret;
  386. }
  387. sub STORE {
  388. my $ret=<<___;
  389. vmovdqa $ACC0, 32*0(%rdi)
  390. lea 160(%rdi), %rax # size optimization
  391. vmovdqa $ACC1, 32*1(%rdi)
  392. vmovdqa $ACC2, 32*2(%rdi)
  393. vmovdqa $ACC3, 32*3(%rdi)
  394. vmovdqa $ACC4, 32*4-160(%rax)
  395. vmovdqa $ACC5, 32*5-160(%rax)
  396. vmovdqa $ACC6, 32*6-160(%rax)
  397. vmovdqa $ACC7, 32*7-160(%rax)
  398. vmovdqa $ACC8, 32*8-160(%rax)
  399. ___
  400. $ret;
  401. }
  402. $code.=<<___;
  403. .type avx2_normalize,\@abi-omnipotent
  404. .align 32
  405. avx2_normalize:
  406. vpsrlq $digit_size, $ACC0, $T0
  407. vpand $AND_MASK, $ACC0, $ACC0
  408. vpaddq $T0, $ACC1, $ACC1
  409. vpsrlq $digit_size, $ACC1, $T0
  410. vpand $AND_MASK, $ACC1, $ACC1
  411. vpaddq $T0, $ACC2, $ACC2
  412. vpsrlq $digit_size, $ACC2, $T0
  413. vpand $AND_MASK, $ACC2, $ACC2
  414. vpaddq $T0, $ACC3, $ACC3
  415. vpsrlq $digit_size, $ACC3, $T0
  416. vpand $AND_MASK, $ACC3, $ACC3
  417. vpaddq $T0, $ACC4, $ACC4
  418. vpsrlq $digit_size, $ACC4, $T0
  419. vpand $AND_MASK, $ACC4, $ACC4
  420. vpaddq $T0, $ACC5, $ACC5
  421. vpsrlq $digit_size, $ACC5, $T0
  422. vpand $AND_MASK, $ACC5, $ACC5
  423. vpaddq $T0, $ACC6, $ACC6
  424. vpsrlq $digit_size, $ACC6, $T0
  425. vpand $AND_MASK, $ACC6, $ACC6
  426. vpaddq $T0, $ACC7, $ACC7
  427. vpsrlq $digit_size, $ACC7, $T0
  428. vpand $AND_MASK, $ACC7, $ACC7
  429. vpaddq $T0, $ACC8, $ACC8
  430. #vpand $AND_MASK, $ACC8, $ACC8
  431. ret
  432. .size avx2_normalize,.-avx2_normalize
  433. .type avx2_normalize_n_store,\@abi-omnipotent
  434. .align 32
  435. avx2_normalize_n_store:
  436. vpsrlq $digit_size, $ACC0, $T0
  437. vpand $AND_MASK, $ACC0, $ACC0
  438. vpaddq $T0, $ACC1, $ACC1
  439. vpsrlq $digit_size, $ACC1, $T0
  440. vpand $AND_MASK, $ACC1, $ACC1
  441. vmovdqa $ACC0, 32*0(%rdi)
  442. lea 160(%rdi), %rax # size optimization
  443. vpaddq $T0, $ACC2, $ACC2
  444. vpsrlq $digit_size, $ACC2, $T0
  445. vpand $AND_MASK, $ACC2, $ACC2
  446. vmovdqa $ACC1, 32*1(%rdi)
  447. vpaddq $T0, $ACC3, $ACC3
  448. vpsrlq $digit_size, $ACC3, $T0
  449. vpand $AND_MASK, $ACC3, $ACC3
  450. vmovdqa $ACC2, 32*2(%rdi)
  451. vpaddq $T0, $ACC4, $ACC4
  452. vpsrlq $digit_size, $ACC4, $T0
  453. vpand $AND_MASK, $ACC4, $ACC4
  454. vmovdqa $ACC3, 32*3(%rdi)
  455. vpaddq $T0, $ACC5, $ACC5
  456. vpsrlq $digit_size, $ACC5, $T0
  457. vpand $AND_MASK, $ACC5, $ACC5
  458. vmovdqa $ACC4, 32*4-160(%rax)
  459. vpaddq $T0, $ACC6, $ACC6
  460. vpsrlq $digit_size, $ACC6, $T0
  461. vpand $AND_MASK, $ACC6, $ACC6
  462. vmovdqa $ACC5, 32*5-160(%rax)
  463. vpaddq $T0, $ACC7, $ACC7
  464. vpsrlq $digit_size, $ACC7, $T0
  465. vpand $AND_MASK, $ACC7, $ACC7
  466. vmovdqa $ACC6, 32*6-160(%rax)
  467. vpaddq $T0, $ACC8, $ACC8
  468. #vpand $AND_MASK, $ACC8, $ACC8
  469. vmovdqa $ACC7, 32*7-160(%rax)
  470. vmovdqa $ACC8, 32*8-160(%rax)
  471. ret
  472. .size avx2_normalize_n_store,.-avx2_normalize_n_store
  473. ################################################################################
  474. # void avx2_mul_x4(void* RESULTx4, void *Ax4, void *Bx4);
  475. .type avx2_mul_x4,\@abi-omnipotent
  476. .align 32
  477. avx2_mul_x4:
  478. lea .LAVX2_POLY(%rip), %rax
  479. vpxor $ACC0, $ACC0, $ACC0
  480. vpxor $ACC1, $ACC1, $ACC1
  481. vpxor $ACC2, $ACC2, $ACC2
  482. vpxor $ACC3, $ACC3, $ACC3
  483. vpxor $ACC4, $ACC4, $ACC4
  484. vpxor $ACC5, $ACC5, $ACC5
  485. vpxor $ACC6, $ACC6, $ACC6
  486. vpxor $ACC7, $ACC7, $ACC7
  487. vmovdqa 32*7(%rax), %ymm14
  488. vmovdqa 32*8(%rax), %ymm15
  489. mov $n_digits, $itr
  490. lea -512($a_ptr), $a_ptr # strategic bias to control u-op density
  491. jmp .Lavx2_mul_x4_loop
  492. .align 32
  493. .Lavx2_mul_x4_loop:
  494. vmovdqa 32*0($b_ptr), $B
  495. lea 32*1($b_ptr), $b_ptr
  496. vpmuludq 32*0+512($a_ptr), $B, $T0
  497. vpmuludq 32*1+512($a_ptr), $B, $OVERFLOW # borrow $OVERFLOW
  498. vpaddq $T0, $ACC0, $ACC0
  499. vpmuludq 32*2+512($a_ptr), $B, $T0
  500. vpaddq $OVERFLOW, $ACC1, $ACC1
  501. vpand $AND_MASK, $ACC0, $Y
  502. vpmuludq 32*3+512($a_ptr), $B, $OVERFLOW
  503. vpaddq $T0, $ACC2, $ACC2
  504. vpmuludq 32*4+512($a_ptr), $B, $T0
  505. vpaddq $OVERFLOW, $ACC3, $ACC3
  506. vpmuludq 32*5+512($a_ptr), $B, $OVERFLOW
  507. vpaddq $T0, $ACC4, $ACC4
  508. vpmuludq 32*6+512($a_ptr), $B, $T0
  509. vpaddq $OVERFLOW, $ACC5, $ACC5
  510. vpmuludq 32*7+512($a_ptr), $B, $OVERFLOW
  511. vpaddq $T0, $ACC6, $ACC6
  512. # Skip some multiplications, optimizing for the constant poly
  513. vpmuludq $AND_MASK, $Y, $T0
  514. vpaddq $OVERFLOW, $ACC7, $ACC7
  515. vpmuludq 32*8+512($a_ptr), $B, $ACC8
  516. vpaddq $T0, $ACC0, $OVERFLOW
  517. vpaddq $T0, $ACC1, $ACC0
  518. vpsrlq $digit_size, $OVERFLOW, $OVERFLOW
  519. vpaddq $T0, $ACC2, $ACC1
  520. vpmuludq 32*3(%rax), $Y, $T0
  521. vpaddq $OVERFLOW, $ACC0, $ACC0
  522. vpaddq $T0, $ACC3, $ACC2
  523. .byte 0x67
  524. vmovdqa $ACC4, $ACC3
  525. vpsllq \$18, $Y, $OVERFLOW
  526. .byte 0x67
  527. vmovdqa $ACC5, $ACC4
  528. vpmuludq %ymm14, $Y, $T0
  529. vpaddq $OVERFLOW, $ACC6, $ACC5
  530. vpmuludq %ymm15, $Y, $OVERFLOW
  531. vpaddq $T0, $ACC7, $ACC6
  532. vpaddq $OVERFLOW, $ACC8, $ACC7
  533. dec $itr
  534. jnz .Lavx2_mul_x4_loop
  535. vpxor $ACC8, $ACC8, $ACC8
  536. ret
  537. .size avx2_mul_x4,.-avx2_mul_x4
  538. # Function optimized for the constant 1
  539. ################################################################################
  540. # void avx2_mul_by1_x4(void* RESULTx4, void *Ax4);
  541. .type avx2_mul_by1_x4,\@abi-omnipotent
  542. .align 32
  543. avx2_mul_by1_x4:
  544. lea .LAVX2_POLY(%rip), %rax
  545. vpxor $ACC0, $ACC0, $ACC0
  546. vpxor $ACC1, $ACC1, $ACC1
  547. vpxor $ACC2, $ACC2, $ACC2
  548. vpxor $ACC3, $ACC3, $ACC3
  549. vpxor $ACC4, $ACC4, $ACC4
  550. vpxor $ACC5, $ACC5, $ACC5
  551. vpxor $ACC6, $ACC6, $ACC6
  552. vpxor $ACC7, $ACC7, $ACC7
  553. vpxor $ACC8, $ACC8, $ACC8
  554. vmovdqa 32*3+.LONE(%rip), %ymm14
  555. vmovdqa 32*7+.LONE(%rip), %ymm15
  556. mov $n_digits, $itr
  557. jmp .Lavx2_mul_by1_x4_loop
  558. .align 32
  559. .Lavx2_mul_by1_x4_loop:
  560. vmovdqa 32*0($a_ptr), $B
  561. .byte 0x48,0x8d,0xb6,0x20,0,0,0 # lea 32*1($a_ptr), $a_ptr
  562. vpsllq \$5, $B, $OVERFLOW
  563. vpmuludq %ymm14, $B, $T0
  564. vpaddq $OVERFLOW, $ACC0, $ACC0
  565. vpaddq $T0, $ACC3, $ACC3
  566. .byte 0x67
  567. vpmuludq $AND_MASK, $B, $T0
  568. vpand $AND_MASK, $ACC0, $Y
  569. vpaddq $T0, $ACC4, $ACC4
  570. vpaddq $T0, $ACC5, $ACC5
  571. vpaddq $T0, $ACC6, $ACC6
  572. vpsllq \$23, $B, $T0
  573. .byte 0x67,0x67
  574. vpmuludq %ymm15, $B, $OVERFLOW
  575. vpsubq $T0, $ACC6, $ACC6
  576. vpmuludq $AND_MASK, $Y, $T0
  577. vpaddq $OVERFLOW, $ACC7, $ACC7
  578. vpaddq $T0, $ACC0, $OVERFLOW
  579. vpaddq $T0, $ACC1, $ACC0
  580. .byte 0x67,0x67
  581. vpsrlq $digit_size, $OVERFLOW, $OVERFLOW
  582. vpaddq $T0, $ACC2, $ACC1
  583. vpmuludq 32*3(%rax), $Y, $T0
  584. vpaddq $OVERFLOW, $ACC0, $ACC0
  585. vpaddq $T0, $ACC3, $ACC2
  586. vmovdqa $ACC4, $ACC3
  587. vpsllq \$18, $Y, $OVERFLOW
  588. vmovdqa $ACC5, $ACC4
  589. vpmuludq 32*7(%rax), $Y, $T0
  590. vpaddq $OVERFLOW, $ACC6, $ACC5
  591. vpaddq $T0, $ACC7, $ACC6
  592. vpmuludq 32*8(%rax), $Y, $ACC7
  593. dec $itr
  594. jnz .Lavx2_mul_by1_x4_loop
  595. ret
  596. .size avx2_mul_by1_x4,.-avx2_mul_by1_x4
  597. ################################################################################
  598. # void avx2_sqr_x4(void* RESULTx4, void *Ax4, void *Bx4);
  599. .type avx2_sqr_x4,\@abi-omnipotent
  600. .align 32
  601. avx2_sqr_x4:
  602. lea .LAVX2_POLY(%rip), %rax
  603. vmovdqa 32*7(%rax), %ymm14
  604. vmovdqa 32*8(%rax), %ymm15
  605. vmovdqa 32*0($a_ptr), $B
  606. vmovdqa 32*1($a_ptr), $ACC1
  607. vmovdqa 32*2($a_ptr), $ACC2
  608. vmovdqa 32*3($a_ptr), $ACC3
  609. vmovdqa 32*4($a_ptr), $ACC4
  610. vmovdqa 32*5($a_ptr), $ACC5
  611. vmovdqa 32*6($a_ptr), $ACC6
  612. vmovdqa 32*7($a_ptr), $ACC7
  613. vpaddq $ACC1, $ACC1, $ACC1 # 2*$ACC0..7
  614. vmovdqa 32*8($a_ptr), $ACC8
  615. vpaddq $ACC2, $ACC2, $ACC2
  616. vmovdqa $ACC1, 32*0(%rcx)
  617. vpaddq $ACC3, $ACC3, $ACC3
  618. vmovdqa $ACC2, 32*1(%rcx)
  619. vpaddq $ACC4, $ACC4, $ACC4
  620. vmovdqa $ACC3, 32*2(%rcx)
  621. vpaddq $ACC5, $ACC5, $ACC5
  622. vmovdqa $ACC4, 32*3(%rcx)
  623. vpaddq $ACC6, $ACC6, $ACC6
  624. vmovdqa $ACC5, 32*4(%rcx)
  625. vpaddq $ACC7, $ACC7, $ACC7
  626. vmovdqa $ACC6, 32*5(%rcx)
  627. vpaddq $ACC8, $ACC8, $ACC8
  628. vmovdqa $ACC7, 32*6(%rcx)
  629. vmovdqa $ACC8, 32*7(%rcx)
  630. #itr 1
  631. vpmuludq $B, $B, $ACC0
  632. vpmuludq $B, $ACC1, $ACC1
  633. vpand $AND_MASK, $ACC0, $Y
  634. vpmuludq $B, $ACC2, $ACC2
  635. vpmuludq $B, $ACC3, $ACC3
  636. vpmuludq $B, $ACC4, $ACC4
  637. vpmuludq $B, $ACC5, $ACC5
  638. vpmuludq $B, $ACC6, $ACC6
  639. vpmuludq $AND_MASK, $Y, $T0
  640. vpmuludq $B, $ACC7, $ACC7
  641. vpmuludq $B, $ACC8, $ACC8
  642. vmovdqa 32*1($a_ptr), $B
  643. vpaddq $T0, $ACC0, $OVERFLOW
  644. vpaddq $T0, $ACC1, $ACC0
  645. vpsrlq $digit_size, $OVERFLOW, $OVERFLOW
  646. vpaddq $T0, $ACC2, $ACC1
  647. vpmuludq 32*3(%rax), $Y, $T0
  648. vpaddq $OVERFLOW, $ACC0, $ACC0
  649. vpaddq $T0, $ACC3, $ACC2
  650. vmovdqa $ACC4, $ACC3
  651. vpsllq \$18, $Y, $T0
  652. vmovdqa $ACC5, $ACC4
  653. vpmuludq %ymm14, $Y, $OVERFLOW
  654. vpaddq $T0, $ACC6, $ACC5
  655. vpmuludq %ymm15, $Y, $T0
  656. vpaddq $OVERFLOW, $ACC7, $ACC6
  657. vpaddq $T0, $ACC8, $ACC7
  658. #itr 2
  659. vpmuludq $B, $B, $OVERFLOW
  660. vpand $AND_MASK, $ACC0, $Y
  661. vpmuludq 32*1(%rcx), $B, $T0
  662. vpaddq $OVERFLOW, $ACC1, $ACC1
  663. vpmuludq 32*2(%rcx), $B, $OVERFLOW
  664. vpaddq $T0, $ACC2, $ACC2
  665. vpmuludq 32*3(%rcx), $B, $T0
  666. vpaddq $OVERFLOW, $ACC3, $ACC3
  667. vpmuludq 32*4(%rcx), $B, $OVERFLOW
  668. vpaddq $T0, $ACC4, $ACC4
  669. vpmuludq 32*5(%rcx), $B, $T0
  670. vpaddq $OVERFLOW, $ACC5, $ACC5
  671. vpmuludq 32*6(%rcx), $B, $OVERFLOW
  672. vpaddq $T0, $ACC6, $ACC6
  673. vpmuludq $AND_MASK, $Y, $T0
  674. vpaddq $OVERFLOW, $ACC7, $ACC7
  675. vpmuludq 32*7(%rcx), $B, $ACC8
  676. vmovdqa 32*2($a_ptr), $B
  677. vpaddq $T0, $ACC0, $OVERFLOW
  678. vpaddq $T0, $ACC1, $ACC0
  679. vpsrlq $digit_size, $OVERFLOW, $OVERFLOW
  680. vpaddq $T0, $ACC2, $ACC1
  681. vpmuludq 32*3(%rax), $Y, $T0
  682. vpaddq $OVERFLOW, $ACC0, $ACC0
  683. vpaddq $T0, $ACC3, $ACC2
  684. vmovdqa $ACC4, $ACC3
  685. vpsllq \$18, $Y, $T0
  686. vmovdqa $ACC5, $ACC4
  687. vpmuludq %ymm14, $Y, $OVERFLOW
  688. vpaddq $T0, $ACC6, $ACC5
  689. vpmuludq %ymm15, $Y, $T0
  690. vpaddq $OVERFLOW, $ACC7, $ACC6
  691. vpaddq $T0, $ACC8, $ACC7
  692. #itr 3
  693. vpmuludq $B, $B, $T0
  694. vpand $AND_MASK, $ACC0, $Y
  695. vpmuludq 32*2(%rcx), $B, $OVERFLOW
  696. vpaddq $T0, $ACC2, $ACC2
  697. vpmuludq 32*3(%rcx), $B, $T0
  698. vpaddq $OVERFLOW, $ACC3, $ACC3
  699. vpmuludq 32*4(%rcx), $B, $OVERFLOW
  700. vpaddq $T0, $ACC4, $ACC4
  701. vpmuludq 32*5(%rcx), $B, $T0
  702. vpaddq $OVERFLOW, $ACC5, $ACC5
  703. vpmuludq 32*6(%rcx), $B, $OVERFLOW
  704. vpaddq $T0, $ACC6, $ACC6
  705. vpmuludq $AND_MASK, $Y, $T0
  706. vpaddq $OVERFLOW, $ACC7, $ACC7
  707. vpmuludq 32*7(%rcx), $B, $ACC8
  708. vmovdqa 32*3($a_ptr), $B
  709. vpaddq $T0, $ACC0, $OVERFLOW
  710. vpaddq $T0, $ACC1, $ACC0
  711. vpsrlq $digit_size, $OVERFLOW, $OVERFLOW
  712. vpaddq $T0, $ACC2, $ACC1
  713. vpmuludq 32*3(%rax), $Y, $T0
  714. vpaddq $OVERFLOW, $ACC0, $ACC0
  715. vpaddq $T0, $ACC3, $ACC2
  716. vmovdqa $ACC4, $ACC3
  717. vpsllq \$18, $Y, $T0
  718. vmovdqa $ACC5, $ACC4
  719. vpmuludq %ymm14, $Y, $OVERFLOW
  720. vpaddq $T0, $ACC6, $ACC5
  721. vpmuludq %ymm15, $Y, $T0
  722. vpand $AND_MASK, $ACC0, $Y
  723. vpaddq $OVERFLOW, $ACC7, $ACC6
  724. vpaddq $T0, $ACC8, $ACC7
  725. #itr 4
  726. vpmuludq $B, $B, $OVERFLOW
  727. vpmuludq 32*3(%rcx), $B, $T0
  728. vpaddq $OVERFLOW, $ACC3, $ACC3
  729. vpmuludq 32*4(%rcx), $B, $OVERFLOW
  730. vpaddq $T0, $ACC4, $ACC4
  731. vpmuludq 32*5(%rcx), $B, $T0
  732. vpaddq $OVERFLOW, $ACC5, $ACC5
  733. vpmuludq 32*6(%rcx), $B, $OVERFLOW
  734. vpaddq $T0, $ACC6, $ACC6
  735. vpmuludq $AND_MASK, $Y, $T0
  736. vpaddq $OVERFLOW, $ACC7, $ACC7
  737. vpmuludq 32*7(%rcx), $B, $ACC8
  738. vmovdqa 32*4($a_ptr), $B
  739. vpaddq $T0, $ACC0, $OVERFLOW
  740. vpaddq $T0, $ACC1, $ACC0
  741. vpsrlq $digit_size, $OVERFLOW, $OVERFLOW
  742. vpaddq $T0, $ACC2, $ACC1
  743. vpmuludq 32*3(%rax), $Y, $T0
  744. vpaddq $OVERFLOW, $ACC0, $ACC0
  745. vpaddq $T0, $ACC3, $ACC2
  746. vmovdqa $ACC4, $ACC3
  747. vpsllq \$18, $Y, $T0
  748. vmovdqa $ACC5, $ACC4
  749. vpmuludq %ymm14, $Y, $OVERFLOW
  750. vpaddq $T0, $ACC6, $ACC5
  751. vpmuludq %ymm15, $Y, $T0
  752. vpand $AND_MASK, $ACC0, $Y
  753. vpaddq $OVERFLOW, $ACC7, $ACC6
  754. vpaddq $T0, $ACC8, $ACC7
  755. #itr 5
  756. vpmuludq $B, $B, $T0
  757. vpmuludq 32*4(%rcx), $B, $OVERFLOW
  758. vpaddq $T0, $ACC4, $ACC4
  759. vpmuludq 32*5(%rcx), $B, $T0
  760. vpaddq $OVERFLOW, $ACC5, $ACC5
  761. vpmuludq 32*6(%rcx), $B, $OVERFLOW
  762. vpaddq $T0, $ACC6, $ACC6
  763. vpmuludq $AND_MASK, $Y, $T0
  764. vpaddq $OVERFLOW, $ACC7, $ACC7
  765. vpmuludq 32*7(%rcx), $B, $ACC8
  766. vmovdqa 32*5($a_ptr), $B
  767. vpaddq $T0, $ACC0, $OVERFLOW
  768. vpsrlq $digit_size, $OVERFLOW, $OVERFLOW
  769. vpaddq $T0, $ACC1, $ACC0
  770. vpaddq $T0, $ACC2, $ACC1
  771. vpmuludq 32*3+.LAVX2_POLY(%rip), $Y, $T0
  772. vpaddq $OVERFLOW, $ACC0, $ACC0
  773. vpaddq $T0, $ACC3, $ACC2
  774. vmovdqa $ACC4, $ACC3
  775. vpsllq \$18, $Y, $T0
  776. vmovdqa $ACC5, $ACC4
  777. vpmuludq %ymm14, $Y, $OVERFLOW
  778. vpaddq $T0, $ACC6, $ACC5
  779. vpmuludq %ymm15, $Y, $T0
  780. vpand $AND_MASK, $ACC0, $Y
  781. vpaddq $OVERFLOW, $ACC7, $ACC6
  782. vpaddq $T0, $ACC8, $ACC7
  783. #itr 6
  784. vpmuludq $B, $B, $OVERFLOW
  785. vpmuludq 32*5(%rcx), $B, $T0
  786. vpaddq $OVERFLOW, $ACC5, $ACC5
  787. vpmuludq 32*6(%rcx), $B, $OVERFLOW
  788. vpaddq $T0, $ACC6, $ACC6
  789. vpmuludq $AND_MASK, $Y, $T0
  790. vpaddq $OVERFLOW, $ACC7, $ACC7
  791. vpmuludq 32*7(%rcx), $B, $ACC8
  792. vmovdqa 32*6($a_ptr), $B
  793. vpaddq $T0, $ACC0, $OVERFLOW
  794. vpaddq $T0, $ACC1, $ACC0
  795. vpsrlq $digit_size, $OVERFLOW, $OVERFLOW
  796. vpaddq $T0, $ACC2, $ACC1
  797. vpmuludq 32*3(%rax), $Y, $T0
  798. vpaddq $OVERFLOW, $ACC0, $ACC0
  799. vpaddq $T0, $ACC3, $ACC2
  800. vmovdqa $ACC4, $ACC3
  801. vpsllq \$18, $Y, $T0
  802. vmovdqa $ACC5, $ACC4
  803. vpmuludq %ymm14, $Y, $OVERFLOW
  804. vpaddq $T0, $ACC6, $ACC5
  805. vpmuludq %ymm15, $Y, $T0
  806. vpand $AND_MASK, $ACC0, $Y
  807. vpaddq $OVERFLOW, $ACC7, $ACC6
  808. vpaddq $T0, $ACC8, $ACC7
  809. #itr 7
  810. vpmuludq $B, $B, $T0
  811. vpmuludq 32*6(%rcx), $B, $OVERFLOW
  812. vpaddq $T0, $ACC6, $ACC6
  813. vpmuludq $AND_MASK, $Y, $T0
  814. vpaddq $OVERFLOW, $ACC7, $ACC7
  815. vpmuludq 32*7(%rcx), $B, $ACC8
  816. vmovdqa 32*7($a_ptr), $B
  817. vpaddq $T0, $ACC0, $OVERFLOW
  818. vpsrlq $digit_size, $OVERFLOW, $OVERFLOW
  819. vpaddq $T0, $ACC1, $ACC0
  820. vpaddq $T0, $ACC2, $ACC1
  821. vpmuludq 32*3(%rax), $Y, $T0
  822. vpaddq $OVERFLOW, $ACC0, $ACC0
  823. vpaddq $T0, $ACC3, $ACC2
  824. vmovdqa $ACC4, $ACC3
  825. vpsllq \$18, $Y, $T0
  826. vmovdqa $ACC5, $ACC4
  827. vpmuludq %ymm14, $Y, $OVERFLOW
  828. vpaddq $T0, $ACC6, $ACC5
  829. vpmuludq %ymm15, $Y, $T0
  830. vpand $AND_MASK, $ACC0, $Y
  831. vpaddq $OVERFLOW, $ACC7, $ACC6
  832. vpaddq $T0, $ACC8, $ACC7
  833. #itr 8
  834. vpmuludq $B, $B, $OVERFLOW
  835. vpmuludq $AND_MASK, $Y, $T0
  836. vpaddq $OVERFLOW, $ACC7, $ACC7
  837. vpmuludq 32*7(%rcx), $B, $ACC8
  838. vmovdqa 32*8($a_ptr), $B
  839. vpaddq $T0, $ACC0, $OVERFLOW
  840. vpsrlq $digit_size, $OVERFLOW, $OVERFLOW
  841. vpaddq $T0, $ACC1, $ACC0
  842. vpaddq $T0, $ACC2, $ACC1
  843. vpmuludq 32*3(%rax), $Y, $T0
  844. vpaddq $OVERFLOW, $ACC0, $ACC0
  845. vpaddq $T0, $ACC3, $ACC2
  846. vmovdqa $ACC4, $ACC3
  847. vpsllq \$18, $Y, $T0
  848. vmovdqa $ACC5, $ACC4
  849. vpmuludq %ymm14, $Y, $OVERFLOW
  850. vpaddq $T0, $ACC6, $ACC5
  851. vpmuludq %ymm15, $Y, $T0
  852. vpand $AND_MASK, $ACC0, $Y
  853. vpaddq $OVERFLOW, $ACC7, $ACC6
  854. vpaddq $T0, $ACC8, $ACC7
  855. #itr 9
  856. vpmuludq $B, $B, $ACC8
  857. vpmuludq $AND_MASK, $Y, $T0
  858. vpaddq $T0, $ACC0, $OVERFLOW
  859. vpsrlq $digit_size, $OVERFLOW, $OVERFLOW
  860. vpaddq $T0, $ACC1, $ACC0
  861. vpaddq $T0, $ACC2, $ACC1
  862. vpmuludq 32*3(%rax), $Y, $T0
  863. vpaddq $OVERFLOW, $ACC0, $ACC0
  864. vpaddq $T0, $ACC3, $ACC2
  865. vmovdqa $ACC4, $ACC3
  866. vpsllq \$18, $Y, $T0
  867. vmovdqa $ACC5, $ACC4
  868. vpmuludq %ymm14, $Y, $OVERFLOW
  869. vpaddq $T0, $ACC6, $ACC5
  870. vpmuludq %ymm15, $Y, $T0
  871. vpaddq $OVERFLOW, $ACC7, $ACC6
  872. vpaddq $T0, $ACC8, $ACC7
  873. vpxor $ACC8, $ACC8, $ACC8
  874. ret
  875. .size avx2_sqr_x4,.-avx2_sqr_x4
  876. ################################################################################
  877. # void avx2_sub_x4(void* RESULTx4, void *Ax4, void *Bx4);
  878. .type avx2_sub_x4,\@abi-omnipotent
  879. .align 32
  880. avx2_sub_x4:
  881. vmovdqa 32*0($a_ptr), $ACC0
  882. lea 160($a_ptr), $a_ptr
  883. lea .LAVX2_POLY_x8+128(%rip), %rax
  884. lea 128($b_ptr), $b_ptr
  885. vmovdqa 32*1-160($a_ptr), $ACC1
  886. vmovdqa 32*2-160($a_ptr), $ACC2
  887. vmovdqa 32*3-160($a_ptr), $ACC3
  888. vmovdqa 32*4-160($a_ptr), $ACC4
  889. vmovdqa 32*5-160($a_ptr), $ACC5
  890. vmovdqa 32*6-160($a_ptr), $ACC6
  891. vmovdqa 32*7-160($a_ptr), $ACC7
  892. vmovdqa 32*8-160($a_ptr), $ACC8
  893. vpaddq 32*0-128(%rax), $ACC0, $ACC0
  894. vpaddq 32*1-128(%rax), $ACC1, $ACC1
  895. vpaddq 32*2-128(%rax), $ACC2, $ACC2
  896. vpaddq 32*3-128(%rax), $ACC3, $ACC3
  897. vpaddq 32*4-128(%rax), $ACC4, $ACC4
  898. vpaddq 32*5-128(%rax), $ACC5, $ACC5
  899. vpaddq 32*6-128(%rax), $ACC6, $ACC6
  900. vpaddq 32*7-128(%rax), $ACC7, $ACC7
  901. vpaddq 32*8-128(%rax), $ACC8, $ACC8
  902. vpsubq 32*0-128($b_ptr), $ACC0, $ACC0
  903. vpsubq 32*1-128($b_ptr), $ACC1, $ACC1
  904. vpsubq 32*2-128($b_ptr), $ACC2, $ACC2
  905. vpsubq 32*3-128($b_ptr), $ACC3, $ACC3
  906. vpsubq 32*4-128($b_ptr), $ACC4, $ACC4
  907. vpsubq 32*5-128($b_ptr), $ACC5, $ACC5
  908. vpsubq 32*6-128($b_ptr), $ACC6, $ACC6
  909. vpsubq 32*7-128($b_ptr), $ACC7, $ACC7
  910. vpsubq 32*8-128($b_ptr), $ACC8, $ACC8
  911. ret
  912. .size avx2_sub_x4,.-avx2_sub_x4
  913. .type avx2_select_n_store,\@abi-omnipotent
  914. .align 32
  915. avx2_select_n_store:
  916. vmovdqa `8+32*9*8`(%rsp), $Y
  917. vpor `8+32*9*8+32`(%rsp), $Y, $Y
  918. vpandn $ACC0, $Y, $ACC0
  919. vpandn $ACC1, $Y, $ACC1
  920. vpandn $ACC2, $Y, $ACC2
  921. vpandn $ACC3, $Y, $ACC3
  922. vpandn $ACC4, $Y, $ACC4
  923. vpandn $ACC5, $Y, $ACC5
  924. vpandn $ACC6, $Y, $ACC6
  925. vmovdqa `8+32*9*8+32`(%rsp), $B
  926. vpandn $ACC7, $Y, $ACC7
  927. vpandn `8+32*9*8`(%rsp), $B, $B
  928. vpandn $ACC8, $Y, $ACC8
  929. vpand 32*0(%rsi), $B, $T0
  930. lea 160(%rsi), %rax
  931. vpand 32*1(%rsi), $B, $Y
  932. vpxor $T0, $ACC0, $ACC0
  933. vpand 32*2(%rsi), $B, $T0
  934. vpxor $Y, $ACC1, $ACC1
  935. vpand 32*3(%rsi), $B, $Y
  936. vpxor $T0, $ACC2, $ACC2
  937. vpand 32*4-160(%rax), $B, $T0
  938. vpxor $Y, $ACC3, $ACC3
  939. vpand 32*5-160(%rax), $B, $Y
  940. vpxor $T0, $ACC4, $ACC4
  941. vpand 32*6-160(%rax), $B, $T0
  942. vpxor $Y, $ACC5, $ACC5
  943. vpand 32*7-160(%rax), $B, $Y
  944. vpxor $T0, $ACC6, $ACC6
  945. vpand 32*8-160(%rax), $B, $T0
  946. vmovdqa `8+32*9*8+32`(%rsp), $B
  947. vpxor $Y, $ACC7, $ACC7
  948. vpand 32*0(%rdx), $B, $Y
  949. lea 160(%rdx), %rax
  950. vpxor $T0, $ACC8, $ACC8
  951. vpand 32*1(%rdx), $B, $T0
  952. vpxor $Y, $ACC0, $ACC0
  953. vpand 32*2(%rdx), $B, $Y
  954. vpxor $T0, $ACC1, $ACC1
  955. vpand 32*3(%rdx), $B, $T0
  956. vpxor $Y, $ACC2, $ACC2
  957. vpand 32*4-160(%rax), $B, $Y
  958. vpxor $T0, $ACC3, $ACC3
  959. vpand 32*5-160(%rax), $B, $T0
  960. vpxor $Y, $ACC4, $ACC4
  961. vpand 32*6-160(%rax), $B, $Y
  962. vpxor $T0, $ACC5, $ACC5
  963. vpand 32*7-160(%rax), $B, $T0
  964. vpxor $Y, $ACC6, $ACC6
  965. vpand 32*8-160(%rax), $B, $Y
  966. vpxor $T0, $ACC7, $ACC7
  967. vpxor $Y, $ACC8, $ACC8
  968. `&STORE`
  969. ret
  970. .size avx2_select_n_store,.-avx2_select_n_store
  971. ___
  972. $code.=<<___ if (0); # inlined
  973. ################################################################################
  974. # void avx2_mul_by2_x4(void* RESULTx4, void *Ax4);
  975. .type avx2_mul_by2_x4,\@abi-omnipotent
  976. .align 32
  977. avx2_mul_by2_x4:
  978. vmovdqa 32*0($a_ptr), $ACC0
  979. lea 160($a_ptr), %rax
  980. vmovdqa 32*1($a_ptr), $ACC1
  981. vmovdqa 32*2($a_ptr), $ACC2
  982. vmovdqa 32*3($a_ptr), $ACC3
  983. vmovdqa 32*4-160(%rax), $ACC4
  984. vmovdqa 32*5-160(%rax), $ACC5
  985. vmovdqa 32*6-160(%rax), $ACC6
  986. vmovdqa 32*7-160(%rax), $ACC7
  987. vmovdqa 32*8-160(%rax), $ACC8
  988. vpaddq $ACC0, $ACC0, $ACC0
  989. vpaddq $ACC1, $ACC1, $ACC1
  990. vpaddq $ACC2, $ACC2, $ACC2
  991. vpaddq $ACC3, $ACC3, $ACC3
  992. vpaddq $ACC4, $ACC4, $ACC4
  993. vpaddq $ACC5, $ACC5, $ACC5
  994. vpaddq $ACC6, $ACC6, $ACC6
  995. vpaddq $ACC7, $ACC7, $ACC7
  996. vpaddq $ACC8, $ACC8, $ACC8
  997. ret
  998. .size avx2_mul_by2_x4,.-avx2_mul_by2_x4
  999. ___
  1000. my ($r_ptr_in,$a_ptr_in,$b_ptr_in)=("%rdi","%rsi","%rdx");
  1001. my ($r_ptr,$a_ptr,$b_ptr)=("%r8","%r9","%r10");
  1002. $code.=<<___;
  1003. ################################################################################
  1004. # void ecp_nistz256_avx2_point_add_affine_x4(void* RESULTx4, void *Ax4, void *Bx4);
  1005. .globl ecp_nistz256_avx2_point_add_affine_x4
  1006. .type ecp_nistz256_avx2_point_add_affine_x4,\@function,3
  1007. .align 32
  1008. ecp_nistz256_avx2_point_add_affine_x4:
  1009. mov %rsp, %rax
  1010. push %rbp
  1011. vzeroupper
  1012. ___
  1013. $code.=<<___ if ($win64);
  1014. lea -16*10(%rsp), %rsp
  1015. vmovaps %xmm6, -8-16*10(%rax)
  1016. vmovaps %xmm7, -8-16*9(%rax)
  1017. vmovaps %xmm8, -8-16*8(%rax)
  1018. vmovaps %xmm9, -8-16*7(%rax)
  1019. vmovaps %xmm10, -8-16*6(%rax)
  1020. vmovaps %xmm11, -8-16*5(%rax)
  1021. vmovaps %xmm12, -8-16*4(%rax)
  1022. vmovaps %xmm13, -8-16*3(%rax)
  1023. vmovaps %xmm14, -8-16*2(%rax)
  1024. vmovaps %xmm15, -8-16*1(%rax)
  1025. ___
  1026. $code.=<<___;
  1027. lea -8(%rax), %rbp
  1028. # Result + 32*0 = Result.X
  1029. # Result + 32*9 = Result.Y
  1030. # Result + 32*18 = Result.Z
  1031. # A + 32*0 = A.X
  1032. # A + 32*9 = A.Y
  1033. # A + 32*18 = A.Z
  1034. # B + 32*0 = B.X
  1035. # B + 32*9 = B.Y
  1036. sub \$`32*9*8+32*2+32*8`, %rsp
  1037. and \$-64, %rsp
  1038. mov $r_ptr_in, $r_ptr
  1039. mov $a_ptr_in, $a_ptr
  1040. mov $b_ptr_in, $b_ptr
  1041. vmovdqa 32*0($a_ptr_in), %ymm0
  1042. vmovdqa .LAVX2_AND_MASK(%rip), $AND_MASK
  1043. vpxor %ymm1, %ymm1, %ymm1
  1044. lea 256($a_ptr_in), %rax # size optimization
  1045. vpor 32*1($a_ptr_in), %ymm0, %ymm0
  1046. vpor 32*2($a_ptr_in), %ymm0, %ymm0
  1047. vpor 32*3($a_ptr_in), %ymm0, %ymm0
  1048. vpor 32*4-256(%rax), %ymm0, %ymm0
  1049. lea 256(%rax), %rcx # size optimization
  1050. vpor 32*5-256(%rax), %ymm0, %ymm0
  1051. vpor 32*6-256(%rax), %ymm0, %ymm0
  1052. vpor 32*7-256(%rax), %ymm0, %ymm0
  1053. vpor 32*8-256(%rax), %ymm0, %ymm0
  1054. vpor 32*9-256(%rax), %ymm0, %ymm0
  1055. vpor 32*10-256(%rax), %ymm0, %ymm0
  1056. vpor 32*11-256(%rax), %ymm0, %ymm0
  1057. vpor 32*12-512(%rcx), %ymm0, %ymm0
  1058. vpor 32*13-512(%rcx), %ymm0, %ymm0
  1059. vpor 32*14-512(%rcx), %ymm0, %ymm0
  1060. vpor 32*15-512(%rcx), %ymm0, %ymm0
  1061. vpor 32*16-512(%rcx), %ymm0, %ymm0
  1062. vpor 32*17-512(%rcx), %ymm0, %ymm0
  1063. vpcmpeqq %ymm1, %ymm0, %ymm0
  1064. vmovdqa %ymm0, `32*9*8`(%rsp)
  1065. vpxor %ymm1, %ymm1, %ymm1
  1066. vmovdqa 32*0($b_ptr), %ymm0
  1067. lea 256($b_ptr), %rax # size optimization
  1068. vpor 32*1($b_ptr), %ymm0, %ymm0
  1069. vpor 32*2($b_ptr), %ymm0, %ymm0
  1070. vpor 32*3($b_ptr), %ymm0, %ymm0
  1071. vpor 32*4-256(%rax), %ymm0, %ymm0
  1072. lea 256(%rax), %rcx # size optimization
  1073. vpor 32*5-256(%rax), %ymm0, %ymm0
  1074. vpor 32*6-256(%rax), %ymm0, %ymm0
  1075. vpor 32*7-256(%rax), %ymm0, %ymm0
  1076. vpor 32*8-256(%rax), %ymm0, %ymm0
  1077. vpor 32*9-256(%rax), %ymm0, %ymm0
  1078. vpor 32*10-256(%rax), %ymm0, %ymm0
  1079. vpor 32*11-256(%rax), %ymm0, %ymm0
  1080. vpor 32*12-512(%rcx), %ymm0, %ymm0
  1081. vpor 32*13-512(%rcx), %ymm0, %ymm0
  1082. vpor 32*14-512(%rcx), %ymm0, %ymm0
  1083. vpor 32*15-512(%rcx), %ymm0, %ymm0
  1084. vpor 32*16-512(%rcx), %ymm0, %ymm0
  1085. vpor 32*17-512(%rcx), %ymm0, %ymm0
  1086. vpcmpeqq %ymm1, %ymm0, %ymm0
  1087. vmovdqa %ymm0, `32*9*8+32`(%rsp)
  1088. # Z1^2 = Z1*Z1
  1089. lea `32*9*2`($a_ptr), %rsi
  1090. lea `32*9*2`(%rsp), %rdi
  1091. lea `32*9*8+32*2`(%rsp), %rcx # temporary vector
  1092. call avx2_sqr_x4
  1093. call avx2_normalize_n_store
  1094. # U2 = X2*Z1^2
  1095. lea `32*9*0`($b_ptr), %rsi
  1096. lea `32*9*2`(%rsp), %rdx
  1097. lea `32*9*0`(%rsp), %rdi
  1098. call avx2_mul_x4
  1099. #call avx2_normalize
  1100. `&STORE`
  1101. # S2 = Z1*Z1^2 = Z1^3
  1102. lea `32*9*2`($a_ptr), %rsi
  1103. lea `32*9*2`(%rsp), %rdx
  1104. lea `32*9*1`(%rsp), %rdi
  1105. call avx2_mul_x4
  1106. call avx2_normalize_n_store
  1107. # S2 = S2*Y2 = Y2*Z1^3
  1108. lea `32*9*1`($b_ptr), %rsi
  1109. lea `32*9*1`(%rsp), %rdx
  1110. lea `32*9*1`(%rsp), %rdi
  1111. call avx2_mul_x4
  1112. call avx2_normalize_n_store
  1113. # H = U2 - U1 = U2 - X1
  1114. lea `32*9*0`(%rsp), %rsi
  1115. lea `32*9*0`($a_ptr), %rdx
  1116. lea `32*9*3`(%rsp), %rdi
  1117. call avx2_sub_x4
  1118. call avx2_normalize_n_store
  1119. # R = S2 - S1 = S2 - Y1
  1120. lea `32*9*1`(%rsp), %rsi
  1121. lea `32*9*1`($a_ptr), %rdx
  1122. lea `32*9*4`(%rsp), %rdi
  1123. call avx2_sub_x4
  1124. call avx2_normalize_n_store
  1125. # Z3 = H*Z1*Z2
  1126. lea `32*9*3`(%rsp), %rsi
  1127. lea `32*9*2`($a_ptr), %rdx
  1128. lea `32*9*2`($r_ptr), %rdi
  1129. call avx2_mul_x4
  1130. call avx2_normalize
  1131. lea .LONE(%rip), %rsi
  1132. lea `32*9*2`($a_ptr), %rdx
  1133. call avx2_select_n_store
  1134. # R^2 = R^2
  1135. lea `32*9*4`(%rsp), %rsi
  1136. lea `32*9*6`(%rsp), %rdi
  1137. lea `32*9*8+32*2`(%rsp), %rcx # temporary vector
  1138. call avx2_sqr_x4
  1139. call avx2_normalize_n_store
  1140. # H^2 = H^2
  1141. lea `32*9*3`(%rsp), %rsi
  1142. lea `32*9*5`(%rsp), %rdi
  1143. call avx2_sqr_x4
  1144. call avx2_normalize_n_store
  1145. # H^3 = H^2*H
  1146. lea `32*9*3`(%rsp), %rsi
  1147. lea `32*9*5`(%rsp), %rdx
  1148. lea `32*9*7`(%rsp), %rdi
  1149. call avx2_mul_x4
  1150. call avx2_normalize_n_store
  1151. # U2 = U1*H^2
  1152. lea `32*9*0`($a_ptr), %rsi
  1153. lea `32*9*5`(%rsp), %rdx
  1154. lea `32*9*0`(%rsp), %rdi
  1155. call avx2_mul_x4
  1156. #call avx2_normalize
  1157. `&STORE`
  1158. # Hsqr = U2*2
  1159. #lea 32*9*0(%rsp), %rsi
  1160. #lea 32*9*5(%rsp), %rdi
  1161. #call avx2_mul_by2_x4
  1162. vpaddq $ACC0, $ACC0, $ACC0 # inlined avx2_mul_by2_x4
  1163. lea `32*9*5`(%rsp), %rdi
  1164. vpaddq $ACC1, $ACC1, $ACC1
  1165. vpaddq $ACC2, $ACC2, $ACC2
  1166. vpaddq $ACC3, $ACC3, $ACC3
  1167. vpaddq $ACC4, $ACC4, $ACC4
  1168. vpaddq $ACC5, $ACC5, $ACC5
  1169. vpaddq $ACC6, $ACC6, $ACC6
  1170. vpaddq $ACC7, $ACC7, $ACC7
  1171. vpaddq $ACC8, $ACC8, $ACC8
  1172. call avx2_normalize_n_store
  1173. # X3 = R^2 - H^3
  1174. #lea 32*9*6(%rsp), %rsi
  1175. #lea 32*9*7(%rsp), %rdx
  1176. #lea 32*9*5(%rsp), %rcx
  1177. #lea 32*9*0($r_ptr), %rdi
  1178. #call avx2_sub_x4
  1179. #NORMALIZE
  1180. #STORE
  1181. # X3 = X3 - U2*2
  1182. #lea 32*9*0($r_ptr), %rsi
  1183. #lea 32*9*0($r_ptr), %rdi
  1184. #call avx2_sub_x4
  1185. #NORMALIZE
  1186. #STORE
  1187. lea `32*9*6+128`(%rsp), %rsi
  1188. lea .LAVX2_POLY_x2+128(%rip), %rax
  1189. lea `32*9*7+128`(%rsp), %rdx
  1190. lea `32*9*5+128`(%rsp), %rcx
  1191. lea `32*9*0`($r_ptr), %rdi
  1192. vmovdqa 32*0-128(%rsi), $ACC0
  1193. vmovdqa 32*1-128(%rsi), $ACC1
  1194. vmovdqa 32*2-128(%rsi), $ACC2
  1195. vmovdqa 32*3-128(%rsi), $ACC3
  1196. vmovdqa 32*4-128(%rsi), $ACC4
  1197. vmovdqa 32*5-128(%rsi), $ACC5
  1198. vmovdqa 32*6-128(%rsi), $ACC6
  1199. vmovdqa 32*7-128(%rsi), $ACC7
  1200. vmovdqa 32*8-128(%rsi), $ACC8
  1201. vpaddq 32*0-128(%rax), $ACC0, $ACC0
  1202. vpaddq 32*1-128(%rax), $ACC1, $ACC1
  1203. vpaddq 32*2-128(%rax), $ACC2, $ACC2
  1204. vpaddq 32*3-128(%rax), $ACC3, $ACC3
  1205. vpaddq 32*4-128(%rax), $ACC4, $ACC4
  1206. vpaddq 32*5-128(%rax), $ACC5, $ACC5
  1207. vpaddq 32*6-128(%rax), $ACC6, $ACC6
  1208. vpaddq 32*7-128(%rax), $ACC7, $ACC7
  1209. vpaddq 32*8-128(%rax), $ACC8, $ACC8
  1210. vpsubq 32*0-128(%rdx), $ACC0, $ACC0
  1211. vpsubq 32*1-128(%rdx), $ACC1, $ACC1
  1212. vpsubq 32*2-128(%rdx), $ACC2, $ACC2
  1213. vpsubq 32*3-128(%rdx), $ACC3, $ACC3
  1214. vpsubq 32*4-128(%rdx), $ACC4, $ACC4
  1215. vpsubq 32*5-128(%rdx), $ACC5, $ACC5
  1216. vpsubq 32*6-128(%rdx), $ACC6, $ACC6
  1217. vpsubq 32*7-128(%rdx), $ACC7, $ACC7
  1218. vpsubq 32*8-128(%rdx), $ACC8, $ACC8
  1219. vpsubq 32*0-128(%rcx), $ACC0, $ACC0
  1220. vpsubq 32*1-128(%rcx), $ACC1, $ACC1
  1221. vpsubq 32*2-128(%rcx), $ACC2, $ACC2
  1222. vpsubq 32*3-128(%rcx), $ACC3, $ACC3
  1223. vpsubq 32*4-128(%rcx), $ACC4, $ACC4
  1224. vpsubq 32*5-128(%rcx), $ACC5, $ACC5
  1225. vpsubq 32*6-128(%rcx), $ACC6, $ACC6
  1226. vpsubq 32*7-128(%rcx), $ACC7, $ACC7
  1227. vpsubq 32*8-128(%rcx), $ACC8, $ACC8
  1228. call avx2_normalize
  1229. lea 32*0($b_ptr), %rsi
  1230. lea 32*0($a_ptr), %rdx
  1231. call avx2_select_n_store
  1232. # H = U2 - X3
  1233. lea `32*9*0`(%rsp), %rsi
  1234. lea `32*9*0`($r_ptr), %rdx
  1235. lea `32*9*3`(%rsp), %rdi
  1236. call avx2_sub_x4
  1237. call avx2_normalize_n_store
  1238. #
  1239. lea `32*9*3`(%rsp), %rsi
  1240. lea `32*9*4`(%rsp), %rdx
  1241. lea `32*9*3`(%rsp), %rdi
  1242. call avx2_mul_x4
  1243. call avx2_normalize_n_store
  1244. #
  1245. lea `32*9*7`(%rsp), %rsi
  1246. lea `32*9*1`($a_ptr), %rdx
  1247. lea `32*9*1`(%rsp), %rdi
  1248. call avx2_mul_x4
  1249. call avx2_normalize_n_store
  1250. #
  1251. lea `32*9*3`(%rsp), %rsi
  1252. lea `32*9*1`(%rsp), %rdx
  1253. lea `32*9*1`($r_ptr), %rdi
  1254. call avx2_sub_x4
  1255. call avx2_normalize
  1256. lea 32*9($b_ptr), %rsi
  1257. lea 32*9($a_ptr), %rdx
  1258. call avx2_select_n_store
  1259. #lea 32*9*0($r_ptr), %rsi
  1260. #lea 32*9*0($r_ptr), %rdi
  1261. #call avx2_mul_by1_x4
  1262. #NORMALIZE
  1263. #STORE
  1264. lea `32*9*1`($r_ptr), %rsi
  1265. lea `32*9*1`($r_ptr), %rdi
  1266. call avx2_mul_by1_x4
  1267. call avx2_normalize_n_store
  1268. vzeroupper
  1269. ___
  1270. $code.=<<___ if ($win64);
  1271. movaps %xmm6, -16*10(%rbp)
  1272. movaps %xmm7, -16*9(%rbp)
  1273. movaps %xmm8, -16*8(%rbp)
  1274. movaps %xmm9, -16*7(%rbp)
  1275. movaps %xmm10, -16*6(%rbp)
  1276. movaps %xmm11, -16*5(%rbp)
  1277. movaps %xmm12, -16*4(%rbp)
  1278. movaps %xmm13, -16*3(%rbp)
  1279. movaps %xmm14, -16*2(%rbp)
  1280. movaps %xmm15, -16*1(%rbp)
  1281. ___
  1282. $code.=<<___;
  1283. mov %rbp, %rsp
  1284. pop %rbp
  1285. ret
  1286. .size ecp_nistz256_avx2_point_add_affine_x4,.-ecp_nistz256_avx2_point_add_affine_x4
  1287. ################################################################################
  1288. # void ecp_nistz256_avx2_point_add_affines_x4(void* RESULTx4, void *Ax4, void *Bx4);
  1289. .globl ecp_nistz256_avx2_point_add_affines_x4
  1290. .type ecp_nistz256_avx2_point_add_affines_x4,\@function,3
  1291. .align 32
  1292. ecp_nistz256_avx2_point_add_affines_x4:
  1293. mov %rsp, %rax
  1294. push %rbp
  1295. vzeroupper
  1296. ___
  1297. $code.=<<___ if ($win64);
  1298. lea -16*10(%rsp), %rsp
  1299. vmovaps %xmm6, -8-16*10(%rax)
  1300. vmovaps %xmm7, -8-16*9(%rax)
  1301. vmovaps %xmm8, -8-16*8(%rax)
  1302. vmovaps %xmm9, -8-16*7(%rax)
  1303. vmovaps %xmm10, -8-16*6(%rax)
  1304. vmovaps %xmm11, -8-16*5(%rax)
  1305. vmovaps %xmm12, -8-16*4(%rax)
  1306. vmovaps %xmm13, -8-16*3(%rax)
  1307. vmovaps %xmm14, -8-16*2(%rax)
  1308. vmovaps %xmm15, -8-16*1(%rax)
  1309. ___
  1310. $code.=<<___;
  1311. lea -8(%rax), %rbp
  1312. # Result + 32*0 = Result.X
  1313. # Result + 32*9 = Result.Y
  1314. # Result + 32*18 = Result.Z
  1315. # A + 32*0 = A.X
  1316. # A + 32*9 = A.Y
  1317. # B + 32*0 = B.X
  1318. # B + 32*9 = B.Y
  1319. sub \$`32*9*8+32*2+32*8`, %rsp
  1320. and \$-64, %rsp
  1321. mov $r_ptr_in, $r_ptr
  1322. mov $a_ptr_in, $a_ptr
  1323. mov $b_ptr_in, $b_ptr
  1324. vmovdqa 32*0($a_ptr_in), %ymm0
  1325. vmovdqa .LAVX2_AND_MASK(%rip), $AND_MASK
  1326. vpxor %ymm1, %ymm1, %ymm1
  1327. lea 256($a_ptr_in), %rax # size optimization
  1328. vpor 32*1($a_ptr_in), %ymm0, %ymm0
  1329. vpor 32*2($a_ptr_in), %ymm0, %ymm0
  1330. vpor 32*3($a_ptr_in), %ymm0, %ymm0
  1331. vpor 32*4-256(%rax), %ymm0, %ymm0
  1332. lea 256(%rax), %rcx # size optimization
  1333. vpor 32*5-256(%rax), %ymm0, %ymm0
  1334. vpor 32*6-256(%rax), %ymm0, %ymm0
  1335. vpor 32*7-256(%rax), %ymm0, %ymm0
  1336. vpor 32*8-256(%rax), %ymm0, %ymm0
  1337. vpor 32*9-256(%rax), %ymm0, %ymm0
  1338. vpor 32*10-256(%rax), %ymm0, %ymm0
  1339. vpor 32*11-256(%rax), %ymm0, %ymm0
  1340. vpor 32*12-512(%rcx), %ymm0, %ymm0
  1341. vpor 32*13-512(%rcx), %ymm0, %ymm0
  1342. vpor 32*14-512(%rcx), %ymm0, %ymm0
  1343. vpor 32*15-512(%rcx), %ymm0, %ymm0
  1344. vpor 32*16-512(%rcx), %ymm0, %ymm0
  1345. vpor 32*17-512(%rcx), %ymm0, %ymm0
  1346. vpcmpeqq %ymm1, %ymm0, %ymm0
  1347. vmovdqa %ymm0, `32*9*8`(%rsp)
  1348. vpxor %ymm1, %ymm1, %ymm1
  1349. vmovdqa 32*0($b_ptr), %ymm0
  1350. lea 256($b_ptr), %rax # size optimization
  1351. vpor 32*1($b_ptr), %ymm0, %ymm0
  1352. vpor 32*2($b_ptr), %ymm0, %ymm0
  1353. vpor 32*3($b_ptr), %ymm0, %ymm0
  1354. vpor 32*4-256(%rax), %ymm0, %ymm0
  1355. lea 256(%rax), %rcx # size optimization
  1356. vpor 32*5-256(%rax), %ymm0, %ymm0
  1357. vpor 32*6-256(%rax), %ymm0, %ymm0
  1358. vpor 32*7-256(%rax), %ymm0, %ymm0
  1359. vpor 32*8-256(%rax), %ymm0, %ymm0
  1360. vpor 32*9-256(%rax), %ymm0, %ymm0
  1361. vpor 32*10-256(%rax), %ymm0, %ymm0
  1362. vpor 32*11-256(%rax), %ymm0, %ymm0
  1363. vpor 32*12-512(%rcx), %ymm0, %ymm0
  1364. vpor 32*13-512(%rcx), %ymm0, %ymm0
  1365. vpor 32*14-512(%rcx), %ymm0, %ymm0
  1366. vpor 32*15-512(%rcx), %ymm0, %ymm0
  1367. vpor 32*16-512(%rcx), %ymm0, %ymm0
  1368. vpor 32*17-512(%rcx), %ymm0, %ymm0
  1369. vpcmpeqq %ymm1, %ymm0, %ymm0
  1370. vmovdqa %ymm0, `32*9*8+32`(%rsp)
  1371. # H = U2 - U1 = X2 - X1
  1372. lea `32*9*0`($b_ptr), %rsi
  1373. lea `32*9*0`($a_ptr), %rdx
  1374. lea `32*9*3`(%rsp), %rdi
  1375. call avx2_sub_x4
  1376. call avx2_normalize_n_store
  1377. # R = S2 - S1 = Y2 - Y1
  1378. lea `32*9*1`($b_ptr), %rsi
  1379. lea `32*9*1`($a_ptr), %rdx
  1380. lea `32*9*4`(%rsp), %rdi
  1381. call avx2_sub_x4
  1382. call avx2_normalize_n_store
  1383. # Z3 = H*Z1*Z2 = H
  1384. lea `32*9*3`(%rsp), %rsi
  1385. lea `32*9*2`($r_ptr), %rdi
  1386. call avx2_mul_by1_x4
  1387. call avx2_normalize
  1388. vmovdqa `32*9*8`(%rsp), $B
  1389. vpor `32*9*8+32`(%rsp), $B, $B
  1390. vpandn $ACC0, $B, $ACC0
  1391. lea .LONE+128(%rip), %rax
  1392. vpandn $ACC1, $B, $ACC1
  1393. vpandn $ACC2, $B, $ACC2
  1394. vpandn $ACC3, $B, $ACC3
  1395. vpandn $ACC4, $B, $ACC4
  1396. vpandn $ACC5, $B, $ACC5
  1397. vpandn $ACC6, $B, $ACC6
  1398. vpandn $ACC7, $B, $ACC7
  1399. vpand 32*0-128(%rax), $B, $T0
  1400. vpandn $ACC8, $B, $ACC8
  1401. vpand 32*1-128(%rax), $B, $Y
  1402. vpxor $T0, $ACC0, $ACC0
  1403. vpand 32*2-128(%rax), $B, $T0
  1404. vpxor $Y, $ACC1, $ACC1
  1405. vpand 32*3-128(%rax), $B, $Y
  1406. vpxor $T0, $ACC2, $ACC2
  1407. vpand 32*4-128(%rax), $B, $T0
  1408. vpxor $Y, $ACC3, $ACC3
  1409. vpand 32*5-128(%rax), $B, $Y
  1410. vpxor $T0, $ACC4, $ACC4
  1411. vpand 32*6-128(%rax), $B, $T0
  1412. vpxor $Y, $ACC5, $ACC5
  1413. vpand 32*7-128(%rax), $B, $Y
  1414. vpxor $T0, $ACC6, $ACC6
  1415. vpand 32*8-128(%rax), $B, $T0
  1416. vpxor $Y, $ACC7, $ACC7
  1417. vpxor $T0, $ACC8, $ACC8
  1418. `&STORE`
  1419. # R^2 = R^2
  1420. lea `32*9*4`(%rsp), %rsi
  1421. lea `32*9*6`(%rsp), %rdi
  1422. lea `32*9*8+32*2`(%rsp), %rcx # temporary vector
  1423. call avx2_sqr_x4
  1424. call avx2_normalize_n_store
  1425. # H^2 = H^2
  1426. lea `32*9*3`(%rsp), %rsi
  1427. lea `32*9*5`(%rsp), %rdi
  1428. call avx2_sqr_x4
  1429. call avx2_normalize_n_store
  1430. # H^3 = H^2*H
  1431. lea `32*9*3`(%rsp), %rsi
  1432. lea `32*9*5`(%rsp), %rdx
  1433. lea `32*9*7`(%rsp), %rdi
  1434. call avx2_mul_x4
  1435. call avx2_normalize_n_store
  1436. # U2 = U1*H^2
  1437. lea `32*9*0`($a_ptr), %rsi
  1438. lea `32*9*5`(%rsp), %rdx
  1439. lea `32*9*0`(%rsp), %rdi
  1440. call avx2_mul_x4
  1441. #call avx2_normalize
  1442. `&STORE`
  1443. # Hsqr = U2*2
  1444. #lea 32*9*0(%rsp), %rsi
  1445. #lea 32*9*5(%rsp), %rdi
  1446. #call avx2_mul_by2_x4
  1447. vpaddq $ACC0, $ACC0, $ACC0 # inlined avx2_mul_by2_x4
  1448. lea `32*9*5`(%rsp), %rdi
  1449. vpaddq $ACC1, $ACC1, $ACC1
  1450. vpaddq $ACC2, $ACC2, $ACC2
  1451. vpaddq $ACC3, $ACC3, $ACC3
  1452. vpaddq $ACC4, $ACC4, $ACC4
  1453. vpaddq $ACC5, $ACC5, $ACC5
  1454. vpaddq $ACC6, $ACC6, $ACC6
  1455. vpaddq $ACC7, $ACC7, $ACC7
  1456. vpaddq $ACC8, $ACC8, $ACC8
  1457. call avx2_normalize_n_store
  1458. # X3 = R^2 - H^3
  1459. #lea 32*9*6(%rsp), %rsi
  1460. #lea 32*9*7(%rsp), %rdx
  1461. #lea 32*9*5(%rsp), %rcx
  1462. #lea 32*9*0($r_ptr), %rdi
  1463. #call avx2_sub_x4
  1464. #NORMALIZE
  1465. #STORE
  1466. # X3 = X3 - U2*2
  1467. #lea 32*9*0($r_ptr), %rsi
  1468. #lea 32*9*0($r_ptr), %rdi
  1469. #call avx2_sub_x4
  1470. #NORMALIZE
  1471. #STORE
  1472. lea `32*9*6+128`(%rsp), %rsi
  1473. lea .LAVX2_POLY_x2+128(%rip), %rax
  1474. lea `32*9*7+128`(%rsp), %rdx
  1475. lea `32*9*5+128`(%rsp), %rcx
  1476. lea `32*9*0`($r_ptr), %rdi
  1477. vmovdqa 32*0-128(%rsi), $ACC0
  1478. vmovdqa 32*1-128(%rsi), $ACC1
  1479. vmovdqa 32*2-128(%rsi), $ACC2
  1480. vmovdqa 32*3-128(%rsi), $ACC3
  1481. vmovdqa 32*4-128(%rsi), $ACC4
  1482. vmovdqa 32*5-128(%rsi), $ACC5
  1483. vmovdqa 32*6-128(%rsi), $ACC6
  1484. vmovdqa 32*7-128(%rsi), $ACC7
  1485. vmovdqa 32*8-128(%rsi), $ACC8
  1486. vpaddq 32*0-128(%rax), $ACC0, $ACC0
  1487. vpaddq 32*1-128(%rax), $ACC1, $ACC1
  1488. vpaddq 32*2-128(%rax), $ACC2, $ACC2
  1489. vpaddq 32*3-128(%rax), $ACC3, $ACC3
  1490. vpaddq 32*4-128(%rax), $ACC4, $ACC4
  1491. vpaddq 32*5-128(%rax), $ACC5, $ACC5
  1492. vpaddq 32*6-128(%rax), $ACC6, $ACC6
  1493. vpaddq 32*7-128(%rax), $ACC7, $ACC7
  1494. vpaddq 32*8-128(%rax), $ACC8, $ACC8
  1495. vpsubq 32*0-128(%rdx), $ACC0, $ACC0
  1496. vpsubq 32*1-128(%rdx), $ACC1, $ACC1
  1497. vpsubq 32*2-128(%rdx), $ACC2, $ACC2
  1498. vpsubq 32*3-128(%rdx), $ACC3, $ACC3
  1499. vpsubq 32*4-128(%rdx), $ACC4, $ACC4
  1500. vpsubq 32*5-128(%rdx), $ACC5, $ACC5
  1501. vpsubq 32*6-128(%rdx), $ACC6, $ACC6
  1502. vpsubq 32*7-128(%rdx), $ACC7, $ACC7
  1503. vpsubq 32*8-128(%rdx), $ACC8, $ACC8
  1504. vpsubq 32*0-128(%rcx), $ACC0, $ACC0
  1505. vpsubq 32*1-128(%rcx), $ACC1, $ACC1
  1506. vpsubq 32*2-128(%rcx), $ACC2, $ACC2
  1507. vpsubq 32*3-128(%rcx), $ACC3, $ACC3
  1508. vpsubq 32*4-128(%rcx), $ACC4, $ACC4
  1509. vpsubq 32*5-128(%rcx), $ACC5, $ACC5
  1510. vpsubq 32*6-128(%rcx), $ACC6, $ACC6
  1511. vpsubq 32*7-128(%rcx), $ACC7, $ACC7
  1512. vpsubq 32*8-128(%rcx), $ACC8, $ACC8
  1513. call avx2_normalize
  1514. lea 32*0($b_ptr), %rsi
  1515. lea 32*0($a_ptr), %rdx
  1516. call avx2_select_n_store
  1517. # H = U2 - X3
  1518. lea `32*9*0`(%rsp), %rsi
  1519. lea `32*9*0`($r_ptr), %rdx
  1520. lea `32*9*3`(%rsp), %rdi
  1521. call avx2_sub_x4
  1522. call avx2_normalize_n_store
  1523. # H = H*R
  1524. lea `32*9*3`(%rsp), %rsi
  1525. lea `32*9*4`(%rsp), %rdx
  1526. lea `32*9*3`(%rsp), %rdi
  1527. call avx2_mul_x4
  1528. call avx2_normalize_n_store
  1529. # S2 = S1 * H^3
  1530. lea `32*9*7`(%rsp), %rsi
  1531. lea `32*9*1`($a_ptr), %rdx
  1532. lea `32*9*1`(%rsp), %rdi
  1533. call avx2_mul_x4
  1534. call avx2_normalize_n_store
  1535. #
  1536. lea `32*9*3`(%rsp), %rsi
  1537. lea `32*9*1`(%rsp), %rdx
  1538. lea `32*9*1`($r_ptr), %rdi
  1539. call avx2_sub_x4
  1540. call avx2_normalize
  1541. lea 32*9($b_ptr), %rsi
  1542. lea 32*9($a_ptr), %rdx
  1543. call avx2_select_n_store
  1544. #lea 32*9*0($r_ptr), %rsi
  1545. #lea 32*9*0($r_ptr), %rdi
  1546. #call avx2_mul_by1_x4
  1547. #NORMALIZE
  1548. #STORE
  1549. lea `32*9*1`($r_ptr), %rsi
  1550. lea `32*9*1`($r_ptr), %rdi
  1551. call avx2_mul_by1_x4
  1552. call avx2_normalize_n_store
  1553. vzeroupper
  1554. ___
  1555. $code.=<<___ if ($win64);
  1556. movaps %xmm6, -16*10(%rbp)
  1557. movaps %xmm7, -16*9(%rbp)
  1558. movaps %xmm8, -16*8(%rbp)
  1559. movaps %xmm9, -16*7(%rbp)
  1560. movaps %xmm10, -16*6(%rbp)
  1561. movaps %xmm11, -16*5(%rbp)
  1562. movaps %xmm12, -16*4(%rbp)
  1563. movaps %xmm13, -16*3(%rbp)
  1564. movaps %xmm14, -16*2(%rbp)
  1565. movaps %xmm15, -16*1(%rbp)
  1566. ___
  1567. $code.=<<___;
  1568. mov %rbp, %rsp
  1569. pop %rbp
  1570. ret
  1571. .size ecp_nistz256_avx2_point_add_affines_x4,.-ecp_nistz256_avx2_point_add_affines_x4
  1572. ################################################################################
  1573. # void ecp_nistz256_avx2_to_mont(void* RESULTx4, void *Ax4);
  1574. .globl ecp_nistz256_avx2_to_mont
  1575. .type ecp_nistz256_avx2_to_mont,\@function,2
  1576. .align 32
  1577. ecp_nistz256_avx2_to_mont:
  1578. vzeroupper
  1579. ___
  1580. $code.=<<___ if ($win64);
  1581. lea -8-16*10(%rsp), %rsp
  1582. vmovaps %xmm6, -8-16*10(%rax)
  1583. vmovaps %xmm7, -8-16*9(%rax)
  1584. vmovaps %xmm8, -8-16*8(%rax)
  1585. vmovaps %xmm9, -8-16*7(%rax)
  1586. vmovaps %xmm10, -8-16*6(%rax)
  1587. vmovaps %xmm11, -8-16*5(%rax)
  1588. vmovaps %xmm12, -8-16*4(%rax)
  1589. vmovaps %xmm13, -8-16*3(%rax)
  1590. vmovaps %xmm14, -8-16*2(%rax)
  1591. vmovaps %xmm15, -8-16*1(%rax)
  1592. ___
  1593. $code.=<<___;
  1594. vmovdqa .LAVX2_AND_MASK(%rip), $AND_MASK
  1595. lea .LTO_MONT_AVX2(%rip), %rdx
  1596. call avx2_mul_x4
  1597. call avx2_normalize_n_store
  1598. vzeroupper
  1599. ___
  1600. $code.=<<___ if ($win64);
  1601. movaps 16*0(%rsp), %xmm6
  1602. movaps 16*1(%rsp), %xmm7
  1603. movaps 16*2(%rsp), %xmm8
  1604. movaps 16*3(%rsp), %xmm9
  1605. movaps 16*4(%rsp), %xmm10
  1606. movaps 16*5(%rsp), %xmm11
  1607. movaps 16*6(%rsp), %xmm12
  1608. movaps 16*7(%rsp), %xmm13
  1609. movaps 16*8(%rsp), %xmm14
  1610. movaps 16*9(%rsp), %xmm15
  1611. lea 8+16*10(%rsp), %rsp
  1612. ___
  1613. $code.=<<___;
  1614. ret
  1615. .size ecp_nistz256_avx2_to_mont,.-ecp_nistz256_avx2_to_mont
  1616. ################################################################################
  1617. # void ecp_nistz256_avx2_from_mont(void* RESULTx4, void *Ax4);
  1618. .globl ecp_nistz256_avx2_from_mont
  1619. .type ecp_nistz256_avx2_from_mont,\@function,2
  1620. .align 32
  1621. ecp_nistz256_avx2_from_mont:
  1622. vzeroupper
  1623. ___
  1624. $code.=<<___ if ($win64);
  1625. lea -8-16*10(%rsp), %rsp
  1626. vmovaps %xmm6, -8-16*10(%rax)
  1627. vmovaps %xmm7, -8-16*9(%rax)
  1628. vmovaps %xmm8, -8-16*8(%rax)
  1629. vmovaps %xmm9, -8-16*7(%rax)
  1630. vmovaps %xmm10, -8-16*6(%rax)
  1631. vmovaps %xmm11, -8-16*5(%rax)
  1632. vmovaps %xmm12, -8-16*4(%rax)
  1633. vmovaps %xmm13, -8-16*3(%rax)
  1634. vmovaps %xmm14, -8-16*2(%rax)
  1635. vmovaps %xmm15, -8-16*1(%rax)
  1636. ___
  1637. $code.=<<___;
  1638. vmovdqa .LAVX2_AND_MASK(%rip), $AND_MASK
  1639. lea .LFROM_MONT_AVX2(%rip), %rdx
  1640. call avx2_mul_x4
  1641. call avx2_normalize_n_store
  1642. vzeroupper
  1643. ___
  1644. $code.=<<___ if ($win64);
  1645. movaps 16*0(%rsp), %xmm6
  1646. movaps 16*1(%rsp), %xmm7
  1647. movaps 16*2(%rsp), %xmm8
  1648. movaps 16*3(%rsp), %xmm9
  1649. movaps 16*4(%rsp), %xmm10
  1650. movaps 16*5(%rsp), %xmm11
  1651. movaps 16*6(%rsp), %xmm12
  1652. movaps 16*7(%rsp), %xmm13
  1653. movaps 16*8(%rsp), %xmm14
  1654. movaps 16*9(%rsp), %xmm15
  1655. lea 8+16*10(%rsp), %rsp
  1656. ___
  1657. $code.=<<___;
  1658. ret
  1659. .size ecp_nistz256_avx2_from_mont,.-ecp_nistz256_avx2_from_mont
  1660. ################################################################################
  1661. # void ecp_nistz256_avx2_set1(void* RESULTx4);
  1662. .globl ecp_nistz256_avx2_set1
  1663. .type ecp_nistz256_avx2_set1,\@function,1
  1664. .align 32
  1665. ecp_nistz256_avx2_set1:
  1666. lea .LONE+128(%rip), %rax
  1667. lea 128(%rdi), %rdi
  1668. vzeroupper
  1669. vmovdqa 32*0-128(%rax), %ymm0
  1670. vmovdqa 32*1-128(%rax), %ymm1
  1671. vmovdqa 32*2-128(%rax), %ymm2
  1672. vmovdqa 32*3-128(%rax), %ymm3
  1673. vmovdqa 32*4-128(%rax), %ymm4
  1674. vmovdqa 32*5-128(%rax), %ymm5
  1675. vmovdqa %ymm0, 32*0-128(%rdi)
  1676. vmovdqa 32*6-128(%rax), %ymm0
  1677. vmovdqa %ymm1, 32*1-128(%rdi)
  1678. vmovdqa 32*7-128(%rax), %ymm1
  1679. vmovdqa %ymm2, 32*2-128(%rdi)
  1680. vmovdqa 32*8-128(%rax), %ymm2
  1681. vmovdqa %ymm3, 32*3-128(%rdi)
  1682. vmovdqa %ymm4, 32*4-128(%rdi)
  1683. vmovdqa %ymm5, 32*5-128(%rdi)
  1684. vmovdqa %ymm0, 32*6-128(%rdi)
  1685. vmovdqa %ymm1, 32*7-128(%rdi)
  1686. vmovdqa %ymm2, 32*8-128(%rdi)
  1687. vzeroupper
  1688. ret
  1689. .size ecp_nistz256_avx2_set1,.-ecp_nistz256_avx2_set1
  1690. ___
  1691. }
  1692. {
  1693. ################################################################################
  1694. # void ecp_nistz256_avx2_multi_gather_w7(void* RESULT, void *in,
  1695. # int index0, int index1, int index2, int index3);
  1696. ################################################################################
  1697. my ($val,$in_t,$index0,$index1,$index2,$index3)=("%rdi","%rsi","%edx","%ecx","%r8d","%r9d");
  1698. my ($INDEX0,$INDEX1,$INDEX2,$INDEX3)=map("%ymm$_",(0..3));
  1699. my ($R0a,$R0b,$R1a,$R1b,$R2a,$R2b,$R3a,$R3b)=map("%ymm$_",(4..11));
  1700. my ($M0,$T0,$T1,$TMP0)=map("%ymm$_",(12..15));
  1701. $code.=<<___;
  1702. .globl ecp_nistz256_avx2_multi_gather_w7
  1703. .type ecp_nistz256_avx2_multi_gather_w7,\@function,6
  1704. .align 32
  1705. ecp_nistz256_avx2_multi_gather_w7:
  1706. vzeroupper
  1707. ___
  1708. $code.=<<___ if ($win64);
  1709. lea -8-16*10(%rsp), %rsp
  1710. vmovaps %xmm6, -8-16*10(%rax)
  1711. vmovaps %xmm7, -8-16*9(%rax)
  1712. vmovaps %xmm8, -8-16*8(%rax)
  1713. vmovaps %xmm9, -8-16*7(%rax)
  1714. vmovaps %xmm10, -8-16*6(%rax)
  1715. vmovaps %xmm11, -8-16*5(%rax)
  1716. vmovaps %xmm12, -8-16*4(%rax)
  1717. vmovaps %xmm13, -8-16*3(%rax)
  1718. vmovaps %xmm14, -8-16*2(%rax)
  1719. vmovaps %xmm15, -8-16*1(%rax)
  1720. ___
  1721. $code.=<<___;
  1722. lea .LIntOne(%rip), %rax
  1723. vmovd $index0, %xmm0
  1724. vmovd $index1, %xmm1
  1725. vmovd $index2, %xmm2
  1726. vmovd $index3, %xmm3
  1727. vpxor $R0a, $R0a, $R0a
  1728. vpxor $R0b, $R0b, $R0b
  1729. vpxor $R1a, $R1a, $R1a
  1730. vpxor $R1b, $R1b, $R1b
  1731. vpxor $R2a, $R2a, $R2a
  1732. vpxor $R2b, $R2b, $R2b
  1733. vpxor $R3a, $R3a, $R3a
  1734. vpxor $R3b, $R3b, $R3b
  1735. vmovdqa (%rax), $M0
  1736. vpermd $INDEX0, $R0a, $INDEX0
  1737. vpermd $INDEX1, $R0a, $INDEX1
  1738. vpermd $INDEX2, $R0a, $INDEX2
  1739. vpermd $INDEX3, $R0a, $INDEX3
  1740. mov \$64, %ecx
  1741. lea 112($val), $val # size optimization
  1742. jmp .Lmulti_select_loop_avx2
  1743. # INDEX=0, corresponds to the point at infty (0,0)
  1744. .align 32
  1745. .Lmulti_select_loop_avx2:
  1746. vpcmpeqd $INDEX0, $M0, $TMP0
  1747. vmovdqa `32*0+32*64*2*0`($in_t), $T0
  1748. vmovdqa `32*1+32*64*2*0`($in_t), $T1
  1749. vpand $TMP0, $T0, $T0
  1750. vpand $TMP0, $T1, $T1
  1751. vpxor $T0, $R0a, $R0a
  1752. vpxor $T1, $R0b, $R0b
  1753. vpcmpeqd $INDEX1, $M0, $TMP0
  1754. vmovdqa `32*0+32*64*2*1`($in_t), $T0
  1755. vmovdqa `32*1+32*64*2*1`($in_t), $T1
  1756. vpand $TMP0, $T0, $T0
  1757. vpand $TMP0, $T1, $T1
  1758. vpxor $T0, $R1a, $R1a
  1759. vpxor $T1, $R1b, $R1b
  1760. vpcmpeqd $INDEX2, $M0, $TMP0
  1761. vmovdqa `32*0+32*64*2*2`($in_t), $T0
  1762. vmovdqa `32*1+32*64*2*2`($in_t), $T1
  1763. vpand $TMP0, $T0, $T0
  1764. vpand $TMP0, $T1, $T1
  1765. vpxor $T0, $R2a, $R2a
  1766. vpxor $T1, $R2b, $R2b
  1767. vpcmpeqd $INDEX3, $M0, $TMP0
  1768. vmovdqa `32*0+32*64*2*3`($in_t), $T0
  1769. vmovdqa `32*1+32*64*2*3`($in_t), $T1
  1770. vpand $TMP0, $T0, $T0
  1771. vpand $TMP0, $T1, $T1
  1772. vpxor $T0, $R3a, $R3a
  1773. vpxor $T1, $R3b, $R3b
  1774. vpaddd (%rax), $M0, $M0 # increment
  1775. lea 32*2($in_t), $in_t
  1776. dec %ecx
  1777. jnz .Lmulti_select_loop_avx2
  1778. vmovdqu $R0a, 32*0-112($val)
  1779. vmovdqu $R0b, 32*1-112($val)
  1780. vmovdqu $R1a, 32*2-112($val)
  1781. vmovdqu $R1b, 32*3-112($val)
  1782. vmovdqu $R2a, 32*4-112($val)
  1783. vmovdqu $R2b, 32*5-112($val)
  1784. vmovdqu $R3a, 32*6-112($val)
  1785. vmovdqu $R3b, 32*7-112($val)
  1786. vzeroupper
  1787. ___
  1788. $code.=<<___ if ($win64);
  1789. movaps 16*0(%rsp), %xmm6
  1790. movaps 16*1(%rsp), %xmm7
  1791. movaps 16*2(%rsp), %xmm8
  1792. movaps 16*3(%rsp), %xmm9
  1793. movaps 16*4(%rsp), %xmm10
  1794. movaps 16*5(%rsp), %xmm11
  1795. movaps 16*6(%rsp), %xmm12
  1796. movaps 16*7(%rsp), %xmm13
  1797. movaps 16*8(%rsp), %xmm14
  1798. movaps 16*9(%rsp), %xmm15
  1799. lea 8+16*10(%rsp), %rsp
  1800. ___
  1801. $code.=<<___;
  1802. ret
  1803. .size ecp_nistz256_avx2_multi_gather_w7,.-ecp_nistz256_avx2_multi_gather_w7
  1804. .extern OPENSSL_ia32cap_P
  1805. .globl ecp_nistz_avx2_eligible
  1806. .type ecp_nistz_avx2_eligible,\@abi-omnipotent
  1807. .align 32
  1808. ecp_nistz_avx2_eligible:
  1809. mov OPENSSL_ia32cap_P+8(%rip),%eax
  1810. shr \$5,%eax
  1811. and \$1,%eax
  1812. ret
  1813. .size ecp_nistz_avx2_eligible,.-ecp_nistz_avx2_eligible
  1814. ___
  1815. }
  1816. }} else {{ # assembler is too old
  1817. $code.=<<___;
  1818. .text
  1819. .globl ecp_nistz256_avx2_transpose_convert
  1820. .globl ecp_nistz256_avx2_convert_transpose_back
  1821. .globl ecp_nistz256_avx2_point_add_affine_x4
  1822. .globl ecp_nistz256_avx2_point_add_affines_x4
  1823. .globl ecp_nistz256_avx2_to_mont
  1824. .globl ecp_nistz256_avx2_from_mont
  1825. .globl ecp_nistz256_avx2_set1
  1826. .globl ecp_nistz256_avx2_multi_gather_w7
  1827. .type ecp_nistz256_avx2_multi_gather_w7,\@abi-omnipotent
  1828. ecp_nistz256_avx2_transpose_convert:
  1829. ecp_nistz256_avx2_convert_transpose_back:
  1830. ecp_nistz256_avx2_point_add_affine_x4:
  1831. ecp_nistz256_avx2_point_add_affines_x4:
  1832. ecp_nistz256_avx2_to_mont:
  1833. ecp_nistz256_avx2_from_mont:
  1834. ecp_nistz256_avx2_set1:
  1835. ecp_nistz256_avx2_multi_gather_w7:
  1836. .byte 0x0f,0x0b # ud2
  1837. ret
  1838. .size ecp_nistz256_avx2_multi_gather_w7,.-ecp_nistz256_avx2_multi_gather_w7
  1839. .globl ecp_nistz_avx2_eligible
  1840. .type ecp_nistz_avx2_eligible,\@abi-omnipotent
  1841. ecp_nistz_avx2_eligible:
  1842. xor %eax,%eax
  1843. ret
  1844. .size ecp_nistz_avx2_eligible,.-ecp_nistz_avx2_eligible
  1845. ___
  1846. }}
  1847. foreach (split("\n",$code)) {
  1848. s/\`([^\`]*)\`/eval($1)/geo;
  1849. print $_,"\n";
  1850. }
  1851. close STDOUT;