ecp_nistz256-ppc64.pl 52 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385
  1. #! /usr/bin/env perl
  2. # Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. # ====================================================================
  10. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  11. # project. The module is, however, dual licensed under OpenSSL and
  12. # CRYPTOGAMS licenses depending on where you obtain it. For further
  13. # details see http://www.openssl.org/~appro/cryptogams/.
  14. # ====================================================================
  15. #
  16. # ECP_NISTZ256 module for PPC64.
  17. #
  18. # August 2016.
  19. #
  20. # Original ECP_NISTZ256 submission targeting x86_64 is detailed in
  21. # http://eprint.iacr.org/2013/816.
  22. #
  23. # with/without -DECP_NISTZ256_ASM
  24. # POWER7 +260-530%
  25. # POWER8 +220-340%
  26. # $output is the last argument if it looks like a file (it has an extension)
  27. # $flavour is the first argument if it doesn't look like a file
  28. $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  29. $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  30. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  31. ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
  32. ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
  33. die "can't locate ppc-xlate.pl";
  34. open OUT,"| \"$^X\" $xlate $flavour \"$output\""
  35. or die "can't call $xlate: $!";
  36. *STDOUT=*OUT;
  37. my $sp="r1";
  38. {
  39. my ($rp,$ap,$bp,$bi,$acc0,$acc1,$acc2,$acc3,$poly1,$poly3,
  40. $acc4,$acc5,$a0,$a1,$a2,$a3,$t0,$t1,$t2,$t3) =
  41. map("r$_",(3..12,22..31));
  42. my ($acc6,$acc7)=($bp,$bi); # used in __ecp_nistz256_sqr_mont
  43. $code.=<<___;
  44. .machine "any"
  45. .text
  46. ___
  47. ########################################################################
  48. # Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7
  49. #
  50. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  51. open TABLE,"<ecp_nistz256_table.c" or
  52. open TABLE,"<${dir}../ecp_nistz256_table.c" or
  53. die "failed to open ecp_nistz256_table.c:",$!;
  54. use integer;
  55. foreach(<TABLE>) {
  56. s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo;
  57. }
  58. close TABLE;
  59. # See ecp_nistz256_table.c for explanation for why it's 64*16*37.
  60. # 64*16*37-1 is because $#arr returns last valid index or @arr, not
  61. # amount of elements.
  62. die "insane number of elements" if ($#arr != 64*16*37-1);
  63. $code.=<<___;
  64. .type ecp_nistz256_precomputed,\@object
  65. .globl ecp_nistz256_precomputed
  66. .align 12
  67. ecp_nistz256_precomputed:
  68. ___
  69. ########################################################################
  70. # this conversion smashes P256_POINT_AFFINE by individual bytes with
  71. # 64 byte interval, similar to
  72. # 1111222233334444
  73. # 1234123412341234
  74. for(1..37) {
  75. @tbl = splice(@arr,0,64*16);
  76. for($i=0;$i<64;$i++) {
  77. undef @line;
  78. for($j=0;$j<64;$j++) {
  79. push @line,(@tbl[$j*16+$i/4]>>(($i%4)*8))&0xff;
  80. }
  81. $code.=".byte\t";
  82. $code.=join(',',map { sprintf "0x%02x",$_} @line);
  83. $code.="\n";
  84. }
  85. }
  86. $code.=<<___;
  87. .size ecp_nistz256_precomputed,.-ecp_nistz256_precomputed
  88. .asciz "ECP_NISTZ256 for PPC64, CRYPTOGAMS by <appro\@openssl.org>"
  89. # void ecp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4],
  90. # const BN_ULONG x2[4]);
  91. .globl ecp_nistz256_mul_mont
  92. .align 5
  93. ecp_nistz256_mul_mont:
  94. stdu $sp,-128($sp)
  95. mflr r0
  96. std r22,48($sp)
  97. std r23,56($sp)
  98. std r24,64($sp)
  99. std r25,72($sp)
  100. std r26,80($sp)
  101. std r27,88($sp)
  102. std r28,96($sp)
  103. std r29,104($sp)
  104. std r30,112($sp)
  105. std r31,120($sp)
  106. ld $a0,0($ap)
  107. ld $bi,0($bp)
  108. ld $a1,8($ap)
  109. ld $a2,16($ap)
  110. ld $a3,24($ap)
  111. li $poly1,-1
  112. srdi $poly1,$poly1,32 # 0x00000000ffffffff
  113. li $poly3,1
  114. orc $poly3,$poly3,$poly1 # 0xffffffff00000001
  115. bl __ecp_nistz256_mul_mont
  116. mtlr r0
  117. ld r22,48($sp)
  118. ld r23,56($sp)
  119. ld r24,64($sp)
  120. ld r25,72($sp)
  121. ld r26,80($sp)
  122. ld r27,88($sp)
  123. ld r28,96($sp)
  124. ld r29,104($sp)
  125. ld r30,112($sp)
  126. ld r31,120($sp)
  127. addi $sp,$sp,128
  128. blr
  129. .long 0
  130. .byte 0,12,4,0,0x80,10,3,0
  131. .long 0
  132. .size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
  133. # void ecp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]);
  134. .globl ecp_nistz256_sqr_mont
  135. .align 4
  136. ecp_nistz256_sqr_mont:
  137. stdu $sp,-128($sp)
  138. mflr r0
  139. std r22,48($sp)
  140. std r23,56($sp)
  141. std r24,64($sp)
  142. std r25,72($sp)
  143. std r26,80($sp)
  144. std r27,88($sp)
  145. std r28,96($sp)
  146. std r29,104($sp)
  147. std r30,112($sp)
  148. std r31,120($sp)
  149. ld $a0,0($ap)
  150. ld $a1,8($ap)
  151. ld $a2,16($ap)
  152. ld $a3,24($ap)
  153. li $poly1,-1
  154. srdi $poly1,$poly1,32 # 0x00000000ffffffff
  155. li $poly3,1
  156. orc $poly3,$poly3,$poly1 # 0xffffffff00000001
  157. bl __ecp_nistz256_sqr_mont
  158. mtlr r0
  159. ld r22,48($sp)
  160. ld r23,56($sp)
  161. ld r24,64($sp)
  162. ld r25,72($sp)
  163. ld r26,80($sp)
  164. ld r27,88($sp)
  165. ld r28,96($sp)
  166. ld r29,104($sp)
  167. ld r30,112($sp)
  168. ld r31,120($sp)
  169. addi $sp,$sp,128
  170. blr
  171. .long 0
  172. .byte 0,12,4,0,0x80,10,2,0
  173. .long 0
  174. .size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
  175. # void ecp_nistz256_add(BN_ULONG x0[4],const BN_ULONG x1[4],
  176. # const BN_ULONG x2[4]);
  177. .globl ecp_nistz256_add
  178. .align 4
  179. ecp_nistz256_add:
  180. stdu $sp,-128($sp)
  181. mflr r0
  182. std r28,96($sp)
  183. std r29,104($sp)
  184. std r30,112($sp)
  185. std r31,120($sp)
  186. ld $acc0,0($ap)
  187. ld $t0, 0($bp)
  188. ld $acc1,8($ap)
  189. ld $t1, 8($bp)
  190. ld $acc2,16($ap)
  191. ld $t2, 16($bp)
  192. ld $acc3,24($ap)
  193. ld $t3, 24($bp)
  194. li $poly1,-1
  195. srdi $poly1,$poly1,32 # 0x00000000ffffffff
  196. li $poly3,1
  197. orc $poly3,$poly3,$poly1 # 0xffffffff00000001
  198. bl __ecp_nistz256_add
  199. mtlr r0
  200. ld r28,96($sp)
  201. ld r29,104($sp)
  202. ld r30,112($sp)
  203. ld r31,120($sp)
  204. addi $sp,$sp,128
  205. blr
  206. .long 0
  207. .byte 0,12,4,0,0x80,4,3,0
  208. .long 0
  209. .size ecp_nistz256_add,.-ecp_nistz256_add
  210. # void ecp_nistz256_div_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]);
  211. .globl ecp_nistz256_div_by_2
  212. .align 4
  213. ecp_nistz256_div_by_2:
  214. stdu $sp,-128($sp)
  215. mflr r0
  216. std r28,96($sp)
  217. std r29,104($sp)
  218. std r30,112($sp)
  219. std r31,120($sp)
  220. ld $acc0,0($ap)
  221. ld $acc1,8($ap)
  222. ld $acc2,16($ap)
  223. ld $acc3,24($ap)
  224. li $poly1,-1
  225. srdi $poly1,$poly1,32 # 0x00000000ffffffff
  226. li $poly3,1
  227. orc $poly3,$poly3,$poly1 # 0xffffffff00000001
  228. bl __ecp_nistz256_div_by_2
  229. mtlr r0
  230. ld r28,96($sp)
  231. ld r29,104($sp)
  232. ld r30,112($sp)
  233. ld r31,120($sp)
  234. addi $sp,$sp,128
  235. blr
  236. .long 0
  237. .byte 0,12,4,0,0x80,4,2,0
  238. .long 0
  239. .size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2
  240. # void ecp_nistz256_mul_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]);
  241. .globl ecp_nistz256_mul_by_2
  242. .align 4
  243. ecp_nistz256_mul_by_2:
  244. stdu $sp,-128($sp)
  245. mflr r0
  246. std r28,96($sp)
  247. std r29,104($sp)
  248. std r30,112($sp)
  249. std r31,120($sp)
  250. ld $acc0,0($ap)
  251. ld $acc1,8($ap)
  252. ld $acc2,16($ap)
  253. ld $acc3,24($ap)
  254. mr $t0,$acc0
  255. mr $t1,$acc1
  256. mr $t2,$acc2
  257. mr $t3,$acc3
  258. li $poly1,-1
  259. srdi $poly1,$poly1,32 # 0x00000000ffffffff
  260. li $poly3,1
  261. orc $poly3,$poly3,$poly1 # 0xffffffff00000001
  262. bl __ecp_nistz256_add # ret = a+a // 2*a
  263. mtlr r0
  264. ld r28,96($sp)
  265. ld r29,104($sp)
  266. ld r30,112($sp)
  267. ld r31,120($sp)
  268. addi $sp,$sp,128
  269. blr
  270. .long 0
  271. .byte 0,12,4,0,0x80,4,3,0
  272. .long 0
  273. .size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2
  274. # void ecp_nistz256_mul_by_3(BN_ULONG x0[4],const BN_ULONG x1[4]);
  275. .globl ecp_nistz256_mul_by_3
  276. .align 4
  277. ecp_nistz256_mul_by_3:
  278. stdu $sp,-128($sp)
  279. mflr r0
  280. std r28,96($sp)
  281. std r29,104($sp)
  282. std r30,112($sp)
  283. std r31,120($sp)
  284. ld $acc0,0($ap)
  285. ld $acc1,8($ap)
  286. ld $acc2,16($ap)
  287. ld $acc3,24($ap)
  288. mr $t0,$acc0
  289. std $acc0,64($sp)
  290. mr $t1,$acc1
  291. std $acc1,72($sp)
  292. mr $t2,$acc2
  293. std $acc2,80($sp)
  294. mr $t3,$acc3
  295. std $acc3,88($sp)
  296. li $poly1,-1
  297. srdi $poly1,$poly1,32 # 0x00000000ffffffff
  298. li $poly3,1
  299. orc $poly3,$poly3,$poly1 # 0xffffffff00000001
  300. bl __ecp_nistz256_add # ret = a+a // 2*a
  301. ld $t0,64($sp)
  302. ld $t1,72($sp)
  303. ld $t2,80($sp)
  304. ld $t3,88($sp)
  305. bl __ecp_nistz256_add # ret += a // 2*a+a=3*a
  306. mtlr r0
  307. ld r28,96($sp)
  308. ld r29,104($sp)
  309. ld r30,112($sp)
  310. ld r31,120($sp)
  311. addi $sp,$sp,128
  312. blr
  313. .long 0
  314. .byte 0,12,4,0,0x80,4,2,0
  315. .long 0
  316. .size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3
  317. # void ecp_nistz256_sub(BN_ULONG x0[4],const BN_ULONG x1[4],
  318. # const BN_ULONG x2[4]);
  319. .globl ecp_nistz256_sub
  320. .align 4
  321. ecp_nistz256_sub:
  322. stdu $sp,-128($sp)
  323. mflr r0
  324. std r28,96($sp)
  325. std r29,104($sp)
  326. std r30,112($sp)
  327. std r31,120($sp)
  328. ld $acc0,0($ap)
  329. ld $acc1,8($ap)
  330. ld $acc2,16($ap)
  331. ld $acc3,24($ap)
  332. li $poly1,-1
  333. srdi $poly1,$poly1,32 # 0x00000000ffffffff
  334. li $poly3,1
  335. orc $poly3,$poly3,$poly1 # 0xffffffff00000001
  336. bl __ecp_nistz256_sub_from
  337. mtlr r0
  338. ld r28,96($sp)
  339. ld r29,104($sp)
  340. ld r30,112($sp)
  341. ld r31,120($sp)
  342. addi $sp,$sp,128
  343. blr
  344. .long 0
  345. .byte 0,12,4,0,0x80,4,3,0
  346. .long 0
  347. .size ecp_nistz256_sub,.-ecp_nistz256_sub
  348. # void ecp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]);
  349. .globl ecp_nistz256_neg
  350. .align 4
  351. ecp_nistz256_neg:
  352. stdu $sp,-128($sp)
  353. mflr r0
  354. std r28,96($sp)
  355. std r29,104($sp)
  356. std r30,112($sp)
  357. std r31,120($sp)
  358. mr $bp,$ap
  359. li $acc0,0
  360. li $acc1,0
  361. li $acc2,0
  362. li $acc3,0
  363. li $poly1,-1
  364. srdi $poly1,$poly1,32 # 0x00000000ffffffff
  365. li $poly3,1
  366. orc $poly3,$poly3,$poly1 # 0xffffffff00000001
  367. bl __ecp_nistz256_sub_from
  368. mtlr r0
  369. ld r28,96($sp)
  370. ld r29,104($sp)
  371. ld r30,112($sp)
  372. ld r31,120($sp)
  373. addi $sp,$sp,128
  374. blr
  375. .long 0
  376. .byte 0,12,4,0,0x80,4,2,0
  377. .long 0
  378. .size ecp_nistz256_neg,.-ecp_nistz256_neg
  379. # note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded
  380. # to $a0-$a3 and b[0] - to $bi
  381. .type __ecp_nistz256_mul_mont,\@function
  382. .align 4
  383. __ecp_nistz256_mul_mont:
  384. mulld $acc0,$a0,$bi # a[0]*b[0]
  385. mulhdu $t0,$a0,$bi
  386. mulld $acc1,$a1,$bi # a[1]*b[0]
  387. mulhdu $t1,$a1,$bi
  388. mulld $acc2,$a2,$bi # a[2]*b[0]
  389. mulhdu $t2,$a2,$bi
  390. mulld $acc3,$a3,$bi # a[3]*b[0]
  391. mulhdu $t3,$a3,$bi
  392. ld $bi,8($bp) # b[1]
  393. addc $acc1,$acc1,$t0 # accumulate high parts of multiplication
  394. sldi $t0,$acc0,32
  395. adde $acc2,$acc2,$t1
  396. srdi $t1,$acc0,32
  397. adde $acc3,$acc3,$t2
  398. addze $acc4,$t3
  399. li $acc5,0
  400. ___
  401. for($i=1;$i<4;$i++) {
  402. ################################################################
  403. # Reduction iteration is normally performed by accumulating
  404. # result of multiplication of modulus by "magic" digit [and
  405. # omitting least significant word, which is guaranteed to
  406. # be 0], but thanks to special form of modulus and "magic"
  407. # digit being equal to least significant word, it can be
  408. # performed with additions and subtractions alone. Indeed:
  409. #
  410. # ffff0001.00000000.0000ffff.ffffffff
  411. # * abcdefgh
  412. # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh
  413. #
  414. # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
  415. # rewrite above as:
  416. #
  417. # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh
  418. # + abcdefgh.abcdefgh.0000abcd.efgh0000.00000000
  419. # - 0000abcd.efgh0000.00000000.00000000.abcdefgh
  420. #
  421. # or marking redundant operations:
  422. #
  423. # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.--------
  424. # + abcdefgh.abcdefgh.0000abcd.efgh0000.--------
  425. # - 0000abcd.efgh0000.--------.--------.--------
  426. $code.=<<___;
  427. subfc $t2,$t0,$acc0 # "*0xffff0001"
  428. subfe $t3,$t1,$acc0
  429. addc $acc0,$acc1,$t0 # +=acc[0]<<96 and omit acc[0]
  430. adde $acc1,$acc2,$t1
  431. adde $acc2,$acc3,$t2 # +=acc[0]*0xffff0001
  432. adde $acc3,$acc4,$t3
  433. addze $acc4,$acc5
  434. mulld $t0,$a0,$bi # lo(a[0]*b[i])
  435. mulld $t1,$a1,$bi # lo(a[1]*b[i])
  436. mulld $t2,$a2,$bi # lo(a[2]*b[i])
  437. mulld $t3,$a3,$bi # lo(a[3]*b[i])
  438. addc $acc0,$acc0,$t0 # accumulate low parts of multiplication
  439. mulhdu $t0,$a0,$bi # hi(a[0]*b[i])
  440. adde $acc1,$acc1,$t1
  441. mulhdu $t1,$a1,$bi # hi(a[1]*b[i])
  442. adde $acc2,$acc2,$t2
  443. mulhdu $t2,$a2,$bi # hi(a[2]*b[i])
  444. adde $acc3,$acc3,$t3
  445. mulhdu $t3,$a3,$bi # hi(a[3]*b[i])
  446. addze $acc4,$acc4
  447. ___
  448. $code.=<<___ if ($i<3);
  449. ld $bi,8*($i+1)($bp) # b[$i+1]
  450. ___
  451. $code.=<<___;
  452. addc $acc1,$acc1,$t0 # accumulate high parts of multiplication
  453. sldi $t0,$acc0,32
  454. adde $acc2,$acc2,$t1
  455. srdi $t1,$acc0,32
  456. adde $acc3,$acc3,$t2
  457. adde $acc4,$acc4,$t3
  458. li $acc5,0
  459. addze $acc5,$acc5
  460. ___
  461. }
  462. $code.=<<___;
  463. # last reduction
  464. subfc $t2,$t0,$acc0 # "*0xffff0001"
  465. subfe $t3,$t1,$acc0
  466. addc $acc0,$acc1,$t0 # +=acc[0]<<96 and omit acc[0]
  467. adde $acc1,$acc2,$t1
  468. adde $acc2,$acc3,$t2 # +=acc[0]*0xffff0001
  469. adde $acc3,$acc4,$t3
  470. addze $acc4,$acc5
  471. li $t2,0
  472. addic $acc0,$acc0,1 # ret -= modulus
  473. subfe $acc1,$poly1,$acc1
  474. subfe $acc2,$t2,$acc2
  475. subfe $acc3,$poly3,$acc3
  476. subfe $acc4,$t2,$acc4
  477. addc $acc0,$acc0,$acc4 # ret += modulus if borrow
  478. and $t1,$poly1,$acc4
  479. and $t3,$poly3,$acc4
  480. adde $acc1,$acc1,$t1
  481. addze $acc2,$acc2
  482. adde $acc3,$acc3,$t3
  483. std $acc0,0($rp)
  484. std $acc1,8($rp)
  485. std $acc2,16($rp)
  486. std $acc3,24($rp)
  487. blr
  488. .long 0
  489. .byte 0,12,0x14,0,0,0,1,0
  490. .long 0
  491. .size __ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont
  492. # note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded
  493. # to $a0-$a3
  494. .type __ecp_nistz256_sqr_mont,\@function
  495. .align 4
  496. __ecp_nistz256_sqr_mont:
  497. ################################################################
  498. # | | | | | |a1*a0| |
  499. # | | | | |a2*a0| | |
  500. # | |a3*a2|a3*a0| | | |
  501. # | | | |a2*a1| | | |
  502. # | | |a3*a1| | | | |
  503. # *| | | | | | | | 2|
  504. # +|a3*a3|a2*a2|a1*a1|a0*a0|
  505. # |--+--+--+--+--+--+--+--|
  506. # |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx
  507. #
  508. # "can't overflow" below mark carrying into high part of
  509. # multiplication result, which can't overflow, because it
  510. # can never be all ones.
  511. mulld $acc1,$a1,$a0 # a[1]*a[0]
  512. mulhdu $t1,$a1,$a0
  513. mulld $acc2,$a2,$a0 # a[2]*a[0]
  514. mulhdu $t2,$a2,$a0
  515. mulld $acc3,$a3,$a0 # a[3]*a[0]
  516. mulhdu $acc4,$a3,$a0
  517. addc $acc2,$acc2,$t1 # accumulate high parts of multiplication
  518. mulld $t0,$a2,$a1 # a[2]*a[1]
  519. mulhdu $t1,$a2,$a1
  520. adde $acc3,$acc3,$t2
  521. mulld $t2,$a3,$a1 # a[3]*a[1]
  522. mulhdu $t3,$a3,$a1
  523. addze $acc4,$acc4 # can't overflow
  524. mulld $acc5,$a3,$a2 # a[3]*a[2]
  525. mulhdu $acc6,$a3,$a2
  526. addc $t1,$t1,$t2 # accumulate high parts of multiplication
  527. addze $t2,$t3 # can't overflow
  528. addc $acc3,$acc3,$t0 # accumulate low parts of multiplication
  529. adde $acc4,$acc4,$t1
  530. adde $acc5,$acc5,$t2
  531. addze $acc6,$acc6 # can't overflow
  532. addc $acc1,$acc1,$acc1 # acc[1-6]*=2
  533. adde $acc2,$acc2,$acc2
  534. adde $acc3,$acc3,$acc3
  535. adde $acc4,$acc4,$acc4
  536. adde $acc5,$acc5,$acc5
  537. adde $acc6,$acc6,$acc6
  538. li $acc7,0
  539. addze $acc7,$acc7
  540. mulld $acc0,$a0,$a0 # a[0]*a[0]
  541. mulhdu $a0,$a0,$a0
  542. mulld $t1,$a1,$a1 # a[1]*a[1]
  543. mulhdu $a1,$a1,$a1
  544. mulld $t2,$a2,$a2 # a[2]*a[2]
  545. mulhdu $a2,$a2,$a2
  546. mulld $t3,$a3,$a3 # a[3]*a[3]
  547. mulhdu $a3,$a3,$a3
  548. addc $acc1,$acc1,$a0 # +a[i]*a[i]
  549. sldi $t0,$acc0,32
  550. adde $acc2,$acc2,$t1
  551. srdi $t1,$acc0,32
  552. adde $acc3,$acc3,$a1
  553. adde $acc4,$acc4,$t2
  554. adde $acc5,$acc5,$a2
  555. adde $acc6,$acc6,$t3
  556. adde $acc7,$acc7,$a3
  557. ___
  558. for($i=0;$i<3;$i++) { # reductions, see commentary in
  559. # multiplication for details
  560. $code.=<<___;
  561. subfc $t2,$t0,$acc0 # "*0xffff0001"
  562. subfe $t3,$t1,$acc0
  563. addc $acc0,$acc1,$t0 # +=acc[0]<<96 and omit acc[0]
  564. sldi $t0,$acc0,32
  565. adde $acc1,$acc2,$t1
  566. srdi $t1,$acc0,32
  567. adde $acc2,$acc3,$t2 # +=acc[0]*0xffff0001
  568. addze $acc3,$t3 # can't overflow
  569. ___
  570. }
  571. $code.=<<___;
  572. subfc $t2,$t0,$acc0 # "*0xffff0001"
  573. subfe $t3,$t1,$acc0
  574. addc $acc0,$acc1,$t0 # +=acc[0]<<96 and omit acc[0]
  575. adde $acc1,$acc2,$t1
  576. adde $acc2,$acc3,$t2 # +=acc[0]*0xffff0001
  577. addze $acc3,$t3 # can't overflow
  578. addc $acc0,$acc0,$acc4 # accumulate upper half
  579. adde $acc1,$acc1,$acc5
  580. adde $acc2,$acc2,$acc6
  581. adde $acc3,$acc3,$acc7
  582. li $t2,0
  583. addze $acc4,$t2
  584. addic $acc0,$acc0,1 # ret -= modulus
  585. subfe $acc1,$poly1,$acc1
  586. subfe $acc2,$t2,$acc2
  587. subfe $acc3,$poly3,$acc3
  588. subfe $acc4,$t2,$acc4
  589. addc $acc0,$acc0,$acc4 # ret += modulus if borrow
  590. and $t1,$poly1,$acc4
  591. and $t3,$poly3,$acc4
  592. adde $acc1,$acc1,$t1
  593. addze $acc2,$acc2
  594. adde $acc3,$acc3,$t3
  595. std $acc0,0($rp)
  596. std $acc1,8($rp)
  597. std $acc2,16($rp)
  598. std $acc3,24($rp)
  599. blr
  600. .long 0
  601. .byte 0,12,0x14,0,0,0,1,0
  602. .long 0
  603. .size __ecp_nistz256_sqr_mont,.-__ecp_nistz256_sqr_mont
  604. # Note that __ecp_nistz256_add expects both input vectors pre-loaded to
  605. # $a0-$a3 and $t0-$t3. This is done because it's used in multiple
  606. # contexts, e.g. in multiplication by 2 and 3...
  607. .type __ecp_nistz256_add,\@function
  608. .align 4
  609. __ecp_nistz256_add:
  610. addc $acc0,$acc0,$t0 # ret = a+b
  611. adde $acc1,$acc1,$t1
  612. adde $acc2,$acc2,$t2
  613. li $t2,0
  614. adde $acc3,$acc3,$t3
  615. addze $t0,$t2
  616. # if a+b >= modulus, subtract modulus
  617. #
  618. # But since comparison implies subtraction, we subtract
  619. # modulus and then add it back if subtraction borrowed.
  620. subic $acc0,$acc0,-1
  621. subfe $acc1,$poly1,$acc1
  622. subfe $acc2,$t2,$acc2
  623. subfe $acc3,$poly3,$acc3
  624. subfe $t0,$t2,$t0
  625. addc $acc0,$acc0,$t0
  626. and $t1,$poly1,$t0
  627. and $t3,$poly3,$t0
  628. adde $acc1,$acc1,$t1
  629. addze $acc2,$acc2
  630. adde $acc3,$acc3,$t3
  631. std $acc0,0($rp)
  632. std $acc1,8($rp)
  633. std $acc2,16($rp)
  634. std $acc3,24($rp)
  635. blr
  636. .long 0
  637. .byte 0,12,0x14,0,0,0,3,0
  638. .long 0
  639. .size __ecp_nistz256_add,.-__ecp_nistz256_add
  640. .type __ecp_nistz256_sub_from,\@function
  641. .align 4
  642. __ecp_nistz256_sub_from:
  643. ld $t0,0($bp)
  644. ld $t1,8($bp)
  645. ld $t2,16($bp)
  646. ld $t3,24($bp)
  647. subfc $acc0,$t0,$acc0 # ret = a-b
  648. subfe $acc1,$t1,$acc1
  649. subfe $acc2,$t2,$acc2
  650. subfe $acc3,$t3,$acc3
  651. subfe $t0,$t0,$t0 # t0 = borrow ? -1 : 0
  652. # if a-b borrowed, add modulus
  653. addc $acc0,$acc0,$t0 # ret -= modulus & t0
  654. and $t1,$poly1,$t0
  655. and $t3,$poly3,$t0
  656. adde $acc1,$acc1,$t1
  657. addze $acc2,$acc2
  658. adde $acc3,$acc3,$t3
  659. std $acc0,0($rp)
  660. std $acc1,8($rp)
  661. std $acc2,16($rp)
  662. std $acc3,24($rp)
  663. blr
  664. .long 0
  665. .byte 0,12,0x14,0,0,0,3,0
  666. .long 0
  667. .size __ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from
  668. .type __ecp_nistz256_sub_morf,\@function
  669. .align 4
  670. __ecp_nistz256_sub_morf:
  671. ld $t0,0($bp)
  672. ld $t1,8($bp)
  673. ld $t2,16($bp)
  674. ld $t3,24($bp)
  675. subfc $acc0,$acc0,$t0 # ret = b-a
  676. subfe $acc1,$acc1,$t1
  677. subfe $acc2,$acc2,$t2
  678. subfe $acc3,$acc3,$t3
  679. subfe $t0,$t0,$t0 # t0 = borrow ? -1 : 0
  680. # if b-a borrowed, add modulus
  681. addc $acc0,$acc0,$t0 # ret -= modulus & t0
  682. and $t1,$poly1,$t0
  683. and $t3,$poly3,$t0
  684. adde $acc1,$acc1,$t1
  685. addze $acc2,$acc2
  686. adde $acc3,$acc3,$t3
  687. std $acc0,0($rp)
  688. std $acc1,8($rp)
  689. std $acc2,16($rp)
  690. std $acc3,24($rp)
  691. blr
  692. .long 0
  693. .byte 0,12,0x14,0,0,0,3,0
  694. .long 0
  695. .size __ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf
  696. .type __ecp_nistz256_div_by_2,\@function
  697. .align 4
  698. __ecp_nistz256_div_by_2:
  699. andi. $t0,$acc0,1
  700. addic $acc0,$acc0,-1 # a += modulus
  701. neg $t0,$t0
  702. adde $acc1,$acc1,$poly1
  703. not $t0,$t0
  704. addze $acc2,$acc2
  705. li $t2,0
  706. adde $acc3,$acc3,$poly3
  707. and $t1,$poly1,$t0
  708. addze $ap,$t2 # ap = carry
  709. and $t3,$poly3,$t0
  710. subfc $acc0,$t0,$acc0 # a -= modulus if a was even
  711. subfe $acc1,$t1,$acc1
  712. subfe $acc2,$t2,$acc2
  713. subfe $acc3,$t3,$acc3
  714. subfe $ap, $t2,$ap
  715. srdi $acc0,$acc0,1
  716. sldi $t0,$acc1,63
  717. srdi $acc1,$acc1,1
  718. sldi $t1,$acc2,63
  719. srdi $acc2,$acc2,1
  720. sldi $t2,$acc3,63
  721. srdi $acc3,$acc3,1
  722. sldi $t3,$ap,63
  723. or $acc0,$acc0,$t0
  724. or $acc1,$acc1,$t1
  725. or $acc2,$acc2,$t2
  726. or $acc3,$acc3,$t3
  727. std $acc0,0($rp)
  728. std $acc1,8($rp)
  729. std $acc2,16($rp)
  730. std $acc3,24($rp)
  731. blr
  732. .long 0
  733. .byte 0,12,0x14,0,0,0,1,0
  734. .long 0
  735. .size __ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2
  736. ___
  737. ########################################################################
  738. # following subroutines are "literal" implementation of those found in
  739. # ecp_nistz256.c
  740. #
  741. ########################################################################
  742. # void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp);
  743. #
  744. if (1) {
  745. my $FRAME=64+32*4+12*8;
  746. my ($S,$M,$Zsqr,$tmp0)=map(64+32*$_,(0..3));
  747. # above map() describes stack layout with 4 temporary
  748. # 256-bit vectors on top.
  749. my ($rp_real,$ap_real) = map("r$_",(20,21));
  750. $code.=<<___;
  751. .globl ecp_nistz256_point_double
  752. .align 5
  753. ecp_nistz256_point_double:
  754. stdu $sp,-$FRAME($sp)
  755. mflr r0
  756. std r20,$FRAME-8*12($sp)
  757. std r21,$FRAME-8*11($sp)
  758. std r22,$FRAME-8*10($sp)
  759. std r23,$FRAME-8*9($sp)
  760. std r24,$FRAME-8*8($sp)
  761. std r25,$FRAME-8*7($sp)
  762. std r26,$FRAME-8*6($sp)
  763. std r27,$FRAME-8*5($sp)
  764. std r28,$FRAME-8*4($sp)
  765. std r29,$FRAME-8*3($sp)
  766. std r30,$FRAME-8*2($sp)
  767. std r31,$FRAME-8*1($sp)
  768. li $poly1,-1
  769. srdi $poly1,$poly1,32 # 0x00000000ffffffff
  770. li $poly3,1
  771. orc $poly3,$poly3,$poly1 # 0xffffffff00000001
  772. .Ldouble_shortcut:
  773. ld $acc0,32($ap)
  774. ld $acc1,40($ap)
  775. ld $acc2,48($ap)
  776. ld $acc3,56($ap)
  777. mr $t0,$acc0
  778. mr $t1,$acc1
  779. mr $t2,$acc2
  780. mr $t3,$acc3
  781. ld $a0,64($ap) # forward load for p256_sqr_mont
  782. ld $a1,72($ap)
  783. ld $a2,80($ap)
  784. ld $a3,88($ap)
  785. mr $rp_real,$rp
  786. mr $ap_real,$ap
  787. addi $rp,$sp,$S
  788. bl __ecp_nistz256_add # p256_mul_by_2(S, in_y);
  789. addi $rp,$sp,$Zsqr
  790. bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Zsqr, in_z);
  791. ld $t0,0($ap_real)
  792. ld $t1,8($ap_real)
  793. ld $t2,16($ap_real)
  794. ld $t3,24($ap_real)
  795. mr $a0,$acc0 # put Zsqr aside for p256_sub
  796. mr $a1,$acc1
  797. mr $a2,$acc2
  798. mr $a3,$acc3
  799. addi $rp,$sp,$M
  800. bl __ecp_nistz256_add # p256_add(M, Zsqr, in_x);
  801. addi $bp,$ap_real,0
  802. mr $acc0,$a0 # restore Zsqr
  803. mr $acc1,$a1
  804. mr $acc2,$a2
  805. mr $acc3,$a3
  806. ld $a0,$S+0($sp) # forward load for p256_sqr_mont
  807. ld $a1,$S+8($sp)
  808. ld $a2,$S+16($sp)
  809. ld $a3,$S+24($sp)
  810. addi $rp,$sp,$Zsqr
  811. bl __ecp_nistz256_sub_morf # p256_sub(Zsqr, in_x, Zsqr);
  812. addi $rp,$sp,$S
  813. bl __ecp_nistz256_sqr_mont # p256_sqr_mont(S, S);
  814. ld $bi,32($ap_real)
  815. ld $a0,64($ap_real)
  816. ld $a1,72($ap_real)
  817. ld $a2,80($ap_real)
  818. ld $a3,88($ap_real)
  819. addi $bp,$ap_real,32
  820. addi $rp,$sp,$tmp0
  821. bl __ecp_nistz256_mul_mont # p256_mul_mont(tmp0, in_z, in_y);
  822. mr $t0,$acc0
  823. mr $t1,$acc1
  824. mr $t2,$acc2
  825. mr $t3,$acc3
  826. ld $a0,$S+0($sp) # forward load for p256_sqr_mont
  827. ld $a1,$S+8($sp)
  828. ld $a2,$S+16($sp)
  829. ld $a3,$S+24($sp)
  830. addi $rp,$rp_real,64
  831. bl __ecp_nistz256_add # p256_mul_by_2(res_z, tmp0);
  832. addi $rp,$sp,$tmp0
  833. bl __ecp_nistz256_sqr_mont # p256_sqr_mont(tmp0, S);
  834. ld $bi,$Zsqr($sp) # forward load for p256_mul_mont
  835. ld $a0,$M+0($sp)
  836. ld $a1,$M+8($sp)
  837. ld $a2,$M+16($sp)
  838. ld $a3,$M+24($sp)
  839. addi $rp,$rp_real,32
  840. bl __ecp_nistz256_div_by_2 # p256_div_by_2(res_y, tmp0);
  841. addi $bp,$sp,$Zsqr
  842. addi $rp,$sp,$M
  843. bl __ecp_nistz256_mul_mont # p256_mul_mont(M, M, Zsqr);
  844. mr $t0,$acc0 # duplicate M
  845. mr $t1,$acc1
  846. mr $t2,$acc2
  847. mr $t3,$acc3
  848. mr $a0,$acc0 # put M aside
  849. mr $a1,$acc1
  850. mr $a2,$acc2
  851. mr $a3,$acc3
  852. addi $rp,$sp,$M
  853. bl __ecp_nistz256_add
  854. mr $t0,$a0 # restore M
  855. mr $t1,$a1
  856. mr $t2,$a2
  857. mr $t3,$a3
  858. ld $bi,0($ap_real) # forward load for p256_mul_mont
  859. ld $a0,$S+0($sp)
  860. ld $a1,$S+8($sp)
  861. ld $a2,$S+16($sp)
  862. ld $a3,$S+24($sp)
  863. bl __ecp_nistz256_add # p256_mul_by_3(M, M);
  864. addi $bp,$ap_real,0
  865. addi $rp,$sp,$S
  866. bl __ecp_nistz256_mul_mont # p256_mul_mont(S, S, in_x);
  867. mr $t0,$acc0
  868. mr $t1,$acc1
  869. mr $t2,$acc2
  870. mr $t3,$acc3
  871. ld $a0,$M+0($sp) # forward load for p256_sqr_mont
  872. ld $a1,$M+8($sp)
  873. ld $a2,$M+16($sp)
  874. ld $a3,$M+24($sp)
  875. addi $rp,$sp,$tmp0
  876. bl __ecp_nistz256_add # p256_mul_by_2(tmp0, S);
  877. addi $rp,$rp_real,0
  878. bl __ecp_nistz256_sqr_mont # p256_sqr_mont(res_x, M);
  879. addi $bp,$sp,$tmp0
  880. bl __ecp_nistz256_sub_from # p256_sub(res_x, res_x, tmp0);
  881. addi $bp,$sp,$S
  882. addi $rp,$sp,$S
  883. bl __ecp_nistz256_sub_morf # p256_sub(S, S, res_x);
  884. ld $bi,$M($sp)
  885. mr $a0,$acc0 # copy S
  886. mr $a1,$acc1
  887. mr $a2,$acc2
  888. mr $a3,$acc3
  889. addi $bp,$sp,$M
  890. bl __ecp_nistz256_mul_mont # p256_mul_mont(S, S, M);
  891. addi $bp,$rp_real,32
  892. addi $rp,$rp_real,32
  893. bl __ecp_nistz256_sub_from # p256_sub(res_y, S, res_y);
  894. mtlr r0
  895. ld r20,$FRAME-8*12($sp)
  896. ld r21,$FRAME-8*11($sp)
  897. ld r22,$FRAME-8*10($sp)
  898. ld r23,$FRAME-8*9($sp)
  899. ld r24,$FRAME-8*8($sp)
  900. ld r25,$FRAME-8*7($sp)
  901. ld r26,$FRAME-8*6($sp)
  902. ld r27,$FRAME-8*5($sp)
  903. ld r28,$FRAME-8*4($sp)
  904. ld r29,$FRAME-8*3($sp)
  905. ld r30,$FRAME-8*2($sp)
  906. ld r31,$FRAME-8*1($sp)
  907. addi $sp,$sp,$FRAME
  908. blr
  909. .long 0
  910. .byte 0,12,4,0,0x80,12,2,0
  911. .long 0
  912. .size ecp_nistz256_point_double,.-ecp_nistz256_point_double
  913. ___
  914. }
  915. ########################################################################
  916. # void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1,
  917. # const P256_POINT *in2);
  918. if (1) {
  919. my $FRAME = 64 + 32*12 + 16*8;
  920. my ($res_x,$res_y,$res_z,
  921. $H,$Hsqr,$R,$Rsqr,$Hcub,
  922. $U1,$U2,$S1,$S2)=map(64+32*$_,(0..11));
  923. my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr);
  924. # above map() describes stack layout with 12 temporary
  925. # 256-bit vectors on top.
  926. my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp)=map("r$_",(16..21));
  927. $code.=<<___;
  928. .globl ecp_nistz256_point_add
  929. .align 5
  930. ecp_nistz256_point_add:
  931. stdu $sp,-$FRAME($sp)
  932. mflr r0
  933. std r16,$FRAME-8*16($sp)
  934. std r17,$FRAME-8*15($sp)
  935. std r18,$FRAME-8*14($sp)
  936. std r19,$FRAME-8*13($sp)
  937. std r20,$FRAME-8*12($sp)
  938. std r21,$FRAME-8*11($sp)
  939. std r22,$FRAME-8*10($sp)
  940. std r23,$FRAME-8*9($sp)
  941. std r24,$FRAME-8*8($sp)
  942. std r25,$FRAME-8*7($sp)
  943. std r26,$FRAME-8*6($sp)
  944. std r27,$FRAME-8*5($sp)
  945. std r28,$FRAME-8*4($sp)
  946. std r29,$FRAME-8*3($sp)
  947. std r30,$FRAME-8*2($sp)
  948. std r31,$FRAME-8*1($sp)
  949. li $poly1,-1
  950. srdi $poly1,$poly1,32 # 0x00000000ffffffff
  951. li $poly3,1
  952. orc $poly3,$poly3,$poly1 # 0xffffffff00000001
  953. ld $a0,64($bp) # in2_z
  954. ld $a1,72($bp)
  955. ld $a2,80($bp)
  956. ld $a3,88($bp)
  957. mr $rp_real,$rp
  958. mr $ap_real,$ap
  959. mr $bp_real,$bp
  960. or $t0,$a0,$a1
  961. or $t2,$a2,$a3
  962. or $in2infty,$t0,$t2
  963. neg $t0,$in2infty
  964. or $in2infty,$in2infty,$t0
  965. sradi $in2infty,$in2infty,63 # !in2infty
  966. addi $rp,$sp,$Z2sqr
  967. bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Z2sqr, in2_z);
  968. ld $a0,64($ap_real) # in1_z
  969. ld $a1,72($ap_real)
  970. ld $a2,80($ap_real)
  971. ld $a3,88($ap_real)
  972. or $t0,$a0,$a1
  973. or $t2,$a2,$a3
  974. or $in1infty,$t0,$t2
  975. neg $t0,$in1infty
  976. or $in1infty,$in1infty,$t0
  977. sradi $in1infty,$in1infty,63 # !in1infty
  978. addi $rp,$sp,$Z1sqr
  979. bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Z1sqr, in1_z);
  980. ld $bi,64($bp_real)
  981. ld $a0,$Z2sqr+0($sp)
  982. ld $a1,$Z2sqr+8($sp)
  983. ld $a2,$Z2sqr+16($sp)
  984. ld $a3,$Z2sqr+24($sp)
  985. addi $bp,$bp_real,64
  986. addi $rp,$sp,$S1
  987. bl __ecp_nistz256_mul_mont # p256_mul_mont(S1, Z2sqr, in2_z);
  988. ld $bi,64($ap_real)
  989. ld $a0,$Z1sqr+0($sp)
  990. ld $a1,$Z1sqr+8($sp)
  991. ld $a2,$Z1sqr+16($sp)
  992. ld $a3,$Z1sqr+24($sp)
  993. addi $bp,$ap_real,64
  994. addi $rp,$sp,$S2
  995. bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, Z1sqr, in1_z);
  996. ld $bi,32($ap_real)
  997. ld $a0,$S1+0($sp)
  998. ld $a1,$S1+8($sp)
  999. ld $a2,$S1+16($sp)
  1000. ld $a3,$S1+24($sp)
  1001. addi $bp,$ap_real,32
  1002. addi $rp,$sp,$S1
  1003. bl __ecp_nistz256_mul_mont # p256_mul_mont(S1, S1, in1_y);
  1004. ld $bi,32($bp_real)
  1005. ld $a0,$S2+0($sp)
  1006. ld $a1,$S2+8($sp)
  1007. ld $a2,$S2+16($sp)
  1008. ld $a3,$S2+24($sp)
  1009. addi $bp,$bp_real,32
  1010. addi $rp,$sp,$S2
  1011. bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, S2, in2_y);
  1012. addi $bp,$sp,$S1
  1013. ld $bi,$Z2sqr($sp) # forward load for p256_mul_mont
  1014. ld $a0,0($ap_real)
  1015. ld $a1,8($ap_real)
  1016. ld $a2,16($ap_real)
  1017. ld $a3,24($ap_real)
  1018. addi $rp,$sp,$R
  1019. bl __ecp_nistz256_sub_from # p256_sub(R, S2, S1);
  1020. or $acc0,$acc0,$acc1 # see if result is zero
  1021. or $acc2,$acc2,$acc3
  1022. or $temp,$acc0,$acc2
  1023. addi $bp,$sp,$Z2sqr
  1024. addi $rp,$sp,$U1
  1025. bl __ecp_nistz256_mul_mont # p256_mul_mont(U1, in1_x, Z2sqr);
  1026. ld $bi,$Z1sqr($sp)
  1027. ld $a0,0($bp_real)
  1028. ld $a1,8($bp_real)
  1029. ld $a2,16($bp_real)
  1030. ld $a3,24($bp_real)
  1031. addi $bp,$sp,$Z1sqr
  1032. addi $rp,$sp,$U2
  1033. bl __ecp_nistz256_mul_mont # p256_mul_mont(U2, in2_x, Z1sqr);
  1034. addi $bp,$sp,$U1
  1035. ld $a0,$R+0($sp) # forward load for p256_sqr_mont
  1036. ld $a1,$R+8($sp)
  1037. ld $a2,$R+16($sp)
  1038. ld $a3,$R+24($sp)
  1039. addi $rp,$sp,$H
  1040. bl __ecp_nistz256_sub_from # p256_sub(H, U2, U1);
  1041. or $acc0,$acc0,$acc1 # see if result is zero
  1042. or $acc2,$acc2,$acc3
  1043. or. $acc0,$acc0,$acc2
  1044. bne .Ladd_proceed # is_equal(U1,U2)?
  1045. and. $t0,$in1infty,$in2infty
  1046. beq .Ladd_proceed # (in1infty || in2infty)?
  1047. cmpldi $temp,0
  1048. beq .Ladd_double # is_equal(S1,S2)?
  1049. xor $a0,$a0,$a0
  1050. std $a0,0($rp_real)
  1051. std $a0,8($rp_real)
  1052. std $a0,16($rp_real)
  1053. std $a0,24($rp_real)
  1054. std $a0,32($rp_real)
  1055. std $a0,40($rp_real)
  1056. std $a0,48($rp_real)
  1057. std $a0,56($rp_real)
  1058. std $a0,64($rp_real)
  1059. std $a0,72($rp_real)
  1060. std $a0,80($rp_real)
  1061. std $a0,88($rp_real)
  1062. b .Ladd_done
  1063. .align 4
  1064. .Ladd_double:
  1065. ld $bp,0($sp) # back-link
  1066. mr $ap,$ap_real
  1067. mr $rp,$rp_real
  1068. ld r16,$FRAME-8*16($sp)
  1069. ld r17,$FRAME-8*15($sp)
  1070. ld r18,$FRAME-8*14($sp)
  1071. ld r19,$FRAME-8*13($sp)
  1072. stdu $bp,$FRAME-288($sp) # difference in stack frame sizes
  1073. b .Ldouble_shortcut
  1074. .align 4
  1075. .Ladd_proceed:
  1076. addi $rp,$sp,$Rsqr
  1077. bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Rsqr, R);
  1078. ld $bi,64($ap_real)
  1079. ld $a0,$H+0($sp)
  1080. ld $a1,$H+8($sp)
  1081. ld $a2,$H+16($sp)
  1082. ld $a3,$H+24($sp)
  1083. addi $bp,$ap_real,64
  1084. addi $rp,$sp,$res_z
  1085. bl __ecp_nistz256_mul_mont # p256_mul_mont(res_z, H, in1_z);
  1086. ld $a0,$H+0($sp)
  1087. ld $a1,$H+8($sp)
  1088. ld $a2,$H+16($sp)
  1089. ld $a3,$H+24($sp)
  1090. addi $rp,$sp,$Hsqr
  1091. bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Hsqr, H);
  1092. ld $bi,64($bp_real)
  1093. ld $a0,$res_z+0($sp)
  1094. ld $a1,$res_z+8($sp)
  1095. ld $a2,$res_z+16($sp)
  1096. ld $a3,$res_z+24($sp)
  1097. addi $bp,$bp_real,64
  1098. addi $rp,$sp,$res_z
  1099. bl __ecp_nistz256_mul_mont # p256_mul_mont(res_z, res_z, in2_z);
  1100. ld $bi,$H($sp)
  1101. ld $a0,$Hsqr+0($sp)
  1102. ld $a1,$Hsqr+8($sp)
  1103. ld $a2,$Hsqr+16($sp)
  1104. ld $a3,$Hsqr+24($sp)
  1105. addi $bp,$sp,$H
  1106. addi $rp,$sp,$Hcub
  1107. bl __ecp_nistz256_mul_mont # p256_mul_mont(Hcub, Hsqr, H);
  1108. ld $bi,$Hsqr($sp)
  1109. ld $a0,$U1+0($sp)
  1110. ld $a1,$U1+8($sp)
  1111. ld $a2,$U1+16($sp)
  1112. ld $a3,$U1+24($sp)
  1113. addi $bp,$sp,$Hsqr
  1114. addi $rp,$sp,$U2
  1115. bl __ecp_nistz256_mul_mont # p256_mul_mont(U2, U1, Hsqr);
  1116. mr $t0,$acc0
  1117. mr $t1,$acc1
  1118. mr $t2,$acc2
  1119. mr $t3,$acc3
  1120. addi $rp,$sp,$Hsqr
  1121. bl __ecp_nistz256_add # p256_mul_by_2(Hsqr, U2);
  1122. addi $bp,$sp,$Rsqr
  1123. addi $rp,$sp,$res_x
  1124. bl __ecp_nistz256_sub_morf # p256_sub(res_x, Rsqr, Hsqr);
  1125. addi $bp,$sp,$Hcub
  1126. bl __ecp_nistz256_sub_from # p256_sub(res_x, res_x, Hcub);
  1127. addi $bp,$sp,$U2
  1128. ld $bi,$Hcub($sp) # forward load for p256_mul_mont
  1129. ld $a0,$S1+0($sp)
  1130. ld $a1,$S1+8($sp)
  1131. ld $a2,$S1+16($sp)
  1132. ld $a3,$S1+24($sp)
  1133. addi $rp,$sp,$res_y
  1134. bl __ecp_nistz256_sub_morf # p256_sub(res_y, U2, res_x);
  1135. addi $bp,$sp,$Hcub
  1136. addi $rp,$sp,$S2
  1137. bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, S1, Hcub);
  1138. ld $bi,$R($sp)
  1139. ld $a0,$res_y+0($sp)
  1140. ld $a1,$res_y+8($sp)
  1141. ld $a2,$res_y+16($sp)
  1142. ld $a3,$res_y+24($sp)
  1143. addi $bp,$sp,$R
  1144. addi $rp,$sp,$res_y
  1145. bl __ecp_nistz256_mul_mont # p256_mul_mont(res_y, res_y, R);
  1146. addi $bp,$sp,$S2
  1147. bl __ecp_nistz256_sub_from # p256_sub(res_y, res_y, S2);
  1148. ld $t0,0($bp_real) # in2
  1149. ld $t1,8($bp_real)
  1150. ld $t2,16($bp_real)
  1151. ld $t3,24($bp_real)
  1152. ld $a0,$res_x+0($sp) # res
  1153. ld $a1,$res_x+8($sp)
  1154. ld $a2,$res_x+16($sp)
  1155. ld $a3,$res_x+24($sp)
  1156. ___
  1157. for($i=0;$i<64;$i+=32) { # conditional moves
  1158. $code.=<<___;
  1159. ld $acc0,$i+0($ap_real) # in1
  1160. ld $acc1,$i+8($ap_real)
  1161. ld $acc2,$i+16($ap_real)
  1162. ld $acc3,$i+24($ap_real)
  1163. andc $t0,$t0,$in1infty
  1164. andc $t1,$t1,$in1infty
  1165. andc $t2,$t2,$in1infty
  1166. andc $t3,$t3,$in1infty
  1167. and $a0,$a0,$in1infty
  1168. and $a1,$a1,$in1infty
  1169. and $a2,$a2,$in1infty
  1170. and $a3,$a3,$in1infty
  1171. or $t0,$t0,$a0
  1172. or $t1,$t1,$a1
  1173. or $t2,$t2,$a2
  1174. or $t3,$t3,$a3
  1175. andc $acc0,$acc0,$in2infty
  1176. andc $acc1,$acc1,$in2infty
  1177. andc $acc2,$acc2,$in2infty
  1178. andc $acc3,$acc3,$in2infty
  1179. and $t0,$t0,$in2infty
  1180. and $t1,$t1,$in2infty
  1181. and $t2,$t2,$in2infty
  1182. and $t3,$t3,$in2infty
  1183. or $acc0,$acc0,$t0
  1184. or $acc1,$acc1,$t1
  1185. or $acc2,$acc2,$t2
  1186. or $acc3,$acc3,$t3
  1187. ld $t0,$i+32($bp_real) # in2
  1188. ld $t1,$i+40($bp_real)
  1189. ld $t2,$i+48($bp_real)
  1190. ld $t3,$i+56($bp_real)
  1191. ld $a0,$res_x+$i+32($sp)
  1192. ld $a1,$res_x+$i+40($sp)
  1193. ld $a2,$res_x+$i+48($sp)
  1194. ld $a3,$res_x+$i+56($sp)
  1195. std $acc0,$i+0($rp_real)
  1196. std $acc1,$i+8($rp_real)
  1197. std $acc2,$i+16($rp_real)
  1198. std $acc3,$i+24($rp_real)
  1199. ___
  1200. }
  1201. $code.=<<___;
  1202. ld $acc0,$i+0($ap_real) # in1
  1203. ld $acc1,$i+8($ap_real)
  1204. ld $acc2,$i+16($ap_real)
  1205. ld $acc3,$i+24($ap_real)
  1206. andc $t0,$t0,$in1infty
  1207. andc $t1,$t1,$in1infty
  1208. andc $t2,$t2,$in1infty
  1209. andc $t3,$t3,$in1infty
  1210. and $a0,$a0,$in1infty
  1211. and $a1,$a1,$in1infty
  1212. and $a2,$a2,$in1infty
  1213. and $a3,$a3,$in1infty
  1214. or $t0,$t0,$a0
  1215. or $t1,$t1,$a1
  1216. or $t2,$t2,$a2
  1217. or $t3,$t3,$a3
  1218. andc $acc0,$acc0,$in2infty
  1219. andc $acc1,$acc1,$in2infty
  1220. andc $acc2,$acc2,$in2infty
  1221. andc $acc3,$acc3,$in2infty
  1222. and $t0,$t0,$in2infty
  1223. and $t1,$t1,$in2infty
  1224. and $t2,$t2,$in2infty
  1225. and $t3,$t3,$in2infty
  1226. or $acc0,$acc0,$t0
  1227. or $acc1,$acc1,$t1
  1228. or $acc2,$acc2,$t2
  1229. or $acc3,$acc3,$t3
  1230. std $acc0,$i+0($rp_real)
  1231. std $acc1,$i+8($rp_real)
  1232. std $acc2,$i+16($rp_real)
  1233. std $acc3,$i+24($rp_real)
  1234. .Ladd_done:
  1235. mtlr r0
  1236. ld r16,$FRAME-8*16($sp)
  1237. ld r17,$FRAME-8*15($sp)
  1238. ld r18,$FRAME-8*14($sp)
  1239. ld r19,$FRAME-8*13($sp)
  1240. ld r20,$FRAME-8*12($sp)
  1241. ld r21,$FRAME-8*11($sp)
  1242. ld r22,$FRAME-8*10($sp)
  1243. ld r23,$FRAME-8*9($sp)
  1244. ld r24,$FRAME-8*8($sp)
  1245. ld r25,$FRAME-8*7($sp)
  1246. ld r26,$FRAME-8*6($sp)
  1247. ld r27,$FRAME-8*5($sp)
  1248. ld r28,$FRAME-8*4($sp)
  1249. ld r29,$FRAME-8*3($sp)
  1250. ld r30,$FRAME-8*2($sp)
  1251. ld r31,$FRAME-8*1($sp)
  1252. addi $sp,$sp,$FRAME
  1253. blr
  1254. .long 0
  1255. .byte 0,12,4,0,0x80,16,3,0
  1256. .long 0
  1257. .size ecp_nistz256_point_add,.-ecp_nistz256_point_add
  1258. ___
  1259. }
  1260. ########################################################################
  1261. # void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1,
  1262. # const P256_POINT_AFFINE *in2);
  1263. if (1) {
  1264. my $FRAME = 64 + 32*10 + 16*8;
  1265. my ($res_x,$res_y,$res_z,
  1266. $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(64+32*$_,(0..9));
  1267. my $Z1sqr = $S2;
  1268. # above map() describes stack layout with 10 temporary
  1269. # 256-bit vectors on top.
  1270. my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp)=map("r$_",(16..21));
  1271. $code.=<<___;
  1272. .globl ecp_nistz256_point_add_affine
  1273. .align 5
  1274. ecp_nistz256_point_add_affine:
  1275. stdu $sp,-$FRAME($sp)
  1276. mflr r0
  1277. std r16,$FRAME-8*16($sp)
  1278. std r17,$FRAME-8*15($sp)
  1279. std r18,$FRAME-8*14($sp)
  1280. std r19,$FRAME-8*13($sp)
  1281. std r20,$FRAME-8*12($sp)
  1282. std r21,$FRAME-8*11($sp)
  1283. std r22,$FRAME-8*10($sp)
  1284. std r23,$FRAME-8*9($sp)
  1285. std r24,$FRAME-8*8($sp)
  1286. std r25,$FRAME-8*7($sp)
  1287. std r26,$FRAME-8*6($sp)
  1288. std r27,$FRAME-8*5($sp)
  1289. std r28,$FRAME-8*4($sp)
  1290. std r29,$FRAME-8*3($sp)
  1291. std r30,$FRAME-8*2($sp)
  1292. std r31,$FRAME-8*1($sp)
  1293. li $poly1,-1
  1294. srdi $poly1,$poly1,32 # 0x00000000ffffffff
  1295. li $poly3,1
  1296. orc $poly3,$poly3,$poly1 # 0xffffffff00000001
  1297. mr $rp_real,$rp
  1298. mr $ap_real,$ap
  1299. mr $bp_real,$bp
  1300. ld $a0,64($ap) # in1_z
  1301. ld $a1,72($ap)
  1302. ld $a2,80($ap)
  1303. ld $a3,88($ap)
  1304. or $t0,$a0,$a1
  1305. or $t2,$a2,$a3
  1306. or $in1infty,$t0,$t2
  1307. neg $t0,$in1infty
  1308. or $in1infty,$in1infty,$t0
  1309. sradi $in1infty,$in1infty,63 # !in1infty
  1310. ld $acc0,0($bp) # in2_x
  1311. ld $acc1,8($bp)
  1312. ld $acc2,16($bp)
  1313. ld $acc3,24($bp)
  1314. ld $t0,32($bp) # in2_y
  1315. ld $t1,40($bp)
  1316. ld $t2,48($bp)
  1317. ld $t3,56($bp)
  1318. or $acc0,$acc0,$acc1
  1319. or $acc2,$acc2,$acc3
  1320. or $acc0,$acc0,$acc2
  1321. or $t0,$t0,$t1
  1322. or $t2,$t2,$t3
  1323. or $t0,$t0,$t2
  1324. or $in2infty,$acc0,$t0
  1325. neg $t0,$in2infty
  1326. or $in2infty,$in2infty,$t0
  1327. sradi $in2infty,$in2infty,63 # !in2infty
  1328. addi $rp,$sp,$Z1sqr
  1329. bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Z1sqr, in1_z);
  1330. mr $a0,$acc0
  1331. mr $a1,$acc1
  1332. mr $a2,$acc2
  1333. mr $a3,$acc3
  1334. ld $bi,0($bp_real)
  1335. addi $bp,$bp_real,0
  1336. addi $rp,$sp,$U2
  1337. bl __ecp_nistz256_mul_mont # p256_mul_mont(U2, Z1sqr, in2_x);
  1338. addi $bp,$ap_real,0
  1339. ld $bi,64($ap_real) # forward load for p256_mul_mont
  1340. ld $a0,$Z1sqr+0($sp)
  1341. ld $a1,$Z1sqr+8($sp)
  1342. ld $a2,$Z1sqr+16($sp)
  1343. ld $a3,$Z1sqr+24($sp)
  1344. addi $rp,$sp,$H
  1345. bl __ecp_nistz256_sub_from # p256_sub(H, U2, in1_x);
  1346. addi $bp,$ap_real,64
  1347. addi $rp,$sp,$S2
  1348. bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, Z1sqr, in1_z);
  1349. ld $bi,64($ap_real)
  1350. ld $a0,$H+0($sp)
  1351. ld $a1,$H+8($sp)
  1352. ld $a2,$H+16($sp)
  1353. ld $a3,$H+24($sp)
  1354. addi $bp,$ap_real,64
  1355. addi $rp,$sp,$res_z
  1356. bl __ecp_nistz256_mul_mont # p256_mul_mont(res_z, H, in1_z);
  1357. ld $bi,32($bp_real)
  1358. ld $a0,$S2+0($sp)
  1359. ld $a1,$S2+8($sp)
  1360. ld $a2,$S2+16($sp)
  1361. ld $a3,$S2+24($sp)
  1362. addi $bp,$bp_real,32
  1363. addi $rp,$sp,$S2
  1364. bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, S2, in2_y);
  1365. addi $bp,$ap_real,32
  1366. ld $a0,$H+0($sp) # forward load for p256_sqr_mont
  1367. ld $a1,$H+8($sp)
  1368. ld $a2,$H+16($sp)
  1369. ld $a3,$H+24($sp)
  1370. addi $rp,$sp,$R
  1371. bl __ecp_nistz256_sub_from # p256_sub(R, S2, in1_y);
  1372. addi $rp,$sp,$Hsqr
  1373. bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Hsqr, H);
  1374. ld $a0,$R+0($sp)
  1375. ld $a1,$R+8($sp)
  1376. ld $a2,$R+16($sp)
  1377. ld $a3,$R+24($sp)
  1378. addi $rp,$sp,$Rsqr
  1379. bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Rsqr, R);
  1380. ld $bi,$H($sp)
  1381. ld $a0,$Hsqr+0($sp)
  1382. ld $a1,$Hsqr+8($sp)
  1383. ld $a2,$Hsqr+16($sp)
  1384. ld $a3,$Hsqr+24($sp)
  1385. addi $bp,$sp,$H
  1386. addi $rp,$sp,$Hcub
  1387. bl __ecp_nistz256_mul_mont # p256_mul_mont(Hcub, Hsqr, H);
  1388. ld $bi,0($ap_real)
  1389. ld $a0,$Hsqr+0($sp)
  1390. ld $a1,$Hsqr+8($sp)
  1391. ld $a2,$Hsqr+16($sp)
  1392. ld $a3,$Hsqr+24($sp)
  1393. addi $bp,$ap_real,0
  1394. addi $rp,$sp,$U2
  1395. bl __ecp_nistz256_mul_mont # p256_mul_mont(U2, in1_x, Hsqr);
  1396. mr $t0,$acc0
  1397. mr $t1,$acc1
  1398. mr $t2,$acc2
  1399. mr $t3,$acc3
  1400. addi $rp,$sp,$Hsqr
  1401. bl __ecp_nistz256_add # p256_mul_by_2(Hsqr, U2);
  1402. addi $bp,$sp,$Rsqr
  1403. addi $rp,$sp,$res_x
  1404. bl __ecp_nistz256_sub_morf # p256_sub(res_x, Rsqr, Hsqr);
  1405. addi $bp,$sp,$Hcub
  1406. bl __ecp_nistz256_sub_from # p256_sub(res_x, res_x, Hcub);
  1407. addi $bp,$sp,$U2
  1408. ld $bi,32($ap_real) # forward load for p256_mul_mont
  1409. ld $a0,$Hcub+0($sp)
  1410. ld $a1,$Hcub+8($sp)
  1411. ld $a2,$Hcub+16($sp)
  1412. ld $a3,$Hcub+24($sp)
  1413. addi $rp,$sp,$res_y
  1414. bl __ecp_nistz256_sub_morf # p256_sub(res_y, U2, res_x);
  1415. addi $bp,$ap_real,32
  1416. addi $rp,$sp,$S2
  1417. bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, in1_y, Hcub);
  1418. ld $bi,$R($sp)
  1419. ld $a0,$res_y+0($sp)
  1420. ld $a1,$res_y+8($sp)
  1421. ld $a2,$res_y+16($sp)
  1422. ld $a3,$res_y+24($sp)
  1423. addi $bp,$sp,$R
  1424. addi $rp,$sp,$res_y
  1425. bl __ecp_nistz256_mul_mont # p256_mul_mont(res_y, res_y, R);
  1426. addi $bp,$sp,$S2
  1427. bl __ecp_nistz256_sub_from # p256_sub(res_y, res_y, S2);
  1428. ld $t0,0($bp_real) # in2
  1429. ld $t1,8($bp_real)
  1430. ld $t2,16($bp_real)
  1431. ld $t3,24($bp_real)
  1432. ld $a0,$res_x+0($sp) # res
  1433. ld $a1,$res_x+8($sp)
  1434. ld $a2,$res_x+16($sp)
  1435. ld $a3,$res_x+24($sp)
  1436. ___
  1437. for($i=0;$i<64;$i+=32) { # conditional moves
  1438. $code.=<<___;
  1439. ld $acc0,$i+0($ap_real) # in1
  1440. ld $acc1,$i+8($ap_real)
  1441. ld $acc2,$i+16($ap_real)
  1442. ld $acc3,$i+24($ap_real)
  1443. andc $t0,$t0,$in1infty
  1444. andc $t1,$t1,$in1infty
  1445. andc $t2,$t2,$in1infty
  1446. andc $t3,$t3,$in1infty
  1447. and $a0,$a0,$in1infty
  1448. and $a1,$a1,$in1infty
  1449. and $a2,$a2,$in1infty
  1450. and $a3,$a3,$in1infty
  1451. or $t0,$t0,$a0
  1452. or $t1,$t1,$a1
  1453. or $t2,$t2,$a2
  1454. or $t3,$t3,$a3
  1455. andc $acc0,$acc0,$in2infty
  1456. andc $acc1,$acc1,$in2infty
  1457. andc $acc2,$acc2,$in2infty
  1458. andc $acc3,$acc3,$in2infty
  1459. and $t0,$t0,$in2infty
  1460. and $t1,$t1,$in2infty
  1461. and $t2,$t2,$in2infty
  1462. and $t3,$t3,$in2infty
  1463. or $acc0,$acc0,$t0
  1464. or $acc1,$acc1,$t1
  1465. or $acc2,$acc2,$t2
  1466. or $acc3,$acc3,$t3
  1467. ___
  1468. $code.=<<___ if ($i==0);
  1469. ld $t0,32($bp_real) # in2
  1470. ld $t1,40($bp_real)
  1471. ld $t2,48($bp_real)
  1472. ld $t3,56($bp_real)
  1473. ___
  1474. $code.=<<___ if ($i==32);
  1475. li $t0,1 # Lone_mont
  1476. not $t1,$poly1
  1477. li $t2,-1
  1478. not $t3,$poly3
  1479. ___
  1480. $code.=<<___;
  1481. ld $a0,$res_x+$i+32($sp)
  1482. ld $a1,$res_x+$i+40($sp)
  1483. ld $a2,$res_x+$i+48($sp)
  1484. ld $a3,$res_x+$i+56($sp)
  1485. std $acc0,$i+0($rp_real)
  1486. std $acc1,$i+8($rp_real)
  1487. std $acc2,$i+16($rp_real)
  1488. std $acc3,$i+24($rp_real)
  1489. ___
  1490. }
  1491. $code.=<<___;
  1492. ld $acc0,$i+0($ap_real) # in1
  1493. ld $acc1,$i+8($ap_real)
  1494. ld $acc2,$i+16($ap_real)
  1495. ld $acc3,$i+24($ap_real)
  1496. andc $t0,$t0,$in1infty
  1497. andc $t1,$t1,$in1infty
  1498. andc $t2,$t2,$in1infty
  1499. andc $t3,$t3,$in1infty
  1500. and $a0,$a0,$in1infty
  1501. and $a1,$a1,$in1infty
  1502. and $a2,$a2,$in1infty
  1503. and $a3,$a3,$in1infty
  1504. or $t0,$t0,$a0
  1505. or $t1,$t1,$a1
  1506. or $t2,$t2,$a2
  1507. or $t3,$t3,$a3
  1508. andc $acc0,$acc0,$in2infty
  1509. andc $acc1,$acc1,$in2infty
  1510. andc $acc2,$acc2,$in2infty
  1511. andc $acc3,$acc3,$in2infty
  1512. and $t0,$t0,$in2infty
  1513. and $t1,$t1,$in2infty
  1514. and $t2,$t2,$in2infty
  1515. and $t3,$t3,$in2infty
  1516. or $acc0,$acc0,$t0
  1517. or $acc1,$acc1,$t1
  1518. or $acc2,$acc2,$t2
  1519. or $acc3,$acc3,$t3
  1520. std $acc0,$i+0($rp_real)
  1521. std $acc1,$i+8($rp_real)
  1522. std $acc2,$i+16($rp_real)
  1523. std $acc3,$i+24($rp_real)
  1524. mtlr r0
  1525. ld r16,$FRAME-8*16($sp)
  1526. ld r17,$FRAME-8*15($sp)
  1527. ld r18,$FRAME-8*14($sp)
  1528. ld r19,$FRAME-8*13($sp)
  1529. ld r20,$FRAME-8*12($sp)
  1530. ld r21,$FRAME-8*11($sp)
  1531. ld r22,$FRAME-8*10($sp)
  1532. ld r23,$FRAME-8*9($sp)
  1533. ld r24,$FRAME-8*8($sp)
  1534. ld r25,$FRAME-8*7($sp)
  1535. ld r26,$FRAME-8*6($sp)
  1536. ld r27,$FRAME-8*5($sp)
  1537. ld r28,$FRAME-8*4($sp)
  1538. ld r29,$FRAME-8*3($sp)
  1539. ld r30,$FRAME-8*2($sp)
  1540. ld r31,$FRAME-8*1($sp)
  1541. addi $sp,$sp,$FRAME
  1542. blr
  1543. .long 0
  1544. .byte 0,12,4,0,0x80,16,3,0
  1545. .long 0
  1546. .size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine
  1547. ___
  1548. }
  1549. if (1) {
  1550. my ($ordk,$ord0,$ord1,$t4) = map("r$_",(18..21));
  1551. my ($ord2,$ord3,$zr) = ($poly1,$poly3,"r0");
  1552. $code.=<<___;
  1553. ########################################################################
  1554. # void ecp_nistz256_ord_mul_mont(uint64_t res[4], uint64_t a[4],
  1555. # uint64_t b[4]);
  1556. .globl ecp_nistz256_ord_mul_mont
  1557. .align 5
  1558. ecp_nistz256_ord_mul_mont:
  1559. stdu $sp,-160($sp)
  1560. std r18,48($sp)
  1561. std r19,56($sp)
  1562. std r20,64($sp)
  1563. std r21,72($sp)
  1564. std r22,80($sp)
  1565. std r23,88($sp)
  1566. std r24,96($sp)
  1567. std r25,104($sp)
  1568. std r26,112($sp)
  1569. std r27,120($sp)
  1570. std r28,128($sp)
  1571. std r29,136($sp)
  1572. std r30,144($sp)
  1573. std r31,152($sp)
  1574. ld $a0,0($ap)
  1575. ld $bi,0($bp)
  1576. ld $a1,8($ap)
  1577. ld $a2,16($ap)
  1578. ld $a3,24($ap)
  1579. lis $ordk,0xccd1
  1580. lis $ord0,0xf3b9
  1581. lis $ord1,0xbce6
  1582. ori $ordk,$ordk,0xc8aa
  1583. ori $ord0,$ord0,0xcac2
  1584. ori $ord1,$ord1,0xfaad
  1585. sldi $ordk,$ordk,32
  1586. sldi $ord0,$ord0,32
  1587. sldi $ord1,$ord1,32
  1588. oris $ordk,$ordk,0xee00
  1589. oris $ord0,$ord0,0xfc63
  1590. oris $ord1,$ord1,0xa717
  1591. ori $ordk,$ordk,0xbc4f # 0xccd1c8aaee00bc4f
  1592. ori $ord0,$ord0,0x2551 # 0xf3b9cac2fc632551
  1593. ori $ord1,$ord1,0x9e84 # 0xbce6faada7179e84
  1594. li $ord2,-1 # 0xffffffffffffffff
  1595. sldi $ord3,$ord2,32 # 0xffffffff00000000
  1596. li $zr,0
  1597. mulld $acc0,$a0,$bi # a[0]*b[0]
  1598. mulhdu $t0,$a0,$bi
  1599. mulld $acc1,$a1,$bi # a[1]*b[0]
  1600. mulhdu $t1,$a1,$bi
  1601. mulld $acc2,$a2,$bi # a[2]*b[0]
  1602. mulhdu $t2,$a2,$bi
  1603. mulld $acc3,$a3,$bi # a[3]*b[0]
  1604. mulhdu $acc4,$a3,$bi
  1605. mulld $t4,$acc0,$ordk
  1606. addc $acc1,$acc1,$t0 # accumulate high parts of multiplication
  1607. adde $acc2,$acc2,$t1
  1608. adde $acc3,$acc3,$t2
  1609. addze $acc4,$acc4
  1610. li $acc5,0
  1611. ___
  1612. for ($i=1;$i<4;$i++) {
  1613. ################################################################
  1614. # ffff0000.ffffffff.yyyyyyyy.zzzzzzzz
  1615. # * abcdefgh
  1616. # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx
  1617. #
  1618. # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
  1619. # rewrite above as:
  1620. #
  1621. # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx
  1622. # - 0000abcd.efgh0000.abcdefgh.00000000.00000000
  1623. # + abcdefgh.abcdefgh.yzayzbyz.cyzdyzey.zfyzgyzh
  1624. $code.=<<___;
  1625. ld $bi,8*$i($bp) # b[i]
  1626. sldi $t0,$t4,32
  1627. subfc $acc2,$t4,$acc2
  1628. srdi $t1,$t4,32
  1629. subfe $acc3,$t0,$acc3
  1630. subfe $acc4,$t1,$acc4
  1631. subfe $acc5,$zr,$acc5
  1632. addic $t0,$acc0,-1 # discarded
  1633. mulhdu $t1,$ord0,$t4
  1634. mulld $t2,$ord1,$t4
  1635. mulhdu $t3,$ord1,$t4
  1636. adde $t2,$t2,$t1
  1637. mulld $t0,$a0,$bi
  1638. addze $t3,$t3
  1639. mulld $t1,$a1,$bi
  1640. addc $acc0,$acc1,$t2
  1641. mulld $t2,$a2,$bi
  1642. adde $acc1,$acc2,$t3
  1643. mulld $t3,$a3,$bi
  1644. adde $acc2,$acc3,$t4
  1645. adde $acc3,$acc4,$t4
  1646. addze $acc4,$acc5
  1647. addc $acc0,$acc0,$t0 # accumulate low parts
  1648. mulhdu $t0,$a0,$bi
  1649. adde $acc1,$acc1,$t1
  1650. mulhdu $t1,$a1,$bi
  1651. adde $acc2,$acc2,$t2
  1652. mulhdu $t2,$a2,$bi
  1653. adde $acc3,$acc3,$t3
  1654. mulhdu $t3,$a3,$bi
  1655. addze $acc4,$acc4
  1656. mulld $t4,$acc0,$ordk
  1657. addc $acc1,$acc1,$t0 # accumulate high parts
  1658. adde $acc2,$acc2,$t1
  1659. adde $acc3,$acc3,$t2
  1660. adde $acc4,$acc4,$t3
  1661. addze $acc5,$zr
  1662. ___
  1663. }
  1664. $code.=<<___;
  1665. sldi $t0,$t4,32 # last reduction
  1666. subfc $acc2,$t4,$acc2
  1667. srdi $t1,$t4,32
  1668. subfe $acc3,$t0,$acc3
  1669. subfe $acc4,$t1,$acc4
  1670. subfe $acc5,$zr,$acc5
  1671. addic $t0,$acc0,-1 # discarded
  1672. mulhdu $t1,$ord0,$t4
  1673. mulld $t2,$ord1,$t4
  1674. mulhdu $t3,$ord1,$t4
  1675. adde $t2,$t2,$t1
  1676. addze $t3,$t3
  1677. addc $acc0,$acc1,$t2
  1678. adde $acc1,$acc2,$t3
  1679. adde $acc2,$acc3,$t4
  1680. adde $acc3,$acc4,$t4
  1681. addze $acc4,$acc5
  1682. subfc $acc0,$ord0,$acc0 # ret -= modulus
  1683. subfe $acc1,$ord1,$acc1
  1684. subfe $acc2,$ord2,$acc2
  1685. subfe $acc3,$ord3,$acc3
  1686. subfe $acc4,$zr,$acc4
  1687. and $t0,$ord0,$acc4
  1688. and $t1,$ord1,$acc4
  1689. addc $acc0,$acc0,$t0 # ret += modulus if borrow
  1690. and $t3,$ord3,$acc4
  1691. adde $acc1,$acc1,$t1
  1692. adde $acc2,$acc2,$acc4
  1693. adde $acc3,$acc3,$t3
  1694. std $acc0,0($rp)
  1695. std $acc1,8($rp)
  1696. std $acc2,16($rp)
  1697. std $acc3,24($rp)
  1698. ld r18,48($sp)
  1699. ld r19,56($sp)
  1700. ld r20,64($sp)
  1701. ld r21,72($sp)
  1702. ld r22,80($sp)
  1703. ld r23,88($sp)
  1704. ld r24,96($sp)
  1705. ld r25,104($sp)
  1706. ld r26,112($sp)
  1707. ld r27,120($sp)
  1708. ld r28,128($sp)
  1709. ld r29,136($sp)
  1710. ld r30,144($sp)
  1711. ld r31,152($sp)
  1712. addi $sp,$sp,160
  1713. blr
  1714. .long 0
  1715. .byte 0,12,4,0,0x80,14,3,0
  1716. .long 0
  1717. .size ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont
  1718. ################################################################################
  1719. # void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4],
  1720. # uint64_t rep);
  1721. .globl ecp_nistz256_ord_sqr_mont
  1722. .align 5
  1723. ecp_nistz256_ord_sqr_mont:
  1724. stdu $sp,-160($sp)
  1725. std r18,48($sp)
  1726. std r19,56($sp)
  1727. std r20,64($sp)
  1728. std r21,72($sp)
  1729. std r22,80($sp)
  1730. std r23,88($sp)
  1731. std r24,96($sp)
  1732. std r25,104($sp)
  1733. std r26,112($sp)
  1734. std r27,120($sp)
  1735. std r28,128($sp)
  1736. std r29,136($sp)
  1737. std r30,144($sp)
  1738. std r31,152($sp)
  1739. mtctr $bp
  1740. ld $a0,0($ap)
  1741. ld $a1,8($ap)
  1742. ld $a2,16($ap)
  1743. ld $a3,24($ap)
  1744. lis $ordk,0xccd1
  1745. lis $ord0,0xf3b9
  1746. lis $ord1,0xbce6
  1747. ori $ordk,$ordk,0xc8aa
  1748. ori $ord0,$ord0,0xcac2
  1749. ori $ord1,$ord1,0xfaad
  1750. sldi $ordk,$ordk,32
  1751. sldi $ord0,$ord0,32
  1752. sldi $ord1,$ord1,32
  1753. oris $ordk,$ordk,0xee00
  1754. oris $ord0,$ord0,0xfc63
  1755. oris $ord1,$ord1,0xa717
  1756. ori $ordk,$ordk,0xbc4f # 0xccd1c8aaee00bc4f
  1757. ori $ord0,$ord0,0x2551 # 0xf3b9cac2fc632551
  1758. ori $ord1,$ord1,0x9e84 # 0xbce6faada7179e84
  1759. li $ord2,-1 # 0xffffffffffffffff
  1760. sldi $ord3,$ord2,32 # 0xffffffff00000000
  1761. li $zr,0
  1762. b .Loop_ord_sqr
  1763. .align 5
  1764. .Loop_ord_sqr:
  1765. ################################################################
  1766. # | | | | | |a1*a0| |
  1767. # | | | | |a2*a0| | |
  1768. # | |a3*a2|a3*a0| | | |
  1769. # | | | |a2*a1| | | |
  1770. # | | |a3*a1| | | | |
  1771. # *| | | | | | | | 2|
  1772. # +|a3*a3|a2*a2|a1*a1|a0*a0|
  1773. # |--+--+--+--+--+--+--+--|
  1774. # |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx
  1775. #
  1776. # "can't overflow" below mark carrying into high part of
  1777. # multiplication result, which can't overflow, because it
  1778. # can never be all ones.
  1779. mulld $acc1,$a1,$a0 # a[1]*a[0]
  1780. mulhdu $t1,$a1,$a0
  1781. mulld $acc2,$a2,$a0 # a[2]*a[0]
  1782. mulhdu $t2,$a2,$a0
  1783. mulld $acc3,$a3,$a0 # a[3]*a[0]
  1784. mulhdu $acc4,$a3,$a0
  1785. addc $acc2,$acc2,$t1 # accumulate high parts of multiplication
  1786. mulld $t0,$a2,$a1 # a[2]*a[1]
  1787. mulhdu $t1,$a2,$a1
  1788. adde $acc3,$acc3,$t2
  1789. mulld $t2,$a3,$a1 # a[3]*a[1]
  1790. mulhdu $t3,$a3,$a1
  1791. addze $acc4,$acc4 # can't overflow
  1792. mulld $acc5,$a3,$a2 # a[3]*a[2]
  1793. mulhdu $acc6,$a3,$a2
  1794. addc $t1,$t1,$t2 # accumulate high parts of multiplication
  1795. mulld $acc0,$a0,$a0 # a[0]*a[0]
  1796. addze $t2,$t3 # can't overflow
  1797. addc $acc3,$acc3,$t0 # accumulate low parts of multiplication
  1798. mulhdu $a0,$a0,$a0
  1799. adde $acc4,$acc4,$t1
  1800. mulld $t1,$a1,$a1 # a[1]*a[1]
  1801. adde $acc5,$acc5,$t2
  1802. mulhdu $a1,$a1,$a1
  1803. addze $acc6,$acc6 # can't overflow
  1804. addc $acc1,$acc1,$acc1 # acc[1-6]*=2
  1805. mulld $t2,$a2,$a2 # a[2]*a[2]
  1806. adde $acc2,$acc2,$acc2
  1807. mulhdu $a2,$a2,$a2
  1808. adde $acc3,$acc3,$acc3
  1809. mulld $t3,$a3,$a3 # a[3]*a[3]
  1810. adde $acc4,$acc4,$acc4
  1811. mulhdu $a3,$a3,$a3
  1812. adde $acc5,$acc5,$acc5
  1813. adde $acc6,$acc6,$acc6
  1814. addze $acc7,$zr
  1815. addc $acc1,$acc1,$a0 # +a[i]*a[i]
  1816. mulld $t4,$acc0,$ordk
  1817. adde $acc2,$acc2,$t1
  1818. adde $acc3,$acc3,$a1
  1819. adde $acc4,$acc4,$t2
  1820. adde $acc5,$acc5,$a2
  1821. adde $acc6,$acc6,$t3
  1822. adde $acc7,$acc7,$a3
  1823. ___
  1824. for($i=0; $i<4; $i++) { # reductions
  1825. $code.=<<___;
  1826. addic $t0,$acc0,-1 # discarded
  1827. mulhdu $t1,$ord0,$t4
  1828. mulld $t2,$ord1,$t4
  1829. mulhdu $t3,$ord1,$t4
  1830. adde $t2,$t2,$t1
  1831. addze $t3,$t3
  1832. addc $acc0,$acc1,$t2
  1833. adde $acc1,$acc2,$t3
  1834. adde $acc2,$acc3,$t4
  1835. adde $acc3,$zr,$t4 # can't overflow
  1836. ___
  1837. $code.=<<___ if ($i<3);
  1838. mulld $t3,$acc0,$ordk
  1839. ___
  1840. $code.=<<___;
  1841. sldi $t0,$t4,32
  1842. subfc $acc1,$t4,$acc1
  1843. srdi $t1,$t4,32
  1844. subfe $acc2,$t0,$acc2
  1845. subfe $acc3,$t1,$acc3 # can't borrow
  1846. ___
  1847. ($t3,$t4) = ($t4,$t3);
  1848. }
  1849. $code.=<<___;
  1850. addc $acc0,$acc0,$acc4 # accumulate upper half
  1851. adde $acc1,$acc1,$acc5
  1852. adde $acc2,$acc2,$acc6
  1853. adde $acc3,$acc3,$acc7
  1854. addze $acc4,$zr
  1855. subfc $acc0,$ord0,$acc0 # ret -= modulus
  1856. subfe $acc1,$ord1,$acc1
  1857. subfe $acc2,$ord2,$acc2
  1858. subfe $acc3,$ord3,$acc3
  1859. subfe $acc4,$zr,$acc4
  1860. and $t0,$ord0,$acc4
  1861. and $t1,$ord1,$acc4
  1862. addc $a0,$acc0,$t0 # ret += modulus if borrow
  1863. and $t3,$ord3,$acc4
  1864. adde $a1,$acc1,$t1
  1865. adde $a2,$acc2,$acc4
  1866. adde $a3,$acc3,$t3
  1867. bdnz .Loop_ord_sqr
  1868. std $a0,0($rp)
  1869. std $a1,8($rp)
  1870. std $a2,16($rp)
  1871. std $a3,24($rp)
  1872. ld r18,48($sp)
  1873. ld r19,56($sp)
  1874. ld r20,64($sp)
  1875. ld r21,72($sp)
  1876. ld r22,80($sp)
  1877. ld r23,88($sp)
  1878. ld r24,96($sp)
  1879. ld r25,104($sp)
  1880. ld r26,112($sp)
  1881. ld r27,120($sp)
  1882. ld r28,128($sp)
  1883. ld r29,136($sp)
  1884. ld r30,144($sp)
  1885. ld r31,152($sp)
  1886. addi $sp,$sp,160
  1887. blr
  1888. .long 0
  1889. .byte 0,12,4,0,0x80,14,3,0
  1890. .long 0
  1891. .size ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont
  1892. ___
  1893. } }
  1894. ########################################################################
  1895. # scatter-gather subroutines
  1896. {
  1897. my ($out,$inp,$index,$mask)=map("r$_",(3..7));
  1898. $code.=<<___;
  1899. ########################################################################
  1900. # void ecp_nistz256_scatter_w5(void *out, const P256_POINT *inp,
  1901. # int index);
  1902. .globl ecp_nistz256_scatter_w5
  1903. .align 4
  1904. ecp_nistz256_scatter_w5:
  1905. slwi $index,$index,2
  1906. add $out,$out,$index
  1907. ld r8, 0($inp) # X
  1908. ld r9, 8($inp)
  1909. ld r10,16($inp)
  1910. ld r11,24($inp)
  1911. stw r8, 64*0-4($out)
  1912. srdi r8, r8, 32
  1913. stw r9, 64*1-4($out)
  1914. srdi r9, r9, 32
  1915. stw r10,64*2-4($out)
  1916. srdi r10,r10,32
  1917. stw r11,64*3-4($out)
  1918. srdi r11,r11,32
  1919. stw r8, 64*4-4($out)
  1920. stw r9, 64*5-4($out)
  1921. stw r10,64*6-4($out)
  1922. stw r11,64*7-4($out)
  1923. addi $out,$out,64*8
  1924. ld r8, 32($inp) # Y
  1925. ld r9, 40($inp)
  1926. ld r10,48($inp)
  1927. ld r11,56($inp)
  1928. stw r8, 64*0-4($out)
  1929. srdi r8, r8, 32
  1930. stw r9, 64*1-4($out)
  1931. srdi r9, r9, 32
  1932. stw r10,64*2-4($out)
  1933. srdi r10,r10,32
  1934. stw r11,64*3-4($out)
  1935. srdi r11,r11,32
  1936. stw r8, 64*4-4($out)
  1937. stw r9, 64*5-4($out)
  1938. stw r10,64*6-4($out)
  1939. stw r11,64*7-4($out)
  1940. addi $out,$out,64*8
  1941. ld r8, 64($inp) # Z
  1942. ld r9, 72($inp)
  1943. ld r10,80($inp)
  1944. ld r11,88($inp)
  1945. stw r8, 64*0-4($out)
  1946. srdi r8, r8, 32
  1947. stw r9, 64*1-4($out)
  1948. srdi r9, r9, 32
  1949. stw r10,64*2-4($out)
  1950. srdi r10,r10,32
  1951. stw r11,64*3-4($out)
  1952. srdi r11,r11,32
  1953. stw r8, 64*4-4($out)
  1954. stw r9, 64*5-4($out)
  1955. stw r10,64*6-4($out)
  1956. stw r11,64*7-4($out)
  1957. blr
  1958. .long 0
  1959. .byte 0,12,0x14,0,0,0,3,0
  1960. .long 0
  1961. .size ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5
  1962. ########################################################################
  1963. # void ecp_nistz256_gather_w5(P256_POINT *out, const void *inp,
  1964. # int index);
  1965. .globl ecp_nistz256_gather_w5
  1966. .align 4
  1967. ecp_nistz256_gather_w5:
  1968. neg r0,$index
  1969. sradi r0,r0,63
  1970. add $index,$index,r0
  1971. slwi $index,$index,2
  1972. add $inp,$inp,$index
  1973. lwz r5, 64*0($inp)
  1974. lwz r6, 64*1($inp)
  1975. lwz r7, 64*2($inp)
  1976. lwz r8, 64*3($inp)
  1977. lwz r9, 64*4($inp)
  1978. lwz r10,64*5($inp)
  1979. lwz r11,64*6($inp)
  1980. lwz r12,64*7($inp)
  1981. addi $inp,$inp,64*8
  1982. sldi r9, r9, 32
  1983. sldi r10,r10,32
  1984. sldi r11,r11,32
  1985. sldi r12,r12,32
  1986. or r5,r5,r9
  1987. or r6,r6,r10
  1988. or r7,r7,r11
  1989. or r8,r8,r12
  1990. and r5,r5,r0
  1991. and r6,r6,r0
  1992. and r7,r7,r0
  1993. and r8,r8,r0
  1994. std r5,0($out) # X
  1995. std r6,8($out)
  1996. std r7,16($out)
  1997. std r8,24($out)
  1998. lwz r5, 64*0($inp)
  1999. lwz r6, 64*1($inp)
  2000. lwz r7, 64*2($inp)
  2001. lwz r8, 64*3($inp)
  2002. lwz r9, 64*4($inp)
  2003. lwz r10,64*5($inp)
  2004. lwz r11,64*6($inp)
  2005. lwz r12,64*7($inp)
  2006. addi $inp,$inp,64*8
  2007. sldi r9, r9, 32
  2008. sldi r10,r10,32
  2009. sldi r11,r11,32
  2010. sldi r12,r12,32
  2011. or r5,r5,r9
  2012. or r6,r6,r10
  2013. or r7,r7,r11
  2014. or r8,r8,r12
  2015. and r5,r5,r0
  2016. and r6,r6,r0
  2017. and r7,r7,r0
  2018. and r8,r8,r0
  2019. std r5,32($out) # Y
  2020. std r6,40($out)
  2021. std r7,48($out)
  2022. std r8,56($out)
  2023. lwz r5, 64*0($inp)
  2024. lwz r6, 64*1($inp)
  2025. lwz r7, 64*2($inp)
  2026. lwz r8, 64*3($inp)
  2027. lwz r9, 64*4($inp)
  2028. lwz r10,64*5($inp)
  2029. lwz r11,64*6($inp)
  2030. lwz r12,64*7($inp)
  2031. sldi r9, r9, 32
  2032. sldi r10,r10,32
  2033. sldi r11,r11,32
  2034. sldi r12,r12,32
  2035. or r5,r5,r9
  2036. or r6,r6,r10
  2037. or r7,r7,r11
  2038. or r8,r8,r12
  2039. and r5,r5,r0
  2040. and r6,r6,r0
  2041. and r7,r7,r0
  2042. and r8,r8,r0
  2043. std r5,64($out) # Z
  2044. std r6,72($out)
  2045. std r7,80($out)
  2046. std r8,88($out)
  2047. blr
  2048. .long 0
  2049. .byte 0,12,0x14,0,0,0,3,0
  2050. .long 0
  2051. .size ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5
  2052. ########################################################################
  2053. # void ecp_nistz256_scatter_w7(void *out, const P256_POINT_AFFINE *inp,
  2054. # int index);
  2055. .globl ecp_nistz256_scatter_w7
  2056. .align 4
  2057. ecp_nistz256_scatter_w7:
  2058. li r0,8
  2059. mtctr r0
  2060. add $out,$out,$index
  2061. subi $inp,$inp,8
  2062. .Loop_scatter_w7:
  2063. ldu r0,8($inp)
  2064. stb r0,64*0($out)
  2065. srdi r0,r0,8
  2066. stb r0,64*1($out)
  2067. srdi r0,r0,8
  2068. stb r0,64*2($out)
  2069. srdi r0,r0,8
  2070. stb r0,64*3($out)
  2071. srdi r0,r0,8
  2072. stb r0,64*4($out)
  2073. srdi r0,r0,8
  2074. stb r0,64*5($out)
  2075. srdi r0,r0,8
  2076. stb r0,64*6($out)
  2077. srdi r0,r0,8
  2078. stb r0,64*7($out)
  2079. addi $out,$out,64*8
  2080. bdnz .Loop_scatter_w7
  2081. blr
  2082. .long 0
  2083. .byte 0,12,0x14,0,0,0,3,0
  2084. .long 0
  2085. .size ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7
  2086. ########################################################################
  2087. # void ecp_nistz256_gather_w7(P256_POINT_AFFINE *out, const void *inp,
  2088. # int index);
  2089. .globl ecp_nistz256_gather_w7
  2090. .align 4
  2091. ecp_nistz256_gather_w7:
  2092. li r0,8
  2093. mtctr r0
  2094. neg r0,$index
  2095. sradi r0,r0,63
  2096. add $index,$index,r0
  2097. add $inp,$inp,$index
  2098. subi $out,$out,8
  2099. .Loop_gather_w7:
  2100. lbz r5, 64*0($inp)
  2101. lbz r6, 64*1($inp)
  2102. lbz r7, 64*2($inp)
  2103. lbz r8, 64*3($inp)
  2104. lbz r9, 64*4($inp)
  2105. lbz r10,64*5($inp)
  2106. lbz r11,64*6($inp)
  2107. lbz r12,64*7($inp)
  2108. addi $inp,$inp,64*8
  2109. sldi r6, r6, 8
  2110. sldi r7, r7, 16
  2111. sldi r8, r8, 24
  2112. sldi r9, r9, 32
  2113. sldi r10,r10,40
  2114. sldi r11,r11,48
  2115. sldi r12,r12,56
  2116. or r5,r5,r6
  2117. or r7,r7,r8
  2118. or r9,r9,r10
  2119. or r11,r11,r12
  2120. or r5,r5,r7
  2121. or r9,r9,r11
  2122. or r5,r5,r9
  2123. and r5,r5,r0
  2124. stdu r5,8($out)
  2125. bdnz .Loop_gather_w7
  2126. blr
  2127. .long 0
  2128. .byte 0,12,0x14,0,0,0,3,0
  2129. .long 0
  2130. .size ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7
  2131. ___
  2132. }
  2133. foreach (split("\n",$code)) {
  2134. s/\`([^\`]*)\`/eval $1/ge;
  2135. print $_,"\n";
  2136. }
  2137. close STDOUT or die "error closing STDOUT: $!"; # enforce flush