chacha-loongarch64.pl 32 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413
  1. #! /usr/bin/env perl
  2. # Author: Min Zhou <zhoumin@loongson.cn>
  3. # Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
  4. #
  5. # Licensed under the OpenSSL license (the "License"). You may not use
  6. # this file except in compliance with the License. You can obtain a copy
  7. # in the file LICENSE in the source distribution or at
  8. # https://www.openssl.org/source/license.html
  9. use strict;
  10. my $code;
  11. # Here is the scalar register layout for LoongArch.
  12. my ($zero,$ra,$tp,$sp,$fp)=map("\$r$_",(0..3,22));
  13. my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$r$_",(4..11));
  14. my ($t0,$t1,$t2,$t3,$t4,$t5,$t6,$t7,$t8,$x)=map("\$r$_",(12..21));
  15. my ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8)=map("\$r$_",(23..31));
  16. # Here is the 128-bit vector register layout for LSX extension.
  17. my ($vr0,$vr1,$vr2,$vr3,$vr4,$vr5,$vr6,$vr7,$vr8,$vr9,$vr10,
  18. $vr11,$vr12,$vr13,$vr14,$vr15,$vr16,$vr17,$vr18,$vr19,
  19. $vr20,$vr21,$vr22,$vr23,$vr24,$vr25,$vr26,$vr27,$vr28,
  20. $vr29,$vr30,$vr31)=map("\$vr$_",(0..31));
  21. # Here is the 256-bit vector register layout for LASX extension.
  22. my ($xr0,$xr1,$xr2,$xr3,$xr4,$xr5,$xr6,$xr7,$xr8,$xr9,$xr10,
  23. $xr11,$xr12,$xr13,$xr14,$xr15,$xr16,$xr17,$xr18,$xr19,
  24. $xr20,$xr21,$xr22,$xr23,$xr24,$xr25,$xr26,$xr27,$xr28,
  25. $xr29,$xr30,$xr31)=map("\$xr$_",(0..31));
  26. my $output;
  27. for (@ARGV) { $output=$_ if (/\w[\w\-]*\.\w+$/); }
  28. open STDOUT,">$output";
  29. # Input parameter block
  30. my ($out, $inp, $len, $key, $counter) = ($a0, $a1, $a2, $a3, $a4);
  31. $code .= <<EOF;
  32. #include "loongarch_arch.h"
  33. .text
  34. .extern OPENSSL_loongarch_hwcap_P
  35. .align 6
  36. .Lsigma:
  37. .ascii "expand 32-byte k"
  38. .Linc8x:
  39. .long 0,1,2,3,4,5,6,7
  40. .Linc4x:
  41. .long 0,1,2,3
  42. .globl ChaCha20_ctr32
  43. .type ChaCha20_ctr32 function
  44. .align 6
  45. ChaCha20_ctr32:
  46. # $a0 = arg #1 (out pointer)
  47. # $a1 = arg #2 (inp pointer)
  48. # $a2 = arg #3 (len)
  49. # $a3 = arg #4 (key array)
  50. # $a4 = arg #5 (counter array)
  51. beqz $len,.Lno_data
  52. la.pcrel $t0,OPENSSL_loongarch_hwcap_P
  53. ld.w $t0,$t0,0
  54. andi $t1,$t0,LOONGARCH_HWCAP_LASX
  55. bnez $t1,.LChaCha20_8x
  56. andi $t2,$t0,LOONGARCH_HWCAP_LSX
  57. bnez $t2,.LChaCha20_4x
  58. b .LChaCha20_1x
  59. EOF
  60. ########################################################################
  61. # Scalar code path that handles all lengths.
  62. {
  63. # Load the initial states in array @x[*] and update directly
  64. my @x = ($t0, $t1, $t2, $t3, $t4, $t5, $t6, $t7,
  65. $s0, $s1, $s2, $s3, $s4, $s5, $s6, $s7);
  66. sub ROUND {
  67. my ($a0,$b0,$c0,$d0) = @_;
  68. my ($a1,$b1,$c1,$d1) = map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
  69. my ($a2,$b2,$c2,$d2) = map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
  70. my ($a3,$b3,$c3,$d3) = map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
  71. $code .= <<EOF;
  72. add.w @x[$a0],@x[$a0],@x[$b0]
  73. xor @x[$d0],@x[$d0],@x[$a0]
  74. rotri.w @x[$d0],@x[$d0],16 # rotate left 16 bits
  75. add.w @x[$a1],@x[$a1],@x[$b1]
  76. xor @x[$d1],@x[$d1],@x[$a1]
  77. rotri.w @x[$d1],@x[$d1],16
  78. add.w @x[$c0],@x[$c0],@x[$d0]
  79. xor @x[$b0],@x[$b0],@x[$c0]
  80. rotri.w @x[$b0],@x[$b0],20 # rotate left 12 bits
  81. add.w @x[$c1],@x[$c1],@x[$d1]
  82. xor @x[$b1],@x[$b1],@x[$c1]
  83. rotri.w @x[$b1],@x[$b1],20
  84. add.w @x[$a0],@x[$a0],@x[$b0]
  85. xor @x[$d0],@x[$d0],@x[$a0]
  86. rotri.w @x[$d0],@x[$d0],24 # rotate left 8 bits
  87. add.w @x[$a1],@x[$a1],@x[$b1]
  88. xor @x[$d1],@x[$d1],@x[$a1]
  89. rotri.w @x[$d1],@x[$d1],24
  90. add.w @x[$c0],@x[$c0],@x[$d0]
  91. xor @x[$b0],@x[$b0],@x[$c0]
  92. rotri.w @x[$b0],@x[$b0],25 # rotate left 7 bits
  93. add.w @x[$c1],@x[$c1],@x[$d1]
  94. xor @x[$b1],@x[$b1],@x[$c1]
  95. rotri.w @x[$b1],@x[$b1],25
  96. add.w @x[$a2],@x[$a2],@x[$b2]
  97. xor @x[$d2],@x[$d2],@x[$a2]
  98. rotri.w @x[$d2],@x[$d2],16
  99. add.w @x[$a3],@x[$a3],@x[$b3]
  100. xor @x[$d3],@x[$d3],@x[$a3]
  101. rotri.w @x[$d3],@x[$d3],16
  102. add.w @x[$c2],@x[$c2],@x[$d2]
  103. xor @x[$b2],@x[$b2],@x[$c2]
  104. rotri.w @x[$b2],@x[$b2],20
  105. add.w @x[$c3],@x[$c3],@x[$d3]
  106. xor @x[$b3],@x[$b3],@x[$c3]
  107. rotri.w @x[$b3],@x[$b3],20
  108. add.w @x[$a2],@x[$a2],@x[$b2]
  109. xor @x[$d2],@x[$d2],@x[$a2]
  110. rotri.w @x[$d2],@x[$d2],24
  111. add.w @x[$a3],@x[$a3],@x[$b3]
  112. xor @x[$d3],@x[$d3],@x[$a3]
  113. rotri.w @x[$d3],@x[$d3],24
  114. add.w @x[$c2],@x[$c2],@x[$d2]
  115. xor @x[$b2],@x[$b2],@x[$c2]
  116. rotri.w @x[$b2],@x[$b2],25
  117. add.w @x[$c3],@x[$c3],@x[$d3]
  118. xor @x[$b3],@x[$b3],@x[$c3]
  119. rotri.w @x[$b3],@x[$b3],25
  120. EOF
  121. }
  122. $code .= <<EOF;
  123. .align 6
  124. .LChaCha20_1x:
  125. addi.d $sp,$sp,-256
  126. st.d $s0,$sp,0
  127. st.d $s1,$sp,8
  128. st.d $s2,$sp,16
  129. st.d $s3,$sp,24
  130. st.d $s4,$sp,32
  131. st.d $s5,$sp,40
  132. st.d $s6,$sp,48
  133. st.d $s7,$sp,56
  134. st.d $s8,$sp,64
  135. # Save the initial block counter in $s8
  136. ld.w $s8,$counter,0
  137. b .Loop_outer_1x
  138. .align 5
  139. .Loop_outer_1x:
  140. # Load constants
  141. la.local $t8,.Lsigma
  142. ld.w @x[0],$t8,0 # 'expa'
  143. ld.w @x[1],$t8,4 # 'nd 3'
  144. ld.w @x[2],$t8,8 # '2-by'
  145. ld.w @x[3],$t8,12 # 'te k'
  146. # Load key
  147. ld.w @x[4],$key,4*0
  148. ld.w @x[5],$key,4*1
  149. ld.w @x[6],$key,4*2
  150. ld.w @x[7],$key,4*3
  151. ld.w @x[8],$key,4*4
  152. ld.w @x[9],$key,4*5
  153. ld.w @x[10],$key,4*6
  154. ld.w @x[11],$key,4*7
  155. # Load block counter
  156. move @x[12],$s8
  157. # Load nonce
  158. ld.w @x[13],$counter,4*1
  159. ld.w @x[14],$counter,4*2
  160. ld.w @x[15],$counter,4*3
  161. # Update states in \@x[*] for 20 rounds
  162. ori $t8,$zero,10
  163. b .Loop_1x
  164. .align 5
  165. .Loop_1x:
  166. EOF
  167. &ROUND (0, 4, 8, 12);
  168. &ROUND (0, 5, 10, 15);
  169. $code .= <<EOF;
  170. addi.w $t8,$t8,-1
  171. bnez $t8,.Loop_1x
  172. # Get the final states by adding the initial states
  173. la.local $t8,.Lsigma
  174. ld.w $a7,$t8,4*0
  175. ld.w $a6,$t8,4*1
  176. ld.w $a5,$t8,4*2
  177. add.w @x[0],@x[0],$a7
  178. add.w @x[1],@x[1],$a6
  179. add.w @x[2],@x[2],$a5
  180. ld.w $a7,$t8,4*3
  181. add.w @x[3],@x[3],$a7
  182. ld.w $t8,$key,4*0
  183. ld.w $a7,$key,4*1
  184. ld.w $a6,$key,4*2
  185. ld.w $a5,$key,4*3
  186. add.w @x[4],@x[4],$t8
  187. add.w @x[5],@x[5],$a7
  188. add.w @x[6],@x[6],$a6
  189. add.w @x[7],@x[7],$a5
  190. ld.w $t8,$key,4*4
  191. ld.w $a7,$key,4*5
  192. ld.w $a6,$key,4*6
  193. ld.w $a5,$key,4*7
  194. add.w @x[8],@x[8],$t8
  195. add.w @x[9],@x[9],$a7
  196. add.w @x[10],@x[10],$a6
  197. add.w @x[11],@x[11],$a5
  198. add.w @x[12],@x[12],$s8
  199. ld.w $t8,$counter,4*1
  200. ld.w $a7,$counter,4*2
  201. ld.w $a6,$counter,4*3
  202. add.w @x[13],@x[13],$t8
  203. add.w @x[14],@x[14],$a7
  204. add.w @x[15],@x[15],$a6
  205. ori $t8,$zero,64
  206. bltu $len,$t8,.Ltail_1x
  207. # Get the encrypted message by xor states with plaintext
  208. ld.w $t8,$inp,4*0
  209. ld.w $a7,$inp,4*1
  210. ld.w $a6,$inp,4*2
  211. ld.w $a5,$inp,4*3
  212. xor $t8,$t8,@x[0]
  213. xor $a7,$a7,@x[1]
  214. xor $a6,$a6,@x[2]
  215. xor $a5,$a5,@x[3]
  216. st.w $t8,$out,4*0
  217. st.w $a7,$out,4*1
  218. st.w $a6,$out,4*2
  219. st.w $a5,$out,4*3
  220. ld.w $t8,$inp,4*4
  221. ld.w $a7,$inp,4*5
  222. ld.w $a6,$inp,4*6
  223. ld.w $a5,$inp,4*7
  224. xor $t8,$t8,@x[4]
  225. xor $a7,$a7,@x[5]
  226. xor $a6,$a6,@x[6]
  227. xor $a5,$a5,@x[7]
  228. st.w $t8,$out,4*4
  229. st.w $a7,$out,4*5
  230. st.w $a6,$out,4*6
  231. st.w $a5,$out,4*7
  232. ld.w $t8,$inp,4*8
  233. ld.w $a7,$inp,4*9
  234. ld.w $a6,$inp,4*10
  235. ld.w $a5,$inp,4*11
  236. xor $t8,$t8,@x[8]
  237. xor $a7,$a7,@x[9]
  238. xor $a6,$a6,@x[10]
  239. xor $a5,$a5,@x[11]
  240. st.w $t8,$out,4*8
  241. st.w $a7,$out,4*9
  242. st.w $a6,$out,4*10
  243. st.w $a5,$out,4*11
  244. ld.w $t8,$inp,4*12
  245. ld.w $a7,$inp,4*13
  246. ld.w $a6,$inp,4*14
  247. ld.w $a5,$inp,4*15
  248. xor $t8,$t8,@x[12]
  249. xor $a7,$a7,@x[13]
  250. xor $a6,$a6,@x[14]
  251. xor $a5,$a5,@x[15]
  252. st.w $t8,$out,4*12
  253. st.w $a7,$out,4*13
  254. st.w $a6,$out,4*14
  255. st.w $a5,$out,4*15
  256. addi.d $len,$len,-64
  257. beqz $len,.Ldone_1x
  258. addi.d $inp,$inp,64
  259. addi.d $out,$out,64
  260. addi.w $s8,$s8,1
  261. b .Loop_outer_1x
  262. .align 4
  263. .Ltail_1x:
  264. # Handle the tail for 1x (1 <= tail_len <= 63)
  265. addi.d $a7,$sp,72
  266. st.w @x[0],$a7,4*0
  267. st.w @x[1],$a7,4*1
  268. st.w @x[2],$a7,4*2
  269. st.w @x[3],$a7,4*3
  270. st.w @x[4],$a7,4*4
  271. st.w @x[5],$a7,4*5
  272. st.w @x[6],$a7,4*6
  273. st.w @x[7],$a7,4*7
  274. st.w @x[8],$a7,4*8
  275. st.w @x[9],$a7,4*9
  276. st.w @x[10],$a7,4*10
  277. st.w @x[11],$a7,4*11
  278. st.w @x[12],$a7,4*12
  279. st.w @x[13],$a7,4*13
  280. st.w @x[14],$a7,4*14
  281. st.w @x[15],$a7,4*15
  282. move $t8,$zero
  283. .Loop_tail_1x:
  284. # Xor input with states byte by byte
  285. ldx.bu $a6,$inp,$t8
  286. ldx.bu $a5,$a7,$t8
  287. xor $a6,$a6,$a5
  288. stx.b $a6,$out,$t8
  289. addi.w $t8,$t8,1
  290. addi.d $len,$len,-1
  291. bnez $len,.Loop_tail_1x
  292. b .Ldone_1x
  293. .Ldone_1x:
  294. ld.d $s0,$sp,0
  295. ld.d $s1,$sp,8
  296. ld.d $s2,$sp,16
  297. ld.d $s3,$sp,24
  298. ld.d $s4,$sp,32
  299. ld.d $s5,$sp,40
  300. ld.d $s6,$sp,48
  301. ld.d $s7,$sp,56
  302. ld.d $s8,$sp,64
  303. addi.d $sp,$sp,256
  304. b .Lend
  305. EOF
  306. }
  307. ########################################################################
  308. # 128-bit LSX code path that handles all lengths.
  309. {
  310. # Load the initial states in array @x[*] and update directly.
  311. my @x = ($vr0, $vr1, $vr2, $vr3, $vr4, $vr5, $vr6, $vr7,
  312. $vr8, $vr9, $vr10, $vr11, $vr12, $vr13, $vr14, $vr15);
  313. # Save the initial states in array @y[*]
  314. my @y = ($vr16, $vr17, $vr18, $vr19, $vr20, $vr21, $vr22, $vr23,
  315. $vr24, $vr25, $vr26, $vr27, $vr28, $vr29, $vr30, $vr31);
  316. sub ROUND_4x {
  317. my ($a0,$b0,$c0,$d0) = @_;
  318. my ($a1,$b1,$c1,$d1) = map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
  319. my ($a2,$b2,$c2,$d2) = map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
  320. my ($a3,$b3,$c3,$d3) = map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
  321. $code .= <<EOF;
  322. vadd.w @x[$a0],@x[$a0],@x[$b0]
  323. vxor.v @x[$d0],@x[$d0],@x[$a0]
  324. vrotri.w @x[$d0],@x[$d0],16 # rotate left 16 bits
  325. vadd.w @x[$a1],@x[$a1],@x[$b1]
  326. vxor.v @x[$d1],@x[$d1],@x[$a1]
  327. vrotri.w @x[$d1],@x[$d1],16
  328. vadd.w @x[$c0],@x[$c0],@x[$d0]
  329. vxor.v @x[$b0],@x[$b0],@x[$c0]
  330. vrotri.w @x[$b0],@x[$b0],20 # rotate left 12 bits
  331. vadd.w @x[$c1],@x[$c1],@x[$d1]
  332. vxor.v @x[$b1],@x[$b1],@x[$c1]
  333. vrotri.w @x[$b1],@x[$b1],20
  334. vadd.w @x[$a0],@x[$a0],@x[$b0]
  335. vxor.v @x[$d0],@x[$d0],@x[$a0]
  336. vrotri.w @x[$d0],@x[$d0],24 # rotate left 8 bits
  337. vadd.w @x[$a1],@x[$a1],@x[$b1]
  338. vxor.v @x[$d1],@x[$d1],@x[$a1]
  339. vrotri.w @x[$d1],@x[$d1],24
  340. vadd.w @x[$c0],@x[$c0],@x[$d0]
  341. vxor.v @x[$b0],@x[$b0],@x[$c0]
  342. vrotri.w @x[$b0],@x[$b0],25 # rotate left 7 bits
  343. vadd.w @x[$c1],@x[$c1],@x[$d1]
  344. vxor.v @x[$b1],@x[$b1],@x[$c1]
  345. vrotri.w @x[$b1],@x[$b1],25
  346. vadd.w @x[$a2],@x[$a2],@x[$b2]
  347. vxor.v @x[$d2],@x[$d2],@x[$a2]
  348. vrotri.w @x[$d2],@x[$d2],16
  349. vadd.w @x[$a3],@x[$a3],@x[$b3]
  350. vxor.v @x[$d3],@x[$d3],@x[$a3]
  351. vrotri.w @x[$d3],@x[$d3],16
  352. vadd.w @x[$c2],@x[$c2],@x[$d2]
  353. vxor.v @x[$b2],@x[$b2],@x[$c2]
  354. vrotri.w @x[$b2],@x[$b2],20
  355. vadd.w @x[$c3],@x[$c3],@x[$d3]
  356. vxor.v @x[$b3],@x[$b3],@x[$c3]
  357. vrotri.w @x[$b3],@x[$b3],20
  358. vadd.w @x[$a2],@x[$a2],@x[$b2]
  359. vxor.v @x[$d2],@x[$d2],@x[$a2]
  360. vrotri.w @x[$d2],@x[$d2],24
  361. vadd.w @x[$a3],@x[$a3],@x[$b3]
  362. vxor.v @x[$d3],@x[$d3],@x[$a3]
  363. vrotri.w @x[$d3],@x[$d3],24
  364. vadd.w @x[$c2],@x[$c2],@x[$d2]
  365. vxor.v @x[$b2],@x[$b2],@x[$c2]
  366. vrotri.w @x[$b2],@x[$b2],25
  367. vadd.w @x[$c3],@x[$c3],@x[$d3]
  368. vxor.v @x[$b3],@x[$b3],@x[$c3]
  369. vrotri.w @x[$b3],@x[$b3],25
  370. EOF
  371. }
  372. $code .= <<EOF;
  373. .align 6
  374. .LChaCha20_4x:
  375. ori $t3,$zero,64
  376. bleu $len,$t3,.LChaCha20_1x # goto 1x when len <= 64
  377. addi.d $sp,$sp,-128
  378. # Save the initial block counter in $t4
  379. ld.w $t4,$counter,0
  380. b .Loop_outer_4x
  381. .align 5
  382. .Loop_outer_4x:
  383. # Load constant
  384. la.local $t8,.Lsigma
  385. vldrepl.w @x[0],$t8,4*0 # 'expa'
  386. vldrepl.w @x[1],$t8,4*1 # 'nd 3'
  387. vldrepl.w @x[2],$t8,4*2 # '2-by'
  388. vldrepl.w @x[3],$t8,4*3 # 'te k'
  389. # Load key
  390. vldrepl.w @x[4],$key,4*0
  391. vldrepl.w @x[5],$key,4*1
  392. vldrepl.w @x[6],$key,4*2
  393. vldrepl.w @x[7],$key,4*3
  394. vldrepl.w @x[8],$key,4*4
  395. vldrepl.w @x[9],$key,4*5
  396. vldrepl.w @x[10],$key,4*6
  397. vldrepl.w @x[11],$key,4*7
  398. # Load block counter
  399. vreplgr2vr.w @x[12],$t4
  400. # Load nonce
  401. vldrepl.w @x[13],$counter,4*1
  402. vldrepl.w @x[14],$counter,4*2
  403. vldrepl.w @x[15],$counter,4*3
  404. # Get the correct block counter for each block
  405. la.local $t8,.Linc4x
  406. vld @y[0],$t8,0
  407. vadd.w @x[12],@x[12],@y[0]
  408. # Copy the initial states from \@x[*] to \@y[*]
  409. vori.b @y[0],@x[0],0
  410. vori.b @y[1],@x[1],0
  411. vori.b @y[2],@x[2],0
  412. vori.b @y[3],@x[3],0
  413. vori.b @y[4],@x[4],0
  414. vori.b @y[5],@x[5],0
  415. vori.b @y[6],@x[6],0
  416. vori.b @y[7],@x[7],0
  417. vori.b @y[8],@x[8],0
  418. vori.b @y[9],@x[9],0
  419. vori.b @y[10],@x[10],0
  420. vori.b @y[11],@x[11],0
  421. vori.b @y[12],@x[12],0
  422. vori.b @y[13],@x[13],0
  423. vori.b @y[14],@x[14],0
  424. vori.b @y[15],@x[15],0
  425. # Update states in \@x[*] for 20 rounds
  426. ori $t8,$zero,10
  427. b .Loop_4x
  428. .align 5
  429. .Loop_4x:
  430. EOF
  431. &ROUND_4x (0, 4, 8, 12);
  432. &ROUND_4x (0, 5, 10, 15);
  433. $code .= <<EOF;
  434. addi.w $t8,$t8,-1
  435. bnez $t8,.Loop_4x
  436. # Get the final states by adding the initial states
  437. vadd.w @x[0],@x[0],@y[0]
  438. vadd.w @x[1],@x[1],@y[1]
  439. vadd.w @x[2],@x[2],@y[2]
  440. vadd.w @x[3],@x[3],@y[3]
  441. vadd.w @x[4],@x[4],@y[4]
  442. vadd.w @x[5],@x[5],@y[5]
  443. vadd.w @x[6],@x[6],@y[6]
  444. vadd.w @x[7],@x[7],@y[7]
  445. vadd.w @x[8],@x[8],@y[8]
  446. vadd.w @x[9],@x[9],@y[9]
  447. vadd.w @x[10],@x[10],@y[10]
  448. vadd.w @x[11],@x[11],@y[11]
  449. vadd.w @x[12],@x[12],@y[12]
  450. vadd.w @x[13],@x[13],@y[13]
  451. vadd.w @x[14],@x[14],@y[14]
  452. vadd.w @x[15],@x[15],@y[15]
  453. # Get the transpose of \@x[*] and save them in \@x[*]
  454. vilvl.w @y[0],@x[1],@x[0]
  455. vilvh.w @y[1],@x[1],@x[0]
  456. vilvl.w @y[2],@x[3],@x[2]
  457. vilvh.w @y[3],@x[3],@x[2]
  458. vilvl.w @y[4],@x[5],@x[4]
  459. vilvh.w @y[5],@x[5],@x[4]
  460. vilvl.w @y[6],@x[7],@x[6]
  461. vilvh.w @y[7],@x[7],@x[6]
  462. vilvl.w @y[8],@x[9],@x[8]
  463. vilvh.w @y[9],@x[9],@x[8]
  464. vilvl.w @y[10],@x[11],@x[10]
  465. vilvh.w @y[11],@x[11],@x[10]
  466. vilvl.w @y[12],@x[13],@x[12]
  467. vilvh.w @y[13],@x[13],@x[12]
  468. vilvl.w @y[14],@x[15],@x[14]
  469. vilvh.w @y[15],@x[15],@x[14]
  470. vilvl.d @x[0],@y[2],@y[0]
  471. vilvh.d @x[1],@y[2],@y[0]
  472. vilvl.d @x[2],@y[3],@y[1]
  473. vilvh.d @x[3],@y[3],@y[1]
  474. vilvl.d @x[4],@y[6],@y[4]
  475. vilvh.d @x[5],@y[6],@y[4]
  476. vilvl.d @x[6],@y[7],@y[5]
  477. vilvh.d @x[7],@y[7],@y[5]
  478. vilvl.d @x[8],@y[10],@y[8]
  479. vilvh.d @x[9],@y[10],@y[8]
  480. vilvl.d @x[10],@y[11],@y[9]
  481. vilvh.d @x[11],@y[11],@y[9]
  482. vilvl.d @x[12],@y[14],@y[12]
  483. vilvh.d @x[13],@y[14],@y[12]
  484. vilvl.d @x[14],@y[15],@y[13]
  485. vilvh.d @x[15],@y[15],@y[13]
  486. EOF
  487. # Adjust the order of elements in @x[*] for ease of use.
  488. @x = (@x[0],@x[4],@x[8],@x[12],@x[1],@x[5],@x[9],@x[13],
  489. @x[2],@x[6],@x[10],@x[14],@x[3],@x[7],@x[11],@x[15]);
  490. $code .= <<EOF;
  491. ori $t8,$zero,64*4
  492. bltu $len,$t8,.Ltail_4x
  493. # Get the encrypted message by xor states with plaintext
  494. vld @y[0],$inp,16*0
  495. vld @y[1],$inp,16*1
  496. vld @y[2],$inp,16*2
  497. vld @y[3],$inp,16*3
  498. vxor.v @y[0],@y[0],@x[0]
  499. vxor.v @y[1],@y[1],@x[1]
  500. vxor.v @y[2],@y[2],@x[2]
  501. vxor.v @y[3],@y[3],@x[3]
  502. vst @y[0],$out,16*0
  503. vst @y[1],$out,16*1
  504. vst @y[2],$out,16*2
  505. vst @y[3],$out,16*3
  506. vld @y[0],$inp,16*4
  507. vld @y[1],$inp,16*5
  508. vld @y[2],$inp,16*6
  509. vld @y[3],$inp,16*7
  510. vxor.v @y[0],@y[0],@x[4]
  511. vxor.v @y[1],@y[1],@x[5]
  512. vxor.v @y[2],@y[2],@x[6]
  513. vxor.v @y[3],@y[3],@x[7]
  514. vst @y[0],$out,16*4
  515. vst @y[1],$out,16*5
  516. vst @y[2],$out,16*6
  517. vst @y[3],$out,16*7
  518. vld @y[0],$inp,16*8
  519. vld @y[1],$inp,16*9
  520. vld @y[2],$inp,16*10
  521. vld @y[3],$inp,16*11
  522. vxor.v @y[0],@y[0],@x[8]
  523. vxor.v @y[1],@y[1],@x[9]
  524. vxor.v @y[2],@y[2],@x[10]
  525. vxor.v @y[3],@y[3],@x[11]
  526. vst @y[0],$out,16*8
  527. vst @y[1],$out,16*9
  528. vst @y[2],$out,16*10
  529. vst @y[3],$out,16*11
  530. vld @y[0],$inp,16*12
  531. vld @y[1],$inp,16*13
  532. vld @y[2],$inp,16*14
  533. vld @y[3],$inp,16*15
  534. vxor.v @y[0],@y[0],@x[12]
  535. vxor.v @y[1],@y[1],@x[13]
  536. vxor.v @y[2],@y[2],@x[14]
  537. vxor.v @y[3],@y[3],@x[15]
  538. vst @y[0],$out,16*12
  539. vst @y[1],$out,16*13
  540. vst @y[2],$out,16*14
  541. vst @y[3],$out,16*15
  542. addi.d $len,$len,-64*4
  543. beqz $len,.Ldone_4x
  544. addi.d $inp,$inp,64*4
  545. addi.d $out,$out,64*4
  546. addi.w $t4,$t4,4
  547. b .Loop_outer_4x
  548. .Ltail_4x:
  549. # Handle the tail for 4x (1 <= tail_len <= 255)
  550. ori $t8,$zero,192
  551. bgeu $len,$t8,.L192_or_more4x
  552. ori $t8,$zero,128
  553. bgeu $len,$t8,.L128_or_more4x
  554. ori $t8,$zero,64
  555. bgeu $len,$t8,.L64_or_more4x
  556. vst @x[0],$sp,16*0
  557. vst @x[1],$sp,16*1
  558. vst @x[2],$sp,16*2
  559. vst @x[3],$sp,16*3
  560. move $t8,$zero
  561. b .Loop_tail_4x
  562. .align 5
  563. .L64_or_more4x:
  564. vld @y[0],$inp,16*0
  565. vld @y[1],$inp,16*1
  566. vld @y[2],$inp,16*2
  567. vld @y[3],$inp,16*3
  568. vxor.v @y[0],@y[0],@x[0]
  569. vxor.v @y[1],@y[1],@x[1]
  570. vxor.v @y[2],@y[2],@x[2]
  571. vxor.v @y[3],@y[3],@x[3]
  572. vst @y[0],$out,16*0
  573. vst @y[1],$out,16*1
  574. vst @y[2],$out,16*2
  575. vst @y[3],$out,16*3
  576. addi.d $len,$len,-64
  577. beqz $len,.Ldone_4x
  578. addi.d $inp,$inp,64
  579. addi.d $out,$out,64
  580. vst @x[4],$sp,16*0
  581. vst @x[5],$sp,16*1
  582. vst @x[6],$sp,16*2
  583. vst @x[7],$sp,16*3
  584. move $t8,$zero
  585. b .Loop_tail_4x
  586. .align 5
  587. .L128_or_more4x:
  588. vld @y[0],$inp,16*0
  589. vld @y[1],$inp,16*1
  590. vld @y[2],$inp,16*2
  591. vld @y[3],$inp,16*3
  592. vxor.v @y[0],@y[0],@x[0]
  593. vxor.v @y[1],@y[1],@x[1]
  594. vxor.v @y[2],@y[2],@x[2]
  595. vxor.v @y[3],@y[3],@x[3]
  596. vst @y[0],$out,16*0
  597. vst @y[1],$out,16*1
  598. vst @y[2],$out,16*2
  599. vst @y[3],$out,16*3
  600. vld @y[0],$inp,16*4
  601. vld @y[1],$inp,16*5
  602. vld @y[2],$inp,16*6
  603. vld @y[3],$inp,16*7
  604. vxor.v @y[0],@y[0],@x[4]
  605. vxor.v @y[1],@y[1],@x[5]
  606. vxor.v @y[2],@y[2],@x[6]
  607. vxor.v @y[3],@y[3],@x[7]
  608. vst @y[0],$out,16*4
  609. vst @y[1],$out,16*5
  610. vst @y[2],$out,16*6
  611. vst @y[3],$out,16*7
  612. addi.d $len,$len,-128
  613. beqz $len,.Ldone_4x
  614. addi.d $inp,$inp,128
  615. addi.d $out,$out,128
  616. vst @x[8],$sp,16*0
  617. vst @x[9],$sp,16*1
  618. vst @x[10],$sp,16*2
  619. vst @x[11],$sp,16*3
  620. move $t8,$zero
  621. b .Loop_tail_4x
  622. .align 5
  623. .L192_or_more4x:
  624. vld @y[0],$inp,16*0
  625. vld @y[1],$inp,16*1
  626. vld @y[2],$inp,16*2
  627. vld @y[3],$inp,16*3
  628. vxor.v @y[0],@y[0],@x[0]
  629. vxor.v @y[1],@y[1],@x[1]
  630. vxor.v @y[2],@y[2],@x[2]
  631. vxor.v @y[3],@y[3],@x[3]
  632. vst @y[0],$out,16*0
  633. vst @y[1],$out,16*1
  634. vst @y[2],$out,16*2
  635. vst @y[3],$out,16*3
  636. vld @y[0],$inp,16*4
  637. vld @y[1],$inp,16*5
  638. vld @y[2],$inp,16*6
  639. vld @y[3],$inp,16*7
  640. vxor.v @y[0],@y[0],@x[4]
  641. vxor.v @y[1],@y[1],@x[5]
  642. vxor.v @y[2],@y[2],@x[6]
  643. vxor.v @y[3],@y[3],@x[7]
  644. vst @y[0],$out,16*4
  645. vst @y[1],$out,16*5
  646. vst @y[2],$out,16*6
  647. vst @y[3],$out,16*7
  648. vld @y[0],$inp,16*8
  649. vld @y[1],$inp,16*9
  650. vld @y[2],$inp,16*10
  651. vld @y[3],$inp,16*11
  652. vxor.v @y[0],@y[0],@x[8]
  653. vxor.v @y[1],@y[1],@x[9]
  654. vxor.v @y[2],@y[2],@x[10]
  655. vxor.v @y[3],@y[3],@x[11]
  656. vst @y[0],$out,16*8
  657. vst @y[1],$out,16*9
  658. vst @y[2],$out,16*10
  659. vst @y[3],$out,16*11
  660. addi.d $len,$len,-192
  661. beqz $len,.Ldone_4x
  662. addi.d $inp,$inp,192
  663. addi.d $out,$out,192
  664. vst @x[12],$sp,16*0
  665. vst @x[13],$sp,16*1
  666. vst @x[14],$sp,16*2
  667. vst @x[15],$sp,16*3
  668. move $t8,$zero
  669. b .Loop_tail_4x
  670. .Loop_tail_4x:
  671. # Xor input with states byte by byte
  672. ldx.bu $t5,$inp,$t8
  673. ldx.bu $t6,$sp,$t8
  674. xor $t5,$t5,$t6
  675. stx.b $t5,$out,$t8
  676. addi.w $t8,$t8,1
  677. addi.d $len,$len,-1
  678. bnez $len,.Loop_tail_4x
  679. b .Ldone_4x
  680. .Ldone_4x:
  681. addi.d $sp,$sp,128
  682. b .Lend
  683. EOF
  684. }
  685. ########################################################################
  686. # 256-bit LASX code path that handles all lengths.
  687. {
  688. # Load the initial states in array @x[*] and update directly.
  689. my @x = ($xr0, $xr1, $xr2, $xr3, $xr4, $xr5, $xr6, $xr7,
  690. $xr8, $xr9, $xr10, $xr11, $xr12, $xr13, $xr14, $xr15);
  691. # Save the initial states in array @y[*]
  692. my @y = ($xr16, $xr17, $xr18, $xr19, $xr20, $xr21, $xr22, $xr23,
  693. $xr24, $xr25, $xr26, $xr27, $xr28, $xr29, $xr30, $xr31);
  694. sub ROUND_8x {
  695. my ($a0,$b0,$c0,$d0) = @_;
  696. my ($a1,$b1,$c1,$d1) = map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
  697. my ($a2,$b2,$c2,$d2) = map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
  698. my ($a3,$b3,$c3,$d3) = map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
  699. $code .= <<EOF;
  700. xvadd.w @x[$a0],@x[$a0],@x[$b0]
  701. xvxor.v @x[$d0],@x[$d0],@x[$a0]
  702. xvrotri.w @x[$d0],@x[$d0],16 # rotate left 16 bits
  703. xvadd.w @x[$a1],@x[$a1],@x[$b1]
  704. xvxor.v @x[$d1],@x[$d1],@x[$a1]
  705. xvrotri.w @x[$d1],@x[$d1],16
  706. xvadd.w @x[$c0],@x[$c0],@x[$d0]
  707. xvxor.v @x[$b0],@x[$b0],@x[$c0]
  708. xvrotri.w @x[$b0],@x[$b0],20 # rotate left 12 bits
  709. xvadd.w @x[$c1],@x[$c1],@x[$d1]
  710. xvxor.v @x[$b1],@x[$b1],@x[$c1]
  711. xvrotri.w @x[$b1],@x[$b1],20
  712. xvadd.w @x[$a0],@x[$a0],@x[$b0]
  713. xvxor.v @x[$d0],@x[$d0],@x[$a0]
  714. xvrotri.w @x[$d0],@x[$d0],24 # rotate left 8 bits
  715. xvadd.w @x[$a1],@x[$a1],@x[$b1]
  716. xvxor.v @x[$d1],@x[$d1],@x[$a1]
  717. xvrotri.w @x[$d1],@x[$d1],24
  718. xvadd.w @x[$c0],@x[$c0],@x[$d0]
  719. xvxor.v @x[$b0],@x[$b0],@x[$c0]
  720. xvrotri.w @x[$b0],@x[$b0],25 # rotate left 7 bits
  721. xvadd.w @x[$c1],@x[$c1],@x[$d1]
  722. xvxor.v @x[$b1],@x[$b1],@x[$c1]
  723. xvrotri.w @x[$b1],@x[$b1],25
  724. xvadd.w @x[$a2],@x[$a2],@x[$b2]
  725. xvxor.v @x[$d2],@x[$d2],@x[$a2]
  726. xvrotri.w @x[$d2],@x[$d2],16
  727. xvadd.w @x[$a3],@x[$a3],@x[$b3]
  728. xvxor.v @x[$d3],@x[$d3],@x[$a3]
  729. xvrotri.w @x[$d3],@x[$d3],16
  730. xvadd.w @x[$c2],@x[$c2],@x[$d2]
  731. xvxor.v @x[$b2],@x[$b2],@x[$c2]
  732. xvrotri.w @x[$b2],@x[$b2],20
  733. xvadd.w @x[$c3],@x[$c3],@x[$d3]
  734. xvxor.v @x[$b3],@x[$b3],@x[$c3]
  735. xvrotri.w @x[$b3],@x[$b3],20
  736. xvadd.w @x[$a2],@x[$a2],@x[$b2]
  737. xvxor.v @x[$d2],@x[$d2],@x[$a2]
  738. xvrotri.w @x[$d2],@x[$d2],24
  739. xvadd.w @x[$a3],@x[$a3],@x[$b3]
  740. xvxor.v @x[$d3],@x[$d3],@x[$a3]
  741. xvrotri.w @x[$d3],@x[$d3],24
  742. xvadd.w @x[$c2],@x[$c2],@x[$d2]
  743. xvxor.v @x[$b2],@x[$b2],@x[$c2]
  744. xvrotri.w @x[$b2],@x[$b2],25
  745. xvadd.w @x[$c3],@x[$c3],@x[$d3]
  746. xvxor.v @x[$b3],@x[$b3],@x[$c3]
  747. xvrotri.w @x[$b3],@x[$b3],25
  748. EOF
  749. }
  750. $code .= <<EOF;
  751. .align 6
  752. .LChaCha20_8x:
  753. ori $t3,$zero,64
  754. bleu $len,$t3,.LChaCha20_1x # goto 1x when len <= 64
  755. addi.d $sp,$sp,-128
  756. # Save the initial block counter in $t4
  757. ld.w $t4,$counter,0
  758. b .Loop_outer_8x
  759. .align 5
  760. .Loop_outer_8x:
  761. # Load constant
  762. la.local $t8,.Lsigma
  763. xvldrepl.w @x[0],$t8,4*0 # 'expa'
  764. xvldrepl.w @x[1],$t8,4*1 # 'nd 3'
  765. xvldrepl.w @x[2],$t8,4*2 # '2-by'
  766. xvldrepl.w @x[3],$t8,4*3 # 'te k'
  767. # Load key
  768. xvldrepl.w @x[4],$key,4*0
  769. xvldrepl.w @x[5],$key,4*1
  770. xvldrepl.w @x[6],$key,4*2
  771. xvldrepl.w @x[7],$key,4*3
  772. xvldrepl.w @x[8],$key,4*4
  773. xvldrepl.w @x[9],$key,4*5
  774. xvldrepl.w @x[10],$key,4*6
  775. xvldrepl.w @x[11],$key,4*7
  776. # Load block counter
  777. xvreplgr2vr.w @x[12],$t4
  778. # Load nonce
  779. xvldrepl.w @x[13],$counter,4*1
  780. xvldrepl.w @x[14],$counter,4*2
  781. xvldrepl.w @x[15],$counter,4*3
  782. # Get the correct block counter for each block
  783. la.local $t8,.Linc8x
  784. xvld @y[0],$t8,0
  785. xvadd.w @x[12],@x[12],@y[0]
  786. # Copy the initial states from \@x[*] to \@y[*]
  787. xvori.b @y[0],@x[0],0
  788. xvori.b @y[1],@x[1],0
  789. xvori.b @y[2],@x[2],0
  790. xvori.b @y[3],@x[3],0
  791. xvori.b @y[4],@x[4],0
  792. xvori.b @y[5],@x[5],0
  793. xvori.b @y[6],@x[6],0
  794. xvori.b @y[7],@x[7],0
  795. xvori.b @y[8],@x[8],0
  796. xvori.b @y[9],@x[9],0
  797. xvori.b @y[10],@x[10],0
  798. xvori.b @y[11],@x[11],0
  799. xvori.b @y[12],@x[12],0
  800. xvori.b @y[13],@x[13],0
  801. xvori.b @y[14],@x[14],0
  802. xvori.b @y[15],@x[15],0
  803. # Update states in \@x[*] for 20 rounds
  804. ori $t8,$zero,10
  805. b .Loop_8x
  806. .align 5
  807. .Loop_8x:
  808. EOF
  809. &ROUND_8x (0, 4, 8, 12);
  810. &ROUND_8x (0, 5, 10, 15);
  811. $code .= <<EOF;
  812. addi.w $t8,$t8,-1
  813. bnez $t8,.Loop_8x
  814. # Get the final states by adding the initial states
  815. xvadd.w @x[0],@x[0],@y[0]
  816. xvadd.w @x[1],@x[1],@y[1]
  817. xvadd.w @x[2],@x[2],@y[2]
  818. xvadd.w @x[3],@x[3],@y[3]
  819. xvadd.w @x[4],@x[4],@y[4]
  820. xvadd.w @x[5],@x[5],@y[5]
  821. xvadd.w @x[6],@x[6],@y[6]
  822. xvadd.w @x[7],@x[7],@y[7]
  823. xvadd.w @x[8],@x[8],@y[8]
  824. xvadd.w @x[9],@x[9],@y[9]
  825. xvadd.w @x[10],@x[10],@y[10]
  826. xvadd.w @x[11],@x[11],@y[11]
  827. xvadd.w @x[12],@x[12],@y[12]
  828. xvadd.w @x[13],@x[13],@y[13]
  829. xvadd.w @x[14],@x[14],@y[14]
  830. xvadd.w @x[15],@x[15],@y[15]
  831. # Get the transpose of \@x[*] and save them in \@y[*]
  832. xvilvl.w @y[0],@x[1],@x[0]
  833. xvilvh.w @y[1],@x[1],@x[0]
  834. xvilvl.w @y[2],@x[3],@x[2]
  835. xvilvh.w @y[3],@x[3],@x[2]
  836. xvilvl.w @y[4],@x[5],@x[4]
  837. xvilvh.w @y[5],@x[5],@x[4]
  838. xvilvl.w @y[6],@x[7],@x[6]
  839. xvilvh.w @y[7],@x[7],@x[6]
  840. xvilvl.w @y[8],@x[9],@x[8]
  841. xvilvh.w @y[9],@x[9],@x[8]
  842. xvilvl.w @y[10],@x[11],@x[10]
  843. xvilvh.w @y[11],@x[11],@x[10]
  844. xvilvl.w @y[12],@x[13],@x[12]
  845. xvilvh.w @y[13],@x[13],@x[12]
  846. xvilvl.w @y[14],@x[15],@x[14]
  847. xvilvh.w @y[15],@x[15],@x[14]
  848. xvilvl.d @x[0],@y[2],@y[0]
  849. xvilvh.d @x[1],@y[2],@y[0]
  850. xvilvl.d @x[2],@y[3],@y[1]
  851. xvilvh.d @x[3],@y[3],@y[1]
  852. xvilvl.d @x[4],@y[6],@y[4]
  853. xvilvh.d @x[5],@y[6],@y[4]
  854. xvilvl.d @x[6],@y[7],@y[5]
  855. xvilvh.d @x[7],@y[7],@y[5]
  856. xvilvl.d @x[8],@y[10],@y[8]
  857. xvilvh.d @x[9],@y[10],@y[8]
  858. xvilvl.d @x[10],@y[11],@y[9]
  859. xvilvh.d @x[11],@y[11],@y[9]
  860. xvilvl.d @x[12],@y[14],@y[12]
  861. xvilvh.d @x[13],@y[14],@y[12]
  862. xvilvl.d @x[14],@y[15],@y[13]
  863. xvilvh.d @x[15],@y[15],@y[13]
  864. xvori.b @y[0],@x[4],0
  865. xvpermi.q @y[0],@x[0],0x20
  866. xvori.b @y[1],@x[5],0
  867. xvpermi.q @y[1],@x[1],0x20
  868. xvori.b @y[2],@x[6],0
  869. xvpermi.q @y[2],@x[2],0x20
  870. xvori.b @y[3],@x[7],0
  871. xvpermi.q @y[3],@x[3],0x20
  872. xvori.b @y[4],@x[4],0
  873. xvpermi.q @y[4],@x[0],0x31
  874. xvori.b @y[5],@x[5],0
  875. xvpermi.q @y[5],@x[1],0x31
  876. xvori.b @y[6],@x[6],0
  877. xvpermi.q @y[6],@x[2],0x31
  878. xvori.b @y[7],@x[7],0
  879. xvpermi.q @y[7],@x[3],0x31
  880. xvori.b @y[8],@x[12],0
  881. xvpermi.q @y[8],@x[8],0x20
  882. xvori.b @y[9],@x[13],0
  883. xvpermi.q @y[9],@x[9],0x20
  884. xvori.b @y[10],@x[14],0
  885. xvpermi.q @y[10],@x[10],0x20
  886. xvori.b @y[11],@x[15],0
  887. xvpermi.q @y[11],@x[11],0x20
  888. xvori.b @y[12],@x[12],0
  889. xvpermi.q @y[12],@x[8],0x31
  890. xvori.b @y[13],@x[13],0
  891. xvpermi.q @y[13],@x[9],0x31
  892. xvori.b @y[14],@x[14],0
  893. xvpermi.q @y[14],@x[10],0x31
  894. xvori.b @y[15],@x[15],0
  895. xvpermi.q @y[15],@x[11],0x31
  896. EOF
  897. # Adjust the order of elements in @y[*] for ease of use.
  898. @y = (@y[0],@y[8],@y[1],@y[9],@y[2],@y[10],@y[3],@y[11],
  899. @y[4],@y[12],@y[5],@y[13],@y[6],@y[14],@y[7],@y[15]);
  900. $code .= <<EOF;
  901. ori $t8,$zero,64*8
  902. bltu $len,$t8,.Ltail_8x
  903. # Get the encrypted message by xor states with plaintext
  904. xvld @x[0],$inp,32*0
  905. xvld @x[1],$inp,32*1
  906. xvld @x[2],$inp,32*2
  907. xvld @x[3],$inp,32*3
  908. xvxor.v @x[0],@x[0],@y[0]
  909. xvxor.v @x[1],@x[1],@y[1]
  910. xvxor.v @x[2],@x[2],@y[2]
  911. xvxor.v @x[3],@x[3],@y[3]
  912. xvst @x[0],$out,32*0
  913. xvst @x[1],$out,32*1
  914. xvst @x[2],$out,32*2
  915. xvst @x[3],$out,32*3
  916. xvld @x[0],$inp,32*4
  917. xvld @x[1],$inp,32*5
  918. xvld @x[2],$inp,32*6
  919. xvld @x[3],$inp,32*7
  920. xvxor.v @x[0],@x[0],@y[4]
  921. xvxor.v @x[1],@x[1],@y[5]
  922. xvxor.v @x[2],@x[2],@y[6]
  923. xvxor.v @x[3],@x[3],@y[7]
  924. xvst @x[0],$out,32*4
  925. xvst @x[1],$out,32*5
  926. xvst @x[2],$out,32*6
  927. xvst @x[3],$out,32*7
  928. xvld @x[0],$inp,32*8
  929. xvld @x[1],$inp,32*9
  930. xvld @x[2],$inp,32*10
  931. xvld @x[3],$inp,32*11
  932. xvxor.v @x[0],@x[0],@y[8]
  933. xvxor.v @x[1],@x[1],@y[9]
  934. xvxor.v @x[2],@x[2],@y[10]
  935. xvxor.v @x[3],@x[3],@y[11]
  936. xvst @x[0],$out,32*8
  937. xvst @x[1],$out,32*9
  938. xvst @x[2],$out,32*10
  939. xvst @x[3],$out,32*11
  940. xvld @x[0],$inp,32*12
  941. xvld @x[1],$inp,32*13
  942. xvld @x[2],$inp,32*14
  943. xvld @x[3],$inp,32*15
  944. xvxor.v @x[0],@x[0],@y[12]
  945. xvxor.v @x[1],@x[1],@y[13]
  946. xvxor.v @x[2],@x[2],@y[14]
  947. xvxor.v @x[3],@x[3],@y[15]
  948. xvst @x[0],$out,32*12
  949. xvst @x[1],$out,32*13
  950. xvst @x[2],$out,32*14
  951. xvst @x[3],$out,32*15
  952. addi.d $len,$len,-64*8
  953. beqz $len,.Ldone_8x
  954. addi.d $inp,$inp,64*8
  955. addi.d $out,$out,64*8
  956. addi.w $t4,$t4,8
  957. b .Loop_outer_8x
  958. .Ltail_8x:
  959. # Handle the tail for 8x (1 <= tail_len <= 511)
  960. ori $t8,$zero,448
  961. bgeu $len,$t8,.L448_or_more8x
  962. ori $t8,$zero,384
  963. bgeu $len,$t8,.L384_or_more8x
  964. ori $t8,$zero,320
  965. bgeu $len,$t8,.L320_or_more8x
  966. ori $t8,$zero,256
  967. bgeu $len,$t8,.L256_or_more8x
  968. ori $t8,$zero,192
  969. bgeu $len,$t8,.L192_or_more8x
  970. ori $t8,$zero,128
  971. bgeu $len,$t8,.L128_or_more8x
  972. ori $t8,$zero,64
  973. bgeu $len,$t8,.L64_or_more8x
  974. xvst @y[0],$sp,32*0
  975. xvst @y[1],$sp,32*1
  976. move $t8,$zero
  977. b .Loop_tail_8x
  978. .align 5
  979. .L64_or_more8x:
  980. xvld @x[0],$inp,32*0
  981. xvld @x[1],$inp,32*1
  982. xvxor.v @x[0],@x[0],@y[0]
  983. xvxor.v @x[1],@x[1],@y[1]
  984. xvst @x[0],$out,32*0
  985. xvst @x[1],$out,32*1
  986. addi.d $len,$len,-64
  987. beqz $len,.Ldone_8x
  988. addi.d $inp,$inp,64
  989. addi.d $out,$out,64
  990. xvst @y[2],$sp,32*0
  991. xvst @y[3],$sp,32*1
  992. move $t8,$zero
  993. b .Loop_tail_8x
  994. .align 5
  995. .L128_or_more8x:
  996. xvld @x[0],$inp,32*0
  997. xvld @x[1],$inp,32*1
  998. xvld @x[2],$inp,32*2
  999. xvld @x[3],$inp,32*3
  1000. xvxor.v @x[0],@x[0],@y[0]
  1001. xvxor.v @x[1],@x[1],@y[1]
  1002. xvxor.v @x[2],@x[2],@y[2]
  1003. xvxor.v @x[3],@x[3],@y[3]
  1004. xvst @x[0],$out,32*0
  1005. xvst @x[1],$out,32*1
  1006. xvst @x[2],$out,32*2
  1007. xvst @x[3],$out,32*3
  1008. addi.d $len,$len,-128
  1009. beqz $len,.Ldone_8x
  1010. addi.d $inp,$inp,128
  1011. addi.d $out,$out,128
  1012. xvst @y[4],$sp,32*0
  1013. xvst @y[5],$sp,32*1
  1014. move $t8,$zero
  1015. b .Loop_tail_8x
  1016. .align 5
  1017. .L192_or_more8x:
  1018. xvld @x[0],$inp,32*0
  1019. xvld @x[1],$inp,32*1
  1020. xvld @x[2],$inp,32*2
  1021. xvld @x[3],$inp,32*3
  1022. xvxor.v @x[0],@x[0],@y[0]
  1023. xvxor.v @x[1],@x[1],@y[1]
  1024. xvxor.v @x[2],@x[2],@y[2]
  1025. xvxor.v @x[3],@x[3],@y[3]
  1026. xvst @x[0],$out,32*0
  1027. xvst @x[1],$out,32*1
  1028. xvst @x[2],$out,32*2
  1029. xvst @x[3],$out,32*3
  1030. xvld @x[0],$inp,32*4
  1031. xvld @x[1],$inp,32*5
  1032. xvxor.v @x[0],@x[0],@y[4]
  1033. xvxor.v @x[1],@x[1],@y[5]
  1034. xvst @x[0],$out,32*4
  1035. xvst @x[1],$out,32*5
  1036. addi.d $len,$len,-192
  1037. beqz $len,.Ldone_8x
  1038. addi.d $inp,$inp,192
  1039. addi.d $out,$out,192
  1040. xvst @y[6],$sp,32*0
  1041. xvst @y[7],$sp,32*1
  1042. move $t8,$zero
  1043. b .Loop_tail_8x
  1044. .align 5
  1045. .L256_or_more8x:
  1046. xvld @x[0],$inp,32*0
  1047. xvld @x[1],$inp,32*1
  1048. xvld @x[2],$inp,32*2
  1049. xvld @x[3],$inp,32*3
  1050. xvxor.v @x[0],@x[0],@y[0]
  1051. xvxor.v @x[1],@x[1],@y[1]
  1052. xvxor.v @x[2],@x[2],@y[2]
  1053. xvxor.v @x[3],@x[3],@y[3]
  1054. xvst @x[0],$out,32*0
  1055. xvst @x[1],$out,32*1
  1056. xvst @x[2],$out,32*2
  1057. xvst @x[3],$out,32*3
  1058. xvld @x[0],$inp,32*4
  1059. xvld @x[1],$inp,32*5
  1060. xvld @x[2],$inp,32*6
  1061. xvld @x[3],$inp,32*7
  1062. xvxor.v @x[0],@x[0],@y[4]
  1063. xvxor.v @x[1],@x[1],@y[5]
  1064. xvxor.v @x[2],@x[2],@y[6]
  1065. xvxor.v @x[3],@x[3],@y[7]
  1066. xvst @x[0],$out,32*4
  1067. xvst @x[1],$out,32*5
  1068. xvst @x[2],$out,32*6
  1069. xvst @x[3],$out,32*7
  1070. addi.d $len,$len,-256
  1071. beqz $len,.Ldone_8x
  1072. addi.d $inp,$inp,256
  1073. addi.d $out,$out,256
  1074. xvst @y[8],$sp,32*0
  1075. xvst @y[9],$sp,32*1
  1076. move $t8,$zero
  1077. b .Loop_tail_8x
  1078. .align 5
  1079. .L320_or_more8x:
  1080. xvld @x[0],$inp,32*0
  1081. xvld @x[1],$inp,32*1
  1082. xvld @x[2],$inp,32*2
  1083. xvld @x[3],$inp,32*3
  1084. xvxor.v @x[0],@x[0],@y[0]
  1085. xvxor.v @x[1],@x[1],@y[1]
  1086. xvxor.v @x[2],@x[2],@y[2]
  1087. xvxor.v @x[3],@x[3],@y[3]
  1088. xvst @x[0],$out,32*0
  1089. xvst @x[1],$out,32*1
  1090. xvst @x[2],$out,32*2
  1091. xvst @x[3],$out,32*3
  1092. xvld @x[0],$inp,32*4
  1093. xvld @x[1],$inp,32*5
  1094. xvld @x[2],$inp,32*6
  1095. xvld @x[3],$inp,32*7
  1096. xvxor.v @x[0],@x[0],@y[4]
  1097. xvxor.v @x[1],@x[1],@y[5]
  1098. xvxor.v @x[2],@x[2],@y[6]
  1099. xvxor.v @x[3],@x[3],@y[7]
  1100. xvst @x[0],$out,32*4
  1101. xvst @x[1],$out,32*5
  1102. xvst @x[2],$out,32*6
  1103. xvst @x[3],$out,32*7
  1104. xvld @x[0],$inp,32*8
  1105. xvld @x[1],$inp,32*9
  1106. xvxor.v @x[0],@x[0],@y[8]
  1107. xvxor.v @x[1],@x[1],@y[9]
  1108. xvst @x[0],$out,32*8
  1109. xvst @x[1],$out,32*9
  1110. addi.d $len,$len,-320
  1111. beqz $len,.Ldone_8x
  1112. addi.d $inp,$inp,320
  1113. addi.d $out,$out,320
  1114. xvst @y[10],$sp,32*0
  1115. xvst @y[11],$sp,32*1
  1116. move $t8,$zero
  1117. b .Loop_tail_8x
  1118. .align 5
  1119. .L384_or_more8x:
  1120. xvld @x[0],$inp,32*0
  1121. xvld @x[1],$inp,32*1
  1122. xvld @x[2],$inp,32*2
  1123. xvld @x[3],$inp,32*3
  1124. xvxor.v @x[0],@x[0],@y[0]
  1125. xvxor.v @x[1],@x[1],@y[1]
  1126. xvxor.v @x[2],@x[2],@y[2]
  1127. xvxor.v @x[3],@x[3],@y[3]
  1128. xvst @x[0],$out,32*0
  1129. xvst @x[1],$out,32*1
  1130. xvst @x[2],$out,32*2
  1131. xvst @x[3],$out,32*3
  1132. xvld @x[0],$inp,32*4
  1133. xvld @x[1],$inp,32*5
  1134. xvld @x[2],$inp,32*6
  1135. xvld @x[3],$inp,32*7
  1136. xvxor.v @x[0],@x[0],@y[4]
  1137. xvxor.v @x[1],@x[1],@y[5]
  1138. xvxor.v @x[2],@x[2],@y[6]
  1139. xvxor.v @x[3],@x[3],@y[7]
  1140. xvst @x[0],$out,32*4
  1141. xvst @x[1],$out,32*5
  1142. xvst @x[2],$out,32*6
  1143. xvst @x[3],$out,32*7
  1144. xvld @x[0],$inp,32*8
  1145. xvld @x[1],$inp,32*9
  1146. xvld @x[2],$inp,32*10
  1147. xvld @x[3],$inp,32*11
  1148. xvxor.v @x[0],@x[0],@y[8]
  1149. xvxor.v @x[1],@x[1],@y[9]
  1150. xvxor.v @x[2],@x[2],@y[10]
  1151. xvxor.v @x[3],@x[3],@y[11]
  1152. xvst @x[0],$out,32*8
  1153. xvst @x[1],$out,32*9
  1154. xvst @x[2],$out,32*10
  1155. xvst @x[3],$out,32*11
  1156. addi.d $len,$len,-384
  1157. beqz $len,.Ldone_8x
  1158. addi.d $inp,$inp,384
  1159. addi.d $out,$out,384
  1160. xvst @y[12],$sp,32*0
  1161. xvst @y[13],$sp,32*1
  1162. move $t8,$zero
  1163. b .Loop_tail_8x
  1164. .align 5
  1165. .L448_or_more8x:
  1166. xvld @x[0],$inp,32*0
  1167. xvld @x[1],$inp,32*1
  1168. xvld @x[2],$inp,32*2
  1169. xvld @x[3],$inp,32*3
  1170. xvxor.v @x[0],@x[0],@y[0]
  1171. xvxor.v @x[1],@x[1],@y[1]
  1172. xvxor.v @x[2],@x[2],@y[2]
  1173. xvxor.v @x[3],@x[3],@y[3]
  1174. xvst @x[0],$out,32*0
  1175. xvst @x[1],$out,32*1
  1176. xvst @x[2],$out,32*2
  1177. xvst @x[3],$out,32*3
  1178. xvld @x[0],$inp,32*4
  1179. xvld @x[1],$inp,32*5
  1180. xvld @x[2],$inp,32*6
  1181. xvld @x[3],$inp,32*7
  1182. xvxor.v @x[0],@x[0],@y[4]
  1183. xvxor.v @x[1],@x[1],@y[5]
  1184. xvxor.v @x[2],@x[2],@y[6]
  1185. xvxor.v @x[3],@x[3],@y[7]
  1186. xvst @x[0],$out,32*4
  1187. xvst @x[1],$out,32*5
  1188. xvst @x[2],$out,32*6
  1189. xvst @x[3],$out,32*7
  1190. xvld @x[0],$inp,32*8
  1191. xvld @x[1],$inp,32*9
  1192. xvld @x[2],$inp,32*10
  1193. xvld @x[3],$inp,32*11
  1194. xvxor.v @x[0],@x[0],@y[8]
  1195. xvxor.v @x[1],@x[1],@y[9]
  1196. xvxor.v @x[2],@x[2],@y[10]
  1197. xvxor.v @x[3],@x[3],@y[11]
  1198. xvst @x[0],$out,32*8
  1199. xvst @x[1],$out,32*9
  1200. xvst @x[2],$out,32*10
  1201. xvst @x[3],$out,32*11
  1202. xvld @x[0],$inp,32*12
  1203. xvld @x[1],$inp,32*13
  1204. xvxor.v @x[0],@x[0],@y[12]
  1205. xvxor.v @x[1],@x[1],@y[13]
  1206. xvst @x[0],$out,32*12
  1207. xvst @x[1],$out,32*13
  1208. addi.d $len,$len,-448
  1209. beqz $len,.Ldone_8x
  1210. addi.d $inp,$inp,448
  1211. addi.d $out,$out,448
  1212. xvst @y[14],$sp,32*0
  1213. xvst @y[15],$sp,32*1
  1214. move $t8,$zero
  1215. b .Loop_tail_8x
  1216. .Loop_tail_8x:
  1217. # Xor input with states byte by byte
  1218. ldx.bu $t5,$inp,$t8
  1219. ldx.bu $t6,$sp,$t8
  1220. xor $t5,$t5,$t6
  1221. stx.b $t5,$out,$t8
  1222. addi.w $t8,$t8,1
  1223. addi.d $len,$len,-1
  1224. bnez $len,.Loop_tail_8x
  1225. b .Ldone_8x
  1226. .Ldone_8x:
  1227. addi.d $sp,$sp,128
  1228. b .Lend
  1229. EOF
  1230. }
  1231. $code .= <<EOF;
  1232. .Lno_data:
  1233. .Lend:
  1234. jr $ra
  1235. .size ChaCha20_ctr32,.-ChaCha20_ctr32
  1236. EOF
  1237. $code =~ s/\`([^\`]*)\`/eval($1)/gem;
  1238. print $code;
  1239. close STDOUT;