chacha-armv8.pl 32 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329
  1. #! /usr/bin/env perl
  2. # Copyright 2016-2022 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. # ====================================================================
  10. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  11. # project. The module is, however, dual licensed under OpenSSL and
  12. # CRYPTOGAMS licenses depending on where you obtain it. For further
  13. # details see http://www.openssl.org/~appro/cryptogams/.
  14. # ====================================================================
  15. #
  16. # June 2015
  17. #
  18. # ChaCha20 for ARMv8.
  19. #
  20. # April 2019
  21. #
  22. # Replace 3xNEON+1xIALU code path with 4+1. 4+1 is actually fastest
  23. # option on most(*), but not all, processors, yet 6+2 is retained.
  24. # This is because penalties are considered tolerable in comparison to
  25. # improvement on processors where 6+2 helps. Most notably +37% on
  26. # ThunderX2. It's server-oriented processor which will have to serve
  27. # as many requests as possible. While others are mostly clients, when
  28. # performance doesn't have to be absolute top-notch, just fast enough,
  29. # as majority of time is spent "entertaining" relatively slow human.
  30. #
  31. # Performance in cycles per byte out of large buffer.
  32. #
  33. # IALU/gcc-4.9 4xNEON+1xIALU 6xNEON+2xIALU
  34. #
  35. # Apple A7 5.50/+49% 2.72 1.60
  36. # Cortex-A53 8.40/+80% 4.06 4.45(*)
  37. # Cortex-A57 8.06/+43% 4.15 4.40(*)
  38. # Denver 4.50/+82% 2.30 2.70(*)
  39. # X-Gene 9.50/+46% 8.20 8.90(*)
  40. # Mongoose 8.00/+44% 2.74 3.12(*)
  41. # Kryo 8.17/+50% 4.47 4.65(*)
  42. # ThunderX2 7.22/+48% 5.64 4.10
  43. #
  44. # (*) slower than 4+1:-(
  45. # $output is the last argument if it looks like a file (it has an extension)
  46. # $flavour is the first argument if it doesn't look like a file
  47. $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  48. $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  49. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  50. ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
  51. ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
  52. die "can't locate arm-xlate.pl";
  53. open OUT,"| \"$^X\" $xlate $flavour \"$output\""
  54. or die "can't call $xlate: $!";
  55. *STDOUT=*OUT;
  56. sub AUTOLOAD() # thunk [simplified] x86-style perlasm
  57. { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
  58. my $arg = pop;
  59. $arg = "#$arg" if ($arg*1 eq $arg);
  60. $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
  61. }
  62. my ($out,$inp,$len,$key,$ctr) = map("x$_",(0..4));
  63. my @x=map("x$_",(5..17,19..21));
  64. my @d=map("x$_",(22..28,30));
  65. sub ROUND {
  66. my ($a0,$b0,$c0,$d0)=@_;
  67. my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
  68. my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
  69. my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
  70. (
  71. "&add_32 (@x[$a0],@x[$a0],@x[$b0])",
  72. "&add_32 (@x[$a1],@x[$a1],@x[$b1])",
  73. "&add_32 (@x[$a2],@x[$a2],@x[$b2])",
  74. "&add_32 (@x[$a3],@x[$a3],@x[$b3])",
  75. "&eor_32 (@x[$d0],@x[$d0],@x[$a0])",
  76. "&eor_32 (@x[$d1],@x[$d1],@x[$a1])",
  77. "&eor_32 (@x[$d2],@x[$d2],@x[$a2])",
  78. "&eor_32 (@x[$d3],@x[$d3],@x[$a3])",
  79. "&ror_32 (@x[$d0],@x[$d0],16)",
  80. "&ror_32 (@x[$d1],@x[$d1],16)",
  81. "&ror_32 (@x[$d2],@x[$d2],16)",
  82. "&ror_32 (@x[$d3],@x[$d3],16)",
  83. "&add_32 (@x[$c0],@x[$c0],@x[$d0])",
  84. "&add_32 (@x[$c1],@x[$c1],@x[$d1])",
  85. "&add_32 (@x[$c2],@x[$c2],@x[$d2])",
  86. "&add_32 (@x[$c3],@x[$c3],@x[$d3])",
  87. "&eor_32 (@x[$b0],@x[$b0],@x[$c0])",
  88. "&eor_32 (@x[$b1],@x[$b1],@x[$c1])",
  89. "&eor_32 (@x[$b2],@x[$b2],@x[$c2])",
  90. "&eor_32 (@x[$b3],@x[$b3],@x[$c3])",
  91. "&ror_32 (@x[$b0],@x[$b0],20)",
  92. "&ror_32 (@x[$b1],@x[$b1],20)",
  93. "&ror_32 (@x[$b2],@x[$b2],20)",
  94. "&ror_32 (@x[$b3],@x[$b3],20)",
  95. "&add_32 (@x[$a0],@x[$a0],@x[$b0])",
  96. "&add_32 (@x[$a1],@x[$a1],@x[$b1])",
  97. "&add_32 (@x[$a2],@x[$a2],@x[$b2])",
  98. "&add_32 (@x[$a3],@x[$a3],@x[$b3])",
  99. "&eor_32 (@x[$d0],@x[$d0],@x[$a0])",
  100. "&eor_32 (@x[$d1],@x[$d1],@x[$a1])",
  101. "&eor_32 (@x[$d2],@x[$d2],@x[$a2])",
  102. "&eor_32 (@x[$d3],@x[$d3],@x[$a3])",
  103. "&ror_32 (@x[$d0],@x[$d0],24)",
  104. "&ror_32 (@x[$d1],@x[$d1],24)",
  105. "&ror_32 (@x[$d2],@x[$d2],24)",
  106. "&ror_32 (@x[$d3],@x[$d3],24)",
  107. "&add_32 (@x[$c0],@x[$c0],@x[$d0])",
  108. "&add_32 (@x[$c1],@x[$c1],@x[$d1])",
  109. "&add_32 (@x[$c2],@x[$c2],@x[$d2])",
  110. "&add_32 (@x[$c3],@x[$c3],@x[$d3])",
  111. "&eor_32 (@x[$b0],@x[$b0],@x[$c0])",
  112. "&eor_32 (@x[$b1],@x[$b1],@x[$c1])",
  113. "&eor_32 (@x[$b2],@x[$b2],@x[$c2])",
  114. "&eor_32 (@x[$b3],@x[$b3],@x[$c3])",
  115. "&ror_32 (@x[$b0],@x[$b0],25)",
  116. "&ror_32 (@x[$b1],@x[$b1],25)",
  117. "&ror_32 (@x[$b2],@x[$b2],25)",
  118. "&ror_32 (@x[$b3],@x[$b3],25)"
  119. );
  120. }
  121. $code.=<<___;
  122. #include "arm_arch.h"
  123. #ifndef __KERNEL__
  124. .extern OPENSSL_armcap_P
  125. .hidden OPENSSL_armcap_P
  126. .extern ChaCha20_ctr32_sve
  127. #endif
  128. .text
  129. .align 5
  130. .Lsigma:
  131. .quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral
  132. .Lone:
  133. .long 1,2,3,4
  134. .Lrot24:
  135. .long 0x02010003,0x06050407,0x0a09080b,0x0e0d0c0f
  136. .asciz "ChaCha20 for ARMv8, CRYPTOGAMS by \@dot-asm"
  137. .globl ChaCha20_ctr32_dflt
  138. .type ChaCha20_ctr32_dflt,%function
  139. .align 5
  140. ChaCha20_ctr32_dflt:
  141. AARCH64_SIGN_LINK_REGISTER
  142. cmp $len,#192
  143. b.lo .Lshort
  144. #ifndef __KERNEL__
  145. adrp x17,OPENSSL_armcap_P
  146. ldr w17,[x17,#:lo12:OPENSSL_armcap_P]
  147. .Lcheck_neon:
  148. tst w17,#ARMV7_NEON
  149. b.ne .LChaCha20_neon
  150. #endif
  151. .Lshort:
  152. stp x29,x30,[sp,#-96]!
  153. add x29,sp,#0
  154. adr @x[0],.Lsigma
  155. stp x19,x20,[sp,#16]
  156. stp x21,x22,[sp,#32]
  157. stp x23,x24,[sp,#48]
  158. stp x25,x26,[sp,#64]
  159. stp x27,x28,[sp,#80]
  160. sub sp,sp,#64
  161. ldp @d[0],@d[1],[@x[0]] // load sigma
  162. ldp @d[2],@d[3],[$key] // load key
  163. ldp @d[4],@d[5],[$key,#16]
  164. ldp @d[6],@d[7],[$ctr] // load counter
  165. #ifdef __AARCH64EB__
  166. ror @d[2],@d[2],#32
  167. ror @d[3],@d[3],#32
  168. ror @d[4],@d[4],#32
  169. ror @d[5],@d[5],#32
  170. ror @d[6],@d[6],#32
  171. ror @d[7],@d[7],#32
  172. #endif
  173. .Loop_outer:
  174. mov.32 @x[0],@d[0] // unpack key block
  175. lsr @x[1],@d[0],#32
  176. mov.32 @x[2],@d[1]
  177. lsr @x[3],@d[1],#32
  178. mov.32 @x[4],@d[2]
  179. lsr @x[5],@d[2],#32
  180. mov.32 @x[6],@d[3]
  181. lsr @x[7],@d[3],#32
  182. mov.32 @x[8],@d[4]
  183. lsr @x[9],@d[4],#32
  184. mov.32 @x[10],@d[5]
  185. lsr @x[11],@d[5],#32
  186. mov.32 @x[12],@d[6]
  187. lsr @x[13],@d[6],#32
  188. mov.32 @x[14],@d[7]
  189. lsr @x[15],@d[7],#32
  190. mov $ctr,#10
  191. subs $len,$len,#64
  192. .Loop:
  193. sub $ctr,$ctr,#1
  194. ___
  195. foreach (&ROUND(0, 4, 8,12)) { eval; }
  196. foreach (&ROUND(0, 5,10,15)) { eval; }
  197. $code.=<<___;
  198. cbnz $ctr,.Loop
  199. add.32 @x[0],@x[0],@d[0] // accumulate key block
  200. add @x[1],@x[1],@d[0],lsr#32
  201. add.32 @x[2],@x[2],@d[1]
  202. add @x[3],@x[3],@d[1],lsr#32
  203. add.32 @x[4],@x[4],@d[2]
  204. add @x[5],@x[5],@d[2],lsr#32
  205. add.32 @x[6],@x[6],@d[3]
  206. add @x[7],@x[7],@d[3],lsr#32
  207. add.32 @x[8],@x[8],@d[4]
  208. add @x[9],@x[9],@d[4],lsr#32
  209. add.32 @x[10],@x[10],@d[5]
  210. add @x[11],@x[11],@d[5],lsr#32
  211. add.32 @x[12],@x[12],@d[6]
  212. add @x[13],@x[13],@d[6],lsr#32
  213. add.32 @x[14],@x[14],@d[7]
  214. add @x[15],@x[15],@d[7],lsr#32
  215. b.lo .Ltail
  216. add @x[0],@x[0],@x[1],lsl#32 // pack
  217. add @x[2],@x[2],@x[3],lsl#32
  218. ldp @x[1],@x[3],[$inp,#0] // load input
  219. add @x[4],@x[4],@x[5],lsl#32
  220. add @x[6],@x[6],@x[7],lsl#32
  221. ldp @x[5],@x[7],[$inp,#16]
  222. add @x[8],@x[8],@x[9],lsl#32
  223. add @x[10],@x[10],@x[11],lsl#32
  224. ldp @x[9],@x[11],[$inp,#32]
  225. add @x[12],@x[12],@x[13],lsl#32
  226. add @x[14],@x[14],@x[15],lsl#32
  227. ldp @x[13],@x[15],[$inp,#48]
  228. add $inp,$inp,#64
  229. #ifdef __AARCH64EB__
  230. rev @x[0],@x[0]
  231. rev @x[2],@x[2]
  232. rev @x[4],@x[4]
  233. rev @x[6],@x[6]
  234. rev @x[8],@x[8]
  235. rev @x[10],@x[10]
  236. rev @x[12],@x[12]
  237. rev @x[14],@x[14]
  238. #endif
  239. eor @x[0],@x[0],@x[1]
  240. eor @x[2],@x[2],@x[3]
  241. eor @x[4],@x[4],@x[5]
  242. eor @x[6],@x[6],@x[7]
  243. eor @x[8],@x[8],@x[9]
  244. eor @x[10],@x[10],@x[11]
  245. eor @x[12],@x[12],@x[13]
  246. eor @x[14],@x[14],@x[15]
  247. stp @x[0],@x[2],[$out,#0] // store output
  248. add @d[6],@d[6],#1 // increment counter
  249. stp @x[4],@x[6],[$out,#16]
  250. stp @x[8],@x[10],[$out,#32]
  251. stp @x[12],@x[14],[$out,#48]
  252. add $out,$out,#64
  253. b.hi .Loop_outer
  254. ldp x19,x20,[x29,#16]
  255. add sp,sp,#64
  256. ldp x21,x22,[x29,#32]
  257. ldp x23,x24,[x29,#48]
  258. ldp x25,x26,[x29,#64]
  259. ldp x27,x28,[x29,#80]
  260. ldp x29,x30,[sp],#96
  261. .Labort:
  262. AARCH64_VALIDATE_LINK_REGISTER
  263. ret
  264. .align 4
  265. .Ltail:
  266. add $len,$len,#64
  267. .Less_than_64:
  268. sub $out,$out,#1
  269. add $inp,$inp,$len
  270. add $out,$out,$len
  271. add $ctr,sp,$len
  272. neg $len,$len
  273. add @x[0],@x[0],@x[1],lsl#32 // pack
  274. add @x[2],@x[2],@x[3],lsl#32
  275. add @x[4],@x[4],@x[5],lsl#32
  276. add @x[6],@x[6],@x[7],lsl#32
  277. add @x[8],@x[8],@x[9],lsl#32
  278. add @x[10],@x[10],@x[11],lsl#32
  279. add @x[12],@x[12],@x[13],lsl#32
  280. add @x[14],@x[14],@x[15],lsl#32
  281. #ifdef __AARCH64EB__
  282. rev @x[0],@x[0]
  283. rev @x[2],@x[2]
  284. rev @x[4],@x[4]
  285. rev @x[6],@x[6]
  286. rev @x[8],@x[8]
  287. rev @x[10],@x[10]
  288. rev @x[12],@x[12]
  289. rev @x[14],@x[14]
  290. #endif
  291. stp @x[0],@x[2],[sp,#0]
  292. stp @x[4],@x[6],[sp,#16]
  293. stp @x[8],@x[10],[sp,#32]
  294. stp @x[12],@x[14],[sp,#48]
  295. .Loop_tail:
  296. ldrb w10,[$inp,$len]
  297. ldrb w11,[$ctr,$len]
  298. add $len,$len,#1
  299. eor w10,w10,w11
  300. strb w10,[$out,$len]
  301. cbnz $len,.Loop_tail
  302. stp xzr,xzr,[sp,#0]
  303. stp xzr,xzr,[sp,#16]
  304. stp xzr,xzr,[sp,#32]
  305. stp xzr,xzr,[sp,#48]
  306. ldp x19,x20,[x29,#16]
  307. add sp,sp,#64
  308. ldp x21,x22,[x29,#32]
  309. ldp x23,x24,[x29,#48]
  310. ldp x25,x26,[x29,#64]
  311. ldp x27,x28,[x29,#80]
  312. ldp x29,x30,[sp],#96
  313. AARCH64_VALIDATE_LINK_REGISTER
  314. ret
  315. .size ChaCha20_ctr32_dflt,.-ChaCha20_ctr32_dflt
  316. .globl ChaCha20_ctr32
  317. .type ChaCha20_ctr32,%function
  318. .align 5
  319. ChaCha20_ctr32:
  320. AARCH64_SIGN_LINK_REGISTER
  321. cbz $len,.Labort
  322. cmp $len,#192
  323. b.lo .Lshort
  324. #ifndef __KERNEL__
  325. adrp x17,OPENSSL_armcap_P
  326. ldr w17,[x17,#:lo12:OPENSSL_armcap_P]
  327. tst w17,#ARMV8_SVE
  328. b.eq .Lcheck_neon
  329. stp x29,x30,[sp,#-16]!
  330. sub sp,sp,#16
  331. // SVE handling will inevitably increment the counter
  332. // Neon/Scalar code that follows to process tail data needs to
  333. // use new counter, unfortunately the input counter buffer
  334. // pointed to by ctr is meant to be read-only per API contract
  335. // we have to copy the buffer to stack to be writable by SVE
  336. ldp x5,x6,[$ctr]
  337. stp x5,x6,[sp]
  338. mov $ctr,sp
  339. bl ChaCha20_ctr32_sve
  340. cbz $len,1f
  341. bl ChaCha20_ctr32_dflt
  342. 1:
  343. add sp,sp,#16
  344. ldp x29,x30,[sp],#16
  345. AARCH64_VALIDATE_LINK_REGISTER
  346. ret
  347. #endif
  348. b .Lshort
  349. .size ChaCha20_ctr32,.-ChaCha20_ctr32
  350. ___
  351. {{{
  352. my @K = map("v$_.4s",(0..3));
  353. my ($xt0,$xt1,$xt2,$xt3, $CTR,$ROT24) = map("v$_.4s",(4..9));
  354. my @X = map("v$_.4s",(16,20,24,28, 17,21,25,29, 18,22,26,30, 19,23,27,31));
  355. my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
  356. $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3) = @X;
  357. sub NEON_lane_ROUND {
  358. my ($a0,$b0,$c0,$d0)=@_;
  359. my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
  360. my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
  361. my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
  362. my @x=map("'$_'",@X);
  363. (
  364. "&add (@x[$a0],@x[$a0],@x[$b0])", # Q1
  365. "&add (@x[$a1],@x[$a1],@x[$b1])", # Q2
  366. "&add (@x[$a2],@x[$a2],@x[$b2])", # Q3
  367. "&add (@x[$a3],@x[$a3],@x[$b3])", # Q4
  368. "&eor (@x[$d0],@x[$d0],@x[$a0])",
  369. "&eor (@x[$d1],@x[$d1],@x[$a1])",
  370. "&eor (@x[$d2],@x[$d2],@x[$a2])",
  371. "&eor (@x[$d3],@x[$d3],@x[$a3])",
  372. "&rev32_16 (@x[$d0],@x[$d0])",
  373. "&rev32_16 (@x[$d1],@x[$d1])",
  374. "&rev32_16 (@x[$d2],@x[$d2])",
  375. "&rev32_16 (@x[$d3],@x[$d3])",
  376. "&add (@x[$c0],@x[$c0],@x[$d0])",
  377. "&add (@x[$c1],@x[$c1],@x[$d1])",
  378. "&add (@x[$c2],@x[$c2],@x[$d2])",
  379. "&add (@x[$c3],@x[$c3],@x[$d3])",
  380. "&eor ('$xt0',@x[$b0],@x[$c0])",
  381. "&eor ('$xt1',@x[$b1],@x[$c1])",
  382. "&eor ('$xt2',@x[$b2],@x[$c2])",
  383. "&eor ('$xt3',@x[$b3],@x[$c3])",
  384. "&ushr (@x[$b0],'$xt0',20)",
  385. "&ushr (@x[$b1],'$xt1',20)",
  386. "&ushr (@x[$b2],'$xt2',20)",
  387. "&ushr (@x[$b3],'$xt3',20)",
  388. "&sli (@x[$b0],'$xt0',12)",
  389. "&sli (@x[$b1],'$xt1',12)",
  390. "&sli (@x[$b2],'$xt2',12)",
  391. "&sli (@x[$b3],'$xt3',12)",
  392. "&add (@x[$a0],@x[$a0],@x[$b0])",
  393. "&add (@x[$a1],@x[$a1],@x[$b1])",
  394. "&add (@x[$a2],@x[$a2],@x[$b2])",
  395. "&add (@x[$a3],@x[$a3],@x[$b3])",
  396. "&eor ('$xt0',@x[$d0],@x[$a0])",
  397. "&eor ('$xt1',@x[$d1],@x[$a1])",
  398. "&eor ('$xt2',@x[$d2],@x[$a2])",
  399. "&eor ('$xt3',@x[$d3],@x[$a3])",
  400. "&tbl (@x[$d0],'{$xt0}','$ROT24')",
  401. "&tbl (@x[$d1],'{$xt1}','$ROT24')",
  402. "&tbl (@x[$d2],'{$xt2}','$ROT24')",
  403. "&tbl (@x[$d3],'{$xt3}','$ROT24')",
  404. "&add (@x[$c0],@x[$c0],@x[$d0])",
  405. "&add (@x[$c1],@x[$c1],@x[$d1])",
  406. "&add (@x[$c2],@x[$c2],@x[$d2])",
  407. "&add (@x[$c3],@x[$c3],@x[$d3])",
  408. "&eor ('$xt0',@x[$b0],@x[$c0])",
  409. "&eor ('$xt1',@x[$b1],@x[$c1])",
  410. "&eor ('$xt2',@x[$b2],@x[$c2])",
  411. "&eor ('$xt3',@x[$b3],@x[$c3])",
  412. "&ushr (@x[$b0],'$xt0',25)",
  413. "&ushr (@x[$b1],'$xt1',25)",
  414. "&ushr (@x[$b2],'$xt2',25)",
  415. "&ushr (@x[$b3],'$xt3',25)",
  416. "&sli (@x[$b0],'$xt0',7)",
  417. "&sli (@x[$b1],'$xt1',7)",
  418. "&sli (@x[$b2],'$xt2',7)",
  419. "&sli (@x[$b3],'$xt3',7)"
  420. );
  421. }
  422. $code.=<<___;
  423. #ifdef __KERNEL__
  424. .globl ChaCha20_neon
  425. #endif
  426. .type ChaCha20_neon,%function
  427. .align 5
  428. ChaCha20_neon:
  429. AARCH64_SIGN_LINK_REGISTER
  430. .LChaCha20_neon:
  431. stp x29,x30,[sp,#-96]!
  432. add x29,sp,#0
  433. adr @x[0],.Lsigma
  434. stp x19,x20,[sp,#16]
  435. stp x21,x22,[sp,#32]
  436. stp x23,x24,[sp,#48]
  437. stp x25,x26,[sp,#64]
  438. stp x27,x28,[sp,#80]
  439. cmp $len,#512
  440. b.hs .L512_or_more_neon
  441. sub sp,sp,#64
  442. ldp @d[0],@d[1],[@x[0]] // load sigma
  443. ld1 {@K[0]},[@x[0]],#16
  444. ldp @d[2],@d[3],[$key] // load key
  445. ldp @d[4],@d[5],[$key,#16]
  446. ld1 {@K[1],@K[2]},[$key]
  447. ldp @d[6],@d[7],[$ctr] // load counter
  448. ld1 {@K[3]},[$ctr]
  449. stp d8,d9,[sp] // meet ABI requirements
  450. ld1 {$CTR,$ROT24},[@x[0]]
  451. #ifdef __AARCH64EB__
  452. rev64 @K[0],@K[0]
  453. ror @d[2],@d[2],#32
  454. ror @d[3],@d[3],#32
  455. ror @d[4],@d[4],#32
  456. ror @d[5],@d[5],#32
  457. ror @d[6],@d[6],#32
  458. ror @d[7],@d[7],#32
  459. #endif
  460. .Loop_outer_neon:
  461. dup $xa0,@{K[0]}[0] // unpack key block
  462. mov.32 @x[0],@d[0]
  463. dup $xa1,@{K[0]}[1]
  464. lsr @x[1],@d[0],#32
  465. dup $xa2,@{K[0]}[2]
  466. mov.32 @x[2],@d[1]
  467. dup $xa3,@{K[0]}[3]
  468. lsr @x[3],@d[1],#32
  469. dup $xb0,@{K[1]}[0]
  470. mov.32 @x[4],@d[2]
  471. dup $xb1,@{K[1]}[1]
  472. lsr @x[5],@d[2],#32
  473. dup $xb2,@{K[1]}[2]
  474. mov.32 @x[6],@d[3]
  475. dup $xb3,@{K[1]}[3]
  476. lsr @x[7],@d[3],#32
  477. dup $xd0,@{K[3]}[0]
  478. mov.32 @x[8],@d[4]
  479. dup $xd1,@{K[3]}[1]
  480. lsr @x[9],@d[4],#32
  481. dup $xd2,@{K[3]}[2]
  482. mov.32 @x[10],@d[5]
  483. dup $xd3,@{K[3]}[3]
  484. lsr @x[11],@d[5],#32
  485. add $xd0,$xd0,$CTR
  486. mov.32 @x[12],@d[6]
  487. dup $xc0,@{K[2]}[0]
  488. lsr @x[13],@d[6],#32
  489. dup $xc1,@{K[2]}[1]
  490. mov.32 @x[14],@d[7]
  491. dup $xc2,@{K[2]}[2]
  492. lsr @x[15],@d[7],#32
  493. dup $xc3,@{K[2]}[3]
  494. mov $ctr,#10
  495. subs $len,$len,#320
  496. .Loop_neon:
  497. sub $ctr,$ctr,#1
  498. ___
  499. my @plus_one=&ROUND(0,4,8,12);
  500. foreach (&NEON_lane_ROUND(0,4,8,12)) { eval; eval(shift(@plus_one)); }
  501. @plus_one=&ROUND(0,5,10,15);
  502. foreach (&NEON_lane_ROUND(0,5,10,15)) { eval; eval(shift(@plus_one)); }
  503. $code.=<<___;
  504. cbnz $ctr,.Loop_neon
  505. add $xd0,$xd0,$CTR
  506. zip1 $xt0,$xa0,$xa1 // transpose data
  507. zip1 $xt1,$xa2,$xa3
  508. zip2 $xt2,$xa0,$xa1
  509. zip2 $xt3,$xa2,$xa3
  510. zip1.64 $xa0,$xt0,$xt1
  511. zip2.64 $xa1,$xt0,$xt1
  512. zip1.64 $xa2,$xt2,$xt3
  513. zip2.64 $xa3,$xt2,$xt3
  514. zip1 $xt0,$xb0,$xb1
  515. zip1 $xt1,$xb2,$xb3
  516. zip2 $xt2,$xb0,$xb1
  517. zip2 $xt3,$xb2,$xb3
  518. zip1.64 $xb0,$xt0,$xt1
  519. zip2.64 $xb1,$xt0,$xt1
  520. zip1.64 $xb2,$xt2,$xt3
  521. zip2.64 $xb3,$xt2,$xt3
  522. zip1 $xt0,$xc0,$xc1
  523. add.32 @x[0],@x[0],@d[0] // accumulate key block
  524. zip1 $xt1,$xc2,$xc3
  525. add @x[1],@x[1],@d[0],lsr#32
  526. zip2 $xt2,$xc0,$xc1
  527. add.32 @x[2],@x[2],@d[1]
  528. zip2 $xt3,$xc2,$xc3
  529. add @x[3],@x[3],@d[1],lsr#32
  530. zip1.64 $xc0,$xt0,$xt1
  531. add.32 @x[4],@x[4],@d[2]
  532. zip2.64 $xc1,$xt0,$xt1
  533. add @x[5],@x[5],@d[2],lsr#32
  534. zip1.64 $xc2,$xt2,$xt3
  535. add.32 @x[6],@x[6],@d[3]
  536. zip2.64 $xc3,$xt2,$xt3
  537. add @x[7],@x[7],@d[3],lsr#32
  538. zip1 $xt0,$xd0,$xd1
  539. add.32 @x[8],@x[8],@d[4]
  540. zip1 $xt1,$xd2,$xd3
  541. add @x[9],@x[9],@d[4],lsr#32
  542. zip2 $xt2,$xd0,$xd1
  543. add.32 @x[10],@x[10],@d[5]
  544. zip2 $xt3,$xd2,$xd3
  545. add @x[11],@x[11],@d[5],lsr#32
  546. zip1.64 $xd0,$xt0,$xt1
  547. add.32 @x[12],@x[12],@d[6]
  548. zip2.64 $xd1,$xt0,$xt1
  549. add @x[13],@x[13],@d[6],lsr#32
  550. zip1.64 $xd2,$xt2,$xt3
  551. add.32 @x[14],@x[14],@d[7]
  552. zip2.64 $xd3,$xt2,$xt3
  553. add @x[15],@x[15],@d[7],lsr#32
  554. b.lo .Ltail_neon
  555. add @x[0],@x[0],@x[1],lsl#32 // pack
  556. add @x[2],@x[2],@x[3],lsl#32
  557. ldp @x[1],@x[3],[$inp,#0] // load input
  558. add $xa0,$xa0,@K[0] // accumulate key block
  559. add @x[4],@x[4],@x[5],lsl#32
  560. add @x[6],@x[6],@x[7],lsl#32
  561. ldp @x[5],@x[7],[$inp,#16]
  562. add $xb0,$xb0,@K[1]
  563. add @x[8],@x[8],@x[9],lsl#32
  564. add @x[10],@x[10],@x[11],lsl#32
  565. ldp @x[9],@x[11],[$inp,#32]
  566. add $xc0,$xc0,@K[2]
  567. add @x[12],@x[12],@x[13],lsl#32
  568. add @x[14],@x[14],@x[15],lsl#32
  569. ldp @x[13],@x[15],[$inp,#48]
  570. add $xd0,$xd0,@K[3]
  571. add $inp,$inp,#64
  572. #ifdef __AARCH64EB__
  573. rev @x[0],@x[0]
  574. rev @x[2],@x[2]
  575. rev @x[4],@x[4]
  576. rev @x[6],@x[6]
  577. rev @x[8],@x[8]
  578. rev @x[10],@x[10]
  579. rev @x[12],@x[12]
  580. rev @x[14],@x[14]
  581. #endif
  582. ld1.8 {$xt0-$xt3},[$inp],#64
  583. eor @x[0],@x[0],@x[1]
  584. add $xa1,$xa1,@K[0]
  585. eor @x[2],@x[2],@x[3]
  586. add $xb1,$xb1,@K[1]
  587. eor @x[4],@x[4],@x[5]
  588. add $xc1,$xc1,@K[2]
  589. eor @x[6],@x[6],@x[7]
  590. add $xd1,$xd1,@K[3]
  591. eor @x[8],@x[8],@x[9]
  592. eor $xa0,$xa0,$xt0
  593. movi $xt0,#5
  594. eor @x[10],@x[10],@x[11]
  595. eor $xb0,$xb0,$xt1
  596. eor @x[12],@x[12],@x[13]
  597. eor $xc0,$xc0,$xt2
  598. eor @x[14],@x[14],@x[15]
  599. eor $xd0,$xd0,$xt3
  600. add $CTR,$CTR,$xt0 // += 5
  601. ld1.8 {$xt0-$xt3},[$inp],#64
  602. stp @x[0],@x[2],[$out,#0] // store output
  603. add @d[6],@d[6],#5 // increment counter
  604. stp @x[4],@x[6],[$out,#16]
  605. stp @x[8],@x[10],[$out,#32]
  606. stp @x[12],@x[14],[$out,#48]
  607. add $out,$out,#64
  608. st1.8 {$xa0-$xd0},[$out],#64
  609. add $xa2,$xa2,@K[0]
  610. add $xb2,$xb2,@K[1]
  611. add $xc2,$xc2,@K[2]
  612. add $xd2,$xd2,@K[3]
  613. ld1.8 {$xa0-$xd0},[$inp],#64
  614. eor $xa1,$xa1,$xt0
  615. eor $xb1,$xb1,$xt1
  616. eor $xc1,$xc1,$xt2
  617. eor $xd1,$xd1,$xt3
  618. st1.8 {$xa1-$xd1},[$out],#64
  619. add $xa3,$xa3,@K[0]
  620. add $xb3,$xb3,@K[1]
  621. add $xc3,$xc3,@K[2]
  622. add $xd3,$xd3,@K[3]
  623. ld1.8 {$xa1-$xd1},[$inp],#64
  624. eor $xa2,$xa2,$xa0
  625. eor $xb2,$xb2,$xb0
  626. eor $xc2,$xc2,$xc0
  627. eor $xd2,$xd2,$xd0
  628. st1.8 {$xa2-$xd2},[$out],#64
  629. eor $xa3,$xa3,$xa1
  630. eor $xb3,$xb3,$xb1
  631. eor $xc3,$xc3,$xc1
  632. eor $xd3,$xd3,$xd1
  633. st1.8 {$xa3-$xd3},[$out],#64
  634. b.hi .Loop_outer_neon
  635. ldp d8,d9,[sp] // meet ABI requirements
  636. ldp x19,x20,[x29,#16]
  637. add sp,sp,#64
  638. ldp x21,x22,[x29,#32]
  639. ldp x23,x24,[x29,#48]
  640. ldp x25,x26,[x29,#64]
  641. ldp x27,x28,[x29,#80]
  642. ldp x29,x30,[sp],#96
  643. AARCH64_VALIDATE_LINK_REGISTER
  644. ret
  645. .align 4
  646. .Ltail_neon:
  647. add $len,$len,#320
  648. ldp d8,d9,[sp] // meet ABI requirements
  649. cmp $len,#64
  650. b.lo .Less_than_64
  651. add @x[0],@x[0],@x[1],lsl#32 // pack
  652. add @x[2],@x[2],@x[3],lsl#32
  653. ldp @x[1],@x[3],[$inp,#0] // load input
  654. add @x[4],@x[4],@x[5],lsl#32
  655. add @x[6],@x[6],@x[7],lsl#32
  656. ldp @x[5],@x[7],[$inp,#16]
  657. add @x[8],@x[8],@x[9],lsl#32
  658. add @x[10],@x[10],@x[11],lsl#32
  659. ldp @x[9],@x[11],[$inp,#32]
  660. add @x[12],@x[12],@x[13],lsl#32
  661. add @x[14],@x[14],@x[15],lsl#32
  662. ldp @x[13],@x[15],[$inp,#48]
  663. add $inp,$inp,#64
  664. #ifdef __AARCH64EB__
  665. rev @x[0],@x[0]
  666. rev @x[2],@x[2]
  667. rev @x[4],@x[4]
  668. rev @x[6],@x[6]
  669. rev @x[8],@x[8]
  670. rev @x[10],@x[10]
  671. rev @x[12],@x[12]
  672. rev @x[14],@x[14]
  673. #endif
  674. eor @x[0],@x[0],@x[1]
  675. eor @x[2],@x[2],@x[3]
  676. eor @x[4],@x[4],@x[5]
  677. eor @x[6],@x[6],@x[7]
  678. eor @x[8],@x[8],@x[9]
  679. eor @x[10],@x[10],@x[11]
  680. eor @x[12],@x[12],@x[13]
  681. eor @x[14],@x[14],@x[15]
  682. stp @x[0],@x[2],[$out,#0] // store output
  683. add $xa0,$xa0,@K[0] // accumulate key block
  684. stp @x[4],@x[6],[$out,#16]
  685. add $xb0,$xb0,@K[1]
  686. stp @x[8],@x[10],[$out,#32]
  687. add $xc0,$xc0,@K[2]
  688. stp @x[12],@x[14],[$out,#48]
  689. add $xd0,$xd0,@K[3]
  690. add $out,$out,#64
  691. b.eq .Ldone_neon
  692. sub $len,$len,#64
  693. cmp $len,#64
  694. b.lo .Last_neon
  695. ld1.8 {$xt0-$xt3},[$inp],#64
  696. eor $xa0,$xa0,$xt0
  697. eor $xb0,$xb0,$xt1
  698. eor $xc0,$xc0,$xt2
  699. eor $xd0,$xd0,$xt3
  700. st1.8 {$xa0-$xd0},[$out],#64
  701. b.eq .Ldone_neon
  702. add $xa0,$xa1,@K[0]
  703. add $xb0,$xb1,@K[1]
  704. sub $len,$len,#64
  705. add $xc0,$xc1,@K[2]
  706. cmp $len,#64
  707. add $xd0,$xd1,@K[3]
  708. b.lo .Last_neon
  709. ld1.8 {$xt0-$xt3},[$inp],#64
  710. eor $xa1,$xa0,$xt0
  711. eor $xb1,$xb0,$xt1
  712. eor $xc1,$xc0,$xt2
  713. eor $xd1,$xd0,$xt3
  714. st1.8 {$xa1-$xd1},[$out],#64
  715. b.eq .Ldone_neon
  716. add $xa0,$xa2,@K[0]
  717. add $xb0,$xb2,@K[1]
  718. sub $len,$len,#64
  719. add $xc0,$xc2,@K[2]
  720. cmp $len,#64
  721. add $xd0,$xd2,@K[3]
  722. b.lo .Last_neon
  723. ld1.8 {$xt0-$xt3},[$inp],#64
  724. eor $xa2,$xa0,$xt0
  725. eor $xb2,$xb0,$xt1
  726. eor $xc2,$xc0,$xt2
  727. eor $xd2,$xd0,$xt3
  728. st1.8 {$xa2-$xd2},[$out],#64
  729. b.eq .Ldone_neon
  730. add $xa0,$xa3,@K[0]
  731. add $xb0,$xb3,@K[1]
  732. add $xc0,$xc3,@K[2]
  733. add $xd0,$xd3,@K[3]
  734. sub $len,$len,#64
  735. .Last_neon:
  736. st1.8 {$xa0-$xd0},[sp]
  737. sub $out,$out,#1
  738. add $inp,$inp,$len
  739. add $out,$out,$len
  740. add $ctr,sp,$len
  741. neg $len,$len
  742. .Loop_tail_neon:
  743. ldrb w10,[$inp,$len]
  744. ldrb w11,[$ctr,$len]
  745. add $len,$len,#1
  746. eor w10,w10,w11
  747. strb w10,[$out,$len]
  748. cbnz $len,.Loop_tail_neon
  749. stp xzr,xzr,[sp,#0]
  750. stp xzr,xzr,[sp,#16]
  751. stp xzr,xzr,[sp,#32]
  752. stp xzr,xzr,[sp,#48]
  753. .Ldone_neon:
  754. ldp x19,x20,[x29,#16]
  755. add sp,sp,#64
  756. ldp x21,x22,[x29,#32]
  757. ldp x23,x24,[x29,#48]
  758. ldp x25,x26,[x29,#64]
  759. ldp x27,x28,[x29,#80]
  760. ldp x29,x30,[sp],#96
  761. AARCH64_VALIDATE_LINK_REGISTER
  762. ret
  763. .size ChaCha20_neon,.-ChaCha20_neon
  764. ___
  765. {
  766. my @K = map("v$_.4s",(0..6));
  767. my ($T0,$T1,$T2,$T3,$T4,$T5)=@K;
  768. my ($A0,$B0,$C0,$D0,$A1,$B1,$C1,$D1,$A2,$B2,$C2,$D2,
  769. $A3,$B3,$C3,$D3,$A4,$B4,$C4,$D4,$A5,$B5,$C5,$D5) = map("v$_.4s",(8..31));
  770. my $rot24 = @K[6];
  771. my $ONE = "v7.4s";
  772. sub NEONROUND {
  773. my $odd = pop;
  774. my ($a,$b,$c,$d,$t)=@_;
  775. (
  776. "&add ('$a','$a','$b')",
  777. "&eor ('$d','$d','$a')",
  778. "&rev32_16 ('$d','$d')", # vrot ($d,16)
  779. "&add ('$c','$c','$d')",
  780. "&eor ('$t','$b','$c')",
  781. "&ushr ('$b','$t',20)",
  782. "&sli ('$b','$t',12)",
  783. "&add ('$a','$a','$b')",
  784. "&eor ('$d','$d','$a')",
  785. "&tbl ('$d','{$d}','$rot24')",
  786. "&add ('$c','$c','$d')",
  787. "&eor ('$t','$b','$c')",
  788. "&ushr ('$b','$t',25)",
  789. "&sli ('$b','$t',7)",
  790. "&ext ('$c','$c','$c',8)",
  791. "&ext ('$d','$d','$d',$odd?4:12)",
  792. "&ext ('$b','$b','$b',$odd?12:4)"
  793. );
  794. }
  795. $code.=<<___;
  796. .type ChaCha20_512_neon,%function
  797. .align 5
  798. ChaCha20_512_neon:
  799. AARCH64_SIGN_LINK_REGISTER
  800. stp x29,x30,[sp,#-96]!
  801. add x29,sp,#0
  802. adr @x[0],.Lsigma
  803. stp x19,x20,[sp,#16]
  804. stp x21,x22,[sp,#32]
  805. stp x23,x24,[sp,#48]
  806. stp x25,x26,[sp,#64]
  807. stp x27,x28,[sp,#80]
  808. .L512_or_more_neon:
  809. sub sp,sp,#128+64
  810. eor $ONE,$ONE,$ONE
  811. ldp @d[0],@d[1],[@x[0]] // load sigma
  812. ld1 {@K[0]},[@x[0]],#16
  813. ldp @d[2],@d[3],[$key] // load key
  814. ldp @d[4],@d[5],[$key,#16]
  815. ld1 {@K[1],@K[2]},[$key]
  816. ldp @d[6],@d[7],[$ctr] // load counter
  817. ld1 {@K[3]},[$ctr]
  818. ld1 {$ONE}[0],[@x[0]]
  819. add $key,@x[0],#16 // .Lrot24
  820. #ifdef __AARCH64EB__
  821. rev64 @K[0],@K[0]
  822. ror @d[2],@d[2],#32
  823. ror @d[3],@d[3],#32
  824. ror @d[4],@d[4],#32
  825. ror @d[5],@d[5],#32
  826. ror @d[6],@d[6],#32
  827. ror @d[7],@d[7],#32
  828. #endif
  829. add @K[3],@K[3],$ONE // += 1
  830. stp @K[0],@K[1],[sp,#0] // off-load key block, invariant part
  831. add @K[3],@K[3],$ONE // not typo
  832. str @K[2],[sp,#32]
  833. add @K[4],@K[3],$ONE
  834. add @K[5],@K[4],$ONE
  835. add @K[6],@K[5],$ONE
  836. shl $ONE,$ONE,#2 // 1 -> 4
  837. stp d8,d9,[sp,#128+0] // meet ABI requirements
  838. stp d10,d11,[sp,#128+16]
  839. stp d12,d13,[sp,#128+32]
  840. stp d14,d15,[sp,#128+48]
  841. sub $len,$len,#512 // not typo
  842. .Loop_outer_512_neon:
  843. mov $A0,@K[0]
  844. mov $A1,@K[0]
  845. mov $A2,@K[0]
  846. mov $A3,@K[0]
  847. mov $A4,@K[0]
  848. mov $A5,@K[0]
  849. mov $B0,@K[1]
  850. mov.32 @x[0],@d[0] // unpack key block
  851. mov $B1,@K[1]
  852. lsr @x[1],@d[0],#32
  853. mov $B2,@K[1]
  854. mov.32 @x[2],@d[1]
  855. mov $B3,@K[1]
  856. lsr @x[3],@d[1],#32
  857. mov $B4,@K[1]
  858. mov.32 @x[4],@d[2]
  859. mov $B5,@K[1]
  860. lsr @x[5],@d[2],#32
  861. mov $D0,@K[3]
  862. mov.32 @x[6],@d[3]
  863. mov $D1,@K[4]
  864. lsr @x[7],@d[3],#32
  865. mov $D2,@K[5]
  866. mov.32 @x[8],@d[4]
  867. mov $D3,@K[6]
  868. lsr @x[9],@d[4],#32
  869. mov $C0,@K[2]
  870. mov.32 @x[10],@d[5]
  871. mov $C1,@K[2]
  872. lsr @x[11],@d[5],#32
  873. add $D4,$D0,$ONE // +4
  874. mov.32 @x[12],@d[6]
  875. add $D5,$D1,$ONE // +4
  876. lsr @x[13],@d[6],#32
  877. mov $C2,@K[2]
  878. mov.32 @x[14],@d[7]
  879. mov $C3,@K[2]
  880. lsr @x[15],@d[7],#32
  881. mov $C4,@K[2]
  882. stp @K[3],@K[4],[sp,#48] // off-load key block, variable part
  883. mov $C5,@K[2]
  884. stp @K[5],@K[6],[sp,#80]
  885. mov $ctr,#5
  886. ld1 {$rot24},[$key]
  887. subs $len,$len,#512
  888. .Loop_upper_neon:
  889. sub $ctr,$ctr,#1
  890. ___
  891. my @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0);
  892. my @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0);
  893. my @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0);
  894. my @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,0);
  895. my @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,0);
  896. my @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,0);
  897. my @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
  898. my $diff = ($#thread0+1)*6 - $#thread67 - 1;
  899. my $i = 0;
  900. foreach (@thread0) {
  901. eval; eval(shift(@thread67));
  902. eval(shift(@thread1)); eval(shift(@thread67));
  903. eval(shift(@thread2)); eval(shift(@thread67));
  904. eval(shift(@thread3)); eval(shift(@thread67));
  905. eval(shift(@thread4)); eval(shift(@thread67));
  906. eval(shift(@thread5)); eval(shift(@thread67));
  907. }
  908. @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1);
  909. @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1);
  910. @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1);
  911. @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,1);
  912. @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,1);
  913. @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,1);
  914. @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
  915. foreach (@thread0) {
  916. eval; eval(shift(@thread67));
  917. eval(shift(@thread1)); eval(shift(@thread67));
  918. eval(shift(@thread2)); eval(shift(@thread67));
  919. eval(shift(@thread3)); eval(shift(@thread67));
  920. eval(shift(@thread4)); eval(shift(@thread67));
  921. eval(shift(@thread5)); eval(shift(@thread67));
  922. }
  923. $code.=<<___;
  924. cbnz $ctr,.Loop_upper_neon
  925. add.32 @x[0],@x[0],@d[0] // accumulate key block
  926. add @x[1],@x[1],@d[0],lsr#32
  927. add.32 @x[2],@x[2],@d[1]
  928. add @x[3],@x[3],@d[1],lsr#32
  929. add.32 @x[4],@x[4],@d[2]
  930. add @x[5],@x[5],@d[2],lsr#32
  931. add.32 @x[6],@x[6],@d[3]
  932. add @x[7],@x[7],@d[3],lsr#32
  933. add.32 @x[8],@x[8],@d[4]
  934. add @x[9],@x[9],@d[4],lsr#32
  935. add.32 @x[10],@x[10],@d[5]
  936. add @x[11],@x[11],@d[5],lsr#32
  937. add.32 @x[12],@x[12],@d[6]
  938. add @x[13],@x[13],@d[6],lsr#32
  939. add.32 @x[14],@x[14],@d[7]
  940. add @x[15],@x[15],@d[7],lsr#32
  941. add @x[0],@x[0],@x[1],lsl#32 // pack
  942. add @x[2],@x[2],@x[3],lsl#32
  943. ldp @x[1],@x[3],[$inp,#0] // load input
  944. add @x[4],@x[4],@x[5],lsl#32
  945. add @x[6],@x[6],@x[7],lsl#32
  946. ldp @x[5],@x[7],[$inp,#16]
  947. add @x[8],@x[8],@x[9],lsl#32
  948. add @x[10],@x[10],@x[11],lsl#32
  949. ldp @x[9],@x[11],[$inp,#32]
  950. add @x[12],@x[12],@x[13],lsl#32
  951. add @x[14],@x[14],@x[15],lsl#32
  952. ldp @x[13],@x[15],[$inp,#48]
  953. add $inp,$inp,#64
  954. #ifdef __AARCH64EB__
  955. rev @x[0],@x[0]
  956. rev @x[2],@x[2]
  957. rev @x[4],@x[4]
  958. rev @x[6],@x[6]
  959. rev @x[8],@x[8]
  960. rev @x[10],@x[10]
  961. rev @x[12],@x[12]
  962. rev @x[14],@x[14]
  963. #endif
  964. eor @x[0],@x[0],@x[1]
  965. eor @x[2],@x[2],@x[3]
  966. eor @x[4],@x[4],@x[5]
  967. eor @x[6],@x[6],@x[7]
  968. eor @x[8],@x[8],@x[9]
  969. eor @x[10],@x[10],@x[11]
  970. eor @x[12],@x[12],@x[13]
  971. eor @x[14],@x[14],@x[15]
  972. stp @x[0],@x[2],[$out,#0] // store output
  973. add @d[6],@d[6],#1 // increment counter
  974. mov.32 @x[0],@d[0] // unpack key block
  975. lsr @x[1],@d[0],#32
  976. stp @x[4],@x[6],[$out,#16]
  977. mov.32 @x[2],@d[1]
  978. lsr @x[3],@d[1],#32
  979. stp @x[8],@x[10],[$out,#32]
  980. mov.32 @x[4],@d[2]
  981. lsr @x[5],@d[2],#32
  982. stp @x[12],@x[14],[$out,#48]
  983. add $out,$out,#64
  984. mov.32 @x[6],@d[3]
  985. lsr @x[7],@d[3],#32
  986. mov.32 @x[8],@d[4]
  987. lsr @x[9],@d[4],#32
  988. mov.32 @x[10],@d[5]
  989. lsr @x[11],@d[5],#32
  990. mov.32 @x[12],@d[6]
  991. lsr @x[13],@d[6],#32
  992. mov.32 @x[14],@d[7]
  993. lsr @x[15],@d[7],#32
  994. mov $ctr,#5
  995. .Loop_lower_neon:
  996. sub $ctr,$ctr,#1
  997. ___
  998. @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0);
  999. @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0);
  1000. @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0);
  1001. @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,0);
  1002. @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,0);
  1003. @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,0);
  1004. @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
  1005. foreach (@thread0) {
  1006. eval; eval(shift(@thread67));
  1007. eval(shift(@thread1)); eval(shift(@thread67));
  1008. eval(shift(@thread2)); eval(shift(@thread67));
  1009. eval(shift(@thread3)); eval(shift(@thread67));
  1010. eval(shift(@thread4)); eval(shift(@thread67));
  1011. eval(shift(@thread5)); eval(shift(@thread67));
  1012. }
  1013. @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1);
  1014. @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1);
  1015. @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1);
  1016. @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,1);
  1017. @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,1);
  1018. @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,1);
  1019. @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
  1020. foreach (@thread0) {
  1021. eval; eval(shift(@thread67));
  1022. eval(shift(@thread1)); eval(shift(@thread67));
  1023. eval(shift(@thread2)); eval(shift(@thread67));
  1024. eval(shift(@thread3)); eval(shift(@thread67));
  1025. eval(shift(@thread4)); eval(shift(@thread67));
  1026. eval(shift(@thread5)); eval(shift(@thread67));
  1027. }
  1028. $code.=<<___;
  1029. cbnz $ctr,.Loop_lower_neon
  1030. add.32 @x[0],@x[0],@d[0] // accumulate key block
  1031. ldp @K[0],@K[1],[sp,#0]
  1032. add @x[1],@x[1],@d[0],lsr#32
  1033. ldp @K[2],@K[3],[sp,#32]
  1034. add.32 @x[2],@x[2],@d[1]
  1035. ldp @K[4],@K[5],[sp,#64]
  1036. add @x[3],@x[3],@d[1],lsr#32
  1037. ldr @K[6],[sp,#96]
  1038. add $A0,$A0,@K[0]
  1039. add.32 @x[4],@x[4],@d[2]
  1040. add $A1,$A1,@K[0]
  1041. add @x[5],@x[5],@d[2],lsr#32
  1042. add $A2,$A2,@K[0]
  1043. add.32 @x[6],@x[6],@d[3]
  1044. add $A3,$A3,@K[0]
  1045. add @x[7],@x[7],@d[3],lsr#32
  1046. add $A4,$A4,@K[0]
  1047. add.32 @x[8],@x[8],@d[4]
  1048. add $A5,$A5,@K[0]
  1049. add @x[9],@x[9],@d[4],lsr#32
  1050. add $C0,$C0,@K[2]
  1051. add.32 @x[10],@x[10],@d[5]
  1052. add $C1,$C1,@K[2]
  1053. add @x[11],@x[11],@d[5],lsr#32
  1054. add $C2,$C2,@K[2]
  1055. add.32 @x[12],@x[12],@d[6]
  1056. add $C3,$C3,@K[2]
  1057. add @x[13],@x[13],@d[6],lsr#32
  1058. add $C4,$C4,@K[2]
  1059. add.32 @x[14],@x[14],@d[7]
  1060. add $C5,$C5,@K[2]
  1061. add @x[15],@x[15],@d[7],lsr#32
  1062. add $D4,$D4,$ONE // +4
  1063. add @x[0],@x[0],@x[1],lsl#32 // pack
  1064. add $D5,$D5,$ONE // +4
  1065. add @x[2],@x[2],@x[3],lsl#32
  1066. add $D0,$D0,@K[3]
  1067. ldp @x[1],@x[3],[$inp,#0] // load input
  1068. add $D1,$D1,@K[4]
  1069. add @x[4],@x[4],@x[5],lsl#32
  1070. add $D2,$D2,@K[5]
  1071. add @x[6],@x[6],@x[7],lsl#32
  1072. add $D3,$D3,@K[6]
  1073. ldp @x[5],@x[7],[$inp,#16]
  1074. add $D4,$D4,@K[3]
  1075. add @x[8],@x[8],@x[9],lsl#32
  1076. add $D5,$D5,@K[4]
  1077. add @x[10],@x[10],@x[11],lsl#32
  1078. add $B0,$B0,@K[1]
  1079. ldp @x[9],@x[11],[$inp,#32]
  1080. add $B1,$B1,@K[1]
  1081. add @x[12],@x[12],@x[13],lsl#32
  1082. add $B2,$B2,@K[1]
  1083. add @x[14],@x[14],@x[15],lsl#32
  1084. add $B3,$B3,@K[1]
  1085. ldp @x[13],@x[15],[$inp,#48]
  1086. add $B4,$B4,@K[1]
  1087. add $inp,$inp,#64
  1088. add $B5,$B5,@K[1]
  1089. #ifdef __AARCH64EB__
  1090. rev @x[0],@x[0]
  1091. rev @x[2],@x[2]
  1092. rev @x[4],@x[4]
  1093. rev @x[6],@x[6]
  1094. rev @x[8],@x[8]
  1095. rev @x[10],@x[10]
  1096. rev @x[12],@x[12]
  1097. rev @x[14],@x[14]
  1098. #endif
  1099. ld1.8 {$T0-$T3},[$inp],#64
  1100. eor @x[0],@x[0],@x[1]
  1101. eor @x[2],@x[2],@x[3]
  1102. eor @x[4],@x[4],@x[5]
  1103. eor @x[6],@x[6],@x[7]
  1104. eor @x[8],@x[8],@x[9]
  1105. eor $A0,$A0,$T0
  1106. eor @x[10],@x[10],@x[11]
  1107. eor $B0,$B0,$T1
  1108. eor @x[12],@x[12],@x[13]
  1109. eor $C0,$C0,$T2
  1110. eor @x[14],@x[14],@x[15]
  1111. eor $D0,$D0,$T3
  1112. ld1.8 {$T0-$T3},[$inp],#64
  1113. stp @x[0],@x[2],[$out,#0] // store output
  1114. add @d[6],@d[6],#7 // increment counter
  1115. stp @x[4],@x[6],[$out,#16]
  1116. stp @x[8],@x[10],[$out,#32]
  1117. stp @x[12],@x[14],[$out,#48]
  1118. add $out,$out,#64
  1119. st1.8 {$A0-$D0},[$out],#64
  1120. ld1.8 {$A0-$D0},[$inp],#64
  1121. eor $A1,$A1,$T0
  1122. eor $B1,$B1,$T1
  1123. eor $C1,$C1,$T2
  1124. eor $D1,$D1,$T3
  1125. st1.8 {$A1-$D1},[$out],#64
  1126. ld1.8 {$A1-$D1},[$inp],#64
  1127. eor $A2,$A2,$A0
  1128. ldp @K[0],@K[1],[sp,#0]
  1129. eor $B2,$B2,$B0
  1130. ldp @K[2],@K[3],[sp,#32]
  1131. eor $C2,$C2,$C0
  1132. eor $D2,$D2,$D0
  1133. st1.8 {$A2-$D2},[$out],#64
  1134. ld1.8 {$A2-$D2},[$inp],#64
  1135. eor $A3,$A3,$A1
  1136. eor $B3,$B3,$B1
  1137. eor $C3,$C3,$C1
  1138. eor $D3,$D3,$D1
  1139. st1.8 {$A3-$D3},[$out],#64
  1140. ld1.8 {$A3-$D3},[$inp],#64
  1141. eor $A4,$A4,$A2
  1142. eor $B4,$B4,$B2
  1143. eor $C4,$C4,$C2
  1144. eor $D4,$D4,$D2
  1145. st1.8 {$A4-$D4},[$out],#64
  1146. shl $A0,$ONE,#1 // 4 -> 8
  1147. eor $A5,$A5,$A3
  1148. eor $B5,$B5,$B3
  1149. eor $C5,$C5,$C3
  1150. eor $D5,$D5,$D3
  1151. st1.8 {$A5-$D5},[$out],#64
  1152. add @K[3],@K[3],$A0 // += 8
  1153. add @K[4],@K[4],$A0
  1154. add @K[5],@K[5],$A0
  1155. add @K[6],@K[6],$A0
  1156. b.hs .Loop_outer_512_neon
  1157. adds $len,$len,#512
  1158. ushr $ONE,$ONE,#1 // 4 -> 2
  1159. ldp d10,d11,[sp,#128+16] // meet ABI requirements
  1160. ldp d12,d13,[sp,#128+32]
  1161. ldp d14,d15,[sp,#128+48]
  1162. stp @K[0],@K[0],[sp,#0] // wipe off-load area
  1163. stp @K[0],@K[0],[sp,#32]
  1164. stp @K[0],@K[0],[sp,#64]
  1165. b.eq .Ldone_512_neon
  1166. sub $key,$key,#16 // .Lone
  1167. cmp $len,#192
  1168. add sp,sp,#128
  1169. sub @K[3],@K[3],$ONE // -= 2
  1170. ld1 {$CTR,$ROT24},[$key]
  1171. b.hs .Loop_outer_neon
  1172. ldp d8,d9,[sp,#0] // meet ABI requirements
  1173. eor @K[1],@K[1],@K[1]
  1174. eor @K[2],@K[2],@K[2]
  1175. eor @K[3],@K[3],@K[3]
  1176. eor @K[4],@K[4],@K[4]
  1177. eor @K[5],@K[5],@K[5]
  1178. eor @K[6],@K[6],@K[6]
  1179. b .Loop_outer
  1180. .Ldone_512_neon:
  1181. ldp d8,d9,[sp,#128+0] // meet ABI requirements
  1182. ldp x19,x20,[x29,#16]
  1183. add sp,sp,#128+64
  1184. ldp x21,x22,[x29,#32]
  1185. ldp x23,x24,[x29,#48]
  1186. ldp x25,x26,[x29,#64]
  1187. ldp x27,x28,[x29,#80]
  1188. ldp x29,x30,[sp],#96
  1189. AARCH64_VALIDATE_LINK_REGISTER
  1190. ret
  1191. .size ChaCha20_512_neon,.-ChaCha20_512_neon
  1192. ___
  1193. }
  1194. }}}
  1195. foreach (split("\n",$code)) {
  1196. s/\`([^\`]*)\`/eval $1/geo;
  1197. (s/\b([a-z]+)\.32\b/$1/ and (s/x([0-9]+)/w$1/g or 1)) or
  1198. (m/\b(eor|ext|mov|tbl)\b/ and (s/\.4s/\.16b/g or 1)) or
  1199. (s/\b((?:ld|st)1)\.8\b/$1/ and (s/\.4s/\.16b/g or 1)) or
  1200. (m/\b(ld|st)[rp]\b/ and (s/v([0-9]+)\.4s/q$1/g or 1)) or
  1201. (m/\b(dup|ld1)\b/ and (s/\.4(s}?\[[0-3]\])/.$1/g or 1)) or
  1202. (s/\b(zip[12])\.64\b/$1/ and (s/\.4s/\.2d/g or 1)) or
  1203. (s/\brev32\.16\b/rev32/ and (s/\.4s/\.8h/g or 1));
  1204. #s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo;
  1205. print $_,"\n";
  1206. }
  1207. close STDOUT or die "error closing STDOUT: $!"; # flush