chacha-armv8.pl 31 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288
  1. #! /usr/bin/env perl
  2. # Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. # ====================================================================
  10. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  11. # project. The module is, however, dual licensed under OpenSSL and
  12. # CRYPTOGAMS licenses depending on where you obtain it. For further
  13. # details see http://www.openssl.org/~appro/cryptogams/.
  14. # ====================================================================
  15. #
  16. # June 2015
  17. #
  18. # ChaCha20 for ARMv8.
  19. #
  20. # April 2019
  21. #
  22. # Replace 3xNEON+1xIALU code path with 4+1. 4+1 is actually fastest
  23. # option on most(*), but not all, processors, yet 6+2 is retained.
  24. # This is because penalties are considered tolerable in comparison to
  25. # improvement on processors where 6+2 helps. Most notably +37% on
  26. # ThunderX2. It's server-oriented processor which will have to serve
  27. # as many requests as possible. While others are mostly clients, when
  28. # performance doesn't have to be absolute top-notch, just fast enough,
  29. # as majority of time is spent "entertaining" relatively slow human.
  30. #
  31. # Performance in cycles per byte out of large buffer.
  32. #
  33. # IALU/gcc-4.9 4xNEON+1xIALU 6xNEON+2xIALU
  34. #
  35. # Apple A7 5.50/+49% 2.72 1.60
  36. # Cortex-A53 8.40/+80% 4.06 4.45(*)
  37. # Cortex-A57 8.06/+43% 4.15 4.40(*)
  38. # Denver 4.50/+82% 2.30 2.70(*)
  39. # X-Gene 9.50/+46% 8.20 8.90(*)
  40. # Mongoose 8.00/+44% 2.74 3.12(*)
  41. # Kryo 8.17/+50% 4.47 4.65(*)
  42. # ThunderX2 7.22/+48% 5.64 4.10
  43. #
  44. # (*) slower than 4+1:-(
  45. $flavour=shift;
  46. $output=shift;
  47. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  48. ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
  49. ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
  50. die "can't locate arm-xlate.pl";
  51. open OUT,"| \"$^X\" $xlate $flavour $output";
  52. *STDOUT=*OUT;
  53. sub AUTOLOAD() # thunk [simplified] x86-style perlasm
  54. { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
  55. my $arg = pop;
  56. $arg = "#$arg" if ($arg*1 eq $arg);
  57. $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
  58. }
  59. my ($out,$inp,$len,$key,$ctr) = map("x$_",(0..4));
  60. my @x=map("x$_",(5..17,19..21));
  61. my @d=map("x$_",(22..28,30));
  62. sub ROUND {
  63. my ($a0,$b0,$c0,$d0)=@_;
  64. my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
  65. my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
  66. my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
  67. (
  68. "&add_32 (@x[$a0],@x[$a0],@x[$b0])",
  69. "&add_32 (@x[$a1],@x[$a1],@x[$b1])",
  70. "&add_32 (@x[$a2],@x[$a2],@x[$b2])",
  71. "&add_32 (@x[$a3],@x[$a3],@x[$b3])",
  72. "&eor_32 (@x[$d0],@x[$d0],@x[$a0])",
  73. "&eor_32 (@x[$d1],@x[$d1],@x[$a1])",
  74. "&eor_32 (@x[$d2],@x[$d2],@x[$a2])",
  75. "&eor_32 (@x[$d3],@x[$d3],@x[$a3])",
  76. "&ror_32 (@x[$d0],@x[$d0],16)",
  77. "&ror_32 (@x[$d1],@x[$d1],16)",
  78. "&ror_32 (@x[$d2],@x[$d2],16)",
  79. "&ror_32 (@x[$d3],@x[$d3],16)",
  80. "&add_32 (@x[$c0],@x[$c0],@x[$d0])",
  81. "&add_32 (@x[$c1],@x[$c1],@x[$d1])",
  82. "&add_32 (@x[$c2],@x[$c2],@x[$d2])",
  83. "&add_32 (@x[$c3],@x[$c3],@x[$d3])",
  84. "&eor_32 (@x[$b0],@x[$b0],@x[$c0])",
  85. "&eor_32 (@x[$b1],@x[$b1],@x[$c1])",
  86. "&eor_32 (@x[$b2],@x[$b2],@x[$c2])",
  87. "&eor_32 (@x[$b3],@x[$b3],@x[$c3])",
  88. "&ror_32 (@x[$b0],@x[$b0],20)",
  89. "&ror_32 (@x[$b1],@x[$b1],20)",
  90. "&ror_32 (@x[$b2],@x[$b2],20)",
  91. "&ror_32 (@x[$b3],@x[$b3],20)",
  92. "&add_32 (@x[$a0],@x[$a0],@x[$b0])",
  93. "&add_32 (@x[$a1],@x[$a1],@x[$b1])",
  94. "&add_32 (@x[$a2],@x[$a2],@x[$b2])",
  95. "&add_32 (@x[$a3],@x[$a3],@x[$b3])",
  96. "&eor_32 (@x[$d0],@x[$d0],@x[$a0])",
  97. "&eor_32 (@x[$d1],@x[$d1],@x[$a1])",
  98. "&eor_32 (@x[$d2],@x[$d2],@x[$a2])",
  99. "&eor_32 (@x[$d3],@x[$d3],@x[$a3])",
  100. "&ror_32 (@x[$d0],@x[$d0],24)",
  101. "&ror_32 (@x[$d1],@x[$d1],24)",
  102. "&ror_32 (@x[$d2],@x[$d2],24)",
  103. "&ror_32 (@x[$d3],@x[$d3],24)",
  104. "&add_32 (@x[$c0],@x[$c0],@x[$d0])",
  105. "&add_32 (@x[$c1],@x[$c1],@x[$d1])",
  106. "&add_32 (@x[$c2],@x[$c2],@x[$d2])",
  107. "&add_32 (@x[$c3],@x[$c3],@x[$d3])",
  108. "&eor_32 (@x[$b0],@x[$b0],@x[$c0])",
  109. "&eor_32 (@x[$b1],@x[$b1],@x[$c1])",
  110. "&eor_32 (@x[$b2],@x[$b2],@x[$c2])",
  111. "&eor_32 (@x[$b3],@x[$b3],@x[$c3])",
  112. "&ror_32 (@x[$b0],@x[$b0],25)",
  113. "&ror_32 (@x[$b1],@x[$b1],25)",
  114. "&ror_32 (@x[$b2],@x[$b2],25)",
  115. "&ror_32 (@x[$b3],@x[$b3],25)"
  116. );
  117. }
  118. $code.=<<___;
  119. #ifndef __KERNEL__
  120. # include "arm_arch.h"
  121. .extern OPENSSL_armcap_P
  122. #endif
  123. .text
  124. .align 5
  125. .Lsigma:
  126. .quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral
  127. .Lone:
  128. .long 1,2,3,4
  129. .Lrot24:
  130. .long 0x02010003,0x06050407,0x0a09080b,0x0e0d0c0f
  131. .asciz "ChaCha20 for ARMv8, CRYPTOGAMS by \@dot-asm"
  132. .globl ChaCha20_ctr32
  133. .type ChaCha20_ctr32,%function
  134. .align 5
  135. ChaCha20_ctr32:
  136. cbz $len,.Labort
  137. cmp $len,#192
  138. b.lo .Lshort
  139. #ifndef __KERNEL__
  140. adrp x17,OPENSSL_armcap_P
  141. ldr w17,[x17,#:lo12:OPENSSL_armcap_P]
  142. tst w17,#ARMV7_NEON
  143. b.ne .LChaCha20_neon
  144. #endif
  145. .Lshort:
  146. .inst 0xd503233f // paciasp
  147. stp x29,x30,[sp,#-96]!
  148. add x29,sp,#0
  149. adr @x[0],.Lsigma
  150. stp x19,x20,[sp,#16]
  151. stp x21,x22,[sp,#32]
  152. stp x23,x24,[sp,#48]
  153. stp x25,x26,[sp,#64]
  154. stp x27,x28,[sp,#80]
  155. sub sp,sp,#64
  156. ldp @d[0],@d[1],[@x[0]] // load sigma
  157. ldp @d[2],@d[3],[$key] // load key
  158. ldp @d[4],@d[5],[$key,#16]
  159. ldp @d[6],@d[7],[$ctr] // load counter
  160. #ifdef __AARCH64EB__
  161. ror @d[2],@d[2],#32
  162. ror @d[3],@d[3],#32
  163. ror @d[4],@d[4],#32
  164. ror @d[5],@d[5],#32
  165. ror @d[6],@d[6],#32
  166. ror @d[7],@d[7],#32
  167. #endif
  168. .Loop_outer:
  169. mov.32 @x[0],@d[0] // unpack key block
  170. lsr @x[1],@d[0],#32
  171. mov.32 @x[2],@d[1]
  172. lsr @x[3],@d[1],#32
  173. mov.32 @x[4],@d[2]
  174. lsr @x[5],@d[2],#32
  175. mov.32 @x[6],@d[3]
  176. lsr @x[7],@d[3],#32
  177. mov.32 @x[8],@d[4]
  178. lsr @x[9],@d[4],#32
  179. mov.32 @x[10],@d[5]
  180. lsr @x[11],@d[5],#32
  181. mov.32 @x[12],@d[6]
  182. lsr @x[13],@d[6],#32
  183. mov.32 @x[14],@d[7]
  184. lsr @x[15],@d[7],#32
  185. mov $ctr,#10
  186. subs $len,$len,#64
  187. .Loop:
  188. sub $ctr,$ctr,#1
  189. ___
  190. foreach (&ROUND(0, 4, 8,12)) { eval; }
  191. foreach (&ROUND(0, 5,10,15)) { eval; }
  192. $code.=<<___;
  193. cbnz $ctr,.Loop
  194. add.32 @x[0],@x[0],@d[0] // accumulate key block
  195. add @x[1],@x[1],@d[0],lsr#32
  196. add.32 @x[2],@x[2],@d[1]
  197. add @x[3],@x[3],@d[1],lsr#32
  198. add.32 @x[4],@x[4],@d[2]
  199. add @x[5],@x[5],@d[2],lsr#32
  200. add.32 @x[6],@x[6],@d[3]
  201. add @x[7],@x[7],@d[3],lsr#32
  202. add.32 @x[8],@x[8],@d[4]
  203. add @x[9],@x[9],@d[4],lsr#32
  204. add.32 @x[10],@x[10],@d[5]
  205. add @x[11],@x[11],@d[5],lsr#32
  206. add.32 @x[12],@x[12],@d[6]
  207. add @x[13],@x[13],@d[6],lsr#32
  208. add.32 @x[14],@x[14],@d[7]
  209. add @x[15],@x[15],@d[7],lsr#32
  210. b.lo .Ltail
  211. add @x[0],@x[0],@x[1],lsl#32 // pack
  212. add @x[2],@x[2],@x[3],lsl#32
  213. ldp @x[1],@x[3],[$inp,#0] // load input
  214. add @x[4],@x[4],@x[5],lsl#32
  215. add @x[6],@x[6],@x[7],lsl#32
  216. ldp @x[5],@x[7],[$inp,#16]
  217. add @x[8],@x[8],@x[9],lsl#32
  218. add @x[10],@x[10],@x[11],lsl#32
  219. ldp @x[9],@x[11],[$inp,#32]
  220. add @x[12],@x[12],@x[13],lsl#32
  221. add @x[14],@x[14],@x[15],lsl#32
  222. ldp @x[13],@x[15],[$inp,#48]
  223. add $inp,$inp,#64
  224. #ifdef __AARCH64EB__
  225. rev @x[0],@x[0]
  226. rev @x[2],@x[2]
  227. rev @x[4],@x[4]
  228. rev @x[6],@x[6]
  229. rev @x[8],@x[8]
  230. rev @x[10],@x[10]
  231. rev @x[12],@x[12]
  232. rev @x[14],@x[14]
  233. #endif
  234. eor @x[0],@x[0],@x[1]
  235. eor @x[2],@x[2],@x[3]
  236. eor @x[4],@x[4],@x[5]
  237. eor @x[6],@x[6],@x[7]
  238. eor @x[8],@x[8],@x[9]
  239. eor @x[10],@x[10],@x[11]
  240. eor @x[12],@x[12],@x[13]
  241. eor @x[14],@x[14],@x[15]
  242. stp @x[0],@x[2],[$out,#0] // store output
  243. add @d[6],@d[6],#1 // increment counter
  244. stp @x[4],@x[6],[$out,#16]
  245. stp @x[8],@x[10],[$out,#32]
  246. stp @x[12],@x[14],[$out,#48]
  247. add $out,$out,#64
  248. b.hi .Loop_outer
  249. ldp x19,x20,[x29,#16]
  250. add sp,sp,#64
  251. ldp x21,x22,[x29,#32]
  252. ldp x23,x24,[x29,#48]
  253. ldp x25,x26,[x29,#64]
  254. ldp x27,x28,[x29,#80]
  255. ldp x29,x30,[sp],#96
  256. .inst 0xd50323bf // autiasp
  257. .Labort:
  258. ret
  259. .align 4
  260. .Ltail:
  261. add $len,$len,#64
  262. .Less_than_64:
  263. sub $out,$out,#1
  264. add $inp,$inp,$len
  265. add $out,$out,$len
  266. add $ctr,sp,$len
  267. neg $len,$len
  268. add @x[0],@x[0],@x[1],lsl#32 // pack
  269. add @x[2],@x[2],@x[3],lsl#32
  270. add @x[4],@x[4],@x[5],lsl#32
  271. add @x[6],@x[6],@x[7],lsl#32
  272. add @x[8],@x[8],@x[9],lsl#32
  273. add @x[10],@x[10],@x[11],lsl#32
  274. add @x[12],@x[12],@x[13],lsl#32
  275. add @x[14],@x[14],@x[15],lsl#32
  276. #ifdef __AARCH64EB__
  277. rev @x[0],@x[0]
  278. rev @x[2],@x[2]
  279. rev @x[4],@x[4]
  280. rev @x[6],@x[6]
  281. rev @x[8],@x[8]
  282. rev @x[10],@x[10]
  283. rev @x[12],@x[12]
  284. rev @x[14],@x[14]
  285. #endif
  286. stp @x[0],@x[2],[sp,#0]
  287. stp @x[4],@x[6],[sp,#16]
  288. stp @x[8],@x[10],[sp,#32]
  289. stp @x[12],@x[14],[sp,#48]
  290. .Loop_tail:
  291. ldrb w10,[$inp,$len]
  292. ldrb w11,[$ctr,$len]
  293. add $len,$len,#1
  294. eor w10,w10,w11
  295. strb w10,[$out,$len]
  296. cbnz $len,.Loop_tail
  297. stp xzr,xzr,[sp,#0]
  298. stp xzr,xzr,[sp,#16]
  299. stp xzr,xzr,[sp,#32]
  300. stp xzr,xzr,[sp,#48]
  301. ldp x19,x20,[x29,#16]
  302. add sp,sp,#64
  303. ldp x21,x22,[x29,#32]
  304. ldp x23,x24,[x29,#48]
  305. ldp x25,x26,[x29,#64]
  306. ldp x27,x28,[x29,#80]
  307. ldp x29,x30,[sp],#96
  308. .inst 0xd50323bf // autiasp
  309. ret
  310. .size ChaCha20_ctr32,.-ChaCha20_ctr32
  311. ___
  312. {{{
  313. my @K = map("v$_.4s",(0..3));
  314. my ($xt0,$xt1,$xt2,$xt3, $CTR,$ROT24) = map("v$_.4s",(4..9));
  315. my @X = map("v$_.4s",(16,20,24,28, 17,21,25,29, 18,22,26,30, 19,23,27,31));
  316. my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
  317. $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3) = @X;
  318. sub NEON_lane_ROUND {
  319. my ($a0,$b0,$c0,$d0)=@_;
  320. my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
  321. my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
  322. my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
  323. my @x=map("'$_'",@X);
  324. (
  325. "&add (@x[$a0],@x[$a0],@x[$b0])", # Q1
  326. "&add (@x[$a1],@x[$a1],@x[$b1])", # Q2
  327. "&add (@x[$a2],@x[$a2],@x[$b2])", # Q3
  328. "&add (@x[$a3],@x[$a3],@x[$b3])", # Q4
  329. "&eor (@x[$d0],@x[$d0],@x[$a0])",
  330. "&eor (@x[$d1],@x[$d1],@x[$a1])",
  331. "&eor (@x[$d2],@x[$d2],@x[$a2])",
  332. "&eor (@x[$d3],@x[$d3],@x[$a3])",
  333. "&rev32_16 (@x[$d0],@x[$d0])",
  334. "&rev32_16 (@x[$d1],@x[$d1])",
  335. "&rev32_16 (@x[$d2],@x[$d2])",
  336. "&rev32_16 (@x[$d3],@x[$d3])",
  337. "&add (@x[$c0],@x[$c0],@x[$d0])",
  338. "&add (@x[$c1],@x[$c1],@x[$d1])",
  339. "&add (@x[$c2],@x[$c2],@x[$d2])",
  340. "&add (@x[$c3],@x[$c3],@x[$d3])",
  341. "&eor ('$xt0',@x[$b0],@x[$c0])",
  342. "&eor ('$xt1',@x[$b1],@x[$c1])",
  343. "&eor ('$xt2',@x[$b2],@x[$c2])",
  344. "&eor ('$xt3',@x[$b3],@x[$c3])",
  345. "&ushr (@x[$b0],'$xt0',20)",
  346. "&ushr (@x[$b1],'$xt1',20)",
  347. "&ushr (@x[$b2],'$xt2',20)",
  348. "&ushr (@x[$b3],'$xt3',20)",
  349. "&sli (@x[$b0],'$xt0',12)",
  350. "&sli (@x[$b1],'$xt1',12)",
  351. "&sli (@x[$b2],'$xt2',12)",
  352. "&sli (@x[$b3],'$xt3',12)",
  353. "&add (@x[$a0],@x[$a0],@x[$b0])",
  354. "&add (@x[$a1],@x[$a1],@x[$b1])",
  355. "&add (@x[$a2],@x[$a2],@x[$b2])",
  356. "&add (@x[$a3],@x[$a3],@x[$b3])",
  357. "&eor ('$xt0',@x[$d0],@x[$a0])",
  358. "&eor ('$xt1',@x[$d1],@x[$a1])",
  359. "&eor ('$xt2',@x[$d2],@x[$a2])",
  360. "&eor ('$xt3',@x[$d3],@x[$a3])",
  361. "&tbl (@x[$d0],'{$xt0}','$ROT24')",
  362. "&tbl (@x[$d1],'{$xt1}','$ROT24')",
  363. "&tbl (@x[$d2],'{$xt2}','$ROT24')",
  364. "&tbl (@x[$d3],'{$xt3}','$ROT24')",
  365. "&add (@x[$c0],@x[$c0],@x[$d0])",
  366. "&add (@x[$c1],@x[$c1],@x[$d1])",
  367. "&add (@x[$c2],@x[$c2],@x[$d2])",
  368. "&add (@x[$c3],@x[$c3],@x[$d3])",
  369. "&eor ('$xt0',@x[$b0],@x[$c0])",
  370. "&eor ('$xt1',@x[$b1],@x[$c1])",
  371. "&eor ('$xt2',@x[$b2],@x[$c2])",
  372. "&eor ('$xt3',@x[$b3],@x[$c3])",
  373. "&ushr (@x[$b0],'$xt0',25)",
  374. "&ushr (@x[$b1],'$xt1',25)",
  375. "&ushr (@x[$b2],'$xt2',25)",
  376. "&ushr (@x[$b3],'$xt3',25)",
  377. "&sli (@x[$b0],'$xt0',7)",
  378. "&sli (@x[$b1],'$xt1',7)",
  379. "&sli (@x[$b2],'$xt2',7)",
  380. "&sli (@x[$b3],'$xt3',7)"
  381. );
  382. }
  383. $code.=<<___;
  384. #ifdef __KERNEL__
  385. .globl ChaCha20_neon
  386. #endif
  387. .type ChaCha20_neon,%function
  388. .align 5
  389. ChaCha20_neon:
  390. .LChaCha20_neon:
  391. .inst 0xd503233f // paciasp
  392. stp x29,x30,[sp,#-96]!
  393. add x29,sp,#0
  394. adr @x[0],.Lsigma
  395. stp x19,x20,[sp,#16]
  396. stp x21,x22,[sp,#32]
  397. stp x23,x24,[sp,#48]
  398. stp x25,x26,[sp,#64]
  399. stp x27,x28,[sp,#80]
  400. cmp $len,#512
  401. b.hs .L512_or_more_neon
  402. sub sp,sp,#64
  403. ldp @d[0],@d[1],[@x[0]] // load sigma
  404. ld1 {@K[0]},[@x[0]],#16
  405. ldp @d[2],@d[3],[$key] // load key
  406. ldp @d[4],@d[5],[$key,#16]
  407. ld1 {@K[1],@K[2]},[$key]
  408. ldp @d[6],@d[7],[$ctr] // load counter
  409. ld1 {@K[3]},[$ctr]
  410. stp d8,d9,[sp] // meet ABI requirements
  411. ld1 {$CTR,$ROT24},[@x[0]]
  412. #ifdef __AARCH64EB__
  413. rev64 @K[0],@K[0]
  414. ror @d[2],@d[2],#32
  415. ror @d[3],@d[3],#32
  416. ror @d[4],@d[4],#32
  417. ror @d[5],@d[5],#32
  418. ror @d[6],@d[6],#32
  419. ror @d[7],@d[7],#32
  420. #endif
  421. .Loop_outer_neon:
  422. dup $xa0,@{K[0]}[0] // unpack key block
  423. mov.32 @x[0],@d[0]
  424. dup $xa1,@{K[0]}[1]
  425. lsr @x[1],@d[0],#32
  426. dup $xa2,@{K[0]}[2]
  427. mov.32 @x[2],@d[1]
  428. dup $xa3,@{K[0]}[3]
  429. lsr @x[3],@d[1],#32
  430. dup $xb0,@{K[1]}[0]
  431. mov.32 @x[4],@d[2]
  432. dup $xb1,@{K[1]}[1]
  433. lsr @x[5],@d[2],#32
  434. dup $xb2,@{K[1]}[2]
  435. mov.32 @x[6],@d[3]
  436. dup $xb3,@{K[1]}[3]
  437. lsr @x[7],@d[3],#32
  438. dup $xd0,@{K[3]}[0]
  439. mov.32 @x[8],@d[4]
  440. dup $xd1,@{K[3]}[1]
  441. lsr @x[9],@d[4],#32
  442. dup $xd2,@{K[3]}[2]
  443. mov.32 @x[10],@d[5]
  444. dup $xd3,@{K[3]}[3]
  445. lsr @x[11],@d[5],#32
  446. add $xd0,$xd0,$CTR
  447. mov.32 @x[12],@d[6]
  448. dup $xc0,@{K[2]}[0]
  449. lsr @x[13],@d[6],#32
  450. dup $xc1,@{K[2]}[1]
  451. mov.32 @x[14],@d[7]
  452. dup $xc2,@{K[2]}[2]
  453. lsr @x[15],@d[7],#32
  454. dup $xc3,@{K[2]}[3]
  455. mov $ctr,#10
  456. subs $len,$len,#320
  457. .Loop_neon:
  458. sub $ctr,$ctr,#1
  459. ___
  460. my @plus_one=&ROUND(0,4,8,12);
  461. foreach (&NEON_lane_ROUND(0,4,8,12)) { eval; eval(shift(@plus_one)); }
  462. @plus_one=&ROUND(0,5,10,15);
  463. foreach (&NEON_lane_ROUND(0,5,10,15)) { eval; eval(shift(@plus_one)); }
  464. $code.=<<___;
  465. cbnz $ctr,.Loop_neon
  466. add $xd0,$xd0,$CTR
  467. zip1 $xt0,$xa0,$xa1 // transpose data
  468. zip1 $xt1,$xa2,$xa3
  469. zip2 $xt2,$xa0,$xa1
  470. zip2 $xt3,$xa2,$xa3
  471. zip1.64 $xa0,$xt0,$xt1
  472. zip2.64 $xa1,$xt0,$xt1
  473. zip1.64 $xa2,$xt2,$xt3
  474. zip2.64 $xa3,$xt2,$xt3
  475. zip1 $xt0,$xb0,$xb1
  476. zip1 $xt1,$xb2,$xb3
  477. zip2 $xt2,$xb0,$xb1
  478. zip2 $xt3,$xb2,$xb3
  479. zip1.64 $xb0,$xt0,$xt1
  480. zip2.64 $xb1,$xt0,$xt1
  481. zip1.64 $xb2,$xt2,$xt3
  482. zip2.64 $xb3,$xt2,$xt3
  483. zip1 $xt0,$xc0,$xc1
  484. add.32 @x[0],@x[0],@d[0] // accumulate key block
  485. zip1 $xt1,$xc2,$xc3
  486. add @x[1],@x[1],@d[0],lsr#32
  487. zip2 $xt2,$xc0,$xc1
  488. add.32 @x[2],@x[2],@d[1]
  489. zip2 $xt3,$xc2,$xc3
  490. add @x[3],@x[3],@d[1],lsr#32
  491. zip1.64 $xc0,$xt0,$xt1
  492. add.32 @x[4],@x[4],@d[2]
  493. zip2.64 $xc1,$xt0,$xt1
  494. add @x[5],@x[5],@d[2],lsr#32
  495. zip1.64 $xc2,$xt2,$xt3
  496. add.32 @x[6],@x[6],@d[3]
  497. zip2.64 $xc3,$xt2,$xt3
  498. add @x[7],@x[7],@d[3],lsr#32
  499. zip1 $xt0,$xd0,$xd1
  500. add.32 @x[8],@x[8],@d[4]
  501. zip1 $xt1,$xd2,$xd3
  502. add @x[9],@x[9],@d[4],lsr#32
  503. zip2 $xt2,$xd0,$xd1
  504. add.32 @x[10],@x[10],@d[5]
  505. zip2 $xt3,$xd2,$xd3
  506. add @x[11],@x[11],@d[5],lsr#32
  507. zip1.64 $xd0,$xt0,$xt1
  508. add.32 @x[12],@x[12],@d[6]
  509. zip2.64 $xd1,$xt0,$xt1
  510. add @x[13],@x[13],@d[6],lsr#32
  511. zip1.64 $xd2,$xt2,$xt3
  512. add.32 @x[14],@x[14],@d[7]
  513. zip2.64 $xd3,$xt2,$xt3
  514. add @x[15],@x[15],@d[7],lsr#32
  515. b.lo .Ltail_neon
  516. add @x[0],@x[0],@x[1],lsl#32 // pack
  517. add @x[2],@x[2],@x[3],lsl#32
  518. ldp @x[1],@x[3],[$inp,#0] // load input
  519. add $xa0,$xa0,@K[0] // accumulate key block
  520. add @x[4],@x[4],@x[5],lsl#32
  521. add @x[6],@x[6],@x[7],lsl#32
  522. ldp @x[5],@x[7],[$inp,#16]
  523. add $xb0,$xb0,@K[1]
  524. add @x[8],@x[8],@x[9],lsl#32
  525. add @x[10],@x[10],@x[11],lsl#32
  526. ldp @x[9],@x[11],[$inp,#32]
  527. add $xc0,$xc0,@K[2]
  528. add @x[12],@x[12],@x[13],lsl#32
  529. add @x[14],@x[14],@x[15],lsl#32
  530. ldp @x[13],@x[15],[$inp,#48]
  531. add $xd0,$xd0,@K[3]
  532. add $inp,$inp,#64
  533. #ifdef __AARCH64EB__
  534. rev @x[0],@x[0]
  535. rev @x[2],@x[2]
  536. rev @x[4],@x[4]
  537. rev @x[6],@x[6]
  538. rev @x[8],@x[8]
  539. rev @x[10],@x[10]
  540. rev @x[12],@x[12]
  541. rev @x[14],@x[14]
  542. #endif
  543. ld1.8 {$xt0-$xt3},[$inp],#64
  544. eor @x[0],@x[0],@x[1]
  545. add $xa1,$xa1,@K[0]
  546. eor @x[2],@x[2],@x[3]
  547. add $xb1,$xb1,@K[1]
  548. eor @x[4],@x[4],@x[5]
  549. add $xc1,$xc1,@K[2]
  550. eor @x[6],@x[6],@x[7]
  551. add $xd1,$xd1,@K[3]
  552. eor @x[8],@x[8],@x[9]
  553. eor $xa0,$xa0,$xt0
  554. movi $xt0,#5
  555. eor @x[10],@x[10],@x[11]
  556. eor $xb0,$xb0,$xt1
  557. eor @x[12],@x[12],@x[13]
  558. eor $xc0,$xc0,$xt2
  559. eor @x[14],@x[14],@x[15]
  560. eor $xd0,$xd0,$xt3
  561. add $CTR,$CTR,$xt0 // += 5
  562. ld1.8 {$xt0-$xt3},[$inp],#64
  563. stp @x[0],@x[2],[$out,#0] // store output
  564. add @d[6],@d[6],#5 // increment counter
  565. stp @x[4],@x[6],[$out,#16]
  566. stp @x[8],@x[10],[$out,#32]
  567. stp @x[12],@x[14],[$out,#48]
  568. add $out,$out,#64
  569. st1.8 {$xa0-$xd0},[$out],#64
  570. add $xa2,$xa2,@K[0]
  571. add $xb2,$xb2,@K[1]
  572. add $xc2,$xc2,@K[2]
  573. add $xd2,$xd2,@K[3]
  574. ld1.8 {$xa0-$xd0},[$inp],#64
  575. eor $xa1,$xa1,$xt0
  576. eor $xb1,$xb1,$xt1
  577. eor $xc1,$xc1,$xt2
  578. eor $xd1,$xd1,$xt3
  579. st1.8 {$xa1-$xd1},[$out],#64
  580. add $xa3,$xa3,@K[0]
  581. add $xb3,$xb3,@K[1]
  582. add $xc3,$xc3,@K[2]
  583. add $xd3,$xd3,@K[3]
  584. ld1.8 {$xa1-$xd1},[$inp],#64
  585. eor $xa2,$xa2,$xa0
  586. eor $xb2,$xb2,$xb0
  587. eor $xc2,$xc2,$xc0
  588. eor $xd2,$xd2,$xd0
  589. st1.8 {$xa2-$xd2},[$out],#64
  590. eor $xa3,$xa3,$xa1
  591. eor $xb3,$xb3,$xb1
  592. eor $xc3,$xc3,$xc1
  593. eor $xd3,$xd3,$xd1
  594. st1.8 {$xa3-$xd3},[$out],#64
  595. b.hi .Loop_outer_neon
  596. ldp d8,d9,[sp] // meet ABI requirements
  597. ldp x19,x20,[x29,#16]
  598. add sp,sp,#64
  599. ldp x21,x22,[x29,#32]
  600. ldp x23,x24,[x29,#48]
  601. ldp x25,x26,[x29,#64]
  602. ldp x27,x28,[x29,#80]
  603. ldp x29,x30,[sp],#96
  604. .inst 0xd50323bf // autiasp
  605. ret
  606. .align 4
  607. .Ltail_neon:
  608. add $len,$len,#320
  609. ldp d8,d9,[sp] // meet ABI requirements
  610. cmp $len,#64
  611. b.lo .Less_than_64
  612. add @x[0],@x[0],@x[1],lsl#32 // pack
  613. add @x[2],@x[2],@x[3],lsl#32
  614. ldp @x[1],@x[3],[$inp,#0] // load input
  615. add @x[4],@x[4],@x[5],lsl#32
  616. add @x[6],@x[6],@x[7],lsl#32
  617. ldp @x[5],@x[7],[$inp,#16]
  618. add @x[8],@x[8],@x[9],lsl#32
  619. add @x[10],@x[10],@x[11],lsl#32
  620. ldp @x[9],@x[11],[$inp,#32]
  621. add @x[12],@x[12],@x[13],lsl#32
  622. add @x[14],@x[14],@x[15],lsl#32
  623. ldp @x[13],@x[15],[$inp,#48]
  624. add $inp,$inp,#64
  625. #ifdef __AARCH64EB__
  626. rev @x[0],@x[0]
  627. rev @x[2],@x[2]
  628. rev @x[4],@x[4]
  629. rev @x[6],@x[6]
  630. rev @x[8],@x[8]
  631. rev @x[10],@x[10]
  632. rev @x[12],@x[12]
  633. rev @x[14],@x[14]
  634. #endif
  635. eor @x[0],@x[0],@x[1]
  636. eor @x[2],@x[2],@x[3]
  637. eor @x[4],@x[4],@x[5]
  638. eor @x[6],@x[6],@x[7]
  639. eor @x[8],@x[8],@x[9]
  640. eor @x[10],@x[10],@x[11]
  641. eor @x[12],@x[12],@x[13]
  642. eor @x[14],@x[14],@x[15]
  643. stp @x[0],@x[2],[$out,#0] // store output
  644. add $xa0,$xa0,@K[0] // accumulate key block
  645. stp @x[4],@x[6],[$out,#16]
  646. add $xb0,$xb0,@K[1]
  647. stp @x[8],@x[10],[$out,#32]
  648. add $xc0,$xc0,@K[2]
  649. stp @x[12],@x[14],[$out,#48]
  650. add $xd0,$xd0,@K[3]
  651. add $out,$out,#64
  652. b.eq .Ldone_neon
  653. sub $len,$len,#64
  654. cmp $len,#64
  655. b.lo .Last_neon
  656. ld1.8 {$xt0-$xt3},[$inp],#64
  657. eor $xa0,$xa0,$xt0
  658. eor $xb0,$xb0,$xt1
  659. eor $xc0,$xc0,$xt2
  660. eor $xd0,$xd0,$xt3
  661. st1.8 {$xa0-$xd0},[$out],#64
  662. b.eq .Ldone_neon
  663. add $xa0,$xa1,@K[0]
  664. add $xb0,$xb1,@K[1]
  665. sub $len,$len,#64
  666. add $xc0,$xc1,@K[2]
  667. cmp $len,#64
  668. add $xd0,$xd1,@K[3]
  669. b.lo .Last_neon
  670. ld1.8 {$xt0-$xt3},[$inp],#64
  671. eor $xa1,$xa0,$xt0
  672. eor $xb1,$xb0,$xt1
  673. eor $xc1,$xc0,$xt2
  674. eor $xd1,$xd0,$xt3
  675. st1.8 {$xa1-$xd1},[$out],#64
  676. b.eq .Ldone_neon
  677. add $xa0,$xa2,@K[0]
  678. add $xb0,$xb2,@K[1]
  679. sub $len,$len,#64
  680. add $xc0,$xc2,@K[2]
  681. cmp $len,#64
  682. add $xd0,$xd2,@K[3]
  683. b.lo .Last_neon
  684. ld1.8 {$xt0-$xt3},[$inp],#64
  685. eor $xa2,$xa0,$xt0
  686. eor $xb2,$xb0,$xt1
  687. eor $xc2,$xc0,$xt2
  688. eor $xd2,$xd0,$xt3
  689. st1.8 {$xa2-$xd2},[$out],#64
  690. b.eq .Ldone_neon
  691. add $xa0,$xa3,@K[0]
  692. add $xb0,$xb3,@K[1]
  693. add $xc0,$xc3,@K[2]
  694. add $xd0,$xd3,@K[3]
  695. sub $len,$len,#64
  696. .Last_neon:
  697. st1.8 {$xa0-$xd0},[sp]
  698. sub $out,$out,#1
  699. add $inp,$inp,$len
  700. add $out,$out,$len
  701. add $ctr,sp,$len
  702. neg $len,$len
  703. .Loop_tail_neon:
  704. ldrb w10,[$inp,$len]
  705. ldrb w11,[$ctr,$len]
  706. add $len,$len,#1
  707. eor w10,w10,w11
  708. strb w10,[$out,$len]
  709. cbnz $len,.Loop_tail_neon
  710. stp xzr,xzr,[sp,#0]
  711. stp xzr,xzr,[sp,#16]
  712. stp xzr,xzr,[sp,#32]
  713. stp xzr,xzr,[sp,#48]
  714. .Ldone_neon:
  715. ldp x19,x20,[x29,#16]
  716. add sp,sp,#64
  717. ldp x21,x22,[x29,#32]
  718. ldp x23,x24,[x29,#48]
  719. ldp x25,x26,[x29,#64]
  720. ldp x27,x28,[x29,#80]
  721. ldp x29,x30,[sp],#96
  722. .inst 0xd50323bf // autiasp
  723. ret
  724. .size ChaCha20_neon,.-ChaCha20_neon
  725. ___
  726. {
  727. my @K = map("v$_.4s",(0..6));
  728. my ($T0,$T1,$T2,$T3,$T4,$T5)=@K;
  729. my ($A0,$B0,$C0,$D0,$A1,$B1,$C1,$D1,$A2,$B2,$C2,$D2,
  730. $A3,$B3,$C3,$D3,$A4,$B4,$C4,$D4,$A5,$B5,$C5,$D5) = map("v$_.4s",(8..31));
  731. my $rot24 = @K[6];
  732. my $ONE = "v7.4s";
  733. sub NEONROUND {
  734. my $odd = pop;
  735. my ($a,$b,$c,$d,$t)=@_;
  736. (
  737. "&add ('$a','$a','$b')",
  738. "&eor ('$d','$d','$a')",
  739. "&rev32_16 ('$d','$d')", # vrot ($d,16)
  740. "&add ('$c','$c','$d')",
  741. "&eor ('$t','$b','$c')",
  742. "&ushr ('$b','$t',20)",
  743. "&sli ('$b','$t',12)",
  744. "&add ('$a','$a','$b')",
  745. "&eor ('$d','$d','$a')",
  746. "&tbl ('$d','{$d}','$rot24')",
  747. "&add ('$c','$c','$d')",
  748. "&eor ('$t','$b','$c')",
  749. "&ushr ('$b','$t',25)",
  750. "&sli ('$b','$t',7)",
  751. "&ext ('$c','$c','$c',8)",
  752. "&ext ('$d','$d','$d',$odd?4:12)",
  753. "&ext ('$b','$b','$b',$odd?12:4)"
  754. );
  755. }
  756. $code.=<<___;
  757. .type ChaCha20_512_neon,%function
  758. .align 5
  759. ChaCha20_512_neon:
  760. .inst 0xd503233f // paciasp
  761. stp x29,x30,[sp,#-96]!
  762. add x29,sp,#0
  763. adr @x[0],.Lsigma
  764. stp x19,x20,[sp,#16]
  765. stp x21,x22,[sp,#32]
  766. stp x23,x24,[sp,#48]
  767. stp x25,x26,[sp,#64]
  768. stp x27,x28,[sp,#80]
  769. .L512_or_more_neon:
  770. sub sp,sp,#128+64
  771. eor $ONE,$ONE,$ONE
  772. ldp @d[0],@d[1],[@x[0]] // load sigma
  773. ld1 {@K[0]},[@x[0]],#16
  774. ldp @d[2],@d[3],[$key] // load key
  775. ldp @d[4],@d[5],[$key,#16]
  776. ld1 {@K[1],@K[2]},[$key]
  777. ldp @d[6],@d[7],[$ctr] // load counter
  778. ld1 {@K[3]},[$ctr]
  779. ld1 {$ONE}[0],[@x[0]]
  780. add $key,@x[0],#16 // .Lrot24
  781. #ifdef __AARCH64EB__
  782. rev64 @K[0],@K[0]
  783. ror @d[2],@d[2],#32
  784. ror @d[3],@d[3],#32
  785. ror @d[4],@d[4],#32
  786. ror @d[5],@d[5],#32
  787. ror @d[6],@d[6],#32
  788. ror @d[7],@d[7],#32
  789. #endif
  790. add @K[3],@K[3],$ONE // += 1
  791. stp @K[0],@K[1],[sp,#0] // off-load key block, invariant part
  792. add @K[3],@K[3],$ONE // not typo
  793. str @K[2],[sp,#32]
  794. add @K[4],@K[3],$ONE
  795. add @K[5],@K[4],$ONE
  796. add @K[6],@K[5],$ONE
  797. shl $ONE,$ONE,#2 // 1 -> 4
  798. stp d8,d9,[sp,#128+0] // meet ABI requirements
  799. stp d10,d11,[sp,#128+16]
  800. stp d12,d13,[sp,#128+32]
  801. stp d14,d15,[sp,#128+48]
  802. sub $len,$len,#512 // not typo
  803. .Loop_outer_512_neon:
  804. mov $A0,@K[0]
  805. mov $A1,@K[0]
  806. mov $A2,@K[0]
  807. mov $A3,@K[0]
  808. mov $A4,@K[0]
  809. mov $A5,@K[0]
  810. mov $B0,@K[1]
  811. mov.32 @x[0],@d[0] // unpack key block
  812. mov $B1,@K[1]
  813. lsr @x[1],@d[0],#32
  814. mov $B2,@K[1]
  815. mov.32 @x[2],@d[1]
  816. mov $B3,@K[1]
  817. lsr @x[3],@d[1],#32
  818. mov $B4,@K[1]
  819. mov.32 @x[4],@d[2]
  820. mov $B5,@K[1]
  821. lsr @x[5],@d[2],#32
  822. mov $D0,@K[3]
  823. mov.32 @x[6],@d[3]
  824. mov $D1,@K[4]
  825. lsr @x[7],@d[3],#32
  826. mov $D2,@K[5]
  827. mov.32 @x[8],@d[4]
  828. mov $D3,@K[6]
  829. lsr @x[9],@d[4],#32
  830. mov $C0,@K[2]
  831. mov.32 @x[10],@d[5]
  832. mov $C1,@K[2]
  833. lsr @x[11],@d[5],#32
  834. add $D4,$D0,$ONE // +4
  835. mov.32 @x[12],@d[6]
  836. add $D5,$D1,$ONE // +4
  837. lsr @x[13],@d[6],#32
  838. mov $C2,@K[2]
  839. mov.32 @x[14],@d[7]
  840. mov $C3,@K[2]
  841. lsr @x[15],@d[7],#32
  842. mov $C4,@K[2]
  843. stp @K[3],@K[4],[sp,#48] // off-load key block, variable part
  844. mov $C5,@K[2]
  845. stp @K[5],@K[6],[sp,#80]
  846. mov $ctr,#5
  847. ld1 {$rot24},[$key]
  848. subs $len,$len,#512
  849. .Loop_upper_neon:
  850. sub $ctr,$ctr,#1
  851. ___
  852. my @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0);
  853. my @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0);
  854. my @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0);
  855. my @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,0);
  856. my @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,0);
  857. my @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,0);
  858. my @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
  859. my $diff = ($#thread0+1)*6 - $#thread67 - 1;
  860. my $i = 0;
  861. foreach (@thread0) {
  862. eval; eval(shift(@thread67));
  863. eval(shift(@thread1)); eval(shift(@thread67));
  864. eval(shift(@thread2)); eval(shift(@thread67));
  865. eval(shift(@thread3)); eval(shift(@thread67));
  866. eval(shift(@thread4)); eval(shift(@thread67));
  867. eval(shift(@thread5)); eval(shift(@thread67));
  868. }
  869. @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1);
  870. @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1);
  871. @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1);
  872. @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,1);
  873. @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,1);
  874. @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,1);
  875. @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
  876. foreach (@thread0) {
  877. eval; eval(shift(@thread67));
  878. eval(shift(@thread1)); eval(shift(@thread67));
  879. eval(shift(@thread2)); eval(shift(@thread67));
  880. eval(shift(@thread3)); eval(shift(@thread67));
  881. eval(shift(@thread4)); eval(shift(@thread67));
  882. eval(shift(@thread5)); eval(shift(@thread67));
  883. }
  884. $code.=<<___;
  885. cbnz $ctr,.Loop_upper_neon
  886. add.32 @x[0],@x[0],@d[0] // accumulate key block
  887. add @x[1],@x[1],@d[0],lsr#32
  888. add.32 @x[2],@x[2],@d[1]
  889. add @x[3],@x[3],@d[1],lsr#32
  890. add.32 @x[4],@x[4],@d[2]
  891. add @x[5],@x[5],@d[2],lsr#32
  892. add.32 @x[6],@x[6],@d[3]
  893. add @x[7],@x[7],@d[3],lsr#32
  894. add.32 @x[8],@x[8],@d[4]
  895. add @x[9],@x[9],@d[4],lsr#32
  896. add.32 @x[10],@x[10],@d[5]
  897. add @x[11],@x[11],@d[5],lsr#32
  898. add.32 @x[12],@x[12],@d[6]
  899. add @x[13],@x[13],@d[6],lsr#32
  900. add.32 @x[14],@x[14],@d[7]
  901. add @x[15],@x[15],@d[7],lsr#32
  902. add @x[0],@x[0],@x[1],lsl#32 // pack
  903. add @x[2],@x[2],@x[3],lsl#32
  904. ldp @x[1],@x[3],[$inp,#0] // load input
  905. add @x[4],@x[4],@x[5],lsl#32
  906. add @x[6],@x[6],@x[7],lsl#32
  907. ldp @x[5],@x[7],[$inp,#16]
  908. add @x[8],@x[8],@x[9],lsl#32
  909. add @x[10],@x[10],@x[11],lsl#32
  910. ldp @x[9],@x[11],[$inp,#32]
  911. add @x[12],@x[12],@x[13],lsl#32
  912. add @x[14],@x[14],@x[15],lsl#32
  913. ldp @x[13],@x[15],[$inp,#48]
  914. add $inp,$inp,#64
  915. #ifdef __AARCH64EB__
  916. rev @x[0],@x[0]
  917. rev @x[2],@x[2]
  918. rev @x[4],@x[4]
  919. rev @x[6],@x[6]
  920. rev @x[8],@x[8]
  921. rev @x[10],@x[10]
  922. rev @x[12],@x[12]
  923. rev @x[14],@x[14]
  924. #endif
  925. eor @x[0],@x[0],@x[1]
  926. eor @x[2],@x[2],@x[3]
  927. eor @x[4],@x[4],@x[5]
  928. eor @x[6],@x[6],@x[7]
  929. eor @x[8],@x[8],@x[9]
  930. eor @x[10],@x[10],@x[11]
  931. eor @x[12],@x[12],@x[13]
  932. eor @x[14],@x[14],@x[15]
  933. stp @x[0],@x[2],[$out,#0] // store output
  934. add @d[6],@d[6],#1 // increment counter
  935. mov.32 @x[0],@d[0] // unpack key block
  936. lsr @x[1],@d[0],#32
  937. stp @x[4],@x[6],[$out,#16]
  938. mov.32 @x[2],@d[1]
  939. lsr @x[3],@d[1],#32
  940. stp @x[8],@x[10],[$out,#32]
  941. mov.32 @x[4],@d[2]
  942. lsr @x[5],@d[2],#32
  943. stp @x[12],@x[14],[$out,#48]
  944. add $out,$out,#64
  945. mov.32 @x[6],@d[3]
  946. lsr @x[7],@d[3],#32
  947. mov.32 @x[8],@d[4]
  948. lsr @x[9],@d[4],#32
  949. mov.32 @x[10],@d[5]
  950. lsr @x[11],@d[5],#32
  951. mov.32 @x[12],@d[6]
  952. lsr @x[13],@d[6],#32
  953. mov.32 @x[14],@d[7]
  954. lsr @x[15],@d[7],#32
  955. mov $ctr,#5
  956. .Loop_lower_neon:
  957. sub $ctr,$ctr,#1
  958. ___
  959. @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0);
  960. @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0);
  961. @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0);
  962. @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,0);
  963. @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,0);
  964. @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,0);
  965. @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
  966. foreach (@thread0) {
  967. eval; eval(shift(@thread67));
  968. eval(shift(@thread1)); eval(shift(@thread67));
  969. eval(shift(@thread2)); eval(shift(@thread67));
  970. eval(shift(@thread3)); eval(shift(@thread67));
  971. eval(shift(@thread4)); eval(shift(@thread67));
  972. eval(shift(@thread5)); eval(shift(@thread67));
  973. }
  974. @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1);
  975. @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1);
  976. @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1);
  977. @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,1);
  978. @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,1);
  979. @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,1);
  980. @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
  981. foreach (@thread0) {
  982. eval; eval(shift(@thread67));
  983. eval(shift(@thread1)); eval(shift(@thread67));
  984. eval(shift(@thread2)); eval(shift(@thread67));
  985. eval(shift(@thread3)); eval(shift(@thread67));
  986. eval(shift(@thread4)); eval(shift(@thread67));
  987. eval(shift(@thread5)); eval(shift(@thread67));
  988. }
  989. $code.=<<___;
  990. cbnz $ctr,.Loop_lower_neon
  991. add.32 @x[0],@x[0],@d[0] // accumulate key block
  992. ldp @K[0],@K[1],[sp,#0]
  993. add @x[1],@x[1],@d[0],lsr#32
  994. ldp @K[2],@K[3],[sp,#32]
  995. add.32 @x[2],@x[2],@d[1]
  996. ldp @K[4],@K[5],[sp,#64]
  997. add @x[3],@x[3],@d[1],lsr#32
  998. ldr @K[6],[sp,#96]
  999. add $A0,$A0,@K[0]
  1000. add.32 @x[4],@x[4],@d[2]
  1001. add $A1,$A1,@K[0]
  1002. add @x[5],@x[5],@d[2],lsr#32
  1003. add $A2,$A2,@K[0]
  1004. add.32 @x[6],@x[6],@d[3]
  1005. add $A3,$A3,@K[0]
  1006. add @x[7],@x[7],@d[3],lsr#32
  1007. add $A4,$A4,@K[0]
  1008. add.32 @x[8],@x[8],@d[4]
  1009. add $A5,$A5,@K[0]
  1010. add @x[9],@x[9],@d[4],lsr#32
  1011. add $C0,$C0,@K[2]
  1012. add.32 @x[10],@x[10],@d[5]
  1013. add $C1,$C1,@K[2]
  1014. add @x[11],@x[11],@d[5],lsr#32
  1015. add $C2,$C2,@K[2]
  1016. add.32 @x[12],@x[12],@d[6]
  1017. add $C3,$C3,@K[2]
  1018. add @x[13],@x[13],@d[6],lsr#32
  1019. add $C4,$C4,@K[2]
  1020. add.32 @x[14],@x[14],@d[7]
  1021. add $C5,$C5,@K[2]
  1022. add @x[15],@x[15],@d[7],lsr#32
  1023. add $D4,$D4,$ONE // +4
  1024. add @x[0],@x[0],@x[1],lsl#32 // pack
  1025. add $D5,$D5,$ONE // +4
  1026. add @x[2],@x[2],@x[3],lsl#32
  1027. add $D0,$D0,@K[3]
  1028. ldp @x[1],@x[3],[$inp,#0] // load input
  1029. add $D1,$D1,@K[4]
  1030. add @x[4],@x[4],@x[5],lsl#32
  1031. add $D2,$D2,@K[5]
  1032. add @x[6],@x[6],@x[7],lsl#32
  1033. add $D3,$D3,@K[6]
  1034. ldp @x[5],@x[7],[$inp,#16]
  1035. add $D4,$D4,@K[3]
  1036. add @x[8],@x[8],@x[9],lsl#32
  1037. add $D5,$D5,@K[4]
  1038. add @x[10],@x[10],@x[11],lsl#32
  1039. add $B0,$B0,@K[1]
  1040. ldp @x[9],@x[11],[$inp,#32]
  1041. add $B1,$B1,@K[1]
  1042. add @x[12],@x[12],@x[13],lsl#32
  1043. add $B2,$B2,@K[1]
  1044. add @x[14],@x[14],@x[15],lsl#32
  1045. add $B3,$B3,@K[1]
  1046. ldp @x[13],@x[15],[$inp,#48]
  1047. add $B4,$B4,@K[1]
  1048. add $inp,$inp,#64
  1049. add $B5,$B5,@K[1]
  1050. #ifdef __AARCH64EB__
  1051. rev @x[0],@x[0]
  1052. rev @x[2],@x[2]
  1053. rev @x[4],@x[4]
  1054. rev @x[6],@x[6]
  1055. rev @x[8],@x[8]
  1056. rev @x[10],@x[10]
  1057. rev @x[12],@x[12]
  1058. rev @x[14],@x[14]
  1059. #endif
  1060. ld1.8 {$T0-$T3},[$inp],#64
  1061. eor @x[0],@x[0],@x[1]
  1062. eor @x[2],@x[2],@x[3]
  1063. eor @x[4],@x[4],@x[5]
  1064. eor @x[6],@x[6],@x[7]
  1065. eor @x[8],@x[8],@x[9]
  1066. eor $A0,$A0,$T0
  1067. eor @x[10],@x[10],@x[11]
  1068. eor $B0,$B0,$T1
  1069. eor @x[12],@x[12],@x[13]
  1070. eor $C0,$C0,$T2
  1071. eor @x[14],@x[14],@x[15]
  1072. eor $D0,$D0,$T3
  1073. ld1.8 {$T0-$T3},[$inp],#64
  1074. stp @x[0],@x[2],[$out,#0] // store output
  1075. add @d[6],@d[6],#7 // increment counter
  1076. stp @x[4],@x[6],[$out,#16]
  1077. stp @x[8],@x[10],[$out,#32]
  1078. stp @x[12],@x[14],[$out,#48]
  1079. add $out,$out,#64
  1080. st1.8 {$A0-$D0},[$out],#64
  1081. ld1.8 {$A0-$D0},[$inp],#64
  1082. eor $A1,$A1,$T0
  1083. eor $B1,$B1,$T1
  1084. eor $C1,$C1,$T2
  1085. eor $D1,$D1,$T3
  1086. st1.8 {$A1-$D1},[$out],#64
  1087. ld1.8 {$A1-$D1},[$inp],#64
  1088. eor $A2,$A2,$A0
  1089. ldp @K[0],@K[1],[sp,#0]
  1090. eor $B2,$B2,$B0
  1091. ldp @K[2],@K[3],[sp,#32]
  1092. eor $C2,$C2,$C0
  1093. eor $D2,$D2,$D0
  1094. st1.8 {$A2-$D2},[$out],#64
  1095. ld1.8 {$A2-$D2},[$inp],#64
  1096. eor $A3,$A3,$A1
  1097. eor $B3,$B3,$B1
  1098. eor $C3,$C3,$C1
  1099. eor $D3,$D3,$D1
  1100. st1.8 {$A3-$D3},[$out],#64
  1101. ld1.8 {$A3-$D3},[$inp],#64
  1102. eor $A4,$A4,$A2
  1103. eor $B4,$B4,$B2
  1104. eor $C4,$C4,$C2
  1105. eor $D4,$D4,$D2
  1106. st1.8 {$A4-$D4},[$out],#64
  1107. shl $A0,$ONE,#1 // 4 -> 8
  1108. eor $A5,$A5,$A3
  1109. eor $B5,$B5,$B3
  1110. eor $C5,$C5,$C3
  1111. eor $D5,$D5,$D3
  1112. st1.8 {$A5-$D5},[$out],#64
  1113. add @K[3],@K[3],$A0 // += 8
  1114. add @K[4],@K[4],$A0
  1115. add @K[5],@K[5],$A0
  1116. add @K[6],@K[6],$A0
  1117. b.hs .Loop_outer_512_neon
  1118. adds $len,$len,#512
  1119. ushr $ONE,$ONE,#1 // 4 -> 2
  1120. ldp d8,d9,[sp,#128+0] // meet ABI requirements
  1121. ldp d10,d11,[sp,#128+16]
  1122. ldp d12,d13,[sp,#128+32]
  1123. ldp d14,d15,[sp,#128+48]
  1124. stp @K[0],@K[0],[sp,#0] // wipe off-load area
  1125. stp @K[0],@K[0],[sp,#32]
  1126. stp @K[0],@K[0],[sp,#64]
  1127. b.eq .Ldone_512_neon
  1128. sub $key,$key,#16 // .Lone
  1129. cmp $len,#192
  1130. add sp,sp,#128
  1131. sub @K[3],@K[3],$ONE // -= 2
  1132. ld1 {$CTR,$ROT24},[$key]
  1133. b.hs .Loop_outer_neon
  1134. eor @K[1],@K[1],@K[1]
  1135. eor @K[2],@K[2],@K[2]
  1136. eor @K[3],@K[3],@K[3]
  1137. eor @K[4],@K[4],@K[4]
  1138. eor @K[5],@K[5],@K[5]
  1139. eor @K[6],@K[6],@K[6]
  1140. b .Loop_outer
  1141. .Ldone_512_neon:
  1142. ldp x19,x20,[x29,#16]
  1143. add sp,sp,#128+64
  1144. ldp x21,x22,[x29,#32]
  1145. ldp x23,x24,[x29,#48]
  1146. ldp x25,x26,[x29,#64]
  1147. ldp x27,x28,[x29,#80]
  1148. ldp x29,x30,[sp],#96
  1149. .inst 0xd50323bf // autiasp
  1150. ret
  1151. .size ChaCha20_512_neon,.-ChaCha20_512_neon
  1152. ___
  1153. }
  1154. }}}
  1155. foreach (split("\n",$code)) {
  1156. s/\`([^\`]*)\`/eval $1/geo;
  1157. (s/\b([a-z]+)\.32\b/$1/ and (s/x([0-9]+)/w$1/g or 1)) or
  1158. (m/\b(eor|ext|mov|tbl)\b/ and (s/\.4s/\.16b/g or 1)) or
  1159. (s/\b((?:ld|st)1)\.8\b/$1/ and (s/\.4s/\.16b/g or 1)) or
  1160. (m/\b(ld|st)[rp]\b/ and (s/v([0-9]+)\.4s/q$1/g or 1)) or
  1161. (m/\b(dup|ld1)\b/ and (s/\.4(s}?\[[0-3]\])/.$1/g or 1)) or
  1162. (s/\b(zip[12])\.64\b/$1/ and (s/\.4s/\.2d/g or 1)) or
  1163. (s/\brev32\.16\b/rev32/ and (s/\.4s/\.8h/g or 1));
  1164. #s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo;
  1165. print $_,"\n";
  1166. }
  1167. close STDOUT; # flush