chacha-armv4.pl 27 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169
  1. #! /usr/bin/env perl
  2. # Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. # ====================================================================
  10. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  11. # project. The module is, however, dual licensed under OpenSSL and
  12. # CRYPTOGAMS licenses depending on where you obtain it. For further
  13. # details see http://www.openssl.org/~appro/cryptogams/.
  14. # ====================================================================
  15. #
  16. # December 2014
  17. #
  18. # ChaCha20 for ARMv4.
  19. #
  20. # Performance in cycles per byte out of large buffer.
  21. #
  22. # IALU/gcc-4.4 1xNEON 3xNEON+1xIALU
  23. #
  24. # Cortex-A5 19.3(*)/+95% 21.8 14.1
  25. # Cortex-A8 10.5(*)/+160% 13.9 6.35
  26. # Cortex-A9 12.9(**)/+110% 14.3 6.50
  27. # Cortex-A15 11.0/+40% 16.0 5.00
  28. # Snapdragon S4 11.5/+125% 13.6 4.90
  29. #
  30. # (*) most "favourable" result for aligned data on little-endian
  31. # processor, result for misaligned data is 10-15% lower;
  32. # (**) this result is a trade-off: it can be improved by 20%,
  33. # but then Snapdragon S4 and Cortex-A8 results get
  34. # 20-25% worse;
  35. # $output is the last argument if it looks like a file (it has an extension)
  36. # $flavour is the first argument if it doesn't look like a file
  37. $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  38. $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  39. if ($flavour && $flavour ne "void") {
  40. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  41. ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
  42. ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
  43. die "can't locate arm-xlate.pl";
  44. open STDOUT,"| \"$^X\" $xlate $flavour $output"
  45. or die "can't call $xlate: $!";
  46. } else {
  47. $output and open STDOUT,">$output";
  48. }
  49. sub AUTOLOAD() # thunk [simplified] x86-style perlasm
  50. { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
  51. my $arg = pop;
  52. $arg = "#$arg" if ($arg*1 eq $arg);
  53. $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
  54. }
  55. my @x=map("r$_",(0..7,"x","x","x","x",12,"x",14,"x"));
  56. my @t=map("r$_",(8..11));
  57. sub ROUND {
  58. my ($a0,$b0,$c0,$d0)=@_;
  59. my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
  60. my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
  61. my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
  62. my $odd = $d0&1;
  63. my ($xc,$xc_) = (@t[0..1]);
  64. my ($xd,$xd_) = $odd ? (@t[2],@x[$d1]) : (@x[$d0],@t[2]);
  65. my @ret;
  66. # Consider order in which variables are addressed by their
  67. # index:
  68. #
  69. # a b c d
  70. #
  71. # 0 4 8 12 < even round
  72. # 1 5 9 13
  73. # 2 6 10 14
  74. # 3 7 11 15
  75. # 0 5 10 15 < odd round
  76. # 1 6 11 12
  77. # 2 7 8 13
  78. # 3 4 9 14
  79. #
  80. # 'a', 'b' are permanently allocated in registers, @x[0..7],
  81. # while 'c's and pair of 'd's are maintained in memory. If
  82. # you observe 'c' column, you'll notice that pair of 'c's is
  83. # invariant between rounds. This means that we have to reload
  84. # them once per round, in the middle. This is why you'll see
  85. # bunch of 'c' stores and loads in the middle, but none in
  86. # the beginning or end. If you observe 'd' column, you'll
  87. # notice that 15 and 13 are reused in next pair of rounds.
  88. # This is why these two are chosen for offloading to memory,
  89. # to make loads count more.
  90. push @ret,(
  91. "&add (@x[$a0],@x[$a0],@x[$b0])",
  92. "&mov ($xd,$xd,'ror#16')",
  93. "&add (@x[$a1],@x[$a1],@x[$b1])",
  94. "&mov ($xd_,$xd_,'ror#16')",
  95. "&eor ($xd,$xd,@x[$a0],'ror#16')",
  96. "&eor ($xd_,$xd_,@x[$a1],'ror#16')",
  97. "&add ($xc,$xc,$xd)",
  98. "&mov (@x[$b0],@x[$b0],'ror#20')",
  99. "&add ($xc_,$xc_,$xd_)",
  100. "&mov (@x[$b1],@x[$b1],'ror#20')",
  101. "&eor (@x[$b0],@x[$b0],$xc,'ror#20')",
  102. "&eor (@x[$b1],@x[$b1],$xc_,'ror#20')",
  103. "&add (@x[$a0],@x[$a0],@x[$b0])",
  104. "&mov ($xd,$xd,'ror#24')",
  105. "&add (@x[$a1],@x[$a1],@x[$b1])",
  106. "&mov ($xd_,$xd_,'ror#24')",
  107. "&eor ($xd,$xd,@x[$a0],'ror#24')",
  108. "&eor ($xd_,$xd_,@x[$a1],'ror#24')",
  109. "&add ($xc,$xc,$xd)",
  110. "&mov (@x[$b0],@x[$b0],'ror#25')" );
  111. push @ret,(
  112. "&str ($xd,'[sp,#4*(16+$d0)]')",
  113. "&ldr ($xd,'[sp,#4*(16+$d2)]')" ) if ($odd);
  114. push @ret,(
  115. "&add ($xc_,$xc_,$xd_)",
  116. "&mov (@x[$b1],@x[$b1],'ror#25')" );
  117. push @ret,(
  118. "&str ($xd_,'[sp,#4*(16+$d1)]')",
  119. "&ldr ($xd_,'[sp,#4*(16+$d3)]')" ) if (!$odd);
  120. push @ret,(
  121. "&eor (@x[$b0],@x[$b0],$xc,'ror#25')",
  122. "&eor (@x[$b1],@x[$b1],$xc_,'ror#25')" );
  123. $xd=@x[$d2] if (!$odd);
  124. $xd_=@x[$d3] if ($odd);
  125. push @ret,(
  126. "&str ($xc,'[sp,#4*(16+$c0)]')",
  127. "&ldr ($xc,'[sp,#4*(16+$c2)]')",
  128. "&add (@x[$a2],@x[$a2],@x[$b2])",
  129. "&mov ($xd,$xd,'ror#16')",
  130. "&str ($xc_,'[sp,#4*(16+$c1)]')",
  131. "&ldr ($xc_,'[sp,#4*(16+$c3)]')",
  132. "&add (@x[$a3],@x[$a3],@x[$b3])",
  133. "&mov ($xd_,$xd_,'ror#16')",
  134. "&eor ($xd,$xd,@x[$a2],'ror#16')",
  135. "&eor ($xd_,$xd_,@x[$a3],'ror#16')",
  136. "&add ($xc,$xc,$xd)",
  137. "&mov (@x[$b2],@x[$b2],'ror#20')",
  138. "&add ($xc_,$xc_,$xd_)",
  139. "&mov (@x[$b3],@x[$b3],'ror#20')",
  140. "&eor (@x[$b2],@x[$b2],$xc,'ror#20')",
  141. "&eor (@x[$b3],@x[$b3],$xc_,'ror#20')",
  142. "&add (@x[$a2],@x[$a2],@x[$b2])",
  143. "&mov ($xd,$xd,'ror#24')",
  144. "&add (@x[$a3],@x[$a3],@x[$b3])",
  145. "&mov ($xd_,$xd_,'ror#24')",
  146. "&eor ($xd,$xd,@x[$a2],'ror#24')",
  147. "&eor ($xd_,$xd_,@x[$a3],'ror#24')",
  148. "&add ($xc,$xc,$xd)",
  149. "&mov (@x[$b2],@x[$b2],'ror#25')",
  150. "&add ($xc_,$xc_,$xd_)",
  151. "&mov (@x[$b3],@x[$b3],'ror#25')",
  152. "&eor (@x[$b2],@x[$b2],$xc,'ror#25')",
  153. "&eor (@x[$b3],@x[$b3],$xc_,'ror#25')" );
  154. @ret;
  155. }
  156. $code.=<<___;
  157. #include "arm_arch.h"
  158. #if defined(__thumb2__) || defined(__clang__)
  159. .syntax unified
  160. #endif
  161. #if defined(__thumb2__)
  162. .thumb
  163. #else
  164. .code 32
  165. #endif
  166. #if defined(__thumb2__) || defined(__clang__)
  167. #define ldrhsb ldrbhs
  168. #endif
  169. .text
  170. .align 5
  171. .Lsigma:
  172. .long 0x61707865,0x3320646e,0x79622d32,0x6b206574 @ endian-neutral
  173. .Lone:
  174. .long 1,0,0,0
  175. #if __ARM_MAX_ARCH__>=7
  176. .LOPENSSL_armcap:
  177. # ifdef _WIN32
  178. .word OPENSSL_armcap_P
  179. # else
  180. .word OPENSSL_armcap_P-.LChaCha20_ctr32
  181. # endif
  182. #else
  183. .word -1
  184. #endif
  185. .globl ChaCha20_ctr32
  186. .type ChaCha20_ctr32,%function
  187. .align 5
  188. ChaCha20_ctr32:
  189. .LChaCha20_ctr32:
  190. ldr r12,[sp,#0] @ pull pointer to counter and nonce
  191. stmdb sp!,{r0-r2,r4-r11,lr}
  192. #if __ARM_ARCH__<7 && !defined(__thumb2__)
  193. sub r14,pc,#16 @ ChaCha20_ctr32
  194. #else
  195. adr r14,.LChaCha20_ctr32
  196. #endif
  197. cmp r2,#0 @ len==0?
  198. #ifdef __thumb2__
  199. itt eq
  200. #endif
  201. addeq sp,sp,#4*3
  202. beq .Lno_data
  203. #if __ARM_MAX_ARCH__>=7
  204. cmp r2,#192 @ test len
  205. bls .Lshort
  206. ldr r4,[r14,#-32]
  207. # if !defined(_WIN32)
  208. ldr r4,[r14,r4]
  209. # endif
  210. # if defined(__APPLE__) || defined(_WIN32)
  211. ldr r4,[r4]
  212. # endif
  213. tst r4,#ARMV7_NEON
  214. bne .LChaCha20_neon
  215. .Lshort:
  216. #endif
  217. ldmia r12,{r4-r7} @ load counter and nonce
  218. sub sp,sp,#4*(16) @ off-load area
  219. sub r14,r14,#64 @ .Lsigma
  220. stmdb sp!,{r4-r7} @ copy counter and nonce
  221. ldmia r3,{r4-r11} @ load key
  222. ldmia r14,{r0-r3} @ load sigma
  223. stmdb sp!,{r4-r11} @ copy key
  224. stmdb sp!,{r0-r3} @ copy sigma
  225. str r10,[sp,#4*(16+10)] @ off-load "@x[10]"
  226. str r11,[sp,#4*(16+11)] @ off-load "@x[11]"
  227. b .Loop_outer_enter
  228. .align 4
  229. .Loop_outer:
  230. ldmia sp,{r0-r9} @ load key material
  231. str @t[3],[sp,#4*(32+2)] @ save len
  232. str r12, [sp,#4*(32+1)] @ save inp
  233. str r14, [sp,#4*(32+0)] @ save out
  234. .Loop_outer_enter:
  235. ldr @t[3], [sp,#4*(15)]
  236. ldr @x[12],[sp,#4*(12)] @ modulo-scheduled load
  237. ldr @t[2], [sp,#4*(13)]
  238. ldr @x[14],[sp,#4*(14)]
  239. str @t[3], [sp,#4*(16+15)]
  240. mov @t[3],#10
  241. b .Loop
  242. .align 4
  243. .Loop:
  244. subs @t[3],@t[3],#1
  245. ___
  246. foreach (&ROUND(0, 4, 8,12)) { eval; }
  247. foreach (&ROUND(0, 5,10,15)) { eval; }
  248. $code.=<<___;
  249. bne .Loop
  250. ldr @t[3],[sp,#4*(32+2)] @ load len
  251. str @t[0], [sp,#4*(16+8)] @ modulo-scheduled store
  252. str @t[1], [sp,#4*(16+9)]
  253. str @x[12],[sp,#4*(16+12)]
  254. str @t[2], [sp,#4*(16+13)]
  255. str @x[14],[sp,#4*(16+14)]
  256. @ at this point we have first half of 512-bit result in
  257. @ @x[0-7] and second half at sp+4*(16+8)
  258. cmp @t[3],#64 @ done yet?
  259. #ifdef __thumb2__
  260. itete lo
  261. #endif
  262. addlo r12,sp,#4*(0) @ shortcut or ...
  263. ldrhs r12,[sp,#4*(32+1)] @ ... load inp
  264. addlo r14,sp,#4*(0) @ shortcut or ...
  265. ldrhs r14,[sp,#4*(32+0)] @ ... load out
  266. ldr @t[0],[sp,#4*(0)] @ load key material
  267. ldr @t[1],[sp,#4*(1)]
  268. #if __ARM_ARCH__>=6 || !defined(__ARMEB__)
  269. # if __ARM_ARCH__<7
  270. orr @t[2],r12,r14
  271. tst @t[2],#3 @ are input and output aligned?
  272. ldr @t[2],[sp,#4*(2)]
  273. bne .Lunaligned
  274. cmp @t[3],#64 @ restore flags
  275. # else
  276. ldr @t[2],[sp,#4*(2)]
  277. # endif
  278. ldr @t[3],[sp,#4*(3)]
  279. add @x[0],@x[0],@t[0] @ accumulate key material
  280. add @x[1],@x[1],@t[1]
  281. # ifdef __thumb2__
  282. itt hs
  283. # endif
  284. ldrhs @t[0],[r12],#16 @ load input
  285. ldrhs @t[1],[r12,#-12]
  286. add @x[2],@x[2],@t[2]
  287. add @x[3],@x[3],@t[3]
  288. # ifdef __thumb2__
  289. itt hs
  290. # endif
  291. ldrhs @t[2],[r12,#-8]
  292. ldrhs @t[3],[r12,#-4]
  293. # if __ARM_ARCH__>=6 && defined(__ARMEB__)
  294. rev @x[0],@x[0]
  295. rev @x[1],@x[1]
  296. rev @x[2],@x[2]
  297. rev @x[3],@x[3]
  298. # endif
  299. # ifdef __thumb2__
  300. itt hs
  301. # endif
  302. eorhs @x[0],@x[0],@t[0] @ xor with input
  303. eorhs @x[1],@x[1],@t[1]
  304. add @t[0],sp,#4*(4)
  305. str @x[0],[r14],#16 @ store output
  306. # ifdef __thumb2__
  307. itt hs
  308. # endif
  309. eorhs @x[2],@x[2],@t[2]
  310. eorhs @x[3],@x[3],@t[3]
  311. ldmia @t[0],{@t[0]-@t[3]} @ load key material
  312. str @x[1],[r14,#-12]
  313. str @x[2],[r14,#-8]
  314. str @x[3],[r14,#-4]
  315. add @x[4],@x[4],@t[0] @ accumulate key material
  316. add @x[5],@x[5],@t[1]
  317. # ifdef __thumb2__
  318. itt hs
  319. # endif
  320. ldrhs @t[0],[r12],#16 @ load input
  321. ldrhs @t[1],[r12,#-12]
  322. add @x[6],@x[6],@t[2]
  323. add @x[7],@x[7],@t[3]
  324. # ifdef __thumb2__
  325. itt hs
  326. # endif
  327. ldrhs @t[2],[r12,#-8]
  328. ldrhs @t[3],[r12,#-4]
  329. # if __ARM_ARCH__>=6 && defined(__ARMEB__)
  330. rev @x[4],@x[4]
  331. rev @x[5],@x[5]
  332. rev @x[6],@x[6]
  333. rev @x[7],@x[7]
  334. # endif
  335. # ifdef __thumb2__
  336. itt hs
  337. # endif
  338. eorhs @x[4],@x[4],@t[0]
  339. eorhs @x[5],@x[5],@t[1]
  340. add @t[0],sp,#4*(8)
  341. str @x[4],[r14],#16 @ store output
  342. # ifdef __thumb2__
  343. itt hs
  344. # endif
  345. eorhs @x[6],@x[6],@t[2]
  346. eorhs @x[7],@x[7],@t[3]
  347. str @x[5],[r14,#-12]
  348. ldmia @t[0],{@t[0]-@t[3]} @ load key material
  349. str @x[6],[r14,#-8]
  350. add @x[0],sp,#4*(16+8)
  351. str @x[7],[r14,#-4]
  352. ldmia @x[0],{@x[0]-@x[7]} @ load second half
  353. add @x[0],@x[0],@t[0] @ accumulate key material
  354. add @x[1],@x[1],@t[1]
  355. # ifdef __thumb2__
  356. itt hs
  357. # endif
  358. ldrhs @t[0],[r12],#16 @ load input
  359. ldrhs @t[1],[r12,#-12]
  360. # ifdef __thumb2__
  361. itt hi
  362. # endif
  363. strhi @t[2],[sp,#4*(16+10)] @ copy "@x[10]" while at it
  364. strhi @t[3],[sp,#4*(16+11)] @ copy "@x[11]" while at it
  365. add @x[2],@x[2],@t[2]
  366. add @x[3],@x[3],@t[3]
  367. # ifdef __thumb2__
  368. itt hs
  369. # endif
  370. ldrhs @t[2],[r12,#-8]
  371. ldrhs @t[3],[r12,#-4]
  372. # if __ARM_ARCH__>=6 && defined(__ARMEB__)
  373. rev @x[0],@x[0]
  374. rev @x[1],@x[1]
  375. rev @x[2],@x[2]
  376. rev @x[3],@x[3]
  377. # endif
  378. # ifdef __thumb2__
  379. itt hs
  380. # endif
  381. eorhs @x[0],@x[0],@t[0]
  382. eorhs @x[1],@x[1],@t[1]
  383. add @t[0],sp,#4*(12)
  384. str @x[0],[r14],#16 @ store output
  385. # ifdef __thumb2__
  386. itt hs
  387. # endif
  388. eorhs @x[2],@x[2],@t[2]
  389. eorhs @x[3],@x[3],@t[3]
  390. str @x[1],[r14,#-12]
  391. ldmia @t[0],{@t[0]-@t[3]} @ load key material
  392. str @x[2],[r14,#-8]
  393. str @x[3],[r14,#-4]
  394. add @x[4],@x[4],@t[0] @ accumulate key material
  395. add @x[5],@x[5],@t[1]
  396. # ifdef __thumb2__
  397. itt hi
  398. # endif
  399. addhi @t[0],@t[0],#1 @ next counter value
  400. strhi @t[0],[sp,#4*(12)] @ save next counter value
  401. # ifdef __thumb2__
  402. itt hs
  403. # endif
  404. ldrhs @t[0],[r12],#16 @ load input
  405. ldrhs @t[1],[r12,#-12]
  406. add @x[6],@x[6],@t[2]
  407. add @x[7],@x[7],@t[3]
  408. # ifdef __thumb2__
  409. itt hs
  410. # endif
  411. ldrhs @t[2],[r12,#-8]
  412. ldrhs @t[3],[r12,#-4]
  413. # if __ARM_ARCH__>=6 && defined(__ARMEB__)
  414. rev @x[4],@x[4]
  415. rev @x[5],@x[5]
  416. rev @x[6],@x[6]
  417. rev @x[7],@x[7]
  418. # endif
  419. # ifdef __thumb2__
  420. itt hs
  421. # endif
  422. eorhs @x[4],@x[4],@t[0]
  423. eorhs @x[5],@x[5],@t[1]
  424. # ifdef __thumb2__
  425. it ne
  426. # endif
  427. ldrne @t[0],[sp,#4*(32+2)] @ re-load len
  428. # ifdef __thumb2__
  429. itt hs
  430. # endif
  431. eorhs @x[6],@x[6],@t[2]
  432. eorhs @x[7],@x[7],@t[3]
  433. str @x[4],[r14],#16 @ store output
  434. str @x[5],[r14,#-12]
  435. # ifdef __thumb2__
  436. it hs
  437. # endif
  438. subhs @t[3],@t[0],#64 @ len-=64
  439. str @x[6],[r14,#-8]
  440. str @x[7],[r14,#-4]
  441. bhi .Loop_outer
  442. beq .Ldone
  443. # if __ARM_ARCH__<7
  444. b .Ltail
  445. .align 4
  446. .Lunaligned: @ unaligned endian-neutral path
  447. cmp @t[3],#64 @ restore flags
  448. # endif
  449. #endif
  450. #if __ARM_ARCH__<7
  451. ldr @t[3],[sp,#4*(3)]
  452. ___
  453. for ($i=0;$i<16;$i+=4) {
  454. my $j=$i&0x7;
  455. $code.=<<___ if ($i==4);
  456. add @x[0],sp,#4*(16+8)
  457. ___
  458. $code.=<<___ if ($i==8);
  459. ldmia @x[0],{@x[0]-@x[7]} @ load second half
  460. # ifdef __thumb2__
  461. itt hi
  462. # endif
  463. strhi @t[2],[sp,#4*(16+10)] @ copy "@x[10]"
  464. strhi @t[3],[sp,#4*(16+11)] @ copy "@x[11]"
  465. ___
  466. $code.=<<___;
  467. add @x[$j+0],@x[$j+0],@t[0] @ accumulate key material
  468. ___
  469. $code.=<<___ if ($i==12);
  470. # ifdef __thumb2__
  471. itt hi
  472. # endif
  473. addhi @t[0],@t[0],#1 @ next counter value
  474. strhi @t[0],[sp,#4*(12)] @ save next counter value
  475. ___
  476. $code.=<<___;
  477. add @x[$j+1],@x[$j+1],@t[1]
  478. add @x[$j+2],@x[$j+2],@t[2]
  479. # ifdef __thumb2__
  480. itete lo
  481. # endif
  482. eorlo @t[0],@t[0],@t[0] @ zero or ...
  483. ldrhsb @t[0],[r12],#16 @ ... load input
  484. eorlo @t[1],@t[1],@t[1]
  485. ldrhsb @t[1],[r12,#-12]
  486. add @x[$j+3],@x[$j+3],@t[3]
  487. # ifdef __thumb2__
  488. itete lo
  489. # endif
  490. eorlo @t[2],@t[2],@t[2]
  491. ldrhsb @t[2],[r12,#-8]
  492. eorlo @t[3],@t[3],@t[3]
  493. ldrhsb @t[3],[r12,#-4]
  494. eor @x[$j+0],@t[0],@x[$j+0] @ xor with input (or zero)
  495. eor @x[$j+1],@t[1],@x[$j+1]
  496. # ifdef __thumb2__
  497. itt hs
  498. # endif
  499. ldrhsb @t[0],[r12,#-15] @ load more input
  500. ldrhsb @t[1],[r12,#-11]
  501. eor @x[$j+2],@t[2],@x[$j+2]
  502. strb @x[$j+0],[r14],#16 @ store output
  503. eor @x[$j+3],@t[3],@x[$j+3]
  504. # ifdef __thumb2__
  505. itt hs
  506. # endif
  507. ldrhsb @t[2],[r12,#-7]
  508. ldrhsb @t[3],[r12,#-3]
  509. strb @x[$j+1],[r14,#-12]
  510. eor @x[$j+0],@t[0],@x[$j+0],lsr#8
  511. strb @x[$j+2],[r14,#-8]
  512. eor @x[$j+1],@t[1],@x[$j+1],lsr#8
  513. # ifdef __thumb2__
  514. itt hs
  515. # endif
  516. ldrhsb @t[0],[r12,#-14] @ load more input
  517. ldrhsb @t[1],[r12,#-10]
  518. strb @x[$j+3],[r14,#-4]
  519. eor @x[$j+2],@t[2],@x[$j+2],lsr#8
  520. strb @x[$j+0],[r14,#-15]
  521. eor @x[$j+3],@t[3],@x[$j+3],lsr#8
  522. # ifdef __thumb2__
  523. itt hs
  524. # endif
  525. ldrhsb @t[2],[r12,#-6]
  526. ldrhsb @t[3],[r12,#-2]
  527. strb @x[$j+1],[r14,#-11]
  528. eor @x[$j+0],@t[0],@x[$j+0],lsr#8
  529. strb @x[$j+2],[r14,#-7]
  530. eor @x[$j+1],@t[1],@x[$j+1],lsr#8
  531. # ifdef __thumb2__
  532. itt hs
  533. # endif
  534. ldrhsb @t[0],[r12,#-13] @ load more input
  535. ldrhsb @t[1],[r12,#-9]
  536. strb @x[$j+3],[r14,#-3]
  537. eor @x[$j+2],@t[2],@x[$j+2],lsr#8
  538. strb @x[$j+0],[r14,#-14]
  539. eor @x[$j+3],@t[3],@x[$j+3],lsr#8
  540. # ifdef __thumb2__
  541. itt hs
  542. # endif
  543. ldrhsb @t[2],[r12,#-5]
  544. ldrhsb @t[3],[r12,#-1]
  545. strb @x[$j+1],[r14,#-10]
  546. strb @x[$j+2],[r14,#-6]
  547. eor @x[$j+0],@t[0],@x[$j+0],lsr#8
  548. strb @x[$j+3],[r14,#-2]
  549. eor @x[$j+1],@t[1],@x[$j+1],lsr#8
  550. strb @x[$j+0],[r14,#-13]
  551. eor @x[$j+2],@t[2],@x[$j+2],lsr#8
  552. strb @x[$j+1],[r14,#-9]
  553. eor @x[$j+3],@t[3],@x[$j+3],lsr#8
  554. strb @x[$j+2],[r14,#-5]
  555. strb @x[$j+3],[r14,#-1]
  556. ___
  557. $code.=<<___ if ($i<12);
  558. add @t[0],sp,#4*(4+$i)
  559. ldmia @t[0],{@t[0]-@t[3]} @ load key material
  560. ___
  561. }
  562. $code.=<<___;
  563. # ifdef __thumb2__
  564. it ne
  565. # endif
  566. ldrne @t[0],[sp,#4*(32+2)] @ re-load len
  567. # ifdef __thumb2__
  568. it hs
  569. # endif
  570. subhs @t[3],@t[0],#64 @ len-=64
  571. bhi .Loop_outer
  572. beq .Ldone
  573. #endif
  574. .Ltail:
  575. ldr r12,[sp,#4*(32+1)] @ load inp
  576. add @t[1],sp,#4*(0)
  577. ldr r14,[sp,#4*(32+0)] @ load out
  578. .Loop_tail:
  579. ldrb @t[2],[@t[1]],#1 @ read buffer on stack
  580. ldrb @t[3],[r12],#1 @ read input
  581. subs @t[0],@t[0],#1
  582. eor @t[3],@t[3],@t[2]
  583. strb @t[3],[r14],#1 @ store output
  584. bne .Loop_tail
  585. .Ldone:
  586. add sp,sp,#4*(32+3)
  587. .Lno_data:
  588. ldmia sp!,{r4-r11,pc}
  589. .size ChaCha20_ctr32,.-ChaCha20_ctr32
  590. ___
  591. {{{
  592. my ($a0,$b0,$c0,$d0,$a1,$b1,$c1,$d1,$a2,$b2,$c2,$d2,$t0,$t1,$t2,$t3) =
  593. map("q$_",(0..15));
  594. sub NEONROUND {
  595. my $odd = pop;
  596. my ($a,$b,$c,$d,$t)=@_;
  597. (
  598. "&vadd_i32 ($a,$a,$b)",
  599. "&veor ($d,$d,$a)",
  600. "&vrev32_16 ($d,$d)", # vrot ($d,16)
  601. "&vadd_i32 ($c,$c,$d)",
  602. "&veor ($t,$b,$c)",
  603. "&vshr_u32 ($b,$t,20)",
  604. "&vsli_32 ($b,$t,12)",
  605. "&vadd_i32 ($a,$a,$b)",
  606. "&veor ($t,$d,$a)",
  607. "&vshr_u32 ($d,$t,24)",
  608. "&vsli_32 ($d,$t,8)",
  609. "&vadd_i32 ($c,$c,$d)",
  610. "&veor ($t,$b,$c)",
  611. "&vshr_u32 ($b,$t,25)",
  612. "&vsli_32 ($b,$t,7)",
  613. "&vext_8 ($c,$c,$c,8)",
  614. "&vext_8 ($b,$b,$b,$odd?12:4)",
  615. "&vext_8 ($d,$d,$d,$odd?4:12)"
  616. );
  617. }
  618. $code.=<<___;
  619. #if __ARM_MAX_ARCH__>=7
  620. .arch armv7-a
  621. .fpu neon
  622. .type ChaCha20_neon,%function
  623. .align 5
  624. ChaCha20_neon:
  625. ldr r12,[sp,#0] @ pull pointer to counter and nonce
  626. stmdb sp!,{r0-r2,r4-r11,lr}
  627. .LChaCha20_neon:
  628. adr r14,.Lsigma
  629. vstmdb sp!,{d8-d15} @ ABI spec says so
  630. stmdb sp!,{r0-r3}
  631. vld1.32 {$b0-$c0},[r3] @ load key
  632. ldmia r3,{r4-r11} @ load key
  633. sub sp,sp,#4*(16+16)
  634. vld1.32 {$d0},[r12] @ load counter and nonce
  635. add r12,sp,#4*8
  636. ldmia r14,{r0-r3} @ load sigma
  637. vld1.32 {$a0},[r14]! @ load sigma
  638. vld1.32 {$t0},[r14] @ one
  639. vst1.32 {$c0-$d0},[r12] @ copy 1/2key|counter|nonce
  640. vst1.32 {$a0-$b0},[sp] @ copy sigma|1/2key
  641. str r10,[sp,#4*(16+10)] @ off-load "@x[10]"
  642. str r11,[sp,#4*(16+11)] @ off-load "@x[11]"
  643. vshl.i32 $t1#lo,$t0#lo,#1 @ two
  644. vstr $t0#lo,[sp,#4*(16+0)]
  645. vshl.i32 $t2#lo,$t0#lo,#2 @ four
  646. vstr $t1#lo,[sp,#4*(16+2)]
  647. vmov $a1,$a0
  648. vstr $t2#lo,[sp,#4*(16+4)]
  649. vmov $a2,$a0
  650. vmov $b1,$b0
  651. vmov $b2,$b0
  652. b .Loop_neon_enter
  653. .align 4
  654. .Loop_neon_outer:
  655. ldmia sp,{r0-r9} @ load key material
  656. cmp @t[3],#64*2 @ if len<=64*2
  657. bls .Lbreak_neon @ switch to integer-only
  658. vmov $a1,$a0
  659. str @t[3],[sp,#4*(32+2)] @ save len
  660. vmov $a2,$a0
  661. str r12, [sp,#4*(32+1)] @ save inp
  662. vmov $b1,$b0
  663. str r14, [sp,#4*(32+0)] @ save out
  664. vmov $b2,$b0
  665. .Loop_neon_enter:
  666. ldr @t[3], [sp,#4*(15)]
  667. vadd.i32 $d1,$d0,$t0 @ counter+1
  668. ldr @x[12],[sp,#4*(12)] @ modulo-scheduled load
  669. vmov $c1,$c0
  670. ldr @t[2], [sp,#4*(13)]
  671. vmov $c2,$c0
  672. ldr @x[14],[sp,#4*(14)]
  673. vadd.i32 $d2,$d1,$t0 @ counter+2
  674. str @t[3], [sp,#4*(16+15)]
  675. mov @t[3],#10
  676. add @x[12],@x[12],#3 @ counter+3
  677. b .Loop_neon
  678. .align 4
  679. .Loop_neon:
  680. subs @t[3],@t[3],#1
  681. ___
  682. my @thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,0);
  683. my @thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,0);
  684. my @thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,0);
  685. my @thread3=&ROUND(0,4,8,12);
  686. foreach (@thread0) {
  687. eval; eval(shift(@thread3));
  688. eval(shift(@thread1)); eval(shift(@thread3));
  689. eval(shift(@thread2)); eval(shift(@thread3));
  690. }
  691. @thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,1);
  692. @thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,1);
  693. @thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,1);
  694. @thread3=&ROUND(0,5,10,15);
  695. foreach (@thread0) {
  696. eval; eval(shift(@thread3));
  697. eval(shift(@thread1)); eval(shift(@thread3));
  698. eval(shift(@thread2)); eval(shift(@thread3));
  699. }
  700. $code.=<<___;
  701. bne .Loop_neon
  702. add @t[3],sp,#32
  703. vld1.32 {$t0-$t1},[sp] @ load key material
  704. vld1.32 {$t2-$t3},[@t[3]]
  705. ldr @t[3],[sp,#4*(32+2)] @ load len
  706. str @t[0], [sp,#4*(16+8)] @ modulo-scheduled store
  707. str @t[1], [sp,#4*(16+9)]
  708. str @x[12],[sp,#4*(16+12)]
  709. str @t[2], [sp,#4*(16+13)]
  710. str @x[14],[sp,#4*(16+14)]
  711. @ at this point we have first half of 512-bit result in
  712. @ @x[0-7] and second half at sp+4*(16+8)
  713. ldr r12,[sp,#4*(32+1)] @ load inp
  714. ldr r14,[sp,#4*(32+0)] @ load out
  715. vadd.i32 $a0,$a0,$t0 @ accumulate key material
  716. vadd.i32 $a1,$a1,$t0
  717. vadd.i32 $a2,$a2,$t0
  718. vldr $t0#lo,[sp,#4*(16+0)] @ one
  719. vadd.i32 $b0,$b0,$t1
  720. vadd.i32 $b1,$b1,$t1
  721. vadd.i32 $b2,$b2,$t1
  722. vldr $t1#lo,[sp,#4*(16+2)] @ two
  723. vadd.i32 $c0,$c0,$t2
  724. vadd.i32 $c1,$c1,$t2
  725. vadd.i32 $c2,$c2,$t2
  726. vadd.i32 $d1#lo,$d1#lo,$t0#lo @ counter+1
  727. vadd.i32 $d2#lo,$d2#lo,$t1#lo @ counter+2
  728. vadd.i32 $d0,$d0,$t3
  729. vadd.i32 $d1,$d1,$t3
  730. vadd.i32 $d2,$d2,$t3
  731. cmp @t[3],#64*4
  732. blo .Ltail_neon
  733. vld1.8 {$t0-$t1},[r12]! @ load input
  734. mov @t[3],sp
  735. vld1.8 {$t2-$t3},[r12]!
  736. veor $a0,$a0,$t0 @ xor with input
  737. veor $b0,$b0,$t1
  738. vld1.8 {$t0-$t1},[r12]!
  739. veor $c0,$c0,$t2
  740. veor $d0,$d0,$t3
  741. vld1.8 {$t2-$t3},[r12]!
  742. veor $a1,$a1,$t0
  743. vst1.8 {$a0-$b0},[r14]! @ store output
  744. veor $b1,$b1,$t1
  745. vld1.8 {$t0-$t1},[r12]!
  746. veor $c1,$c1,$t2
  747. vst1.8 {$c0-$d0},[r14]!
  748. veor $d1,$d1,$t3
  749. vld1.8 {$t2-$t3},[r12]!
  750. veor $a2,$a2,$t0
  751. vld1.32 {$a0-$b0},[@t[3]]! @ load for next iteration
  752. veor $t0#hi,$t0#hi,$t0#hi
  753. vldr $t0#lo,[sp,#4*(16+4)] @ four
  754. veor $b2,$b2,$t1
  755. vld1.32 {$c0-$d0},[@t[3]]
  756. veor $c2,$c2,$t2
  757. vst1.8 {$a1-$b1},[r14]!
  758. veor $d2,$d2,$t3
  759. vst1.8 {$c1-$d1},[r14]!
  760. vadd.i32 $d0#lo,$d0#lo,$t0#lo @ next counter value
  761. vldr $t0#lo,[sp,#4*(16+0)] @ one
  762. ldmia sp,{@t[0]-@t[3]} @ load key material
  763. add @x[0],@x[0],@t[0] @ accumulate key material
  764. ldr @t[0],[r12],#16 @ load input
  765. vst1.8 {$a2-$b2},[r14]!
  766. add @x[1],@x[1],@t[1]
  767. ldr @t[1],[r12,#-12]
  768. vst1.8 {$c2-$d2},[r14]!
  769. add @x[2],@x[2],@t[2]
  770. ldr @t[2],[r12,#-8]
  771. add @x[3],@x[3],@t[3]
  772. ldr @t[3],[r12,#-4]
  773. # ifdef __ARMEB__
  774. rev @x[0],@x[0]
  775. rev @x[1],@x[1]
  776. rev @x[2],@x[2]
  777. rev @x[3],@x[3]
  778. # endif
  779. eor @x[0],@x[0],@t[0] @ xor with input
  780. add @t[0],sp,#4*(4)
  781. eor @x[1],@x[1],@t[1]
  782. str @x[0],[r14],#16 @ store output
  783. eor @x[2],@x[2],@t[2]
  784. str @x[1],[r14,#-12]
  785. eor @x[3],@x[3],@t[3]
  786. ldmia @t[0],{@t[0]-@t[3]} @ load key material
  787. str @x[2],[r14,#-8]
  788. str @x[3],[r14,#-4]
  789. add @x[4],@x[4],@t[0] @ accumulate key material
  790. ldr @t[0],[r12],#16 @ load input
  791. add @x[5],@x[5],@t[1]
  792. ldr @t[1],[r12,#-12]
  793. add @x[6],@x[6],@t[2]
  794. ldr @t[2],[r12,#-8]
  795. add @x[7],@x[7],@t[3]
  796. ldr @t[3],[r12,#-4]
  797. # ifdef __ARMEB__
  798. rev @x[4],@x[4]
  799. rev @x[5],@x[5]
  800. rev @x[6],@x[6]
  801. rev @x[7],@x[7]
  802. # endif
  803. eor @x[4],@x[4],@t[0]
  804. add @t[0],sp,#4*(8)
  805. eor @x[5],@x[5],@t[1]
  806. str @x[4],[r14],#16 @ store output
  807. eor @x[6],@x[6],@t[2]
  808. str @x[5],[r14,#-12]
  809. eor @x[7],@x[7],@t[3]
  810. ldmia @t[0],{@t[0]-@t[3]} @ load key material
  811. str @x[6],[r14,#-8]
  812. add @x[0],sp,#4*(16+8)
  813. str @x[7],[r14,#-4]
  814. ldmia @x[0],{@x[0]-@x[7]} @ load second half
  815. add @x[0],@x[0],@t[0] @ accumulate key material
  816. ldr @t[0],[r12],#16 @ load input
  817. add @x[1],@x[1],@t[1]
  818. ldr @t[1],[r12,#-12]
  819. # ifdef __thumb2__
  820. it hi
  821. # endif
  822. strhi @t[2],[sp,#4*(16+10)] @ copy "@x[10]" while at it
  823. add @x[2],@x[2],@t[2]
  824. ldr @t[2],[r12,#-8]
  825. # ifdef __thumb2__
  826. it hi
  827. # endif
  828. strhi @t[3],[sp,#4*(16+11)] @ copy "@x[11]" while at it
  829. add @x[3],@x[3],@t[3]
  830. ldr @t[3],[r12,#-4]
  831. # ifdef __ARMEB__
  832. rev @x[0],@x[0]
  833. rev @x[1],@x[1]
  834. rev @x[2],@x[2]
  835. rev @x[3],@x[3]
  836. # endif
  837. eor @x[0],@x[0],@t[0]
  838. add @t[0],sp,#4*(12)
  839. eor @x[1],@x[1],@t[1]
  840. str @x[0],[r14],#16 @ store output
  841. eor @x[2],@x[2],@t[2]
  842. str @x[1],[r14,#-12]
  843. eor @x[3],@x[3],@t[3]
  844. ldmia @t[0],{@t[0]-@t[3]} @ load key material
  845. str @x[2],[r14,#-8]
  846. str @x[3],[r14,#-4]
  847. add @x[4],@x[4],@t[0] @ accumulate key material
  848. add @t[0],@t[0],#4 @ next counter value
  849. add @x[5],@x[5],@t[1]
  850. str @t[0],[sp,#4*(12)] @ save next counter value
  851. ldr @t[0],[r12],#16 @ load input
  852. add @x[6],@x[6],@t[2]
  853. add @x[4],@x[4],#3 @ counter+3
  854. ldr @t[1],[r12,#-12]
  855. add @x[7],@x[7],@t[3]
  856. ldr @t[2],[r12,#-8]
  857. ldr @t[3],[r12,#-4]
  858. # ifdef __ARMEB__
  859. rev @x[4],@x[4]
  860. rev @x[5],@x[5]
  861. rev @x[6],@x[6]
  862. rev @x[7],@x[7]
  863. # endif
  864. eor @x[4],@x[4],@t[0]
  865. # ifdef __thumb2__
  866. it hi
  867. # endif
  868. ldrhi @t[0],[sp,#4*(32+2)] @ re-load len
  869. eor @x[5],@x[5],@t[1]
  870. eor @x[6],@x[6],@t[2]
  871. str @x[4],[r14],#16 @ store output
  872. eor @x[7],@x[7],@t[3]
  873. str @x[5],[r14,#-12]
  874. sub @t[3],@t[0],#64*4 @ len-=64*4
  875. str @x[6],[r14,#-8]
  876. str @x[7],[r14,#-4]
  877. bhi .Loop_neon_outer
  878. b .Ldone_neon
  879. .align 4
  880. .Lbreak_neon:
  881. @ harmonize NEON and integer-only stack frames: load data
  882. @ from NEON frame, but save to integer-only one; distance
  883. @ between the two is 4*(32+4+16-32)=4*(20).
  884. str @t[3], [sp,#4*(20+32+2)] @ save len
  885. add @t[3],sp,#4*(32+4)
  886. str r12, [sp,#4*(20+32+1)] @ save inp
  887. str r14, [sp,#4*(20+32+0)] @ save out
  888. ldr @x[12],[sp,#4*(16+10)]
  889. ldr @x[14],[sp,#4*(16+11)]
  890. vldmia @t[3],{d8-d15} @ fulfill ABI requirement
  891. str @x[12],[sp,#4*(20+16+10)] @ copy "@x[10]"
  892. str @x[14],[sp,#4*(20+16+11)] @ copy "@x[11]"
  893. ldr @t[3], [sp,#4*(15)]
  894. ldr @x[12],[sp,#4*(12)] @ modulo-scheduled load
  895. ldr @t[2], [sp,#4*(13)]
  896. ldr @x[14],[sp,#4*(14)]
  897. str @t[3], [sp,#4*(20+16+15)]
  898. add @t[3],sp,#4*(20)
  899. vst1.32 {$a0-$b0},[@t[3]]! @ copy key
  900. add sp,sp,#4*(20) @ switch frame
  901. vst1.32 {$c0-$d0},[@t[3]]
  902. mov @t[3],#10
  903. b .Loop @ go integer-only
  904. .align 4
  905. .Ltail_neon:
  906. cmp @t[3],#64*3
  907. bhs .L192_or_more_neon
  908. cmp @t[3],#64*2
  909. bhs .L128_or_more_neon
  910. cmp @t[3],#64*1
  911. bhs .L64_or_more_neon
  912. add @t[0],sp,#4*(8)
  913. vst1.8 {$a0-$b0},[sp]
  914. add @t[2],sp,#4*(0)
  915. vst1.8 {$c0-$d0},[@t[0]]
  916. b .Loop_tail_neon
  917. .align 4
  918. .L64_or_more_neon:
  919. vld1.8 {$t0-$t1},[r12]!
  920. vld1.8 {$t2-$t3},[r12]!
  921. veor $a0,$a0,$t0
  922. veor $b0,$b0,$t1
  923. veor $c0,$c0,$t2
  924. veor $d0,$d0,$t3
  925. vst1.8 {$a0-$b0},[r14]!
  926. vst1.8 {$c0-$d0},[r14]!
  927. beq .Ldone_neon
  928. add @t[0],sp,#4*(8)
  929. vst1.8 {$a1-$b1},[sp]
  930. add @t[2],sp,#4*(0)
  931. vst1.8 {$c1-$d1},[@t[0]]
  932. sub @t[3],@t[3],#64*1 @ len-=64*1
  933. b .Loop_tail_neon
  934. .align 4
  935. .L128_or_more_neon:
  936. vld1.8 {$t0-$t1},[r12]!
  937. vld1.8 {$t2-$t3},[r12]!
  938. veor $a0,$a0,$t0
  939. veor $b0,$b0,$t1
  940. vld1.8 {$t0-$t1},[r12]!
  941. veor $c0,$c0,$t2
  942. veor $d0,$d0,$t3
  943. vld1.8 {$t2-$t3},[r12]!
  944. veor $a1,$a1,$t0
  945. veor $b1,$b1,$t1
  946. vst1.8 {$a0-$b0},[r14]!
  947. veor $c1,$c1,$t2
  948. vst1.8 {$c0-$d0},[r14]!
  949. veor $d1,$d1,$t3
  950. vst1.8 {$a1-$b1},[r14]!
  951. vst1.8 {$c1-$d1},[r14]!
  952. beq .Ldone_neon
  953. add @t[0],sp,#4*(8)
  954. vst1.8 {$a2-$b2},[sp]
  955. add @t[2],sp,#4*(0)
  956. vst1.8 {$c2-$d2},[@t[0]]
  957. sub @t[3],@t[3],#64*2 @ len-=64*2
  958. b .Loop_tail_neon
  959. .align 4
  960. .L192_or_more_neon:
  961. vld1.8 {$t0-$t1},[r12]!
  962. vld1.8 {$t2-$t3},[r12]!
  963. veor $a0,$a0,$t0
  964. veor $b0,$b0,$t1
  965. vld1.8 {$t0-$t1},[r12]!
  966. veor $c0,$c0,$t2
  967. veor $d0,$d0,$t3
  968. vld1.8 {$t2-$t3},[r12]!
  969. veor $a1,$a1,$t0
  970. veor $b1,$b1,$t1
  971. vld1.8 {$t0-$t1},[r12]!
  972. veor $c1,$c1,$t2
  973. vst1.8 {$a0-$b0},[r14]!
  974. veor $d1,$d1,$t3
  975. vld1.8 {$t2-$t3},[r12]!
  976. veor $a2,$a2,$t0
  977. vst1.8 {$c0-$d0},[r14]!
  978. veor $b2,$b2,$t1
  979. vst1.8 {$a1-$b1},[r14]!
  980. veor $c2,$c2,$t2
  981. vst1.8 {$c1-$d1},[r14]!
  982. veor $d2,$d2,$t3
  983. vst1.8 {$a2-$b2},[r14]!
  984. vst1.8 {$c2-$d2},[r14]!
  985. beq .Ldone_neon
  986. ldmia sp,{@t[0]-@t[3]} @ load key material
  987. add @x[0],@x[0],@t[0] @ accumulate key material
  988. add @t[0],sp,#4*(4)
  989. add @x[1],@x[1],@t[1]
  990. add @x[2],@x[2],@t[2]
  991. add @x[3],@x[3],@t[3]
  992. ldmia @t[0],{@t[0]-@t[3]} @ load key material
  993. add @x[4],@x[4],@t[0] @ accumulate key material
  994. add @t[0],sp,#4*(8)
  995. add @x[5],@x[5],@t[1]
  996. add @x[6],@x[6],@t[2]
  997. add @x[7],@x[7],@t[3]
  998. ldmia @t[0],{@t[0]-@t[3]} @ load key material
  999. # ifdef __ARMEB__
  1000. rev @x[0],@x[0]
  1001. rev @x[1],@x[1]
  1002. rev @x[2],@x[2]
  1003. rev @x[3],@x[3]
  1004. rev @x[4],@x[4]
  1005. rev @x[5],@x[5]
  1006. rev @x[6],@x[6]
  1007. rev @x[7],@x[7]
  1008. # endif
  1009. stmia sp,{@x[0]-@x[7]}
  1010. add @x[0],sp,#4*(16+8)
  1011. ldmia @x[0],{@x[0]-@x[7]} @ load second half
  1012. add @x[0],@x[0],@t[0] @ accumulate key material
  1013. add @t[0],sp,#4*(12)
  1014. add @x[1],@x[1],@t[1]
  1015. add @x[2],@x[2],@t[2]
  1016. add @x[3],@x[3],@t[3]
  1017. ldmia @t[0],{@t[0]-@t[3]} @ load key material
  1018. add @x[4],@x[4],@t[0] @ accumulate key material
  1019. add @t[0],sp,#4*(8)
  1020. add @x[5],@x[5],@t[1]
  1021. add @x[4],@x[4],#3 @ counter+3
  1022. add @x[6],@x[6],@t[2]
  1023. add @x[7],@x[7],@t[3]
  1024. ldr @t[3],[sp,#4*(32+2)] @ re-load len
  1025. # ifdef __ARMEB__
  1026. rev @x[0],@x[0]
  1027. rev @x[1],@x[1]
  1028. rev @x[2],@x[2]
  1029. rev @x[3],@x[3]
  1030. rev @x[4],@x[4]
  1031. rev @x[5],@x[5]
  1032. rev @x[6],@x[6]
  1033. rev @x[7],@x[7]
  1034. # endif
  1035. stmia @t[0],{@x[0]-@x[7]}
  1036. add @t[2],sp,#4*(0)
  1037. sub @t[3],@t[3],#64*3 @ len-=64*3
  1038. .Loop_tail_neon:
  1039. ldrb @t[0],[@t[2]],#1 @ read buffer on stack
  1040. ldrb @t[1],[r12],#1 @ read input
  1041. subs @t[3],@t[3],#1
  1042. eor @t[0],@t[0],@t[1]
  1043. strb @t[0],[r14],#1 @ store output
  1044. bne .Loop_tail_neon
  1045. .Ldone_neon:
  1046. add sp,sp,#4*(32+4)
  1047. vldmia sp,{d8-d15}
  1048. add sp,sp,#4*(16+3)
  1049. ldmia sp!,{r4-r11,pc}
  1050. .size ChaCha20_neon,.-ChaCha20_neon
  1051. .comm OPENSSL_armcap_P,4,4
  1052. #endif
  1053. ___
  1054. }}}
  1055. foreach (split("\n",$code)) {
  1056. s/\`([^\`]*)\`/eval $1/geo;
  1057. s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo;
  1058. print $_,"\n";
  1059. }
  1060. close STDOUT or die "error closing STDOUT: $!";