poly1305-armv4.pl 29 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255
  1. #! /usr/bin/env perl
  2. # Copyright 2016-2023 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. # ====================================================================
  10. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  11. # project. The module is, however, dual licensed under OpenSSL and
  12. # CRYPTOGAMS licenses depending on where you obtain it. For further
  13. # details see http://www.openssl.org/~appro/cryptogams/.
  14. # ====================================================================
  15. #
  16. # IALU(*)/gcc-4.4 NEON
  17. #
  18. # ARM11xx(ARMv6) 7.78/+100% -
  19. # Cortex-A5 6.35/+130% 3.00
  20. # Cortex-A8 6.25/+115% 2.36
  21. # Cortex-A9 5.10/+95% 2.55
  22. # Cortex-A15 3.85/+85% 1.25(**)
  23. # Snapdragon S4 5.70/+100% 1.48(**)
  24. #
  25. # (*) this is for -march=armv6, i.e. with bunch of ldrb loading data;
  26. # (**) these are trade-off results, they can be improved by ~8% but at
  27. # the cost of 15/12% regression on Cortex-A5/A7, it's even possible
  28. # to improve Cortex-A9 result, but then A5/A7 loose more than 20%;
  29. # $output is the last argument if it looks like a file (it has an extension)
  30. # $flavour is the first argument if it doesn't look like a file
  31. $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  32. $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  33. if ($flavour && $flavour ne "void") {
  34. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  35. ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
  36. ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
  37. die "can't locate arm-xlate.pl";
  38. open STDOUT,"| \"$^X\" $xlate $flavour \"$output\""
  39. or die "can't call $xlate: $!";
  40. } else {
  41. $output and open STDOUT,">$output";
  42. }
  43. ($ctx,$inp,$len,$padbit)=map("r$_",(0..3));
  44. $code.=<<___;
  45. #include "arm_arch.h"
  46. #if defined(__thumb2__)
  47. .syntax unified
  48. .thumb
  49. #else
  50. .code 32
  51. #endif
  52. .text
  53. .globl poly1305_emit
  54. .globl poly1305_blocks
  55. .globl poly1305_init
  56. .type poly1305_init,%function
  57. .align 5
  58. poly1305_init:
  59. .Lpoly1305_init:
  60. stmdb sp!,{r4-r11}
  61. eor r3,r3,r3
  62. cmp $inp,#0
  63. str r3,[$ctx,#0] @ zero hash value
  64. str r3,[$ctx,#4]
  65. str r3,[$ctx,#8]
  66. str r3,[$ctx,#12]
  67. str r3,[$ctx,#16]
  68. str r3,[$ctx,#36] @ is_base2_26
  69. add $ctx,$ctx,#20
  70. #ifdef __thumb2__
  71. it eq
  72. #endif
  73. moveq r0,#0
  74. beq .Lno_key
  75. #if __ARM_MAX_ARCH__>=7
  76. adr r11,.Lpoly1305_init
  77. ldr r12,.LOPENSSL_armcap
  78. #endif
  79. ldrb r4,[$inp,#0]
  80. mov r10,#0x0fffffff
  81. ldrb r5,[$inp,#1]
  82. and r3,r10,#-4 @ 0x0ffffffc
  83. ldrb r6,[$inp,#2]
  84. ldrb r7,[$inp,#3]
  85. orr r4,r4,r5,lsl#8
  86. ldrb r5,[$inp,#4]
  87. orr r4,r4,r6,lsl#16
  88. ldrb r6,[$inp,#5]
  89. orr r4,r4,r7,lsl#24
  90. ldrb r7,[$inp,#6]
  91. and r4,r4,r10
  92. #if __ARM_MAX_ARCH__>=7
  93. # if !defined(_WIN32)
  94. ldr r12,[r11,r12] @ OPENSSL_armcap_P
  95. # endif
  96. # if defined(__APPLE__) || defined(_WIN32)
  97. ldr r12,[r12]
  98. # endif
  99. #endif
  100. ldrb r8,[$inp,#7]
  101. orr r5,r5,r6,lsl#8
  102. ldrb r6,[$inp,#8]
  103. orr r5,r5,r7,lsl#16
  104. ldrb r7,[$inp,#9]
  105. orr r5,r5,r8,lsl#24
  106. ldrb r8,[$inp,#10]
  107. and r5,r5,r3
  108. #if __ARM_MAX_ARCH__>=7
  109. tst r12,#ARMV7_NEON @ check for NEON
  110. # ifdef __thumb2__
  111. adr r9,.Lpoly1305_blocks_neon
  112. adr r11,.Lpoly1305_blocks
  113. adr r12,.Lpoly1305_emit
  114. adr r10,.Lpoly1305_emit_neon
  115. itt ne
  116. movne r11,r9
  117. movne r12,r10
  118. orr r11,r11,#1 @ thumb-ify address
  119. orr r12,r12,#1
  120. # else
  121. addeq r12,r11,#(.Lpoly1305_emit-.Lpoly1305_init)
  122. addne r12,r11,#(.Lpoly1305_emit_neon-.Lpoly1305_init)
  123. addeq r11,r11,#(.Lpoly1305_blocks-.Lpoly1305_init)
  124. addne r11,r11,#(.Lpoly1305_blocks_neon-.Lpoly1305_init)
  125. # endif
  126. #endif
  127. ldrb r9,[$inp,#11]
  128. orr r6,r6,r7,lsl#8
  129. ldrb r7,[$inp,#12]
  130. orr r6,r6,r8,lsl#16
  131. ldrb r8,[$inp,#13]
  132. orr r6,r6,r9,lsl#24
  133. ldrb r9,[$inp,#14]
  134. and r6,r6,r3
  135. ldrb r10,[$inp,#15]
  136. orr r7,r7,r8,lsl#8
  137. str r4,[$ctx,#0]
  138. orr r7,r7,r9,lsl#16
  139. str r5,[$ctx,#4]
  140. orr r7,r7,r10,lsl#24
  141. str r6,[$ctx,#8]
  142. and r7,r7,r3
  143. str r7,[$ctx,#12]
  144. #if __ARM_MAX_ARCH__>=7
  145. stmia r2,{r11,r12} @ fill functions table
  146. mov r0,#1
  147. #else
  148. mov r0,#0
  149. #endif
  150. .Lno_key:
  151. ldmia sp!,{r4-r11}
  152. #if __ARM_ARCH__>=5
  153. ret @ bx lr
  154. #else
  155. tst lr,#1
  156. moveq pc,lr @ be binary compatible with V4, yet
  157. bx lr @ interoperable with Thumb ISA:-)
  158. #endif
  159. .size poly1305_init,.-poly1305_init
  160. ___
  161. {
  162. my ($h0,$h1,$h2,$h3,$h4,$r0,$r1,$r2,$r3)=map("r$_",(4..12));
  163. my ($s1,$s2,$s3)=($r1,$r2,$r3);
  164. $code.=<<___;
  165. .type poly1305_blocks,%function
  166. .align 5
  167. poly1305_blocks:
  168. .Lpoly1305_blocks:
  169. stmdb sp!,{r3-r11,lr}
  170. ands $len,$len,#-16
  171. beq .Lno_data
  172. cmp $padbit,#0
  173. add $len,$len,$inp @ end pointer
  174. sub sp,sp,#32
  175. ldmia $ctx,{$h0-$r3} @ load context
  176. str $ctx,[sp,#12] @ offload stuff
  177. mov lr,$inp
  178. str $len,[sp,#16]
  179. str $r1,[sp,#20]
  180. str $r2,[sp,#24]
  181. str $r3,[sp,#28]
  182. b .Loop
  183. .Loop:
  184. #if __ARM_ARCH__<7
  185. ldrb r0,[lr],#16 @ load input
  186. # ifdef __thumb2__
  187. it hi
  188. # endif
  189. addhi $h4,$h4,#1 @ 1<<128
  190. ldrb r1,[lr,#-15]
  191. ldrb r2,[lr,#-14]
  192. ldrb r3,[lr,#-13]
  193. orr r1,r0,r1,lsl#8
  194. ldrb r0,[lr,#-12]
  195. orr r2,r1,r2,lsl#16
  196. ldrb r1,[lr,#-11]
  197. orr r3,r2,r3,lsl#24
  198. ldrb r2,[lr,#-10]
  199. adds $h0,$h0,r3 @ accumulate input
  200. ldrb r3,[lr,#-9]
  201. orr r1,r0,r1,lsl#8
  202. ldrb r0,[lr,#-8]
  203. orr r2,r1,r2,lsl#16
  204. ldrb r1,[lr,#-7]
  205. orr r3,r2,r3,lsl#24
  206. ldrb r2,[lr,#-6]
  207. adcs $h1,$h1,r3
  208. ldrb r3,[lr,#-5]
  209. orr r1,r0,r1,lsl#8
  210. ldrb r0,[lr,#-4]
  211. orr r2,r1,r2,lsl#16
  212. ldrb r1,[lr,#-3]
  213. orr r3,r2,r3,lsl#24
  214. ldrb r2,[lr,#-2]
  215. adcs $h2,$h2,r3
  216. ldrb r3,[lr,#-1]
  217. orr r1,r0,r1,lsl#8
  218. str lr,[sp,#8] @ offload input pointer
  219. orr r2,r1,r2,lsl#16
  220. add $s1,$r1,$r1,lsr#2
  221. orr r3,r2,r3,lsl#24
  222. #else
  223. ldr r0,[lr],#16 @ load input
  224. # ifdef __thumb2__
  225. it hi
  226. # endif
  227. addhi $h4,$h4,#1 @ padbit
  228. ldr r1,[lr,#-12]
  229. ldr r2,[lr,#-8]
  230. ldr r3,[lr,#-4]
  231. # ifdef __ARMEB__
  232. rev r0,r0
  233. rev r1,r1
  234. rev r2,r2
  235. rev r3,r3
  236. # endif
  237. adds $h0,$h0,r0 @ accumulate input
  238. str lr,[sp,#8] @ offload input pointer
  239. adcs $h1,$h1,r1
  240. add $s1,$r1,$r1,lsr#2
  241. adcs $h2,$h2,r2
  242. #endif
  243. add $s2,$r2,$r2,lsr#2
  244. adcs $h3,$h3,r3
  245. add $s3,$r3,$r3,lsr#2
  246. umull r2,r3,$h1,$r0
  247. adc $h4,$h4,#0
  248. umull r0,r1,$h0,$r0
  249. umlal r2,r3,$h4,$s1
  250. umlal r0,r1,$h3,$s1
  251. ldr $r1,[sp,#20] @ reload $r1
  252. umlal r2,r3,$h2,$s3
  253. umlal r0,r1,$h1,$s3
  254. umlal r2,r3,$h3,$s2
  255. umlal r0,r1,$h2,$s2
  256. umlal r2,r3,$h0,$r1
  257. str r0,[sp,#0] @ future $h0
  258. mul r0,$s2,$h4
  259. ldr $r2,[sp,#24] @ reload $r2
  260. adds r2,r2,r1 @ d1+=d0>>32
  261. eor r1,r1,r1
  262. adc lr,r3,#0 @ future $h2
  263. str r2,[sp,#4] @ future $h1
  264. mul r2,$s3,$h4
  265. eor r3,r3,r3
  266. umlal r0,r1,$h3,$s3
  267. ldr $r3,[sp,#28] @ reload $r3
  268. umlal r2,r3,$h3,$r0
  269. umlal r0,r1,$h2,$r0
  270. umlal r2,r3,$h2,$r1
  271. umlal r0,r1,$h1,$r1
  272. umlal r2,r3,$h1,$r2
  273. umlal r0,r1,$h0,$r2
  274. umlal r2,r3,$h0,$r3
  275. ldr $h0,[sp,#0]
  276. mul $h4,$r0,$h4
  277. ldr $h1,[sp,#4]
  278. adds $h2,lr,r0 @ d2+=d1>>32
  279. ldr lr,[sp,#8] @ reload input pointer
  280. adc r1,r1,#0
  281. adds $h3,r2,r1 @ d3+=d2>>32
  282. ldr r0,[sp,#16] @ reload end pointer
  283. adc r3,r3,#0
  284. add $h4,$h4,r3 @ h4+=d3>>32
  285. and r1,$h4,#-4
  286. and $h4,$h4,#3
  287. add r1,r1,r1,lsr#2 @ *=5
  288. adds $h0,$h0,r1
  289. adcs $h1,$h1,#0
  290. adcs $h2,$h2,#0
  291. adcs $h3,$h3,#0
  292. adc $h4,$h4,#0
  293. cmp r0,lr @ done yet?
  294. bhi .Loop
  295. ldr $ctx,[sp,#12]
  296. add sp,sp,#32
  297. stmia $ctx,{$h0-$h4} @ store the result
  298. .Lno_data:
  299. #if __ARM_ARCH__>=5
  300. ldmia sp!,{r3-r11,pc}
  301. #else
  302. ldmia sp!,{r3-r11,lr}
  303. tst lr,#1
  304. moveq pc,lr @ be binary compatible with V4, yet
  305. bx lr @ interoperable with Thumb ISA:-)
  306. #endif
  307. .size poly1305_blocks,.-poly1305_blocks
  308. ___
  309. }
  310. {
  311. my ($ctx,$mac,$nonce)=map("r$_",(0..2));
  312. my ($h0,$h1,$h2,$h3,$h4,$g0,$g1,$g2,$g3)=map("r$_",(3..11));
  313. my $g4=$h4;
  314. $code.=<<___;
  315. .type poly1305_emit,%function
  316. .align 5
  317. poly1305_emit:
  318. .Lpoly1305_emit:
  319. stmdb sp!,{r4-r11}
  320. .Lpoly1305_emit_enter:
  321. ldmia $ctx,{$h0-$h4}
  322. adds $g0,$h0,#5 @ compare to modulus
  323. adcs $g1,$h1,#0
  324. adcs $g2,$h2,#0
  325. adcs $g3,$h3,#0
  326. adc $g4,$h4,#0
  327. tst $g4,#4 @ did it carry/borrow?
  328. #ifdef __thumb2__
  329. it ne
  330. #endif
  331. movne $h0,$g0
  332. ldr $g0,[$nonce,#0]
  333. #ifdef __thumb2__
  334. it ne
  335. #endif
  336. movne $h1,$g1
  337. ldr $g1,[$nonce,#4]
  338. #ifdef __thumb2__
  339. it ne
  340. #endif
  341. movne $h2,$g2
  342. ldr $g2,[$nonce,#8]
  343. #ifdef __thumb2__
  344. it ne
  345. #endif
  346. movne $h3,$g3
  347. ldr $g3,[$nonce,#12]
  348. adds $h0,$h0,$g0
  349. adcs $h1,$h1,$g1
  350. adcs $h2,$h2,$g2
  351. adc $h3,$h3,$g3
  352. #if __ARM_ARCH__>=7
  353. # ifdef __ARMEB__
  354. rev $h0,$h0
  355. rev $h1,$h1
  356. rev $h2,$h2
  357. rev $h3,$h3
  358. # endif
  359. str $h0,[$mac,#0]
  360. str $h1,[$mac,#4]
  361. str $h2,[$mac,#8]
  362. str $h3,[$mac,#12]
  363. #else
  364. strb $h0,[$mac,#0]
  365. mov $h0,$h0,lsr#8
  366. strb $h1,[$mac,#4]
  367. mov $h1,$h1,lsr#8
  368. strb $h2,[$mac,#8]
  369. mov $h2,$h2,lsr#8
  370. strb $h3,[$mac,#12]
  371. mov $h3,$h3,lsr#8
  372. strb $h0,[$mac,#1]
  373. mov $h0,$h0,lsr#8
  374. strb $h1,[$mac,#5]
  375. mov $h1,$h1,lsr#8
  376. strb $h2,[$mac,#9]
  377. mov $h2,$h2,lsr#8
  378. strb $h3,[$mac,#13]
  379. mov $h3,$h3,lsr#8
  380. strb $h0,[$mac,#2]
  381. mov $h0,$h0,lsr#8
  382. strb $h1,[$mac,#6]
  383. mov $h1,$h1,lsr#8
  384. strb $h2,[$mac,#10]
  385. mov $h2,$h2,lsr#8
  386. strb $h3,[$mac,#14]
  387. mov $h3,$h3,lsr#8
  388. strb $h0,[$mac,#3]
  389. strb $h1,[$mac,#7]
  390. strb $h2,[$mac,#11]
  391. strb $h3,[$mac,#15]
  392. #endif
  393. ldmia sp!,{r4-r11}
  394. #if __ARM_ARCH__>=5
  395. ret @ bx lr
  396. #else
  397. tst lr,#1
  398. moveq pc,lr @ be binary compatible with V4, yet
  399. bx lr @ interoperable with Thumb ISA:-)
  400. #endif
  401. .size poly1305_emit,.-poly1305_emit
  402. ___
  403. {
  404. my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("d$_",(0..9));
  405. my ($D0,$D1,$D2,$D3,$D4, $H0,$H1,$H2,$H3,$H4) = map("q$_",(5..14));
  406. my ($T0,$T1,$MASK) = map("q$_",(15,4,0));
  407. my ($in2,$zeros,$tbl0,$tbl1) = map("r$_",(4..7));
  408. $code.=<<___;
  409. #if __ARM_MAX_ARCH__>=7
  410. .fpu neon
  411. .type poly1305_init_neon,%function
  412. .align 5
  413. poly1305_init_neon:
  414. ldr r4,[$ctx,#20] @ load key base 2^32
  415. ldr r5,[$ctx,#24]
  416. ldr r6,[$ctx,#28]
  417. ldr r7,[$ctx,#32]
  418. and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26
  419. mov r3,r4,lsr#26
  420. mov r4,r5,lsr#20
  421. orr r3,r3,r5,lsl#6
  422. mov r5,r6,lsr#14
  423. orr r4,r4,r6,lsl#12
  424. mov r6,r7,lsr#8
  425. orr r5,r5,r7,lsl#18
  426. and r3,r3,#0x03ffffff
  427. and r4,r4,#0x03ffffff
  428. and r5,r5,#0x03ffffff
  429. vdup.32 $R0,r2 @ r^1 in both lanes
  430. add r2,r3,r3,lsl#2 @ *5
  431. vdup.32 $R1,r3
  432. add r3,r4,r4,lsl#2
  433. vdup.32 $S1,r2
  434. vdup.32 $R2,r4
  435. add r4,r5,r5,lsl#2
  436. vdup.32 $S2,r3
  437. vdup.32 $R3,r5
  438. add r5,r6,r6,lsl#2
  439. vdup.32 $S3,r4
  440. vdup.32 $R4,r6
  441. vdup.32 $S4,r5
  442. mov $zeros,#2 @ counter
  443. .Lsquare_neon:
  444. @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  445. @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
  446. @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
  447. @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
  448. @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
  449. @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
  450. vmull.u32 $D0,$R0,${R0}[1]
  451. vmull.u32 $D1,$R1,${R0}[1]
  452. vmull.u32 $D2,$R2,${R0}[1]
  453. vmull.u32 $D3,$R3,${R0}[1]
  454. vmull.u32 $D4,$R4,${R0}[1]
  455. vmlal.u32 $D0,$R4,${S1}[1]
  456. vmlal.u32 $D1,$R0,${R1}[1]
  457. vmlal.u32 $D2,$R1,${R1}[1]
  458. vmlal.u32 $D3,$R2,${R1}[1]
  459. vmlal.u32 $D4,$R3,${R1}[1]
  460. vmlal.u32 $D0,$R3,${S2}[1]
  461. vmlal.u32 $D1,$R4,${S2}[1]
  462. vmlal.u32 $D3,$R1,${R2}[1]
  463. vmlal.u32 $D2,$R0,${R2}[1]
  464. vmlal.u32 $D4,$R2,${R2}[1]
  465. vmlal.u32 $D0,$R2,${S3}[1]
  466. vmlal.u32 $D3,$R0,${R3}[1]
  467. vmlal.u32 $D1,$R3,${S3}[1]
  468. vmlal.u32 $D2,$R4,${S3}[1]
  469. vmlal.u32 $D4,$R1,${R3}[1]
  470. vmlal.u32 $D3,$R4,${S4}[1]
  471. vmlal.u32 $D0,$R1,${S4}[1]
  472. vmlal.u32 $D1,$R2,${S4}[1]
  473. vmlal.u32 $D2,$R3,${S4}[1]
  474. vmlal.u32 $D4,$R0,${R4}[1]
  475. @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  476. @ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
  477. @ and P. Schwabe
  478. @
  479. @ H0>>+H1>>+H2>>+H3>>+H4
  480. @ H3>>+H4>>*5+H0>>+H1
  481. @
  482. @ Trivia.
  483. @
  484. @ Result of multiplication of n-bit number by m-bit number is
  485. @ n+m bits wide. However! Even though 2^n is a n+1-bit number,
  486. @ m-bit number multiplied by 2^n is still n+m bits wide.
  487. @
  488. @ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2,
  489. @ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit
  490. @ one is n+1 bits wide.
  491. @
  492. @ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that
  493. @ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4
  494. @ can be 27. However! In cases when their width exceeds 26 bits
  495. @ they are limited by 2^26+2^6. This in turn means that *sum*
  496. @ of the products with these values can still be viewed as sum
  497. @ of 52-bit numbers as long as the amount of addends is not a
  498. @ power of 2. For example,
  499. @
  500. @ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4,
  501. @
  502. @ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or
  503. @ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than
  504. @ 8 * (2^52) or 2^55. However, the value is then multiplied by
  505. @ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12),
  506. @ which is less than 32 * (2^52) or 2^57. And when processing
  507. @ data we are looking at triple as many addends...
  508. @
  509. @ In key setup procedure pre-reduced H0 is limited by 5*4+1 and
  510. @ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the
  511. @ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while
  512. @ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32
  513. @ instruction accepts 2x32-bit input and writes 2x64-bit result.
  514. @ This means that result of reduction have to be compressed upon
  515. @ loop wrap-around. This can be done in the process of reduction
  516. @ to minimize amount of instructions [as well as amount of
  517. @ 128-bit instructions, which benefits low-end processors], but
  518. @ one has to watch for H2 (which is narrower than H0) and 5*H4
  519. @ not being wider than 58 bits, so that result of right shift
  520. @ by 26 bits fits in 32 bits. This is also useful on x86,
  521. @ because it allows to use paddd in place for paddq, which
  522. @ benefits Atom, where paddq is ridiculously slow.
  523. vshr.u64 $T0,$D3,#26
  524. vmovn.i64 $D3#lo,$D3
  525. vshr.u64 $T1,$D0,#26
  526. vmovn.i64 $D0#lo,$D0
  527. vadd.i64 $D4,$D4,$T0 @ h3 -> h4
  528. vbic.i32 $D3#lo,#0xfc000000 @ &=0x03ffffff
  529. vadd.i64 $D1,$D1,$T1 @ h0 -> h1
  530. vbic.i32 $D0#lo,#0xfc000000
  531. vshrn.u64 $T0#lo,$D4,#26
  532. vmovn.i64 $D4#lo,$D4
  533. vshr.u64 $T1,$D1,#26
  534. vmovn.i64 $D1#lo,$D1
  535. vadd.i64 $D2,$D2,$T1 @ h1 -> h2
  536. vbic.i32 $D4#lo,#0xfc000000
  537. vbic.i32 $D1#lo,#0xfc000000
  538. vadd.i32 $D0#lo,$D0#lo,$T0#lo
  539. vshl.u32 $T0#lo,$T0#lo,#2
  540. vshrn.u64 $T1#lo,$D2,#26
  541. vmovn.i64 $D2#lo,$D2
  542. vadd.i32 $D0#lo,$D0#lo,$T0#lo @ h4 -> h0
  543. vadd.i32 $D3#lo,$D3#lo,$T1#lo @ h2 -> h3
  544. vbic.i32 $D2#lo,#0xfc000000
  545. vshr.u32 $T0#lo,$D0#lo,#26
  546. vbic.i32 $D0#lo,#0xfc000000
  547. vshr.u32 $T1#lo,$D3#lo,#26
  548. vbic.i32 $D3#lo,#0xfc000000
  549. vadd.i32 $D1#lo,$D1#lo,$T0#lo @ h0 -> h1
  550. vadd.i32 $D4#lo,$D4#lo,$T1#lo @ h3 -> h4
  551. subs $zeros,$zeros,#1
  552. beq .Lsquare_break_neon
  553. add $tbl0,$ctx,#(48+0*9*4)
  554. add $tbl1,$ctx,#(48+1*9*4)
  555. vtrn.32 $R0,$D0#lo @ r^2:r^1
  556. vtrn.32 $R2,$D2#lo
  557. vtrn.32 $R3,$D3#lo
  558. vtrn.32 $R1,$D1#lo
  559. vtrn.32 $R4,$D4#lo
  560. vshl.u32 $S2,$R2,#2 @ *5
  561. vshl.u32 $S3,$R3,#2
  562. vshl.u32 $S1,$R1,#2
  563. vshl.u32 $S4,$R4,#2
  564. vadd.i32 $S2,$S2,$R2
  565. vadd.i32 $S1,$S1,$R1
  566. vadd.i32 $S3,$S3,$R3
  567. vadd.i32 $S4,$S4,$R4
  568. vst4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
  569. vst4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
  570. vst4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
  571. vst4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
  572. vst1.32 {${S4}[0]},[$tbl0,:32]
  573. vst1.32 {${S4}[1]},[$tbl1,:32]
  574. b .Lsquare_neon
  575. .align 4
  576. .Lsquare_break_neon:
  577. add $tbl0,$ctx,#(48+2*4*9)
  578. add $tbl1,$ctx,#(48+3*4*9)
  579. vmov $R0,$D0#lo @ r^4:r^3
  580. vshl.u32 $S1,$D1#lo,#2 @ *5
  581. vmov $R1,$D1#lo
  582. vshl.u32 $S2,$D2#lo,#2
  583. vmov $R2,$D2#lo
  584. vshl.u32 $S3,$D3#lo,#2
  585. vmov $R3,$D3#lo
  586. vshl.u32 $S4,$D4#lo,#2
  587. vmov $R4,$D4#lo
  588. vadd.i32 $S1,$S1,$D1#lo
  589. vadd.i32 $S2,$S2,$D2#lo
  590. vadd.i32 $S3,$S3,$D3#lo
  591. vadd.i32 $S4,$S4,$D4#lo
  592. vst4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
  593. vst4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
  594. vst4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
  595. vst4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
  596. vst1.32 {${S4}[0]},[$tbl0]
  597. vst1.32 {${S4}[1]},[$tbl1]
  598. ret @ bx lr
  599. .size poly1305_init_neon,.-poly1305_init_neon
  600. .type poly1305_blocks_neon,%function
  601. .align 5
  602. poly1305_blocks_neon:
  603. .Lpoly1305_blocks_neon:
  604. ldr ip,[$ctx,#36] @ is_base2_26
  605. ands $len,$len,#-16
  606. beq .Lno_data_neon
  607. cmp $len,#64
  608. bhs .Lenter_neon
  609. tst ip,ip @ is_base2_26?
  610. beq .Lpoly1305_blocks
  611. .Lenter_neon:
  612. stmdb sp!,{r4-r7}
  613. vstmdb sp!,{d8-d15} @ ABI specification says so
  614. tst ip,ip @ is_base2_26?
  615. bne .Lbase2_26_neon
  616. stmdb sp!,{r1-r3,lr}
  617. bl poly1305_init_neon
  618. ldr r4,[$ctx,#0] @ load hash value base 2^32
  619. ldr r5,[$ctx,#4]
  620. ldr r6,[$ctx,#8]
  621. ldr r7,[$ctx,#12]
  622. ldr ip,[$ctx,#16]
  623. and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26
  624. mov r3,r4,lsr#26
  625. veor $D0#lo,$D0#lo,$D0#lo
  626. mov r4,r5,lsr#20
  627. orr r3,r3,r5,lsl#6
  628. veor $D1#lo,$D1#lo,$D1#lo
  629. mov r5,r6,lsr#14
  630. orr r4,r4,r6,lsl#12
  631. veor $D2#lo,$D2#lo,$D2#lo
  632. mov r6,r7,lsr#8
  633. orr r5,r5,r7,lsl#18
  634. veor $D3#lo,$D3#lo,$D3#lo
  635. and r3,r3,#0x03ffffff
  636. orr r6,r6,ip,lsl#24
  637. veor $D4#lo,$D4#lo,$D4#lo
  638. and r4,r4,#0x03ffffff
  639. mov r1,#1
  640. and r5,r5,#0x03ffffff
  641. str r1,[$ctx,#36] @ is_base2_26
  642. vmov.32 $D0#lo[0],r2
  643. vmov.32 $D1#lo[0],r3
  644. vmov.32 $D2#lo[0],r4
  645. vmov.32 $D3#lo[0],r5
  646. vmov.32 $D4#lo[0],r6
  647. adr $zeros,.Lzeros
  648. ldmia sp!,{r1-r3,lr}
  649. b .Lbase2_32_neon
  650. .align 4
  651. .Lbase2_26_neon:
  652. @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  653. @ load hash value
  654. veor $D0#lo,$D0#lo,$D0#lo
  655. veor $D1#lo,$D1#lo,$D1#lo
  656. veor $D2#lo,$D2#lo,$D2#lo
  657. veor $D3#lo,$D3#lo,$D3#lo
  658. veor $D4#lo,$D4#lo,$D4#lo
  659. vld4.32 {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
  660. adr $zeros,.Lzeros
  661. vld1.32 {$D4#lo[0]},[$ctx]
  662. sub $ctx,$ctx,#16 @ rewind
  663. .Lbase2_32_neon:
  664. add $in2,$inp,#32
  665. mov $padbit,$padbit,lsl#24
  666. tst $len,#31
  667. beq .Leven
  668. vld4.32 {$H0#lo[0],$H1#lo[0],$H2#lo[0],$H3#lo[0]},[$inp]!
  669. vmov.32 $H4#lo[0],$padbit
  670. sub $len,$len,#16
  671. add $in2,$inp,#32
  672. # ifdef __ARMEB__
  673. vrev32.8 $H0,$H0
  674. vrev32.8 $H3,$H3
  675. vrev32.8 $H1,$H1
  676. vrev32.8 $H2,$H2
  677. # endif
  678. vsri.u32 $H4#lo,$H3#lo,#8 @ base 2^32 -> base 2^26
  679. vshl.u32 $H3#lo,$H3#lo,#18
  680. vsri.u32 $H3#lo,$H2#lo,#14
  681. vshl.u32 $H2#lo,$H2#lo,#12
  682. vadd.i32 $H4#hi,$H4#lo,$D4#lo @ add hash value and move to #hi
  683. vbic.i32 $H3#lo,#0xfc000000
  684. vsri.u32 $H2#lo,$H1#lo,#20
  685. vshl.u32 $H1#lo,$H1#lo,#6
  686. vbic.i32 $H2#lo,#0xfc000000
  687. vsri.u32 $H1#lo,$H0#lo,#26
  688. vadd.i32 $H3#hi,$H3#lo,$D3#lo
  689. vbic.i32 $H0#lo,#0xfc000000
  690. vbic.i32 $H1#lo,#0xfc000000
  691. vadd.i32 $H2#hi,$H2#lo,$D2#lo
  692. vadd.i32 $H0#hi,$H0#lo,$D0#lo
  693. vadd.i32 $H1#hi,$H1#lo,$D1#lo
  694. mov $tbl1,$zeros
  695. add $tbl0,$ctx,#48
  696. cmp $len,$len
  697. b .Long_tail
  698. .align 4
  699. .Leven:
  700. subs $len,$len,#64
  701. it lo
  702. movlo $in2,$zeros
  703. vmov.i32 $H4,#1<<24 @ padbit, yes, always
  704. vld4.32 {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp] @ inp[0:1]
  705. add $inp,$inp,#64
  706. vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0)
  707. add $in2,$in2,#64
  708. itt hi
  709. addhi $tbl1,$ctx,#(48+1*9*4)
  710. addhi $tbl0,$ctx,#(48+3*9*4)
  711. # ifdef __ARMEB__
  712. vrev32.8 $H0,$H0
  713. vrev32.8 $H3,$H3
  714. vrev32.8 $H1,$H1
  715. vrev32.8 $H2,$H2
  716. # endif
  717. vsri.u32 $H4,$H3,#8 @ base 2^32 -> base 2^26
  718. vshl.u32 $H3,$H3,#18
  719. vsri.u32 $H3,$H2,#14
  720. vshl.u32 $H2,$H2,#12
  721. vbic.i32 $H3,#0xfc000000
  722. vsri.u32 $H2,$H1,#20
  723. vshl.u32 $H1,$H1,#6
  724. vbic.i32 $H2,#0xfc000000
  725. vsri.u32 $H1,$H0,#26
  726. vbic.i32 $H0,#0xfc000000
  727. vbic.i32 $H1,#0xfc000000
  728. bls .Lskip_loop
  729. vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^2
  730. vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^4
  731. vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
  732. vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
  733. b .Loop_neon
  734. .align 5
  735. .Loop_neon:
  736. @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  737. @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
  738. @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
  739. @ \___________________/
  740. @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
  741. @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
  742. @ \___________________/ \____________________/
  743. @
  744. @ Note that we start with inp[2:3]*r^2. This is because it
  745. @ doesn't depend on reduction in previous iteration.
  746. @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  747. @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
  748. @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
  749. @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
  750. @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
  751. @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
  752. @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  753. @ inp[2:3]*r^2
  754. vadd.i32 $H2#lo,$H2#lo,$D2#lo @ accumulate inp[0:1]
  755. vmull.u32 $D2,$H2#hi,${R0}[1]
  756. vadd.i32 $H0#lo,$H0#lo,$D0#lo
  757. vmull.u32 $D0,$H0#hi,${R0}[1]
  758. vadd.i32 $H3#lo,$H3#lo,$D3#lo
  759. vmull.u32 $D3,$H3#hi,${R0}[1]
  760. vmlal.u32 $D2,$H1#hi,${R1}[1]
  761. vadd.i32 $H1#lo,$H1#lo,$D1#lo
  762. vmull.u32 $D1,$H1#hi,${R0}[1]
  763. vadd.i32 $H4#lo,$H4#lo,$D4#lo
  764. vmull.u32 $D4,$H4#hi,${R0}[1]
  765. subs $len,$len,#64
  766. vmlal.u32 $D0,$H4#hi,${S1}[1]
  767. it lo
  768. movlo $in2,$zeros
  769. vmlal.u32 $D3,$H2#hi,${R1}[1]
  770. vld1.32 ${S4}[1],[$tbl1,:32]
  771. vmlal.u32 $D1,$H0#hi,${R1}[1]
  772. vmlal.u32 $D4,$H3#hi,${R1}[1]
  773. vmlal.u32 $D0,$H3#hi,${S2}[1]
  774. vmlal.u32 $D3,$H1#hi,${R2}[1]
  775. vmlal.u32 $D4,$H2#hi,${R2}[1]
  776. vmlal.u32 $D1,$H4#hi,${S2}[1]
  777. vmlal.u32 $D2,$H0#hi,${R2}[1]
  778. vmlal.u32 $D3,$H0#hi,${R3}[1]
  779. vmlal.u32 $D0,$H2#hi,${S3}[1]
  780. vmlal.u32 $D4,$H1#hi,${R3}[1]
  781. vmlal.u32 $D1,$H3#hi,${S3}[1]
  782. vmlal.u32 $D2,$H4#hi,${S3}[1]
  783. vmlal.u32 $D3,$H4#hi,${S4}[1]
  784. vmlal.u32 $D0,$H1#hi,${S4}[1]
  785. vmlal.u32 $D4,$H0#hi,${R4}[1]
  786. vmlal.u32 $D1,$H2#hi,${S4}[1]
  787. vmlal.u32 $D2,$H3#hi,${S4}[1]
  788. vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0)
  789. add $in2,$in2,#64
  790. @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  791. @ (hash+inp[0:1])*r^4 and accumulate
  792. vmlal.u32 $D3,$H3#lo,${R0}[0]
  793. vmlal.u32 $D0,$H0#lo,${R0}[0]
  794. vmlal.u32 $D4,$H4#lo,${R0}[0]
  795. vmlal.u32 $D1,$H1#lo,${R0}[0]
  796. vmlal.u32 $D2,$H2#lo,${R0}[0]
  797. vld1.32 ${S4}[0],[$tbl0,:32]
  798. vmlal.u32 $D3,$H2#lo,${R1}[0]
  799. vmlal.u32 $D0,$H4#lo,${S1}[0]
  800. vmlal.u32 $D4,$H3#lo,${R1}[0]
  801. vmlal.u32 $D1,$H0#lo,${R1}[0]
  802. vmlal.u32 $D2,$H1#lo,${R1}[0]
  803. vmlal.u32 $D3,$H1#lo,${R2}[0]
  804. vmlal.u32 $D0,$H3#lo,${S2}[0]
  805. vmlal.u32 $D4,$H2#lo,${R2}[0]
  806. vmlal.u32 $D1,$H4#lo,${S2}[0]
  807. vmlal.u32 $D2,$H0#lo,${R2}[0]
  808. vmlal.u32 $D3,$H0#lo,${R3}[0]
  809. vmlal.u32 $D0,$H2#lo,${S3}[0]
  810. vmlal.u32 $D4,$H1#lo,${R3}[0]
  811. vmlal.u32 $D1,$H3#lo,${S3}[0]
  812. vmlal.u32 $D3,$H4#lo,${S4}[0]
  813. vmlal.u32 $D2,$H4#lo,${S3}[0]
  814. vmlal.u32 $D0,$H1#lo,${S4}[0]
  815. vmlal.u32 $D4,$H0#lo,${R4}[0]
  816. vmov.i32 $H4,#1<<24 @ padbit, yes, always
  817. vmlal.u32 $D1,$H2#lo,${S4}[0]
  818. vmlal.u32 $D2,$H3#lo,${S4}[0]
  819. vld4.32 {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp] @ inp[0:1]
  820. add $inp,$inp,#64
  821. # ifdef __ARMEB__
  822. vrev32.8 $H0,$H0
  823. vrev32.8 $H1,$H1
  824. vrev32.8 $H2,$H2
  825. vrev32.8 $H3,$H3
  826. # endif
  827. @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  828. @ lazy reduction interleaved with base 2^32 -> base 2^26 of
  829. @ inp[0:3] previously loaded to $H0-$H3 and smashed to $H0-$H4.
  830. vshr.u64 $T0,$D3,#26
  831. vmovn.i64 $D3#lo,$D3
  832. vshr.u64 $T1,$D0,#26
  833. vmovn.i64 $D0#lo,$D0
  834. vadd.i64 $D4,$D4,$T0 @ h3 -> h4
  835. vbic.i32 $D3#lo,#0xfc000000
  836. vsri.u32 $H4,$H3,#8 @ base 2^32 -> base 2^26
  837. vadd.i64 $D1,$D1,$T1 @ h0 -> h1
  838. vshl.u32 $H3,$H3,#18
  839. vbic.i32 $D0#lo,#0xfc000000
  840. vshrn.u64 $T0#lo,$D4,#26
  841. vmovn.i64 $D4#lo,$D4
  842. vshr.u64 $T1,$D1,#26
  843. vmovn.i64 $D1#lo,$D1
  844. vadd.i64 $D2,$D2,$T1 @ h1 -> h2
  845. vsri.u32 $H3,$H2,#14
  846. vbic.i32 $D4#lo,#0xfc000000
  847. vshl.u32 $H2,$H2,#12
  848. vbic.i32 $D1#lo,#0xfc000000
  849. vadd.i32 $D0#lo,$D0#lo,$T0#lo
  850. vshl.u32 $T0#lo,$T0#lo,#2
  851. vbic.i32 $H3,#0xfc000000
  852. vshrn.u64 $T1#lo,$D2,#26
  853. vmovn.i64 $D2#lo,$D2
  854. vaddl.u32 $D0,$D0#lo,$T0#lo @ h4 -> h0 [widen for a sec]
  855. vsri.u32 $H2,$H1,#20
  856. vadd.i32 $D3#lo,$D3#lo,$T1#lo @ h2 -> h3
  857. vshl.u32 $H1,$H1,#6
  858. vbic.i32 $D2#lo,#0xfc000000
  859. vbic.i32 $H2,#0xfc000000
  860. vshrn.u64 $T0#lo,$D0,#26 @ re-narrow
  861. vmovn.i64 $D0#lo,$D0
  862. vsri.u32 $H1,$H0,#26
  863. vbic.i32 $H0,#0xfc000000
  864. vshr.u32 $T1#lo,$D3#lo,#26
  865. vbic.i32 $D3#lo,#0xfc000000
  866. vbic.i32 $D0#lo,#0xfc000000
  867. vadd.i32 $D1#lo,$D1#lo,$T0#lo @ h0 -> h1
  868. vadd.i32 $D4#lo,$D4#lo,$T1#lo @ h3 -> h4
  869. vbic.i32 $H1,#0xfc000000
  870. bhi .Loop_neon
  871. .Lskip_loop:
  872. @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  873. @ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
  874. add $tbl1,$ctx,#(48+0*9*4)
  875. add $tbl0,$ctx,#(48+1*9*4)
  876. adds $len,$len,#32
  877. it ne
  878. movne $len,#0
  879. bne .Long_tail
  880. vadd.i32 $H2#hi,$H2#lo,$D2#lo @ add hash value and move to #hi
  881. vadd.i32 $H0#hi,$H0#lo,$D0#lo
  882. vadd.i32 $H3#hi,$H3#lo,$D3#lo
  883. vadd.i32 $H1#hi,$H1#lo,$D1#lo
  884. vadd.i32 $H4#hi,$H4#lo,$D4#lo
  885. .Long_tail:
  886. vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^1
  887. vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^2
  888. vadd.i32 $H2#lo,$H2#lo,$D2#lo @ can be redundant
  889. vmull.u32 $D2,$H2#hi,$R0
  890. vadd.i32 $H0#lo,$H0#lo,$D0#lo
  891. vmull.u32 $D0,$H0#hi,$R0
  892. vadd.i32 $H3#lo,$H3#lo,$D3#lo
  893. vmull.u32 $D3,$H3#hi,$R0
  894. vadd.i32 $H1#lo,$H1#lo,$D1#lo
  895. vmull.u32 $D1,$H1#hi,$R0
  896. vadd.i32 $H4#lo,$H4#lo,$D4#lo
  897. vmull.u32 $D4,$H4#hi,$R0
  898. vmlal.u32 $D0,$H4#hi,$S1
  899. vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
  900. vmlal.u32 $D3,$H2#hi,$R1
  901. vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
  902. vmlal.u32 $D1,$H0#hi,$R1
  903. vmlal.u32 $D4,$H3#hi,$R1
  904. vmlal.u32 $D2,$H1#hi,$R1
  905. vmlal.u32 $D3,$H1#hi,$R2
  906. vld1.32 ${S4}[1],[$tbl1,:32]
  907. vmlal.u32 $D0,$H3#hi,$S2
  908. vld1.32 ${S4}[0],[$tbl0,:32]
  909. vmlal.u32 $D4,$H2#hi,$R2
  910. vmlal.u32 $D1,$H4#hi,$S2
  911. vmlal.u32 $D2,$H0#hi,$R2
  912. vmlal.u32 $D3,$H0#hi,$R3
  913. it ne
  914. addne $tbl1,$ctx,#(48+2*9*4)
  915. vmlal.u32 $D0,$H2#hi,$S3
  916. it ne
  917. addne $tbl0,$ctx,#(48+3*9*4)
  918. vmlal.u32 $D4,$H1#hi,$R3
  919. vmlal.u32 $D1,$H3#hi,$S3
  920. vmlal.u32 $D2,$H4#hi,$S3
  921. vmlal.u32 $D3,$H4#hi,$S4
  922. vorn $MASK,$MASK,$MASK @ all-ones, can be redundant
  923. vmlal.u32 $D0,$H1#hi,$S4
  924. vshr.u64 $MASK,$MASK,#38
  925. vmlal.u32 $D4,$H0#hi,$R4
  926. vmlal.u32 $D1,$H2#hi,$S4
  927. vmlal.u32 $D2,$H3#hi,$S4
  928. beq .Lshort_tail
  929. @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  930. @ (hash+inp[0:1])*r^4:r^3 and accumulate
  931. vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^3
  932. vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^4
  933. vmlal.u32 $D2,$H2#lo,$R0
  934. vmlal.u32 $D0,$H0#lo,$R0
  935. vmlal.u32 $D3,$H3#lo,$R0
  936. vmlal.u32 $D1,$H1#lo,$R0
  937. vmlal.u32 $D4,$H4#lo,$R0
  938. vmlal.u32 $D0,$H4#lo,$S1
  939. vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
  940. vmlal.u32 $D3,$H2#lo,$R1
  941. vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
  942. vmlal.u32 $D1,$H0#lo,$R1
  943. vmlal.u32 $D4,$H3#lo,$R1
  944. vmlal.u32 $D2,$H1#lo,$R1
  945. vmlal.u32 $D3,$H1#lo,$R2
  946. vld1.32 ${S4}[1],[$tbl1,:32]
  947. vmlal.u32 $D0,$H3#lo,$S2
  948. vld1.32 ${S4}[0],[$tbl0,:32]
  949. vmlal.u32 $D4,$H2#lo,$R2
  950. vmlal.u32 $D1,$H4#lo,$S2
  951. vmlal.u32 $D2,$H0#lo,$R2
  952. vmlal.u32 $D3,$H0#lo,$R3
  953. vmlal.u32 $D0,$H2#lo,$S3
  954. vmlal.u32 $D4,$H1#lo,$R3
  955. vmlal.u32 $D1,$H3#lo,$S3
  956. vmlal.u32 $D2,$H4#lo,$S3
  957. vmlal.u32 $D3,$H4#lo,$S4
  958. vorn $MASK,$MASK,$MASK @ all-ones
  959. vmlal.u32 $D0,$H1#lo,$S4
  960. vshr.u64 $MASK,$MASK,#38
  961. vmlal.u32 $D4,$H0#lo,$R4
  962. vmlal.u32 $D1,$H2#lo,$S4
  963. vmlal.u32 $D2,$H3#lo,$S4
  964. .Lshort_tail:
  965. @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  966. @ horizontal addition
  967. vadd.i64 $D3#lo,$D3#lo,$D3#hi
  968. vadd.i64 $D0#lo,$D0#lo,$D0#hi
  969. vadd.i64 $D4#lo,$D4#lo,$D4#hi
  970. vadd.i64 $D1#lo,$D1#lo,$D1#hi
  971. vadd.i64 $D2#lo,$D2#lo,$D2#hi
  972. @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  973. @ lazy reduction, but without narrowing
  974. vshr.u64 $T0,$D3,#26
  975. vand.i64 $D3,$D3,$MASK
  976. vshr.u64 $T1,$D0,#26
  977. vand.i64 $D0,$D0,$MASK
  978. vadd.i64 $D4,$D4,$T0 @ h3 -> h4
  979. vadd.i64 $D1,$D1,$T1 @ h0 -> h1
  980. vshr.u64 $T0,$D4,#26
  981. vand.i64 $D4,$D4,$MASK
  982. vshr.u64 $T1,$D1,#26
  983. vand.i64 $D1,$D1,$MASK
  984. vadd.i64 $D2,$D2,$T1 @ h1 -> h2
  985. vadd.i64 $D0,$D0,$T0
  986. vshl.u64 $T0,$T0,#2
  987. vshr.u64 $T1,$D2,#26
  988. vand.i64 $D2,$D2,$MASK
  989. vadd.i64 $D0,$D0,$T0 @ h4 -> h0
  990. vadd.i64 $D3,$D3,$T1 @ h2 -> h3
  991. vshr.u64 $T0,$D0,#26
  992. vand.i64 $D0,$D0,$MASK
  993. vshr.u64 $T1,$D3,#26
  994. vand.i64 $D3,$D3,$MASK
  995. vadd.i64 $D1,$D1,$T0 @ h0 -> h1
  996. vadd.i64 $D4,$D4,$T1 @ h3 -> h4
  997. cmp $len,#0
  998. bne .Leven
  999. @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  1000. @ store hash value
  1001. vst4.32 {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
  1002. vst1.32 {$D4#lo[0]},[$ctx]
  1003. vldmia sp!,{d8-d15} @ epilogue
  1004. ldmia sp!,{r4-r7}
  1005. .Lno_data_neon:
  1006. ret @ bx lr
  1007. .size poly1305_blocks_neon,.-poly1305_blocks_neon
  1008. .type poly1305_emit_neon,%function
  1009. .align 5
  1010. poly1305_emit_neon:
  1011. .Lpoly1305_emit_neon:
  1012. ldr ip,[$ctx,#36] @ is_base2_26
  1013. stmdb sp!,{r4-r11}
  1014. tst ip,ip
  1015. beq .Lpoly1305_emit_enter
  1016. ldmia $ctx,{$h0-$h4}
  1017. eor $g0,$g0,$g0
  1018. adds $h0,$h0,$h1,lsl#26 @ base 2^26 -> base 2^32
  1019. mov $h1,$h1,lsr#6
  1020. adcs $h1,$h1,$h2,lsl#20
  1021. mov $h2,$h2,lsr#12
  1022. adcs $h2,$h2,$h3,lsl#14
  1023. mov $h3,$h3,lsr#18
  1024. adcs $h3,$h3,$h4,lsl#8
  1025. adc $h4,$g0,$h4,lsr#24 @ can be partially reduced ...
  1026. and $g0,$h4,#-4 @ ... so reduce
  1027. and $h4,$h3,#3
  1028. add $g0,$g0,$g0,lsr#2 @ *= 5
  1029. adds $h0,$h0,$g0
  1030. adcs $h1,$h1,#0
  1031. adcs $h2,$h2,#0
  1032. adcs $h3,$h3,#0
  1033. adc $h4,$h4,#0
  1034. adds $g0,$h0,#5 @ compare to modulus
  1035. adcs $g1,$h1,#0
  1036. adcs $g2,$h2,#0
  1037. adcs $g3,$h3,#0
  1038. adc $g4,$h4,#0
  1039. tst $g4,#4 @ did it carry/borrow?
  1040. it ne
  1041. movne $h0,$g0
  1042. ldr $g0,[$nonce,#0]
  1043. it ne
  1044. movne $h1,$g1
  1045. ldr $g1,[$nonce,#4]
  1046. it ne
  1047. movne $h2,$g2
  1048. ldr $g2,[$nonce,#8]
  1049. it ne
  1050. movne $h3,$g3
  1051. ldr $g3,[$nonce,#12]
  1052. adds $h0,$h0,$g0 @ accumulate nonce
  1053. adcs $h1,$h1,$g1
  1054. adcs $h2,$h2,$g2
  1055. adc $h3,$h3,$g3
  1056. # ifdef __ARMEB__
  1057. rev $h0,$h0
  1058. rev $h1,$h1
  1059. rev $h2,$h2
  1060. rev $h3,$h3
  1061. # endif
  1062. str $h0,[$mac,#0] @ store the result
  1063. str $h1,[$mac,#4]
  1064. str $h2,[$mac,#8]
  1065. str $h3,[$mac,#12]
  1066. ldmia sp!,{r4-r11}
  1067. ret @ bx lr
  1068. .size poly1305_emit_neon,.-poly1305_emit_neon
  1069. .align 5
  1070. .Lzeros:
  1071. .long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
  1072. .LOPENSSL_armcap:
  1073. # ifdef _WIN32
  1074. .word OPENSSL_armcap_P
  1075. # else
  1076. .word OPENSSL_armcap_P-.Lpoly1305_init
  1077. # endif
  1078. #endif
  1079. ___
  1080. } }
  1081. $code.=<<___;
  1082. .asciz "Poly1305 for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
  1083. .align 2
  1084. #if __ARM_MAX_ARCH__>=7
  1085. .extern OPENSSL_armcap_P
  1086. #endif
  1087. ___
  1088. foreach (split("\n",$code)) {
  1089. s/\`([^\`]*)\`/eval $1/geo;
  1090. s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
  1091. s/\bret\b/bx lr/go or
  1092. s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
  1093. print $_,"\n";
  1094. }
  1095. close STDOUT or die "error closing STDOUT: $!"; # enforce flush