sha1-armv4-large.pl 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750
  1. #! /usr/bin/env perl
  2. # Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. # ====================================================================
  9. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  10. # project. The module is, however, dual licensed under OpenSSL and
  11. # CRYPTOGAMS licenses depending on where you obtain it. For further
  12. # details see http://www.openssl.org/~appro/cryptogams/.
  13. # ====================================================================
  14. # sha1_block procedure for ARMv4.
  15. #
  16. # January 2007.
  17. # Size/performance trade-off
  18. # ====================================================================
  19. # impl size in bytes comp cycles[*] measured performance
  20. # ====================================================================
  21. # thumb 304 3212 4420
  22. # armv4-small 392/+29% 1958/+64% 2250/+96%
  23. # armv4-compact 740/+89% 1552/+26% 1840/+22%
  24. # armv4-large 1420/+92% 1307/+19% 1370/+34%[***]
  25. # full unroll ~5100/+260% ~1260/+4% ~1300/+5%
  26. # ====================================================================
  27. # thumb = same as 'small' but in Thumb instructions[**] and
  28. # with recurring code in two private functions;
  29. # small = detached Xload/update, loops are folded;
  30. # compact = detached Xload/update, 5x unroll;
  31. # large = interleaved Xload/update, 5x unroll;
  32. # full unroll = interleaved Xload/update, full unroll, estimated[!];
  33. #
  34. # [*] Manually counted instructions in "grand" loop body. Measured
  35. # performance is affected by prologue and epilogue overhead,
  36. # i-cache availability, branch penalties, etc.
  37. # [**] While each Thumb instruction is twice smaller, they are not as
  38. # diverse as ARM ones: e.g., there are only two arithmetic
  39. # instructions with 3 arguments, no [fixed] rotate, addressing
  40. # modes are limited. As result it takes more instructions to do
  41. # the same job in Thumb, therefore the code is never twice as
  42. # small and always slower.
  43. # [***] which is also ~35% better than compiler generated code. Dual-
  44. # issue Cortex A8 core was measured to process input block in
  45. # ~990 cycles.
  46. # August 2010.
  47. #
  48. # Rescheduling for dual-issue pipeline resulted in 13% improvement on
  49. # Cortex A8 core and in absolute terms ~870 cycles per input block
  50. # [or 13.6 cycles per byte].
  51. # February 2011.
  52. #
  53. # Profiler-assisted and platform-specific optimization resulted in 10%
  54. # improvement on Cortex A8 core and 12.2 cycles per byte.
  55. # September 2013.
  56. #
  57. # Add NEON implementation (see sha1-586.pl for background info). On
  58. # Cortex A8 it was measured to process one byte in 6.7 cycles or >80%
  59. # faster than integer-only code. Because [fully unrolled] NEON code
  60. # is ~2.5x larger and there are some redundant instructions executed
  61. # when processing last block, improvement is not as big for smallest
  62. # blocks, only ~30%. Snapdragon S4 is a tad faster, 6.4 cycles per
  63. # byte, which is also >80% faster than integer-only code. Cortex-A15
  64. # is even faster spending 5.6 cycles per byte outperforming integer-
  65. # only code by factor of 2.
  66. # May 2014.
  67. #
  68. # Add ARMv8 code path performing at 2.35 cpb on Apple A7.
  69. $flavour = shift;
  70. if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
  71. else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
  72. if ($flavour && $flavour ne "void") {
  73. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  74. ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
  75. ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
  76. die "can't locate arm-xlate.pl";
  77. open STDOUT,"| \"$^X\" $xlate $flavour $output";
  78. } else {
  79. open STDOUT,">$output";
  80. }
  81. $ctx="r0";
  82. $inp="r1";
  83. $len="r2";
  84. $a="r3";
  85. $b="r4";
  86. $c="r5";
  87. $d="r6";
  88. $e="r7";
  89. $K="r8";
  90. $t0="r9";
  91. $t1="r10";
  92. $t2="r11";
  93. $t3="r12";
  94. $Xi="r14";
  95. @V=($a,$b,$c,$d,$e);
  96. sub Xupdate {
  97. my ($a,$b,$c,$d,$e,$opt1,$opt2)=@_;
  98. $code.=<<___;
  99. ldr $t0,[$Xi,#15*4]
  100. ldr $t1,[$Xi,#13*4]
  101. ldr $t2,[$Xi,#7*4]
  102. add $e,$K,$e,ror#2 @ E+=K_xx_xx
  103. ldr $t3,[$Xi,#2*4]
  104. eor $t0,$t0,$t1
  105. eor $t2,$t2,$t3 @ 1 cycle stall
  106. eor $t1,$c,$d @ F_xx_xx
  107. mov $t0,$t0,ror#31
  108. add $e,$e,$a,ror#27 @ E+=ROR(A,27)
  109. eor $t0,$t0,$t2,ror#31
  110. str $t0,[$Xi,#-4]!
  111. $opt1 @ F_xx_xx
  112. $opt2 @ F_xx_xx
  113. add $e,$e,$t0 @ E+=X[i]
  114. ___
  115. }
  116. sub BODY_00_15 {
  117. my ($a,$b,$c,$d,$e)=@_;
  118. $code.=<<___;
  119. #if __ARM_ARCH__<7
  120. ldrb $t1,[$inp,#2]
  121. ldrb $t0,[$inp,#3]
  122. ldrb $t2,[$inp,#1]
  123. add $e,$K,$e,ror#2 @ E+=K_00_19
  124. ldrb $t3,[$inp],#4
  125. orr $t0,$t0,$t1,lsl#8
  126. eor $t1,$c,$d @ F_xx_xx
  127. orr $t0,$t0,$t2,lsl#16
  128. add $e,$e,$a,ror#27 @ E+=ROR(A,27)
  129. orr $t0,$t0,$t3,lsl#24
  130. #else
  131. ldr $t0,[$inp],#4 @ handles unaligned
  132. add $e,$K,$e,ror#2 @ E+=K_00_19
  133. eor $t1,$c,$d @ F_xx_xx
  134. add $e,$e,$a,ror#27 @ E+=ROR(A,27)
  135. #ifdef __ARMEL__
  136. rev $t0,$t0 @ byte swap
  137. #endif
  138. #endif
  139. and $t1,$b,$t1,ror#2
  140. add $e,$e,$t0 @ E+=X[i]
  141. eor $t1,$t1,$d,ror#2 @ F_00_19(B,C,D)
  142. str $t0,[$Xi,#-4]!
  143. add $e,$e,$t1 @ E+=F_00_19(B,C,D)
  144. ___
  145. }
  146. sub BODY_16_19 {
  147. my ($a,$b,$c,$d,$e)=@_;
  148. &Xupdate(@_,"and $t1,$b,$t1,ror#2");
  149. $code.=<<___;
  150. eor $t1,$t1,$d,ror#2 @ F_00_19(B,C,D)
  151. add $e,$e,$t1 @ E+=F_00_19(B,C,D)
  152. ___
  153. }
  154. sub BODY_20_39 {
  155. my ($a,$b,$c,$d,$e)=@_;
  156. &Xupdate(@_,"eor $t1,$b,$t1,ror#2");
  157. $code.=<<___;
  158. add $e,$e,$t1 @ E+=F_20_39(B,C,D)
  159. ___
  160. }
  161. sub BODY_40_59 {
  162. my ($a,$b,$c,$d,$e)=@_;
  163. &Xupdate(@_,"and $t1,$b,$t1,ror#2","and $t2,$c,$d");
  164. $code.=<<___;
  165. add $e,$e,$t1 @ E+=F_40_59(B,C,D)
  166. add $e,$e,$t2,ror#2
  167. ___
  168. }
  169. $code=<<___;
  170. #include "arm_arch.h"
  171. #if defined(__thumb2__)
  172. .syntax unified
  173. .thumb
  174. #else
  175. .code 32
  176. #endif
  177. .text
  178. .global sha1_block_data_order
  179. .type sha1_block_data_order,%function
  180. .align 5
  181. sha1_block_data_order:
  182. #if __ARM_MAX_ARCH__>=7
  183. .Lsha1_block:
  184. ldr r12,.LOPENSSL_armcap
  185. # if !defined(_WIN32)
  186. adr r3,.Lsha1_block
  187. ldr r12,[r3,r12] @ OPENSSL_armcap_P
  188. # endif
  189. # if defined(__APPLE__) || defined(_WIN32)
  190. ldr r12,[r12]
  191. # endif
  192. tst r12,#ARMV8_SHA1
  193. bne .LARMv8
  194. tst r12,#ARMV7_NEON
  195. bne .LNEON
  196. #endif
  197. stmdb sp!,{r4-r12,lr}
  198. add $len,$inp,$len,lsl#6 @ $len to point at the end of $inp
  199. ldmia $ctx,{$a,$b,$c,$d,$e}
  200. .Lloop:
  201. ldr $K,.LK_00_19
  202. mov $Xi,sp
  203. sub sp,sp,#15*4
  204. mov $c,$c,ror#30
  205. mov $d,$d,ror#30
  206. mov $e,$e,ror#30 @ [6]
  207. .L_00_15:
  208. ___
  209. for($i=0;$i<5;$i++) {
  210. &BODY_00_15(@V); unshift(@V,pop(@V));
  211. }
  212. $code.=<<___;
  213. #if defined(__thumb2__)
  214. mov $t3,sp
  215. teq $Xi,$t3
  216. #else
  217. teq $Xi,sp
  218. #endif
  219. bne .L_00_15 @ [((11+4)*5+2)*3]
  220. sub sp,sp,#25*4
  221. ___
  222. &BODY_00_15(@V); unshift(@V,pop(@V));
  223. &BODY_16_19(@V); unshift(@V,pop(@V));
  224. &BODY_16_19(@V); unshift(@V,pop(@V));
  225. &BODY_16_19(@V); unshift(@V,pop(@V));
  226. &BODY_16_19(@V); unshift(@V,pop(@V));
  227. $code.=<<___;
  228. ldr $K,.LK_20_39 @ [+15+16*4]
  229. cmn sp,#0 @ [+3], clear carry to denote 20_39
  230. .L_20_39_or_60_79:
  231. ___
  232. for($i=0;$i<5;$i++) {
  233. &BODY_20_39(@V); unshift(@V,pop(@V));
  234. }
  235. $code.=<<___;
  236. #if defined(__thumb2__)
  237. mov $t3,sp
  238. teq $Xi,$t3
  239. #else
  240. teq $Xi,sp @ preserve carry
  241. #endif
  242. bne .L_20_39_or_60_79 @ [+((12+3)*5+2)*4]
  243. bcs .L_done @ [+((12+3)*5+2)*4], spare 300 bytes
  244. ldr $K,.LK_40_59
  245. sub sp,sp,#20*4 @ [+2]
  246. .L_40_59:
  247. ___
  248. for($i=0;$i<5;$i++) {
  249. &BODY_40_59(@V); unshift(@V,pop(@V));
  250. }
  251. $code.=<<___;
  252. #if defined(__thumb2__)
  253. mov $t3,sp
  254. teq $Xi,$t3
  255. #else
  256. teq $Xi,sp
  257. #endif
  258. bne .L_40_59 @ [+((12+5)*5+2)*4]
  259. ldr $K,.LK_60_79
  260. sub sp,sp,#20*4
  261. cmp sp,#0 @ set carry to denote 60_79
  262. b .L_20_39_or_60_79 @ [+4], spare 300 bytes
  263. .L_done:
  264. add sp,sp,#80*4 @ "deallocate" stack frame
  265. ldmia $ctx,{$K,$t0,$t1,$t2,$t3}
  266. add $a,$K,$a
  267. add $b,$t0,$b
  268. add $c,$t1,$c,ror#2
  269. add $d,$t2,$d,ror#2
  270. add $e,$t3,$e,ror#2
  271. stmia $ctx,{$a,$b,$c,$d,$e}
  272. teq $inp,$len
  273. bne .Lloop @ [+18], total 1307
  274. #if __ARM_ARCH__>=5
  275. ldmia sp!,{r4-r12,pc}
  276. #else
  277. ldmia sp!,{r4-r12,lr}
  278. tst lr,#1
  279. moveq pc,lr @ be binary compatible with V4, yet
  280. bx lr @ interoperable with Thumb ISA:-)
  281. #endif
  282. .size sha1_block_data_order,.-sha1_block_data_order
  283. .align 5
  284. .LK_00_19: .word 0x5a827999
  285. .LK_20_39: .word 0x6ed9eba1
  286. .LK_40_59: .word 0x8f1bbcdc
  287. .LK_60_79: .word 0xca62c1d6
  288. #if __ARM_MAX_ARCH__>=7
  289. .LOPENSSL_armcap:
  290. # ifdef _WIN32
  291. .word OPENSSL_armcap_P
  292. # else
  293. .word OPENSSL_armcap_P-.Lsha1_block
  294. # endif
  295. #endif
  296. .asciz "SHA1 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
  297. .align 5
  298. ___
  299. #####################################################################
  300. # NEON stuff
  301. #
  302. {{{
  303. my @V=($a,$b,$c,$d,$e);
  304. my ($K_XX_XX,$Ki,$t0,$t1,$Xfer,$saved_sp)=map("r$_",(8..12,14));
  305. my $Xi=4;
  306. my @X=map("q$_",(8..11,0..3));
  307. my @Tx=("q12","q13");
  308. my ($K,$zero)=("q14","q15");
  309. my $j=0;
  310. sub AUTOLOAD() # thunk [simplified] x86-style perlasm
  311. { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
  312. my $arg = pop;
  313. $arg = "#$arg" if ($arg*1 eq $arg);
  314. $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
  315. }
  316. sub body_00_19 () {
  317. (
  318. '($a,$b,$c,$d,$e)=@V;'. # '$code.="@ $j\n";'.
  319. '&bic ($t0,$d,$b)',
  320. '&add ($e,$e,$Ki)', # e+=X[i]+K
  321. '&and ($t1,$c,$b)',
  322. '&ldr ($Ki,sprintf "[sp,#%d]",4*(($j+1)&15))',
  323. '&add ($e,$e,$a,"ror#27")', # e+=ROR(A,27)
  324. '&eor ($t1,$t1,$t0)', # F_00_19
  325. '&mov ($b,$b,"ror#2")', # b=ROR(b,2)
  326. '&add ($e,$e,$t1);'. # e+=F_00_19
  327. '$j++; unshift(@V,pop(@V));'
  328. )
  329. }
  330. sub body_20_39 () {
  331. (
  332. '($a,$b,$c,$d,$e)=@V;'. # '$code.="@ $j\n";'.
  333. '&eor ($t0,$b,$d)',
  334. '&add ($e,$e,$Ki)', # e+=X[i]+K
  335. '&ldr ($Ki,sprintf "[sp,#%d]",4*(($j+1)&15)) if ($j<79)',
  336. '&eor ($t1,$t0,$c)', # F_20_39
  337. '&add ($e,$e,$a,"ror#27")', # e+=ROR(A,27)
  338. '&mov ($b,$b,"ror#2")', # b=ROR(b,2)
  339. '&add ($e,$e,$t1);'. # e+=F_20_39
  340. '$j++; unshift(@V,pop(@V));'
  341. )
  342. }
  343. sub body_40_59 () {
  344. (
  345. '($a,$b,$c,$d,$e)=@V;'. # '$code.="@ $j\n";'.
  346. '&add ($e,$e,$Ki)', # e+=X[i]+K
  347. '&and ($t0,$c,$d)',
  348. '&ldr ($Ki,sprintf "[sp,#%d]",4*(($j+1)&15))',
  349. '&add ($e,$e,$a,"ror#27")', # e+=ROR(A,27)
  350. '&eor ($t1,$c,$d)',
  351. '&add ($e,$e,$t0)',
  352. '&and ($t1,$t1,$b)',
  353. '&mov ($b,$b,"ror#2")', # b=ROR(b,2)
  354. '&add ($e,$e,$t1);'. # e+=F_40_59
  355. '$j++; unshift(@V,pop(@V));'
  356. )
  357. }
  358. sub Xupdate_16_31 ()
  359. { use integer;
  360. my $body = shift;
  361. my @insns = (&$body,&$body,&$body,&$body);
  362. my ($a,$b,$c,$d,$e);
  363. &vext_8 (@X[0],@X[-4&7],@X[-3&7],8); # compose "X[-14]" in "X[0]"
  364. eval(shift(@insns));
  365. eval(shift(@insns));
  366. eval(shift(@insns));
  367. &vadd_i32 (@Tx[1],@X[-1&7],$K);
  368. eval(shift(@insns));
  369. &vld1_32 ("{$K\[]}","[$K_XX_XX,:32]!") if ($Xi%5==0);
  370. eval(shift(@insns));
  371. &vext_8 (@Tx[0],@X[-1&7],$zero,4); # "X[-3]", 3 words
  372. eval(shift(@insns));
  373. eval(shift(@insns));
  374. eval(shift(@insns));
  375. &veor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
  376. eval(shift(@insns));
  377. eval(shift(@insns));
  378. &veor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
  379. eval(shift(@insns));
  380. eval(shift(@insns));
  381. &veor (@Tx[0],@Tx[0],@X[0]); # "X[0]"^="X[-3]"^"X[-8]
  382. eval(shift(@insns));
  383. eval(shift(@insns));
  384. &vst1_32 ("{@Tx[1]}","[$Xfer,:128]!"); # X[]+K xfer
  385. &sub ($Xfer,$Xfer,64) if ($Xi%4==0);
  386. eval(shift(@insns));
  387. eval(shift(@insns));
  388. &vext_8 (@Tx[1],$zero,@Tx[0],4); # "X[0]"<<96, extract one dword
  389. eval(shift(@insns));
  390. eval(shift(@insns));
  391. &vadd_i32 (@X[0],@Tx[0],@Tx[0]);
  392. eval(shift(@insns));
  393. eval(shift(@insns));
  394. &vsri_32 (@X[0],@Tx[0],31); # "X[0]"<<<=1
  395. eval(shift(@insns));
  396. eval(shift(@insns));
  397. eval(shift(@insns));
  398. &vshr_u32 (@Tx[0],@Tx[1],30);
  399. eval(shift(@insns));
  400. eval(shift(@insns));
  401. &vshl_u32 (@Tx[1],@Tx[1],2);
  402. eval(shift(@insns));
  403. eval(shift(@insns));
  404. &veor (@X[0],@X[0],@Tx[0]);
  405. eval(shift(@insns));
  406. eval(shift(@insns));
  407. &veor (@X[0],@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2
  408. foreach (@insns) { eval; } # remaining instructions [if any]
  409. $Xi++; push(@X,shift(@X)); # "rotate" X[]
  410. }
  411. sub Xupdate_32_79 ()
  412. { use integer;
  413. my $body = shift;
  414. my @insns = (&$body,&$body,&$body,&$body);
  415. my ($a,$b,$c,$d,$e);
  416. &vext_8 (@Tx[0],@X[-2&7],@X[-1&7],8); # compose "X[-6]"
  417. eval(shift(@insns));
  418. eval(shift(@insns));
  419. eval(shift(@insns));
  420. &veor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
  421. eval(shift(@insns));
  422. eval(shift(@insns));
  423. &veor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
  424. eval(shift(@insns));
  425. eval(shift(@insns));
  426. &vadd_i32 (@Tx[1],@X[-1&7],$K);
  427. eval(shift(@insns));
  428. &vld1_32 ("{$K\[]}","[$K_XX_XX,:32]!") if ($Xi%5==0);
  429. eval(shift(@insns));
  430. &veor (@Tx[0],@Tx[0],@X[0]); # "X[-6]"^="X[0]"
  431. eval(shift(@insns));
  432. eval(shift(@insns));
  433. &vshr_u32 (@X[0],@Tx[0],30);
  434. eval(shift(@insns));
  435. eval(shift(@insns));
  436. &vst1_32 ("{@Tx[1]}","[$Xfer,:128]!"); # X[]+K xfer
  437. &sub ($Xfer,$Xfer,64) if ($Xi%4==0);
  438. eval(shift(@insns));
  439. eval(shift(@insns));
  440. &vsli_32 (@X[0],@Tx[0],2); # "X[0]"="X[-6]"<<<2
  441. foreach (@insns) { eval; } # remaining instructions [if any]
  442. $Xi++; push(@X,shift(@X)); # "rotate" X[]
  443. }
  444. sub Xuplast_80 ()
  445. { use integer;
  446. my $body = shift;
  447. my @insns = (&$body,&$body,&$body,&$body);
  448. my ($a,$b,$c,$d,$e);
  449. &vadd_i32 (@Tx[1],@X[-1&7],$K);
  450. eval(shift(@insns));
  451. eval(shift(@insns));
  452. &vst1_32 ("{@Tx[1]}","[$Xfer,:128]!");
  453. &sub ($Xfer,$Xfer,64);
  454. &teq ($inp,$len);
  455. &sub ($K_XX_XX,$K_XX_XX,16); # rewind $K_XX_XX
  456. &it ("eq");
  457. &subeq ($inp,$inp,64); # reload last block to avoid SEGV
  458. &vld1_8 ("{@X[-4&7]-@X[-3&7]}","[$inp]!");
  459. eval(shift(@insns));
  460. eval(shift(@insns));
  461. &vld1_8 ("{@X[-2&7]-@X[-1&7]}","[$inp]!");
  462. eval(shift(@insns));
  463. eval(shift(@insns));
  464. &vld1_32 ("{$K\[]}","[$K_XX_XX,:32]!"); # load K_00_19
  465. eval(shift(@insns));
  466. eval(shift(@insns));
  467. &vrev32_8 (@X[-4&7],@X[-4&7]);
  468. foreach (@insns) { eval; } # remaining instructions
  469. $Xi=0;
  470. }
  471. sub Xloop()
  472. { use integer;
  473. my $body = shift;
  474. my @insns = (&$body,&$body,&$body,&$body);
  475. my ($a,$b,$c,$d,$e);
  476. &vrev32_8 (@X[($Xi-3)&7],@X[($Xi-3)&7]);
  477. eval(shift(@insns));
  478. eval(shift(@insns));
  479. &vadd_i32 (@X[$Xi&7],@X[($Xi-4)&7],$K);
  480. eval(shift(@insns));
  481. eval(shift(@insns));
  482. &vst1_32 ("{@X[$Xi&7]}","[$Xfer,:128]!");# X[]+K xfer to IALU
  483. foreach (@insns) { eval; }
  484. $Xi++;
  485. }
  486. $code.=<<___;
  487. #if __ARM_MAX_ARCH__>=7
  488. .arch armv7-a
  489. .fpu neon
  490. .type sha1_block_data_order_neon,%function
  491. .align 4
  492. sha1_block_data_order_neon:
  493. .LNEON:
  494. stmdb sp!,{r4-r12,lr}
  495. add $len,$inp,$len,lsl#6 @ $len to point at the end of $inp
  496. @ dmb @ errata #451034 on early Cortex A8
  497. @ vstmdb sp!,{d8-d15} @ ABI specification says so
  498. mov $saved_sp,sp
  499. sub $Xfer,sp,#64
  500. adr $K_XX_XX,.LK_00_19
  501. bic $Xfer,$Xfer,#15 @ align for 128-bit stores
  502. ldmia $ctx,{$a,$b,$c,$d,$e} @ load context
  503. mov sp,$Xfer @ alloca
  504. vld1.8 {@X[-4&7]-@X[-3&7]},[$inp]! @ handles unaligned
  505. veor $zero,$zero,$zero
  506. vld1.8 {@X[-2&7]-@X[-1&7]},[$inp]!
  507. vld1.32 {${K}\[]},[$K_XX_XX,:32]! @ load K_00_19
  508. vrev32.8 @X[-4&7],@X[-4&7] @ yes, even on
  509. vrev32.8 @X[-3&7],@X[-3&7] @ big-endian...
  510. vrev32.8 @X[-2&7],@X[-2&7]
  511. vadd.i32 @X[0],@X[-4&7],$K
  512. vrev32.8 @X[-1&7],@X[-1&7]
  513. vadd.i32 @X[1],@X[-3&7],$K
  514. vst1.32 {@X[0]},[$Xfer,:128]!
  515. vadd.i32 @X[2],@X[-2&7],$K
  516. vst1.32 {@X[1]},[$Xfer,:128]!
  517. vst1.32 {@X[2]},[$Xfer,:128]!
  518. ldr $Ki,[sp] @ big RAW stall
  519. .Loop_neon:
  520. ___
  521. &Xupdate_16_31(\&body_00_19);
  522. &Xupdate_16_31(\&body_00_19);
  523. &Xupdate_16_31(\&body_00_19);
  524. &Xupdate_16_31(\&body_00_19);
  525. &Xupdate_32_79(\&body_00_19);
  526. &Xupdate_32_79(\&body_20_39);
  527. &Xupdate_32_79(\&body_20_39);
  528. &Xupdate_32_79(\&body_20_39);
  529. &Xupdate_32_79(\&body_20_39);
  530. &Xupdate_32_79(\&body_20_39);
  531. &Xupdate_32_79(\&body_40_59);
  532. &Xupdate_32_79(\&body_40_59);
  533. &Xupdate_32_79(\&body_40_59);
  534. &Xupdate_32_79(\&body_40_59);
  535. &Xupdate_32_79(\&body_40_59);
  536. &Xupdate_32_79(\&body_20_39);
  537. &Xuplast_80(\&body_20_39);
  538. &Xloop(\&body_20_39);
  539. &Xloop(\&body_20_39);
  540. &Xloop(\&body_20_39);
  541. $code.=<<___;
  542. ldmia $ctx,{$Ki,$t0,$t1,$Xfer} @ accumulate context
  543. add $a,$a,$Ki
  544. ldr $Ki,[$ctx,#16]
  545. add $b,$b,$t0
  546. add $c,$c,$t1
  547. add $d,$d,$Xfer
  548. it eq
  549. moveq sp,$saved_sp
  550. add $e,$e,$Ki
  551. it ne
  552. ldrne $Ki,[sp]
  553. stmia $ctx,{$a,$b,$c,$d,$e}
  554. itt ne
  555. addne $Xfer,sp,#3*16
  556. bne .Loop_neon
  557. @ vldmia sp!,{d8-d15}
  558. ldmia sp!,{r4-r12,pc}
  559. .size sha1_block_data_order_neon,.-sha1_block_data_order_neon
  560. #endif
  561. ___
  562. }}}
  563. #####################################################################
  564. # ARMv8 stuff
  565. #
  566. {{{
  567. my ($ABCD,$E,$E0,$E1)=map("q$_",(0..3));
  568. my @MSG=map("q$_",(4..7));
  569. my @Kxx=map("q$_",(8..11));
  570. my ($W0,$W1,$ABCD_SAVE)=map("q$_",(12..14));
  571. my $_byte = ($flavour =~ /win/ ? "DCB" : ".byte");
  572. $code.=<<___;
  573. #if __ARM_MAX_ARCH__>=7
  574. # if defined(__thumb2__)
  575. # define INST(a,b,c,d) $_byte c,d|0xf,a,b
  576. # else
  577. # define INST(a,b,c,d) $_byte a,b,c,d|0x10
  578. # endif
  579. .type sha1_block_data_order_armv8,%function
  580. .align 5
  581. sha1_block_data_order_armv8:
  582. .LARMv8:
  583. vstmdb sp!,{d8-d15} @ ABI specification says so
  584. veor $E,$E,$E
  585. adr r3,.LK_00_19
  586. vld1.32 {$ABCD},[$ctx]!
  587. vld1.32 {$E\[0]},[$ctx]
  588. sub $ctx,$ctx,#16
  589. vld1.32 {@Kxx[0]\[]},[r3,:32]!
  590. vld1.32 {@Kxx[1]\[]},[r3,:32]!
  591. vld1.32 {@Kxx[2]\[]},[r3,:32]!
  592. vld1.32 {@Kxx[3]\[]},[r3,:32]
  593. .Loop_v8:
  594. vld1.8 {@MSG[0]-@MSG[1]},[$inp]!
  595. vld1.8 {@MSG[2]-@MSG[3]},[$inp]!
  596. vrev32.8 @MSG[0],@MSG[0]
  597. vrev32.8 @MSG[1],@MSG[1]
  598. vadd.i32 $W0,@Kxx[0],@MSG[0]
  599. vrev32.8 @MSG[2],@MSG[2]
  600. vmov $ABCD_SAVE,$ABCD @ offload
  601. subs $len,$len,#1
  602. vadd.i32 $W1,@Kxx[0],@MSG[1]
  603. vrev32.8 @MSG[3],@MSG[3]
  604. sha1h $E1,$ABCD @ 0
  605. sha1c $ABCD,$E,$W0
  606. vadd.i32 $W0,@Kxx[$j],@MSG[2]
  607. sha1su0 @MSG[0],@MSG[1],@MSG[2]
  608. ___
  609. for ($j=0,$i=1;$i<20-3;$i++) {
  610. my $f=("c","p","m","p")[$i/5];
  611. $code.=<<___;
  612. sha1h $E0,$ABCD @ $i
  613. sha1$f $ABCD,$E1,$W1
  614. vadd.i32 $W1,@Kxx[$j],@MSG[3]
  615. sha1su1 @MSG[0],@MSG[3]
  616. ___
  617. $code.=<<___ if ($i<20-4);
  618. sha1su0 @MSG[1],@MSG[2],@MSG[3]
  619. ___
  620. ($E0,$E1)=($E1,$E0); ($W0,$W1)=($W1,$W0);
  621. push(@MSG,shift(@MSG)); $j++ if ((($i+3)%5)==0);
  622. }
  623. $code.=<<___;
  624. sha1h $E0,$ABCD @ $i
  625. sha1p $ABCD,$E1,$W1
  626. vadd.i32 $W1,@Kxx[$j],@MSG[3]
  627. sha1h $E1,$ABCD @ 18
  628. sha1p $ABCD,$E0,$W0
  629. sha1h $E0,$ABCD @ 19
  630. sha1p $ABCD,$E1,$W1
  631. vadd.i32 $E,$E,$E0
  632. vadd.i32 $ABCD,$ABCD,$ABCD_SAVE
  633. bne .Loop_v8
  634. vst1.32 {$ABCD},[$ctx]!
  635. vst1.32 {$E\[0]},[$ctx]
  636. vldmia sp!,{d8-d15}
  637. ret @ bx lr
  638. .size sha1_block_data_order_armv8,.-sha1_block_data_order_armv8
  639. #endif
  640. ___
  641. }}}
  642. $code.=<<___;
  643. #if __ARM_MAX_ARCH__>=7
  644. .comm OPENSSL_armcap_P,4,4
  645. #endif
  646. ___
  647. { my %opcode = (
  648. "sha1c" => 0xf2000c40, "sha1p" => 0xf2100c40,
  649. "sha1m" => 0xf2200c40, "sha1su0" => 0xf2300c40,
  650. "sha1h" => 0xf3b902c0, "sha1su1" => 0xf3ba0380 );
  651. sub unsha1 {
  652. my ($mnemonic,$arg)=@_;
  653. if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
  654. my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
  655. |(($2&7)<<17)|(($2&8)<<4)
  656. |(($3&7)<<1) |(($3&8)<<2);
  657. # since ARMv7 instructions are always encoded little-endian.
  658. # correct solution is to use .inst directive, but older
  659. # assemblers don't implement it:-(
  660. # this fix-up provides Thumb encoding in conjunction with INST
  661. $word &= ~0x10000000 if (($word & 0x0f000000) == 0x02000000);
  662. sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
  663. $word&0xff,($word>>8)&0xff,
  664. ($word>>16)&0xff,($word>>24)&0xff,
  665. $mnemonic,$arg;
  666. }
  667. }
  668. }
  669. foreach (split($/,$code)) {
  670. s/{q([0-9]+)\[\]}/sprintf "{d%d[],d%d[]}",2*$1,2*$1+1/eo or
  671. s/{q([0-9]+)\[0\]}/sprintf "{d%d[0]}",2*$1/eo;
  672. s/\b(sha1\w+)\s+(q.*)/unsha1($1,$2)/geo;
  673. s/\bret\b/bx lr/o or
  674. s/\bbx\s+lr\b/.word\t0xe12fff1e/o; # make it possible to compile with -march=armv4
  675. print $_,$/;
  676. }
  677. close STDOUT; # enforce flush