sha1-armv4-large.pl 19 KB


  1. #! /usr/bin/env perl
  2. # Copyright 2007-2020 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. # ====================================================================
  9. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  10. # project. The module is, however, dual licensed under OpenSSL and
  11. # CRYPTOGAMS licenses depending on where you obtain it. For further
  12. # details see http://www.openssl.org/~appro/cryptogams/.
  13. # ====================================================================
  14. # sha1_block procedure for ARMv4.
  15. #
  16. # January 2007.
  17. # Size/performance trade-off
  18. # ====================================================================
  19. # impl size in bytes comp cycles[*] measured performance
  20. # ====================================================================
  21. # thumb 304 3212 4420
  22. # armv4-small 392/+29% 1958/+64% 2250/+96%
  23. # armv4-compact 740/+89% 1552/+26% 1840/+22%
  24. # armv4-large 1420/+92% 1307/+19% 1370/+34%[***]
  25. # full unroll ~5100/+260% ~1260/+4% ~1300/+5%
  26. # ====================================================================
  27. # thumb = same as 'small' but in Thumb instructions[**] and
  28. # with recurring code in two private functions;
  29. # small = detached Xload/update, loops are folded;
  30. # compact = detached Xload/update, 5x unroll;
  31. # large = interleaved Xload/update, 5x unroll;
  32. # full unroll = interleaved Xload/update, full unroll, estimated[!];
  33. #
  34. # [*] Manually counted instructions in "grand" loop body. Measured
  35. # performance is affected by prologue and epilogue overhead,
  36. # i-cache availability, branch penalties, etc.
  37. # [**] While each Thumb instruction is twice smaller, they are not as
  38. # diverse as ARM ones: e.g., there are only two arithmetic
  39. # instructions with 3 arguments, no [fixed] rotate, addressing
  40. # modes are limited. As result it takes more instructions to do
  41. # the same job in Thumb, therefore the code is never twice as
  42. # small and always slower.
  43. # [***] which is also ~35% better than compiler generated code. Dual-
  44. # issue Cortex A8 core was measured to process input block in
  45. # ~990 cycles.
  46. # August 2010.
  47. #
  48. # Rescheduling for dual-issue pipeline resulted in 13% improvement on
  49. # Cortex A8 core and in absolute terms ~870 cycles per input block
  50. # [or 13.6 cycles per byte].
  51. # February 2011.
  52. #
  53. # Profiler-assisted and platform-specific optimization resulted in 10%
  54. # improvement on Cortex A8 core and 12.2 cycles per byte.
  55. # September 2013.
  56. #
  57. # Add NEON implementation (see sha1-586.pl for background info). On
  58. # Cortex A8 it was measured to process one byte in 6.7 cycles or >80%
  59. # faster than integer-only code. Because [fully unrolled] NEON code
  60. # is ~2.5x larger and there are some redundant instructions executed
  61. # when processing last block, improvement is not as big for smallest
  62. # blocks, only ~30%. Snapdragon S4 is a tad faster, 6.4 cycles per
  63. # byte, which is also >80% faster than integer-only code. Cortex-A15
  64. # is even faster spending 5.6 cycles per byte outperforming integer-
  65. # only code by factor of 2.
  66. # May 2014.
  67. #
  68. # Add ARMv8 code path performing at 2.35 cpb on Apple A7.
  69. # $output is the last argument if it looks like a file (it has an extension)
  70. # $flavour is the first argument if it doesn't look like a file
  71. $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  72. $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  73. if ($flavour && $flavour ne "void") {
  74. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  75. ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
  76. ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
  77. die "can't locate arm-xlate.pl";
  78. open STDOUT,"| \"$^X\" $xlate $flavour \"$output\""
  79. or die "can't call $xlate: $!";
  80. } else {
  81. $output and open STDOUT,">$output";
  82. }
  83. $ctx="r0";
  84. $inp="r1";
  85. $len="r2";
  86. $a="r3";
  87. $b="r4";
  88. $c="r5";
  89. $d="r6";
  90. $e="r7";
  91. $K="r8";
  92. $t0="r9";
  93. $t1="r10";
  94. $t2="r11";
  95. $t3="r12";
  96. $Xi="r14";
  97. @V=($a,$b,$c,$d,$e);
  98. sub Xupdate {
  99. my ($a,$b,$c,$d,$e,$opt1,$opt2)=@_;
  100. $code.=<<___;
  101. ldr $t0,[$Xi,#15*4]
  102. ldr $t1,[$Xi,#13*4]
  103. ldr $t2,[$Xi,#7*4]
  104. add $e,$K,$e,ror#2 @ E+=K_xx_xx
  105. ldr $t3,[$Xi,#2*4]
  106. eor $t0,$t0,$t1
  107. eor $t2,$t2,$t3 @ 1 cycle stall
  108. eor $t1,$c,$d @ F_xx_xx
  109. mov $t0,$t0,ror#31
  110. add $e,$e,$a,ror#27 @ E+=ROR(A,27)
  111. eor $t0,$t0,$t2,ror#31
  112. str $t0,[$Xi,#-4]!
  113. $opt1 @ F_xx_xx
  114. $opt2 @ F_xx_xx
  115. add $e,$e,$t0 @ E+=X[i]
  116. ___
  117. }
  118. sub BODY_00_15 {
  119. my ($a,$b,$c,$d,$e)=@_;
  120. $code.=<<___;
  121. #if __ARM_ARCH__<7
  122. ldrb $t1,[$inp,#2]
  123. ldrb $t0,[$inp,#3]
  124. ldrb $t2,[$inp,#1]
  125. add $e,$K,$e,ror#2 @ E+=K_00_19
  126. ldrb $t3,[$inp],#4
  127. orr $t0,$t0,$t1,lsl#8
  128. eor $t1,$c,$d @ F_xx_xx
  129. orr $t0,$t0,$t2,lsl#16
  130. add $e,$e,$a,ror#27 @ E+=ROR(A,27)
  131. orr $t0,$t0,$t3,lsl#24
  132. #else
  133. ldr $t0,[$inp],#4 @ handles unaligned
  134. add $e,$K,$e,ror#2 @ E+=K_00_19
  135. eor $t1,$c,$d @ F_xx_xx
  136. add $e,$e,$a,ror#27 @ E+=ROR(A,27)
  137. #ifdef __ARMEL__
  138. rev $t0,$t0 @ byte swap
  139. #endif
  140. #endif
  141. and $t1,$b,$t1,ror#2
  142. add $e,$e,$t0 @ E+=X[i]
  143. eor $t1,$t1,$d,ror#2 @ F_00_19(B,C,D)
  144. str $t0,[$Xi,#-4]!
  145. add $e,$e,$t1 @ E+=F_00_19(B,C,D)
  146. ___
  147. }
  148. sub BODY_16_19 {
  149. my ($a,$b,$c,$d,$e)=@_;
  150. &Xupdate(@_,"and $t1,$b,$t1,ror#2");
  151. $code.=<<___;
  152. eor $t1,$t1,$d,ror#2 @ F_00_19(B,C,D)
  153. add $e,$e,$t1 @ E+=F_00_19(B,C,D)
  154. ___
  155. }
  156. sub BODY_20_39 {
  157. my ($a,$b,$c,$d,$e)=@_;
  158. &Xupdate(@_,"eor $t1,$b,$t1,ror#2");
  159. $code.=<<___;
  160. add $e,$e,$t1 @ E+=F_20_39(B,C,D)
  161. ___
  162. }
  163. sub BODY_40_59 {
  164. my ($a,$b,$c,$d,$e)=@_;
  165. &Xupdate(@_,"and $t1,$b,$t1,ror#2","and $t2,$c,$d");
  166. $code.=<<___;
  167. add $e,$e,$t1 @ E+=F_40_59(B,C,D)
  168. add $e,$e,$t2,ror#2
  169. ___
  170. }
  171. $code=<<___;
  172. #include "arm_arch.h"
  173. #if defined(__thumb2__)
  174. .syntax unified
  175. .thumb
  176. #else
  177. .code 32
  178. #endif
  179. .text
  180. .global sha1_block_data_order
  181. .type sha1_block_data_order,%function
  182. .align 5
  183. sha1_block_data_order:
  184. #if __ARM_MAX_ARCH__>=7
  185. .Lsha1_block:
  186. ldr r12,.LOPENSSL_armcap
  187. # if !defined(_WIN32)
  188. adr r3,.Lsha1_block
  189. ldr r12,[r3,r12] @ OPENSSL_armcap_P
  190. # endif
  191. # if defined(__APPLE__) || defined(_WIN32)
  192. ldr r12,[r12]
  193. # endif
  194. tst r12,#ARMV8_SHA1
  195. bne .LARMv8
  196. tst r12,#ARMV7_NEON
  197. bne .LNEON
  198. #endif
  199. stmdb sp!,{r4-r12,lr}
  200. add $len,$inp,$len,lsl#6 @ $len to point at the end of $inp
  201. ldmia $ctx,{$a,$b,$c,$d,$e}
  202. .Lloop:
  203. ldr $K,.LK_00_19
  204. mov $Xi,sp
  205. sub sp,sp,#15*4
  206. mov $c,$c,ror#30
  207. mov $d,$d,ror#30
  208. mov $e,$e,ror#30 @ [6]
  209. .L_00_15:
  210. ___
  211. for($i=0;$i<5;$i++) {
  212. &BODY_00_15(@V); unshift(@V,pop(@V));
  213. }
  214. $code.=<<___;
  215. #if defined(__thumb2__)
  216. mov $t3,sp
  217. teq $Xi,$t3
  218. #else
  219. teq $Xi,sp
  220. #endif
  221. bne .L_00_15 @ [((11+4)*5+2)*3]
  222. sub sp,sp,#25*4
  223. ___
  224. &BODY_00_15(@V); unshift(@V,pop(@V));
  225. &BODY_16_19(@V); unshift(@V,pop(@V));
  226. &BODY_16_19(@V); unshift(@V,pop(@V));
  227. &BODY_16_19(@V); unshift(@V,pop(@V));
  228. &BODY_16_19(@V); unshift(@V,pop(@V));
  229. $code.=<<___;
  230. ldr $K,.LK_20_39 @ [+15+16*4]
  231. cmn sp,#0 @ [+3], clear carry to denote 20_39
  232. .L_20_39_or_60_79:
  233. ___
  234. for($i=0;$i<5;$i++) {
  235. &BODY_20_39(@V); unshift(@V,pop(@V));
  236. }
  237. $code.=<<___;
  238. #if defined(__thumb2__)
  239. mov $t3,sp
  240. teq $Xi,$t3
  241. #else
  242. teq $Xi,sp @ preserve carry
  243. #endif
  244. bne .L_20_39_or_60_79 @ [+((12+3)*5+2)*4]
  245. bcs .L_done @ [+((12+3)*5+2)*4], spare 300 bytes
  246. ldr $K,.LK_40_59
  247. sub sp,sp,#20*4 @ [+2]
  248. .L_40_59:
  249. ___
  250. for($i=0;$i<5;$i++) {
  251. &BODY_40_59(@V); unshift(@V,pop(@V));
  252. }
  253. $code.=<<___;
  254. #if defined(__thumb2__)
  255. mov $t3,sp
  256. teq $Xi,$t3
  257. #else
  258. teq $Xi,sp
  259. #endif
  260. bne .L_40_59 @ [+((12+5)*5+2)*4]
  261. ldr $K,.LK_60_79
  262. sub sp,sp,#20*4
  263. cmp sp,#0 @ set carry to denote 60_79
  264. b .L_20_39_or_60_79 @ [+4], spare 300 bytes
  265. .L_done:
  266. add sp,sp,#80*4 @ "deallocate" stack frame
  267. ldmia $ctx,{$K,$t0,$t1,$t2,$t3}
  268. add $a,$K,$a
  269. add $b,$t0,$b
  270. add $c,$t1,$c,ror#2
  271. add $d,$t2,$d,ror#2
  272. add $e,$t3,$e,ror#2
  273. stmia $ctx,{$a,$b,$c,$d,$e}
  274. teq $inp,$len
  275. bne .Lloop @ [+18], total 1307
  276. #if __ARM_ARCH__>=5
  277. ldmia sp!,{r4-r12,pc}
  278. #else
  279. ldmia sp!,{r4-r12,lr}
  280. tst lr,#1
  281. moveq pc,lr @ be binary compatible with V4, yet
  282. bx lr @ interoperable with Thumb ISA:-)
  283. #endif
  284. .size sha1_block_data_order,.-sha1_block_data_order
  285. .align 5
  286. .LK_00_19: .word 0x5a827999
  287. .LK_20_39: .word 0x6ed9eba1
  288. .LK_40_59: .word 0x8f1bbcdc
  289. .LK_60_79: .word 0xca62c1d6
  290. #if __ARM_MAX_ARCH__>=7
  291. .LOPENSSL_armcap:
  292. # ifdef _WIN32
  293. .word OPENSSL_armcap_P
  294. # else
  295. .word OPENSSL_armcap_P-.Lsha1_block
  296. # endif
  297. #endif
  298. .asciz "SHA1 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
  299. .align 5
  300. ___
  301. #####################################################################
  302. # NEON stuff
  303. #
  304. {{{
  305. my @V=($a,$b,$c,$d,$e);
  306. my ($K_XX_XX,$Ki,$t0,$t1,$Xfer,$saved_sp)=map("r$_",(8..12,14));
  307. my $Xi=4;
  308. my @X=map("q$_",(8..11,0..3));
  309. my @Tx=("q12","q13");
  310. my ($K,$zero)=("q14","q15");
  311. my $j=0;
  312. sub AUTOLOAD() # thunk [simplified] x86-style perlasm
  313. { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
  314. my $arg = pop;
  315. $arg = "#$arg" if ($arg*1 eq $arg);
  316. $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
  317. }
  318. sub body_00_19 () {
  319. (
  320. '($a,$b,$c,$d,$e)=@V;'. # '$code.="@ $j\n";'.
  321. '&bic ($t0,$d,$b)',
  322. '&add ($e,$e,$Ki)', # e+=X[i]+K
  323. '&and ($t1,$c,$b)',
  324. '&ldr ($Ki,sprintf "[sp,#%d]",4*(($j+1)&15))',
  325. '&add ($e,$e,$a,"ror#27")', # e+=ROR(A,27)
  326. '&eor ($t1,$t1,$t0)', # F_00_19
  327. '&mov ($b,$b,"ror#2")', # b=ROR(b,2)
  328. '&add ($e,$e,$t1);'. # e+=F_00_19
  329. '$j++; unshift(@V,pop(@V));'
  330. )
  331. }
  332. sub body_20_39 () {
  333. (
  334. '($a,$b,$c,$d,$e)=@V;'. # '$code.="@ $j\n";'.
  335. '&eor ($t0,$b,$d)',
  336. '&add ($e,$e,$Ki)', # e+=X[i]+K
  337. '&ldr ($Ki,sprintf "[sp,#%d]",4*(($j+1)&15)) if ($j<79)',
  338. '&eor ($t1,$t0,$c)', # F_20_39
  339. '&add ($e,$e,$a,"ror#27")', # e+=ROR(A,27)
  340. '&mov ($b,$b,"ror#2")', # b=ROR(b,2)
  341. '&add ($e,$e,$t1);'. # e+=F_20_39
  342. '$j++; unshift(@V,pop(@V));'
  343. )
  344. }
  345. sub body_40_59 () {
  346. (
  347. '($a,$b,$c,$d,$e)=@V;'. # '$code.="@ $j\n";'.
  348. '&add ($e,$e,$Ki)', # e+=X[i]+K
  349. '&and ($t0,$c,$d)',
  350. '&ldr ($Ki,sprintf "[sp,#%d]",4*(($j+1)&15))',
  351. '&add ($e,$e,$a,"ror#27")', # e+=ROR(A,27)
  352. '&eor ($t1,$c,$d)',
  353. '&add ($e,$e,$t0)',
  354. '&and ($t1,$t1,$b)',
  355. '&mov ($b,$b,"ror#2")', # b=ROR(b,2)
  356. '&add ($e,$e,$t1);'. # e+=F_40_59
  357. '$j++; unshift(@V,pop(@V));'
  358. )
  359. }
  360. sub Xupdate_16_31 ()
  361. { use integer;
  362. my $body = shift;
  363. my @insns = (&$body,&$body,&$body,&$body);
  364. my ($a,$b,$c,$d,$e);
  365. &vext_8 (@X[0],@X[-4&7],@X[-3&7],8); # compose "X[-14]" in "X[0]"
  366. eval(shift(@insns));
  367. eval(shift(@insns));
  368. eval(shift(@insns));
  369. &vadd_i32 (@Tx[1],@X[-1&7],$K);
  370. eval(shift(@insns));
  371. &vld1_32 ("{$K\[]}","[$K_XX_XX,:32]!") if ($Xi%5==0);
  372. eval(shift(@insns));
  373. &vext_8 (@Tx[0],@X[-1&7],$zero,4); # "X[-3]", 3 words
  374. eval(shift(@insns));
  375. eval(shift(@insns));
  376. eval(shift(@insns));
  377. &veor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
  378. eval(shift(@insns));
  379. eval(shift(@insns));
  380. &veor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
  381. eval(shift(@insns));
  382. eval(shift(@insns));
  383. &veor (@Tx[0],@Tx[0],@X[0]); # "X[0]"^="X[-3]"^"X[-8]
  384. eval(shift(@insns));
  385. eval(shift(@insns));
  386. &vst1_32 ("{@Tx[1]}","[$Xfer,:128]!"); # X[]+K xfer
  387. &sub ($Xfer,$Xfer,64) if ($Xi%4==0);
  388. eval(shift(@insns));
  389. eval(shift(@insns));
  390. &vext_8 (@Tx[1],$zero,@Tx[0],4); # "X[0]"<<96, extract one dword
  391. eval(shift(@insns));
  392. eval(shift(@insns));
  393. &vadd_i32 (@X[0],@Tx[0],@Tx[0]);
  394. eval(shift(@insns));
  395. eval(shift(@insns));
  396. &vsri_32 (@X[0],@Tx[0],31); # "X[0]"<<<=1
  397. eval(shift(@insns));
  398. eval(shift(@insns));
  399. eval(shift(@insns));
  400. &vshr_u32 (@Tx[0],@Tx[1],30);
  401. eval(shift(@insns));
  402. eval(shift(@insns));
  403. &vshl_u32 (@Tx[1],@Tx[1],2);
  404. eval(shift(@insns));
  405. eval(shift(@insns));
  406. &veor (@X[0],@X[0],@Tx[0]);
  407. eval(shift(@insns));
  408. eval(shift(@insns));
  409. &veor (@X[0],@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2
  410. foreach (@insns) { eval; } # remaining instructions [if any]
  411. $Xi++; push(@X,shift(@X)); # "rotate" X[]
  412. }
  413. sub Xupdate_32_79 ()
  414. { use integer;
  415. my $body = shift;
  416. my @insns = (&$body,&$body,&$body,&$body);
  417. my ($a,$b,$c,$d,$e);
  418. &vext_8 (@Tx[0],@X[-2&7],@X[-1&7],8); # compose "X[-6]"
  419. eval(shift(@insns));
  420. eval(shift(@insns));
  421. eval(shift(@insns));
  422. &veor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
  423. eval(shift(@insns));
  424. eval(shift(@insns));
  425. &veor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
  426. eval(shift(@insns));
  427. eval(shift(@insns));
  428. &vadd_i32 (@Tx[1],@X[-1&7],$K);
  429. eval(shift(@insns));
  430. &vld1_32 ("{$K\[]}","[$K_XX_XX,:32]!") if ($Xi%5==0);
  431. eval(shift(@insns));
  432. &veor (@Tx[0],@Tx[0],@X[0]); # "X[-6]"^="X[0]"
  433. eval(shift(@insns));
  434. eval(shift(@insns));
  435. &vshr_u32 (@X[0],@Tx[0],30);
  436. eval(shift(@insns));
  437. eval(shift(@insns));
  438. &vst1_32 ("{@Tx[1]}","[$Xfer,:128]!"); # X[]+K xfer
  439. &sub ($Xfer,$Xfer,64) if ($Xi%4==0);
  440. eval(shift(@insns));
  441. eval(shift(@insns));
  442. &vsli_32 (@X[0],@Tx[0],2); # "X[0]"="X[-6]"<<<2
  443. foreach (@insns) { eval; } # remaining instructions [if any]
  444. $Xi++; push(@X,shift(@X)); # "rotate" X[]
  445. }
  446. sub Xuplast_80 ()
  447. { use integer;
  448. my $body = shift;
  449. my @insns = (&$body,&$body,&$body,&$body);
  450. my ($a,$b,$c,$d,$e);
  451. &vadd_i32 (@Tx[1],@X[-1&7],$K);
  452. eval(shift(@insns));
  453. eval(shift(@insns));
  454. &vst1_32 ("{@Tx[1]}","[$Xfer,:128]!");
  455. &sub ($Xfer,$Xfer,64);
  456. &teq ($inp,$len);
  457. &sub ($K_XX_XX,$K_XX_XX,16); # rewind $K_XX_XX
  458. &it ("eq");
  459. &subeq ($inp,$inp,64); # reload last block to avoid SEGV
  460. &vld1_8 ("{@X[-4&7]-@X[-3&7]}","[$inp]!");
  461. eval(shift(@insns));
  462. eval(shift(@insns));
  463. &vld1_8 ("{@X[-2&7]-@X[-1&7]}","[$inp]!");
  464. eval(shift(@insns));
  465. eval(shift(@insns));
  466. &vld1_32 ("{$K\[]}","[$K_XX_XX,:32]!"); # load K_00_19
  467. eval(shift(@insns));
  468. eval(shift(@insns));
  469. &vrev32_8 (@X[-4&7],@X[-4&7]);
  470. foreach (@insns) { eval; } # remaining instructions
  471. $Xi=0;
  472. }
  473. sub Xloop()
  474. { use integer;
  475. my $body = shift;
  476. my @insns = (&$body,&$body,&$body,&$body);
  477. my ($a,$b,$c,$d,$e);
  478. &vrev32_8 (@X[($Xi-3)&7],@X[($Xi-3)&7]);
  479. eval(shift(@insns));
  480. eval(shift(@insns));
  481. &vadd_i32 (@X[$Xi&7],@X[($Xi-4)&7],$K);
  482. eval(shift(@insns));
  483. eval(shift(@insns));
  484. &vst1_32 ("{@X[$Xi&7]}","[$Xfer,:128]!");# X[]+K xfer to IALU
  485. foreach (@insns) { eval; }
  486. $Xi++;
  487. }
  488. $code.=<<___;
  489. #if __ARM_MAX_ARCH__>=7
  490. .arch armv7-a
  491. .fpu neon
  492. .type sha1_block_data_order_neon,%function
  493. .align 4
  494. sha1_block_data_order_neon:
  495. .LNEON:
  496. stmdb sp!,{r4-r12,lr}
  497. add $len,$inp,$len,lsl#6 @ $len to point at the end of $inp
  498. @ dmb @ errata #451034 on early Cortex A8
  499. @ vstmdb sp!,{d8-d15} @ ABI specification says so
  500. mov $saved_sp,sp
  501. sub $Xfer,sp,#64
  502. adr $K_XX_XX,.LK_00_19
  503. bic $Xfer,$Xfer,#15 @ align for 128-bit stores
  504. ldmia $ctx,{$a,$b,$c,$d,$e} @ load context
  505. mov sp,$Xfer @ alloca
  506. vld1.8 {@X[-4&7]-@X[-3&7]},[$inp]! @ handles unaligned
  507. veor $zero,$zero,$zero
  508. vld1.8 {@X[-2&7]-@X[-1&7]},[$inp]!
  509. vld1.32 {${K}\[]},[$K_XX_XX,:32]! @ load K_00_19
  510. vrev32.8 @X[-4&7],@X[-4&7] @ yes, even on
  511. vrev32.8 @X[-3&7],@X[-3&7] @ big-endian...
  512. vrev32.8 @X[-2&7],@X[-2&7]
  513. vadd.i32 @X[0],@X[-4&7],$K
  514. vrev32.8 @X[-1&7],@X[-1&7]
  515. vadd.i32 @X[1],@X[-3&7],$K
  516. vst1.32 {@X[0]},[$Xfer,:128]!
  517. vadd.i32 @X[2],@X[-2&7],$K
  518. vst1.32 {@X[1]},[$Xfer,:128]!
  519. vst1.32 {@X[2]},[$Xfer,:128]!
  520. ldr $Ki,[sp] @ big RAW stall
  521. .Loop_neon:
  522. ___
  523. &Xupdate_16_31(\&body_00_19);
  524. &Xupdate_16_31(\&body_00_19);
  525. &Xupdate_16_31(\&body_00_19);
  526. &Xupdate_16_31(\&body_00_19);
  527. &Xupdate_32_79(\&body_00_19);
  528. &Xupdate_32_79(\&body_20_39);
  529. &Xupdate_32_79(\&body_20_39);
  530. &Xupdate_32_79(\&body_20_39);
  531. &Xupdate_32_79(\&body_20_39);
  532. &Xupdate_32_79(\&body_20_39);
  533. &Xupdate_32_79(\&body_40_59);
  534. &Xupdate_32_79(\&body_40_59);
  535. &Xupdate_32_79(\&body_40_59);
  536. &Xupdate_32_79(\&body_40_59);
  537. &Xupdate_32_79(\&body_40_59);
  538. &Xupdate_32_79(\&body_20_39);
  539. &Xuplast_80(\&body_20_39);
  540. &Xloop(\&body_20_39);
  541. &Xloop(\&body_20_39);
  542. &Xloop(\&body_20_39);
  543. $code.=<<___;
  544. ldmia $ctx,{$Ki,$t0,$t1,$Xfer} @ accumulate context
  545. add $a,$a,$Ki
  546. ldr $Ki,[$ctx,#16]
  547. add $b,$b,$t0
  548. add $c,$c,$t1
  549. add $d,$d,$Xfer
  550. it eq
  551. moveq sp,$saved_sp
  552. add $e,$e,$Ki
  553. it ne
  554. ldrne $Ki,[sp]
  555. stmia $ctx,{$a,$b,$c,$d,$e}
  556. itt ne
  557. addne $Xfer,sp,#3*16
  558. bne .Loop_neon
  559. @ vldmia sp!,{d8-d15}
  560. ldmia sp!,{r4-r12,pc}
  561. .size sha1_block_data_order_neon,.-sha1_block_data_order_neon
  562. #endif
  563. ___
  564. }}}
  565. #####################################################################
  566. # ARMv8 stuff
  567. #
  568. {{{
  569. my ($ABCD,$E,$E0,$E1)=map("q$_",(0..3));
  570. my @MSG=map("q$_",(4..7));
  571. my @Kxx=map("q$_",(8..11));
  572. my ($W0,$W1,$ABCD_SAVE)=map("q$_",(12..14));
  573. my $_byte = ($flavour =~ /win/ ? "DCB" : ".byte");
  574. $code.=<<___;
  575. #if __ARM_MAX_ARCH__>=7
  576. # if defined(__thumb2__)
  577. # define INST(a,b,c,d) $_byte c,d|0xf,a,b
  578. # else
  579. # define INST(a,b,c,d) $_byte a,b,c,d|0x10
  580. # endif
  581. .type sha1_block_data_order_armv8,%function
  582. .align 5
  583. sha1_block_data_order_armv8:
  584. .LARMv8:
  585. vstmdb sp!,{d8-d15} @ ABI specification says so
  586. veor $E,$E,$E
  587. adr r3,.LK_00_19
  588. vld1.32 {$ABCD},[$ctx]!
  589. vld1.32 {$E\[0]},[$ctx]
  590. sub $ctx,$ctx,#16
  591. vld1.32 {@Kxx[0]\[]},[r3,:32]!
  592. vld1.32 {@Kxx[1]\[]},[r3,:32]!
  593. vld1.32 {@Kxx[2]\[]},[r3,:32]!
  594. vld1.32 {@Kxx[3]\[]},[r3,:32]
  595. .Loop_v8:
  596. vld1.8 {@MSG[0]-@MSG[1]},[$inp]!
  597. vld1.8 {@MSG[2]-@MSG[3]},[$inp]!
  598. vrev32.8 @MSG[0],@MSG[0]
  599. vrev32.8 @MSG[1],@MSG[1]
  600. vadd.i32 $W0,@Kxx[0],@MSG[0]
  601. vrev32.8 @MSG[2],@MSG[2]
  602. vmov $ABCD_SAVE,$ABCD @ offload
  603. subs $len,$len,#1
  604. vadd.i32 $W1,@Kxx[0],@MSG[1]
  605. vrev32.8 @MSG[3],@MSG[3]
  606. sha1h $E1,$ABCD @ 0
  607. sha1c $ABCD,$E,$W0
  608. vadd.i32 $W0,@Kxx[$j],@MSG[2]
  609. sha1su0 @MSG[0],@MSG[1],@MSG[2]
  610. ___
  611. for ($j=0,$i=1;$i<20-3;$i++) {
  612. my $f=("c","p","m","p")[$i/5];
  613. $code.=<<___;
  614. sha1h $E0,$ABCD @ $i
  615. sha1$f $ABCD,$E1,$W1
  616. vadd.i32 $W1,@Kxx[$j],@MSG[3]
  617. sha1su1 @MSG[0],@MSG[3]
  618. ___
  619. $code.=<<___ if ($i<20-4);
  620. sha1su0 @MSG[1],@MSG[2],@MSG[3]
  621. ___
  622. ($E0,$E1)=($E1,$E0); ($W0,$W1)=($W1,$W0);
  623. push(@MSG,shift(@MSG)); $j++ if ((($i+3)%5)==0);
  624. }
  625. $code.=<<___;
  626. sha1h $E0,$ABCD @ $i
  627. sha1p $ABCD,$E1,$W1
  628. vadd.i32 $W1,@Kxx[$j],@MSG[3]
  629. sha1h $E1,$ABCD @ 18
  630. sha1p $ABCD,$E0,$W0
  631. sha1h $E0,$ABCD @ 19
  632. sha1p $ABCD,$E1,$W1
  633. vadd.i32 $E,$E,$E0
  634. vadd.i32 $ABCD,$ABCD,$ABCD_SAVE
  635. bne .Loop_v8
  636. vst1.32 {$ABCD},[$ctx]!
  637. vst1.32 {$E\[0]},[$ctx]
  638. vldmia sp!,{d8-d15}
  639. ret @ bx lr
  640. .size sha1_block_data_order_armv8,.-sha1_block_data_order_armv8
  641. #endif
  642. ___
  643. }}}
  644. $code.=<<___;
  645. #if __ARM_MAX_ARCH__>=7
  646. .comm OPENSSL_armcap_P,4,4
  647. #endif
  648. ___
  649. { my %opcode = (
  650. "sha1c" => 0xf2000c40, "sha1p" => 0xf2100c40,
  651. "sha1m" => 0xf2200c40, "sha1su0" => 0xf2300c40,
  652. "sha1h" => 0xf3b902c0, "sha1su1" => 0xf3ba0380 );
  653. sub unsha1 {
  654. my ($mnemonic,$arg)=@_;
  655. if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
  656. my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
  657. |(($2&7)<<17)|(($2&8)<<4)
  658. |(($3&7)<<1) |(($3&8)<<2);
  659. # since ARMv7 instructions are always encoded little-endian.
  660. # correct solution is to use .inst directive, but older
  661. # assemblers don't implement it:-(
  662. # this fix-up provides Thumb encoding in conjunction with INST
  663. $word &= ~0x10000000 if (($word & 0x0f000000) == 0x02000000);
  664. sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
  665. $word&0xff,($word>>8)&0xff,
  666. ($word>>16)&0xff,($word>>24)&0xff,
  667. $mnemonic,$arg;
  668. }
  669. }
  670. }
  671. foreach (split($/,$code)) {
  672. s/{q([0-9]+)\[\]}/sprintf "{d%d[],d%d[]}",2*$1,2*$1+1/eo or
  673. s/{q([0-9]+)\[0\]}/sprintf "{d%d[0]}",2*$1/eo;
  674. s/\b(sha1\w+)\s+(q.*)/unsha1($1,$2)/geo;
  675. s/\bret\b/bx lr/o or
  676. s/\bbx\s+lr\b/.word\t0xe12fff1e/o; # make it possible to compile with -march=armv4
  677. print $_,$/;
  678. }
  679. close STDOUT or die "error closing STDOUT: $!"; # enforce flush