sha256-armv4.pl 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742
  1. #! /usr/bin/env perl
  2. # Copyright 2007-2023 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. # ====================================================================
  9. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  10. # project. The module is, however, dual licensed under OpenSSL and
  11. # CRYPTOGAMS licenses depending on where you obtain it. For further
  12. # details see http://www.openssl.org/~appro/cryptogams/.
  13. #
  14. # Permission to use under GPL terms is granted.
  15. # ====================================================================
  16. # SHA256 block procedure for ARMv4. May 2007.
  17. # Performance is ~2x better than gcc 3.4 generated code and in "abso-
  18. # lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
  19. # byte [on single-issue Xscale PXA250 core].
  20. # July 2010.
  21. #
  22. # Rescheduling for dual-issue pipeline resulted in 22% improvement on
  23. # Cortex A8 core and ~20 cycles per processed byte.
  24. # February 2011.
  25. #
  26. # Profiler-assisted and platform-specific optimization resulted in 16%
  27. # improvement on Cortex A8 core and ~15.4 cycles per processed byte.
  28. # September 2013.
  29. #
  30. # Add NEON implementation. On Cortex A8 it was measured to process one
  31. # byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
  32. # S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
  33. # code (meaning that latter performs sub-optimally, nothing was done
  34. # about it).
  35. # May 2014.
  36. #
  37. # Add ARMv8 code path performing at 2.0 cpb on Apple A7.
  38. # $output is the last argument if it looks like a file (it has an extension)
  39. # $flavour is the first argument if it doesn't look like a file
  40. $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  41. $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  42. if ($flavour && $flavour ne "void") {
  43. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  44. ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
  45. ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
  46. die "can't locate arm-xlate.pl";
  47. open STDOUT,"| \"$^X\" $xlate $flavour \"$output\""
  48. or die "can't call $xlate: $!";
  49. } else {
  50. $output and open STDOUT,">$output";
  51. }
  52. $ctx="r0"; $t0="r0";
  53. $inp="r1"; $t4="r1";
  54. $len="r2"; $t1="r2";
  55. $T1="r3"; $t3="r3";
  56. $A="r4";
  57. $B="r5";
  58. $C="r6";
  59. $D="r7";
  60. $E="r8";
  61. $F="r9";
  62. $G="r10";
  63. $H="r11";
  64. @V=($A,$B,$C,$D,$E,$F,$G,$H);
  65. $t2="r12";
  66. $Ktbl="r14";
  67. @Sigma0=( 2,13,22);
  68. @Sigma1=( 6,11,25);
  69. @sigma0=( 7,18, 3);
  70. @sigma1=(17,19,10);
  71. sub BODY_00_15 {
  72. my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
  73. $code.=<<___ if ($i<16);
  74. #if __ARM_ARCH__>=7
  75. @ ldr $t1,[$inp],#4 @ $i
  76. # if $i==15
  77. str $inp,[sp,#17*4] @ make room for $t4
  78. # endif
  79. eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
  80. add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
  81. eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
  82. # ifndef __ARMEB__
  83. rev $t1,$t1
  84. # endif
  85. #else
  86. @ ldrb $t1,[$inp,#3] @ $i
  87. add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
  88. ldrb $t2,[$inp,#2]
  89. ldrb $t0,[$inp,#1]
  90. orr $t1,$t1,$t2,lsl#8
  91. ldrb $t2,[$inp],#4
  92. orr $t1,$t1,$t0,lsl#16
  93. # if $i==15
  94. str $inp,[sp,#17*4] @ make room for $t4
  95. # endif
  96. eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
  97. orr $t1,$t1,$t2,lsl#24
  98. eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
  99. #endif
  100. ___
  101. $code.=<<___;
  102. ldr $t2,[$Ktbl],#4 @ *K256++
  103. add $h,$h,$t1 @ h+=X[i]
  104. str $t1,[sp,#`$i%16`*4]
  105. eor $t1,$f,$g
  106. add $h,$h,$t0,ror#$Sigma1[0] @ h+=Sigma1(e)
  107. and $t1,$t1,$e
  108. add $h,$h,$t2 @ h+=K256[i]
  109. eor $t1,$t1,$g @ Ch(e,f,g)
  110. eor $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
  111. add $h,$h,$t1 @ h+=Ch(e,f,g)
  112. #if $i==31
  113. and $t2,$t2,#0xff
  114. cmp $t2,#0xf2 @ done?
  115. #endif
  116. #if $i<15
  117. # if __ARM_ARCH__>=7
  118. ldr $t1,[$inp],#4 @ prefetch
  119. # else
  120. ldrb $t1,[$inp,#3]
  121. # endif
  122. eor $t2,$a,$b @ a^b, b^c in next round
  123. #else
  124. ldr $t1,[sp,#`($i+2)%16`*4] @ from future BODY_16_xx
  125. eor $t2,$a,$b @ a^b, b^c in next round
  126. ldr $t4,[sp,#`($i+15)%16`*4] @ from future BODY_16_xx
  127. #endif
  128. eor $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]` @ Sigma0(a)
  129. and $t3,$t3,$t2 @ (b^c)&=(a^b)
  130. add $d,$d,$h @ d+=h
  131. eor $t3,$t3,$b @ Maj(a,b,c)
  132. add $h,$h,$t0,ror#$Sigma0[0] @ h+=Sigma0(a)
  133. @ add $h,$h,$t3 @ h+=Maj(a,b,c)
  134. ___
  135. ($t2,$t3)=($t3,$t2);
  136. }
  137. sub BODY_16_XX {
  138. my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
  139. $code.=<<___;
  140. @ ldr $t1,[sp,#`($i+1)%16`*4] @ $i
  141. @ ldr $t4,[sp,#`($i+14)%16`*4]
  142. mov $t0,$t1,ror#$sigma0[0]
  143. add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
  144. mov $t2,$t4,ror#$sigma1[0]
  145. eor $t0,$t0,$t1,ror#$sigma0[1]
  146. eor $t2,$t2,$t4,ror#$sigma1[1]
  147. eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1])
  148. ldr $t1,[sp,#`($i+0)%16`*4]
  149. eor $t2,$t2,$t4,lsr#$sigma1[2] @ sigma1(X[i+14])
  150. ldr $t4,[sp,#`($i+9)%16`*4]
  151. add $t2,$t2,$t0
  152. eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` @ from BODY_00_15
  153. add $t1,$t1,$t2
  154. eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
  155. add $t1,$t1,$t4 @ X[i]
  156. ___
  157. &BODY_00_15(@_);
  158. }
  159. $code=<<___;
  160. #ifndef __KERNEL__
  161. # include "arm_arch.h"
  162. #else
  163. # define __ARM_ARCH__ __LINUX_ARM_ARCH__
  164. # define __ARM_MAX_ARCH__ 7
  165. #endif
  166. #if defined(__thumb2__)
  167. .syntax unified
  168. .thumb
  169. #else
  170. .code 32
  171. #endif
  172. .text
  173. .type K256,%object
  174. .align 5
  175. K256:
  176. .word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
  177. .word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
  178. .word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
  179. .word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
  180. .word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
  181. .word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
  182. .word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
  183. .word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
  184. .word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
  185. .word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
  186. .word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
  187. .word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
  188. .word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
  189. .word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
  190. .word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
  191. .word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
  192. .size K256,.-K256
  193. .word 0 @ terminator
  194. #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
  195. .LOPENSSL_armcap:
  196. # ifdef _WIN32
  197. .word OPENSSL_armcap_P
  198. # else
  199. .word OPENSSL_armcap_P-.Lsha256_block_data_order
  200. # endif
  201. #endif
  202. .align 5
  203. .global sha256_block_data_order
  204. .type sha256_block_data_order,%function
  205. sha256_block_data_order:
  206. .Lsha256_block_data_order:
  207. #if __ARM_ARCH__<7 && !defined(__thumb2__)
  208. sub r3,pc,#8 @ sha256_block_data_order
  209. #else
  210. adr r3,.Lsha256_block_data_order
  211. #endif
  212. #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
  213. ldr r12,.LOPENSSL_armcap
  214. # if !defined(_WIN32)
  215. ldr r12,[r3,r12] @ OPENSSL_armcap_P
  216. # endif
  217. # if defined(__APPLE__) || defined(_WIN32)
  218. ldr r12,[r12]
  219. # endif
  220. tst r12,#ARMV8_SHA256
  221. bne .LARMv8
  222. tst r12,#ARMV7_NEON
  223. bne .LNEON
  224. #endif
  225. add $len,$inp,$len,lsl#6 @ len to point at the end of inp
  226. stmdb sp!,{$ctx,$inp,$len,r4-r11,lr}
  227. ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
  228. sub $Ktbl,r3,#256+32 @ K256
  229. sub sp,sp,#16*4 @ alloca(X[16])
  230. .Loop:
  231. # if __ARM_ARCH__>=7
  232. ldr $t1,[$inp],#4
  233. # else
  234. ldrb $t1,[$inp,#3]
  235. # endif
  236. eor $t3,$B,$C @ magic
  237. eor $t2,$t2,$t2
  238. ___
  239. for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
  240. $code.=".Lrounds_16_xx:\n";
  241. for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
  242. $code.=<<___;
  243. #ifdef __thumb2__
  244. ite eq @ Thumb2 thing, sanity check in ARM
  245. #endif
  246. ldreq $t3,[sp,#16*4] @ pull ctx
  247. bne .Lrounds_16_xx
  248. add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
  249. ldr $t0,[$t3,#0]
  250. ldr $t1,[$t3,#4]
  251. ldr $t2,[$t3,#8]
  252. add $A,$A,$t0
  253. ldr $t0,[$t3,#12]
  254. add $B,$B,$t1
  255. ldr $t1,[$t3,#16]
  256. add $C,$C,$t2
  257. ldr $t2,[$t3,#20]
  258. add $D,$D,$t0
  259. ldr $t0,[$t3,#24]
  260. add $E,$E,$t1
  261. ldr $t1,[$t3,#28]
  262. add $F,$F,$t2
  263. ldr $inp,[sp,#17*4] @ pull inp
  264. ldr $t2,[sp,#18*4] @ pull inp+len
  265. add $G,$G,$t0
  266. add $H,$H,$t1
  267. stmia $t3,{$A,$B,$C,$D,$E,$F,$G,$H}
  268. cmp $inp,$t2
  269. sub $Ktbl,$Ktbl,#256 @ rewind Ktbl
  270. bne .Loop
  271. add sp,sp,#`16+3`*4 @ destroy frame
  272. #if __ARM_ARCH__>=5
  273. ldmia sp!,{r4-r11,pc}
  274. #else
  275. ldmia sp!,{r4-r11,lr}
  276. tst lr,#1
  277. moveq pc,lr @ be binary compatible with V4, yet
  278. bx lr @ interoperable with Thumb ISA:-)
  279. #endif
  280. .size sha256_block_data_order,.-sha256_block_data_order
  281. ___
  282. ######################################################################
  283. # NEON stuff
  284. #
  285. {{{
  286. my @X=map("q$_",(0..3));
  287. my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
  288. my $Xfer=$t4;
  289. my $j=0;
  290. sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
  291. sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
  292. sub AUTOLOAD() # thunk [simplified] x86-style perlasm
  293. { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
  294. my $arg = pop;
  295. $arg = "#$arg" if ($arg*1 eq $arg);
  296. $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
  297. }
  298. sub Xupdate()
  299. { use integer;
  300. my $body = shift;
  301. my @insns = (&$body,&$body,&$body,&$body);
  302. my ($a,$b,$c,$d,$e,$f,$g,$h);
  303. &vext_8 ($T0,@X[0],@X[1],4); # X[1..4]
  304. eval(shift(@insns));
  305. eval(shift(@insns));
  306. eval(shift(@insns));
  307. &vext_8 ($T1,@X[2],@X[3],4); # X[9..12]
  308. eval(shift(@insns));
  309. eval(shift(@insns));
  310. eval(shift(@insns));
  311. &vshr_u32 ($T2,$T0,$sigma0[0]);
  312. eval(shift(@insns));
  313. eval(shift(@insns));
  314. &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += X[9..12]
  315. eval(shift(@insns));
  316. eval(shift(@insns));
  317. &vshr_u32 ($T1,$T0,$sigma0[2]);
  318. eval(shift(@insns));
  319. eval(shift(@insns));
  320. &vsli_32 ($T2,$T0,32-$sigma0[0]);
  321. eval(shift(@insns));
  322. eval(shift(@insns));
  323. &vshr_u32 ($T3,$T0,$sigma0[1]);
  324. eval(shift(@insns));
  325. eval(shift(@insns));
  326. &veor ($T1,$T1,$T2);
  327. eval(shift(@insns));
  328. eval(shift(@insns));
  329. &vsli_32 ($T3,$T0,32-$sigma0[1]);
  330. eval(shift(@insns));
  331. eval(shift(@insns));
  332. &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[0]);
  333. eval(shift(@insns));
  334. eval(shift(@insns));
  335. &veor ($T1,$T1,$T3); # sigma0(X[1..4])
  336. eval(shift(@insns));
  337. eval(shift(@insns));
  338. &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[0]);
  339. eval(shift(@insns));
  340. eval(shift(@insns));
  341. &vshr_u32 ($T5,&Dhi(@X[3]),$sigma1[2]);
  342. eval(shift(@insns));
  343. eval(shift(@insns));
  344. &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4])
  345. eval(shift(@insns));
  346. eval(shift(@insns));
  347. &veor ($T5,$T5,$T4);
  348. eval(shift(@insns));
  349. eval(shift(@insns));
  350. &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[1]);
  351. eval(shift(@insns));
  352. eval(shift(@insns));
  353. &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[1]);
  354. eval(shift(@insns));
  355. eval(shift(@insns));
  356. &veor ($T5,$T5,$T4); # sigma1(X[14..15])
  357. eval(shift(@insns));
  358. eval(shift(@insns));
  359. &vadd_i32 (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
  360. eval(shift(@insns));
  361. eval(shift(@insns));
  362. &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[0]);
  363. eval(shift(@insns));
  364. eval(shift(@insns));
  365. &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[0]);
  366. eval(shift(@insns));
  367. eval(shift(@insns));
  368. &vshr_u32 ($T5,&Dlo(@X[0]),$sigma1[2]);
  369. eval(shift(@insns));
  370. eval(shift(@insns));
  371. &veor ($T5,$T5,$T4);
  372. eval(shift(@insns));
  373. eval(shift(@insns));
  374. &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[1]);
  375. eval(shift(@insns));
  376. eval(shift(@insns));
  377. &vld1_32 ("{$T0}","[$Ktbl,:128]!");
  378. eval(shift(@insns));
  379. eval(shift(@insns));
  380. &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[1]);
  381. eval(shift(@insns));
  382. eval(shift(@insns));
  383. &veor ($T5,$T5,$T4); # sigma1(X[16..17])
  384. eval(shift(@insns));
  385. eval(shift(@insns));
  386. &vadd_i32 (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
  387. eval(shift(@insns));
  388. eval(shift(@insns));
  389. &vadd_i32 ($T0,$T0,@X[0]);
  390. while($#insns>=2) { eval(shift(@insns)); }
  391. &vst1_32 ("{$T0}","[$Xfer,:128]!");
  392. eval(shift(@insns));
  393. eval(shift(@insns));
  394. push(@X,shift(@X)); # "rotate" X[]
  395. }
  396. sub Xpreload()
  397. { use integer;
  398. my $body = shift;
  399. my @insns = (&$body,&$body,&$body,&$body);
  400. my ($a,$b,$c,$d,$e,$f,$g,$h);
  401. eval(shift(@insns));
  402. eval(shift(@insns));
  403. eval(shift(@insns));
  404. eval(shift(@insns));
  405. &vld1_32 ("{$T0}","[$Ktbl,:128]!");
  406. eval(shift(@insns));
  407. eval(shift(@insns));
  408. eval(shift(@insns));
  409. eval(shift(@insns));
  410. &vrev32_8 (@X[0],@X[0]);
  411. eval(shift(@insns));
  412. eval(shift(@insns));
  413. eval(shift(@insns));
  414. eval(shift(@insns));
  415. &vadd_i32 ($T0,$T0,@X[0]);
  416. foreach (@insns) { eval; } # remaining instructions
  417. &vst1_32 ("{$T0}","[$Xfer,:128]!");
  418. push(@X,shift(@X)); # "rotate" X[]
  419. }
  420. sub body_00_15 () {
  421. (
  422. '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
  423. '&add ($h,$h,$t1)', # h+=X[i]+K[i]
  424. '&eor ($t1,$f,$g)',
  425. '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
  426. '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past
  427. '&and ($t1,$t1,$e)',
  428. '&eor ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e)
  429. '&eor ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
  430. '&eor ($t1,$t1,$g)', # Ch(e,f,g)
  431. '&add ($h,$h,$t2,"ror#$Sigma1[0]")', # h+=Sigma1(e)
  432. '&eor ($t2,$a,$b)', # a^b, b^c in next round
  433. '&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a)
  434. '&add ($h,$h,$t1)', # h+=Ch(e,f,g)
  435. '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'.
  436. '&ldr ($t1,"[$Ktbl]") if ($j==15);'.
  437. '&ldr ($t1,"[sp,#64]") if ($j==31)',
  438. '&and ($t3,$t3,$t2)', # (b^c)&=(a^b)
  439. '&add ($d,$d,$h)', # d+=h
  440. '&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
  441. '&eor ($t3,$t3,$b)', # Maj(a,b,c)
  442. '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
  443. )
  444. }
  445. $code.=<<___;
  446. #if __ARM_MAX_ARCH__>=7
  447. .arch armv7-a
  448. .fpu neon
  449. .global sha256_block_data_order_neon
  450. .type sha256_block_data_order_neon,%function
  451. .align 5
  452. .skip 16
  453. sha256_block_data_order_neon:
  454. .LNEON:
  455. stmdb sp!,{r4-r12,lr}
  456. sub $H,sp,#16*4+16
  457. adr $Ktbl,K256
  458. bic $H,$H,#15 @ align for 128-bit stores
  459. mov $t2,sp
  460. mov sp,$H @ alloca
  461. add $len,$inp,$len,lsl#6 @ len to point at the end of inp
  462. vld1.8 {@X[0]},[$inp]!
  463. vld1.8 {@X[1]},[$inp]!
  464. vld1.8 {@X[2]},[$inp]!
  465. vld1.8 {@X[3]},[$inp]!
  466. vld1.32 {$T0},[$Ktbl,:128]!
  467. vld1.32 {$T1},[$Ktbl,:128]!
  468. vld1.32 {$T2},[$Ktbl,:128]!
  469. vld1.32 {$T3},[$Ktbl,:128]!
  470. vrev32.8 @X[0],@X[0] @ yes, even on
  471. str $ctx,[sp,#64]
  472. vrev32.8 @X[1],@X[1] @ big-endian
  473. str $inp,[sp,#68]
  474. mov $Xfer,sp
  475. vrev32.8 @X[2],@X[2]
  476. str $len,[sp,#72]
  477. vrev32.8 @X[3],@X[3]
  478. str $t2,[sp,#76] @ save original sp
  479. vadd.i32 $T0,$T0,@X[0]
  480. vadd.i32 $T1,$T1,@X[1]
  481. vst1.32 {$T0},[$Xfer,:128]!
  482. vadd.i32 $T2,$T2,@X[2]
  483. vst1.32 {$T1},[$Xfer,:128]!
  484. vadd.i32 $T3,$T3,@X[3]
  485. vst1.32 {$T2},[$Xfer,:128]!
  486. vst1.32 {$T3},[$Xfer,:128]!
  487. ldmia $ctx,{$A-$H}
  488. sub $Xfer,$Xfer,#64
  489. ldr $t1,[sp,#0]
  490. eor $t2,$t2,$t2
  491. eor $t3,$B,$C
  492. b .L_00_48
  493. .align 4
  494. .L_00_48:
  495. ___
  496. &Xupdate(\&body_00_15);
  497. &Xupdate(\&body_00_15);
  498. &Xupdate(\&body_00_15);
  499. &Xupdate(\&body_00_15);
  500. $code.=<<___;
  501. teq $t1,#0 @ check for K256 terminator
  502. ldr $t1,[sp,#0]
  503. sub $Xfer,$Xfer,#64
  504. bne .L_00_48
  505. ldr $inp,[sp,#68]
  506. ldr $t0,[sp,#72]
  507. sub $Ktbl,$Ktbl,#256 @ rewind $Ktbl
  508. teq $inp,$t0
  509. it eq
  510. subeq $inp,$inp,#64 @ avoid SEGV
  511. vld1.8 {@X[0]},[$inp]! @ load next input block
  512. vld1.8 {@X[1]},[$inp]!
  513. vld1.8 {@X[2]},[$inp]!
  514. vld1.8 {@X[3]},[$inp]!
  515. it ne
  516. strne $inp,[sp,#68]
  517. mov $Xfer,sp
  518. ___
  519. &Xpreload(\&body_00_15);
  520. &Xpreload(\&body_00_15);
  521. &Xpreload(\&body_00_15);
  522. &Xpreload(\&body_00_15);
  523. $code.=<<___;
  524. ldr $t0,[$t1,#0]
  525. add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
  526. ldr $t2,[$t1,#4]
  527. ldr $t3,[$t1,#8]
  528. ldr $t4,[$t1,#12]
  529. add $A,$A,$t0 @ accumulate
  530. ldr $t0,[$t1,#16]
  531. add $B,$B,$t2
  532. ldr $t2,[$t1,#20]
  533. add $C,$C,$t3
  534. ldr $t3,[$t1,#24]
  535. add $D,$D,$t4
  536. ldr $t4,[$t1,#28]
  537. add $E,$E,$t0
  538. str $A,[$t1],#4
  539. add $F,$F,$t2
  540. str $B,[$t1],#4
  541. add $G,$G,$t3
  542. str $C,[$t1],#4
  543. add $H,$H,$t4
  544. str $D,[$t1],#4
  545. stmia $t1,{$E-$H}
  546. ittte ne
  547. movne $Xfer,sp
  548. ldrne $t1,[sp,#0]
  549. eorne $t2,$t2,$t2
  550. ldreq sp,[sp,#76] @ restore original sp
  551. itt ne
  552. eorne $t3,$B,$C
  553. bne .L_00_48
  554. ldmia sp!,{r4-r12,pc}
  555. .size sha256_block_data_order_neon,.-sha256_block_data_order_neon
  556. #endif
  557. ___
  558. }}}
  559. ######################################################################
  560. # ARMv8 stuff
  561. #
  562. {{{
  563. my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
  564. my @MSG=map("q$_",(8..11));
  565. my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
  566. my $Ktbl="r3";
  567. my $_byte = ($flavour =~ /win/ ? "DCB" : ".byte");
  568. $code.=<<___;
  569. #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
  570. # if defined(__thumb2__)
  571. # define INST(a,b,c,d) $_byte c,d|0xc,a,b
  572. # else
  573. # define INST(a,b,c,d) $_byte a,b,c,d
  574. # endif
  575. .type sha256_block_data_order_armv8,%function
  576. .align 5
  577. sha256_block_data_order_armv8:
  578. .LARMv8:
  579. vld1.32 {$ABCD,$EFGH},[$ctx]
  580. sub $Ktbl,$Ktbl,#256+32
  581. add $len,$inp,$len,lsl#6 @ len to point at the end of inp
  582. b .Loop_v8
  583. .align 4
  584. .Loop_v8:
  585. vld1.8 {@MSG[0]-@MSG[1]},[$inp]!
  586. vld1.8 {@MSG[2]-@MSG[3]},[$inp]!
  587. vld1.32 {$W0},[$Ktbl]!
  588. vrev32.8 @MSG[0],@MSG[0]
  589. vrev32.8 @MSG[1],@MSG[1]
  590. vrev32.8 @MSG[2],@MSG[2]
  591. vrev32.8 @MSG[3],@MSG[3]
  592. vmov $ABCD_SAVE,$ABCD @ offload
  593. vmov $EFGH_SAVE,$EFGH
  594. teq $inp,$len
  595. ___
  596. for($i=0;$i<12;$i++) {
  597. $code.=<<___;
  598. vld1.32 {$W1},[$Ktbl]!
  599. vadd.i32 $W0,$W0,@MSG[0]
  600. sha256su0 @MSG[0],@MSG[1]
  601. vmov $abcd,$ABCD
  602. sha256h $ABCD,$EFGH,$W0
  603. sha256h2 $EFGH,$abcd,$W0
  604. sha256su1 @MSG[0],@MSG[2],@MSG[3]
  605. ___
  606. ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
  607. }
  608. $code.=<<___;
  609. vld1.32 {$W1},[$Ktbl]!
  610. vadd.i32 $W0,$W0,@MSG[0]
  611. vmov $abcd,$ABCD
  612. sha256h $ABCD,$EFGH,$W0
  613. sha256h2 $EFGH,$abcd,$W0
  614. vld1.32 {$W0},[$Ktbl]!
  615. vadd.i32 $W1,$W1,@MSG[1]
  616. vmov $abcd,$ABCD
  617. sha256h $ABCD,$EFGH,$W1
  618. sha256h2 $EFGH,$abcd,$W1
  619. vld1.32 {$W1},[$Ktbl]
  620. vadd.i32 $W0,$W0,@MSG[2]
  621. sub $Ktbl,$Ktbl,#256-16 @ rewind
  622. vmov $abcd,$ABCD
  623. sha256h $ABCD,$EFGH,$W0
  624. sha256h2 $EFGH,$abcd,$W0
  625. vadd.i32 $W1,$W1,@MSG[3]
  626. vmov $abcd,$ABCD
  627. sha256h $ABCD,$EFGH,$W1
  628. sha256h2 $EFGH,$abcd,$W1
  629. vadd.i32 $ABCD,$ABCD,$ABCD_SAVE
  630. vadd.i32 $EFGH,$EFGH,$EFGH_SAVE
  631. it ne
  632. bne .Loop_v8
  633. vst1.32 {$ABCD,$EFGH},[$ctx]
  634. ret @ bx lr
  635. .size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
  636. #endif
  637. ___
  638. }}}
  639. $code.=<<___;
  640. .asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
  641. .align 2
  642. #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
  643. .extern OPENSSL_armcap_P
  644. #endif
  645. ___
  646. open SELF,$0;
  647. while(<SELF>) {
  648. next if (/^#!/);
  649. last if (!s/^#/@/ and !/^$/);
  650. print;
  651. }
  652. close SELF;
  653. { my %opcode = (
  654. "sha256h" => 0xf3000c40, "sha256h2" => 0xf3100c40,
  655. "sha256su0" => 0xf3ba03c0, "sha256su1" => 0xf3200c40 );
  656. sub unsha256 {
  657. my ($mnemonic,$arg)=@_;
  658. if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
  659. my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
  660. |(($2&7)<<17)|(($2&8)<<4)
  661. |(($3&7)<<1) |(($3&8)<<2);
  662. # since ARMv7 instructions are always encoded little-endian.
  663. # correct solution is to use .inst directive, but older
  664. # assemblers don't implement it:-(
  665. sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
  666. $word&0xff,($word>>8)&0xff,
  667. ($word>>16)&0xff,($word>>24)&0xff,
  668. $mnemonic,$arg;
  669. }
  670. }
  671. }
  672. foreach (split($/,$code)) {
  673. s/\`([^\`]*)\`/eval $1/geo;
  674. s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
  675. s/\bret\b/bx lr/go or
  676. s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
  677. print $_,"\n";
  678. }
  679. close STDOUT or die "error closing STDOUT: $!"; # enforce flush