sha256-armv4.pl 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740
  1. #! /usr/bin/env perl
  2. # Copyright 2007-2018 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. # ====================================================================
  9. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  10. # project. The module is, however, dual licensed under OpenSSL and
  11. # CRYPTOGAMS licenses depending on where you obtain it. For further
  12. # details see http://www.openssl.org/~appro/cryptogams/.
  13. #
  14. # Permission to use under GPL terms is granted.
  15. # ====================================================================
  16. # SHA256 block procedure for ARMv4. May 2007.
  17. # Performance is ~2x better than gcc 3.4 generated code and in "abso-
  18. # lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
  19. # byte [on single-issue Xscale PXA250 core].
  20. # July 2010.
  21. #
  22. # Rescheduling for dual-issue pipeline resulted in 22% improvement on
  23. # Cortex A8 core and ~20 cycles per processed byte.
  24. # February 2011.
  25. #
  26. # Profiler-assisted and platform-specific optimization resulted in 16%
  27. # improvement on Cortex A8 core and ~15.4 cycles per processed byte.
  28. # September 2013.
  29. #
  30. # Add NEON implementation. On Cortex A8 it was measured to process one
  31. # byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
  32. # S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
  33. # code (meaning that latter performs sub-optimally, nothing was done
  34. # about it).
  35. # May 2014.
  36. #
  37. # Add ARMv8 code path performing at 2.0 cpb on Apple A7.
  38. $flavour = shift;
  39. if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
  40. else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
  41. if ($flavour && $flavour ne "void") {
  42. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  43. ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
  44. ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
  45. die "can't locate arm-xlate.pl";
  46. open STDOUT,"| \"$^X\" $xlate $flavour $output";
  47. } else {
  48. open STDOUT,">$output";
  49. }
  50. $ctx="r0"; $t0="r0";
  51. $inp="r1"; $t4="r1";
  52. $len="r2"; $t1="r2";
  53. $T1="r3"; $t3="r3";
  54. $A="r4";
  55. $B="r5";
  56. $C="r6";
  57. $D="r7";
  58. $E="r8";
  59. $F="r9";
  60. $G="r10";
  61. $H="r11";
  62. @V=($A,$B,$C,$D,$E,$F,$G,$H);
  63. $t2="r12";
  64. $Ktbl="r14";
  65. @Sigma0=( 2,13,22);
  66. @Sigma1=( 6,11,25);
  67. @sigma0=( 7,18, 3);
  68. @sigma1=(17,19,10);
  69. sub BODY_00_15 {
  70. my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
  71. $code.=<<___ if ($i<16);
  72. #if __ARM_ARCH__>=7
  73. @ ldr $t1,[$inp],#4 @ $i
  74. # if $i==15
  75. str $inp,[sp,#17*4] @ make room for $t4
  76. # endif
  77. eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
  78. add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
  79. eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
  80. # ifndef __ARMEB__
  81. rev $t1,$t1
  82. # endif
  83. #else
  84. @ ldrb $t1,[$inp,#3] @ $i
  85. add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
  86. ldrb $t2,[$inp,#2]
  87. ldrb $t0,[$inp,#1]
  88. orr $t1,$t1,$t2,lsl#8
  89. ldrb $t2,[$inp],#4
  90. orr $t1,$t1,$t0,lsl#16
  91. # if $i==15
  92. str $inp,[sp,#17*4] @ make room for $t4
  93. # endif
  94. eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
  95. orr $t1,$t1,$t2,lsl#24
  96. eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
  97. #endif
  98. ___
  99. $code.=<<___;
  100. ldr $t2,[$Ktbl],#4 @ *K256++
  101. add $h,$h,$t1 @ h+=X[i]
  102. str $t1,[sp,#`$i%16`*4]
  103. eor $t1,$f,$g
  104. add $h,$h,$t0,ror#$Sigma1[0] @ h+=Sigma1(e)
  105. and $t1,$t1,$e
  106. add $h,$h,$t2 @ h+=K256[i]
  107. eor $t1,$t1,$g @ Ch(e,f,g)
  108. eor $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
  109. add $h,$h,$t1 @ h+=Ch(e,f,g)
  110. #if $i==31
  111. and $t2,$t2,#0xff
  112. cmp $t2,#0xf2 @ done?
  113. #endif
  114. #if $i<15
  115. # if __ARM_ARCH__>=7
  116. ldr $t1,[$inp],#4 @ prefetch
  117. # else
  118. ldrb $t1,[$inp,#3]
  119. # endif
  120. eor $t2,$a,$b @ a^b, b^c in next round
  121. #else
  122. ldr $t1,[sp,#`($i+2)%16`*4] @ from future BODY_16_xx
  123. eor $t2,$a,$b @ a^b, b^c in next round
  124. ldr $t4,[sp,#`($i+15)%16`*4] @ from future BODY_16_xx
  125. #endif
  126. eor $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]` @ Sigma0(a)
  127. and $t3,$t3,$t2 @ (b^c)&=(a^b)
  128. add $d,$d,$h @ d+=h
  129. eor $t3,$t3,$b @ Maj(a,b,c)
  130. add $h,$h,$t0,ror#$Sigma0[0] @ h+=Sigma0(a)
  131. @ add $h,$h,$t3 @ h+=Maj(a,b,c)
  132. ___
  133. ($t2,$t3)=($t3,$t2);
  134. }
  135. sub BODY_16_XX {
  136. my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
  137. $code.=<<___;
  138. @ ldr $t1,[sp,#`($i+1)%16`*4] @ $i
  139. @ ldr $t4,[sp,#`($i+14)%16`*4]
  140. mov $t0,$t1,ror#$sigma0[0]
  141. add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
  142. mov $t2,$t4,ror#$sigma1[0]
  143. eor $t0,$t0,$t1,ror#$sigma0[1]
  144. eor $t2,$t2,$t4,ror#$sigma1[1]
  145. eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1])
  146. ldr $t1,[sp,#`($i+0)%16`*4]
  147. eor $t2,$t2,$t4,lsr#$sigma1[2] @ sigma1(X[i+14])
  148. ldr $t4,[sp,#`($i+9)%16`*4]
  149. add $t2,$t2,$t0
  150. eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` @ from BODY_00_15
  151. add $t1,$t1,$t2
  152. eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
  153. add $t1,$t1,$t4 @ X[i]
  154. ___
  155. &BODY_00_15(@_);
  156. }
  157. $code=<<___;
  158. #ifndef __KERNEL__
  159. # include "arm_arch.h"
  160. #else
  161. # define __ARM_ARCH__ __LINUX_ARM_ARCH__
  162. # define __ARM_MAX_ARCH__ 7
  163. #endif
  164. #if defined(__thumb2__)
  165. .syntax unified
  166. .thumb
  167. #else
  168. .code 32
  169. #endif
  170. .text
  171. .type K256,%object
  172. .align 5
  173. K256:
  174. .word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
  175. .word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
  176. .word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
  177. .word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
  178. .word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
  179. .word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
  180. .word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
  181. .word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
  182. .word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
  183. .word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
  184. .word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
  185. .word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
  186. .word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
  187. .word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
  188. .word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
  189. .word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
  190. .size K256,.-K256
  191. .word 0 @ terminator
  192. #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
  193. .LOPENSSL_armcap:
  194. # ifdef _WIN32
  195. .word OPENSSL_armcap_P
  196. # else
  197. .word OPENSSL_armcap_P-.Lsha256_block_data_order
  198. # endif
  199. #endif
  200. .align 5
  201. .global sha256_block_data_order
  202. .type sha256_block_data_order,%function
  203. sha256_block_data_order:
  204. .Lsha256_block_data_order:
  205. #if __ARM_ARCH__<7 && !defined(__thumb2__)
  206. sub r3,pc,#8 @ sha256_block_data_order
  207. #else
  208. adr r3,.Lsha256_block_data_order
  209. #endif
  210. #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
  211. ldr r12,.LOPENSSL_armcap
  212. # if !defined(_WIN32)
  213. ldr r12,[r3,r12] @ OPENSSL_armcap_P
  214. # endif
  215. # if defined(__APPLE__) || defined(_WIN32)
  216. ldr r12,[r12]
  217. # endif
  218. tst r12,#ARMV8_SHA256
  219. bne .LARMv8
  220. tst r12,#ARMV7_NEON
  221. bne .LNEON
  222. #endif
  223. add $len,$inp,$len,lsl#6 @ len to point at the end of inp
  224. stmdb sp!,{$ctx,$inp,$len,r4-r11,lr}
  225. ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
  226. sub $Ktbl,r3,#256+32 @ K256
  227. sub sp,sp,#16*4 @ alloca(X[16])
  228. .Loop:
  229. # if __ARM_ARCH__>=7
  230. ldr $t1,[$inp],#4
  231. # else
  232. ldrb $t1,[$inp,#3]
  233. # endif
  234. eor $t3,$B,$C @ magic
  235. eor $t2,$t2,$t2
  236. ___
  237. for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
  238. $code.=".Lrounds_16_xx:\n";
  239. for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
  240. $code.=<<___;
  241. #ifdef __thumb2__
  242. ite eq @ Thumb2 thing, sanity check in ARM
  243. #endif
  244. ldreq $t3,[sp,#16*4] @ pull ctx
  245. bne .Lrounds_16_xx
  246. add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
  247. ldr $t0,[$t3,#0]
  248. ldr $t1,[$t3,#4]
  249. ldr $t2,[$t3,#8]
  250. add $A,$A,$t0
  251. ldr $t0,[$t3,#12]
  252. add $B,$B,$t1
  253. ldr $t1,[$t3,#16]
  254. add $C,$C,$t2
  255. ldr $t2,[$t3,#20]
  256. add $D,$D,$t0
  257. ldr $t0,[$t3,#24]
  258. add $E,$E,$t1
  259. ldr $t1,[$t3,#28]
  260. add $F,$F,$t2
  261. ldr $inp,[sp,#17*4] @ pull inp
  262. ldr $t2,[sp,#18*4] @ pull inp+len
  263. add $G,$G,$t0
  264. add $H,$H,$t1
  265. stmia $t3,{$A,$B,$C,$D,$E,$F,$G,$H}
  266. cmp $inp,$t2
  267. sub $Ktbl,$Ktbl,#256 @ rewind Ktbl
  268. bne .Loop
  269. add sp,sp,#`16+3`*4 @ destroy frame
  270. #if __ARM_ARCH__>=5
  271. ldmia sp!,{r4-r11,pc}
  272. #else
  273. ldmia sp!,{r4-r11,lr}
  274. tst lr,#1
  275. moveq pc,lr @ be binary compatible with V4, yet
  276. bx lr @ interoperable with Thumb ISA:-)
  277. #endif
  278. .size sha256_block_data_order,.-sha256_block_data_order
  279. ___
  280. ######################################################################
  281. # NEON stuff
  282. #
  283. {{{
  284. my @X=map("q$_",(0..3));
  285. my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
  286. my $Xfer=$t4;
  287. my $j=0;
  288. sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
  289. sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
  290. sub AUTOLOAD() # thunk [simplified] x86-style perlasm
  291. { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
  292. my $arg = pop;
  293. $arg = "#$arg" if ($arg*1 eq $arg);
  294. $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
  295. }
  296. sub Xupdate()
  297. { use integer;
  298. my $body = shift;
  299. my @insns = (&$body,&$body,&$body,&$body);
  300. my ($a,$b,$c,$d,$e,$f,$g,$h);
  301. &vext_8 ($T0,@X[0],@X[1],4); # X[1..4]
  302. eval(shift(@insns));
  303. eval(shift(@insns));
  304. eval(shift(@insns));
  305. &vext_8 ($T1,@X[2],@X[3],4); # X[9..12]
  306. eval(shift(@insns));
  307. eval(shift(@insns));
  308. eval(shift(@insns));
  309. &vshr_u32 ($T2,$T0,$sigma0[0]);
  310. eval(shift(@insns));
  311. eval(shift(@insns));
  312. &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += X[9..12]
  313. eval(shift(@insns));
  314. eval(shift(@insns));
  315. &vshr_u32 ($T1,$T0,$sigma0[2]);
  316. eval(shift(@insns));
  317. eval(shift(@insns));
  318. &vsli_32 ($T2,$T0,32-$sigma0[0]);
  319. eval(shift(@insns));
  320. eval(shift(@insns));
  321. &vshr_u32 ($T3,$T0,$sigma0[1]);
  322. eval(shift(@insns));
  323. eval(shift(@insns));
  324. &veor ($T1,$T1,$T2);
  325. eval(shift(@insns));
  326. eval(shift(@insns));
  327. &vsli_32 ($T3,$T0,32-$sigma0[1]);
  328. eval(shift(@insns));
  329. eval(shift(@insns));
  330. &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[0]);
  331. eval(shift(@insns));
  332. eval(shift(@insns));
  333. &veor ($T1,$T1,$T3); # sigma0(X[1..4])
  334. eval(shift(@insns));
  335. eval(shift(@insns));
  336. &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[0]);
  337. eval(shift(@insns));
  338. eval(shift(@insns));
  339. &vshr_u32 ($T5,&Dhi(@X[3]),$sigma1[2]);
  340. eval(shift(@insns));
  341. eval(shift(@insns));
  342. &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4])
  343. eval(shift(@insns));
  344. eval(shift(@insns));
  345. &veor ($T5,$T5,$T4);
  346. eval(shift(@insns));
  347. eval(shift(@insns));
  348. &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[1]);
  349. eval(shift(@insns));
  350. eval(shift(@insns));
  351. &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[1]);
  352. eval(shift(@insns));
  353. eval(shift(@insns));
  354. &veor ($T5,$T5,$T4); # sigma1(X[14..15])
  355. eval(shift(@insns));
  356. eval(shift(@insns));
  357. &vadd_i32 (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
  358. eval(shift(@insns));
  359. eval(shift(@insns));
  360. &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[0]);
  361. eval(shift(@insns));
  362. eval(shift(@insns));
  363. &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[0]);
  364. eval(shift(@insns));
  365. eval(shift(@insns));
  366. &vshr_u32 ($T5,&Dlo(@X[0]),$sigma1[2]);
  367. eval(shift(@insns));
  368. eval(shift(@insns));
  369. &veor ($T5,$T5,$T4);
  370. eval(shift(@insns));
  371. eval(shift(@insns));
  372. &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[1]);
  373. eval(shift(@insns));
  374. eval(shift(@insns));
  375. &vld1_32 ("{$T0}","[$Ktbl,:128]!");
  376. eval(shift(@insns));
  377. eval(shift(@insns));
  378. &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[1]);
  379. eval(shift(@insns));
  380. eval(shift(@insns));
  381. &veor ($T5,$T5,$T4); # sigma1(X[16..17])
  382. eval(shift(@insns));
  383. eval(shift(@insns));
  384. &vadd_i32 (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
  385. eval(shift(@insns));
  386. eval(shift(@insns));
  387. &vadd_i32 ($T0,$T0,@X[0]);
  388. while($#insns>=2) { eval(shift(@insns)); }
  389. &vst1_32 ("{$T0}","[$Xfer,:128]!");
  390. eval(shift(@insns));
  391. eval(shift(@insns));
  392. push(@X,shift(@X)); # "rotate" X[]
  393. }
  394. sub Xpreload()
  395. { use integer;
  396. my $body = shift;
  397. my @insns = (&$body,&$body,&$body,&$body);
  398. my ($a,$b,$c,$d,$e,$f,$g,$h);
  399. eval(shift(@insns));
  400. eval(shift(@insns));
  401. eval(shift(@insns));
  402. eval(shift(@insns));
  403. &vld1_32 ("{$T0}","[$Ktbl,:128]!");
  404. eval(shift(@insns));
  405. eval(shift(@insns));
  406. eval(shift(@insns));
  407. eval(shift(@insns));
  408. &vrev32_8 (@X[0],@X[0]);
  409. eval(shift(@insns));
  410. eval(shift(@insns));
  411. eval(shift(@insns));
  412. eval(shift(@insns));
  413. &vadd_i32 ($T0,$T0,@X[0]);
  414. foreach (@insns) { eval; } # remaining instructions
  415. &vst1_32 ("{$T0}","[$Xfer,:128]!");
  416. push(@X,shift(@X)); # "rotate" X[]
  417. }
  418. sub body_00_15 () {
  419. (
  420. '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
  421. '&add ($h,$h,$t1)', # h+=X[i]+K[i]
  422. '&eor ($t1,$f,$g)',
  423. '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
  424. '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past
  425. '&and ($t1,$t1,$e)',
  426. '&eor ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e)
  427. '&eor ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
  428. '&eor ($t1,$t1,$g)', # Ch(e,f,g)
  429. '&add ($h,$h,$t2,"ror#$Sigma1[0]")', # h+=Sigma1(e)
  430. '&eor ($t2,$a,$b)', # a^b, b^c in next round
  431. '&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a)
  432. '&add ($h,$h,$t1)', # h+=Ch(e,f,g)
  433. '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'.
  434. '&ldr ($t1,"[$Ktbl]") if ($j==15);'.
  435. '&ldr ($t1,"[sp,#64]") if ($j==31)',
  436. '&and ($t3,$t3,$t2)', # (b^c)&=(a^b)
  437. '&add ($d,$d,$h)', # d+=h
  438. '&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
  439. '&eor ($t3,$t3,$b)', # Maj(a,b,c)
  440. '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
  441. )
  442. }
  443. $code.=<<___;
  444. #if __ARM_MAX_ARCH__>=7
  445. .arch armv7-a
  446. .fpu neon
  447. .global sha256_block_data_order_neon
  448. .type sha256_block_data_order_neon,%function
  449. .align 5
  450. .skip 16
  451. sha256_block_data_order_neon:
  452. .LNEON:
  453. stmdb sp!,{r4-r12,lr}
  454. sub $H,sp,#16*4+16
  455. adr $Ktbl,K256
  456. bic $H,$H,#15 @ align for 128-bit stores
  457. mov $t2,sp
  458. mov sp,$H @ alloca
  459. add $len,$inp,$len,lsl#6 @ len to point at the end of inp
  460. vld1.8 {@X[0]},[$inp]!
  461. vld1.8 {@X[1]},[$inp]!
  462. vld1.8 {@X[2]},[$inp]!
  463. vld1.8 {@X[3]},[$inp]!
  464. vld1.32 {$T0},[$Ktbl,:128]!
  465. vld1.32 {$T1},[$Ktbl,:128]!
  466. vld1.32 {$T2},[$Ktbl,:128]!
  467. vld1.32 {$T3},[$Ktbl,:128]!
  468. vrev32.8 @X[0],@X[0] @ yes, even on
  469. str $ctx,[sp,#64]
  470. vrev32.8 @X[1],@X[1] @ big-endian
  471. str $inp,[sp,#68]
  472. mov $Xfer,sp
  473. vrev32.8 @X[2],@X[2]
  474. str $len,[sp,#72]
  475. vrev32.8 @X[3],@X[3]
  476. str $t2,[sp,#76] @ save original sp
  477. vadd.i32 $T0,$T0,@X[0]
  478. vadd.i32 $T1,$T1,@X[1]
  479. vst1.32 {$T0},[$Xfer,:128]!
  480. vadd.i32 $T2,$T2,@X[2]
  481. vst1.32 {$T1},[$Xfer,:128]!
  482. vadd.i32 $T3,$T3,@X[3]
  483. vst1.32 {$T2},[$Xfer,:128]!
  484. vst1.32 {$T3},[$Xfer,:128]!
  485. ldmia $ctx,{$A-$H}
  486. sub $Xfer,$Xfer,#64
  487. ldr $t1,[sp,#0]
  488. eor $t2,$t2,$t2
  489. eor $t3,$B,$C
  490. b .L_00_48
  491. .align 4
  492. .L_00_48:
  493. ___
  494. &Xupdate(\&body_00_15);
  495. &Xupdate(\&body_00_15);
  496. &Xupdate(\&body_00_15);
  497. &Xupdate(\&body_00_15);
  498. $code.=<<___;
  499. teq $t1,#0 @ check for K256 terminator
  500. ldr $t1,[sp,#0]
  501. sub $Xfer,$Xfer,#64
  502. bne .L_00_48
  503. ldr $inp,[sp,#68]
  504. ldr $t0,[sp,#72]
  505. sub $Ktbl,$Ktbl,#256 @ rewind $Ktbl
  506. teq $inp,$t0
  507. it eq
  508. subeq $inp,$inp,#64 @ avoid SEGV
  509. vld1.8 {@X[0]},[$inp]! @ load next input block
  510. vld1.8 {@X[1]},[$inp]!
  511. vld1.8 {@X[2]},[$inp]!
  512. vld1.8 {@X[3]},[$inp]!
  513. it ne
  514. strne $inp,[sp,#68]
  515. mov $Xfer,sp
  516. ___
  517. &Xpreload(\&body_00_15);
  518. &Xpreload(\&body_00_15);
  519. &Xpreload(\&body_00_15);
  520. &Xpreload(\&body_00_15);
  521. $code.=<<___;
  522. ldr $t0,[$t1,#0]
  523. add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
  524. ldr $t2,[$t1,#4]
  525. ldr $t3,[$t1,#8]
  526. ldr $t4,[$t1,#12]
  527. add $A,$A,$t0 @ accumulate
  528. ldr $t0,[$t1,#16]
  529. add $B,$B,$t2
  530. ldr $t2,[$t1,#20]
  531. add $C,$C,$t3
  532. ldr $t3,[$t1,#24]
  533. add $D,$D,$t4
  534. ldr $t4,[$t1,#28]
  535. add $E,$E,$t0
  536. str $A,[$t1],#4
  537. add $F,$F,$t2
  538. str $B,[$t1],#4
  539. add $G,$G,$t3
  540. str $C,[$t1],#4
  541. add $H,$H,$t4
  542. str $D,[$t1],#4
  543. stmia $t1,{$E-$H}
  544. ittte ne
  545. movne $Xfer,sp
  546. ldrne $t1,[sp,#0]
  547. eorne $t2,$t2,$t2
  548. ldreq sp,[sp,#76] @ restore original sp
  549. itt ne
  550. eorne $t3,$B,$C
  551. bne .L_00_48
  552. ldmia sp!,{r4-r12,pc}
  553. .size sha256_block_data_order_neon,.-sha256_block_data_order_neon
  554. #endif
  555. ___
  556. }}}
  557. ######################################################################
  558. # ARMv8 stuff
  559. #
  560. {{{
  561. my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
  562. my @MSG=map("q$_",(8..11));
  563. my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
  564. my $Ktbl="r3";
  565. my $_byte = ($flavour =~ /win/ ? "DCB" : ".byte");
  566. $code.=<<___;
  567. #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
  568. # if defined(__thumb2__)
  569. # define INST(a,b,c,d) $_byte c,d|0xc,a,b
  570. # else
  571. # define INST(a,b,c,d) $_byte a,b,c,d
  572. # endif
  573. .type sha256_block_data_order_armv8,%function
  574. .align 5
  575. sha256_block_data_order_armv8:
  576. .LARMv8:
  577. vld1.32 {$ABCD,$EFGH},[$ctx]
  578. sub $Ktbl,$Ktbl,#256+32
  579. add $len,$inp,$len,lsl#6 @ len to point at the end of inp
  580. b .Loop_v8
  581. .align 4
  582. .Loop_v8:
  583. vld1.8 {@MSG[0]-@MSG[1]},[$inp]!
  584. vld1.8 {@MSG[2]-@MSG[3]},[$inp]!
  585. vld1.32 {$W0},[$Ktbl]!
  586. vrev32.8 @MSG[0],@MSG[0]
  587. vrev32.8 @MSG[1],@MSG[1]
  588. vrev32.8 @MSG[2],@MSG[2]
  589. vrev32.8 @MSG[3],@MSG[3]
  590. vmov $ABCD_SAVE,$ABCD @ offload
  591. vmov $EFGH_SAVE,$EFGH
  592. teq $inp,$len
  593. ___
  594. for($i=0;$i<12;$i++) {
  595. $code.=<<___;
  596. vld1.32 {$W1},[$Ktbl]!
  597. vadd.i32 $W0,$W0,@MSG[0]
  598. sha256su0 @MSG[0],@MSG[1]
  599. vmov $abcd,$ABCD
  600. sha256h $ABCD,$EFGH,$W0
  601. sha256h2 $EFGH,$abcd,$W0
  602. sha256su1 @MSG[0],@MSG[2],@MSG[3]
  603. ___
  604. ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
  605. }
  606. $code.=<<___;
  607. vld1.32 {$W1},[$Ktbl]!
  608. vadd.i32 $W0,$W0,@MSG[0]
  609. vmov $abcd,$ABCD
  610. sha256h $ABCD,$EFGH,$W0
  611. sha256h2 $EFGH,$abcd,$W0
  612. vld1.32 {$W0},[$Ktbl]!
  613. vadd.i32 $W1,$W1,@MSG[1]
  614. vmov $abcd,$ABCD
  615. sha256h $ABCD,$EFGH,$W1
  616. sha256h2 $EFGH,$abcd,$W1
  617. vld1.32 {$W1},[$Ktbl]
  618. vadd.i32 $W0,$W0,@MSG[2]
  619. sub $Ktbl,$Ktbl,#256-16 @ rewind
  620. vmov $abcd,$ABCD
  621. sha256h $ABCD,$EFGH,$W0
  622. sha256h2 $EFGH,$abcd,$W0
  623. vadd.i32 $W1,$W1,@MSG[3]
  624. vmov $abcd,$ABCD
  625. sha256h $ABCD,$EFGH,$W1
  626. sha256h2 $EFGH,$abcd,$W1
  627. vadd.i32 $ABCD,$ABCD,$ABCD_SAVE
  628. vadd.i32 $EFGH,$EFGH,$EFGH_SAVE
  629. it ne
  630. bne .Loop_v8
  631. vst1.32 {$ABCD,$EFGH},[$ctx]
  632. ret @ bx lr
  633. .size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
  634. #endif
  635. ___
  636. }}}
  637. $code.=<<___;
  638. .asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
  639. .align 2
  640. #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
  641. .comm OPENSSL_armcap_P,4,4
  642. #endif
  643. ___
  644. open SELF,$0;
  645. while(<SELF>) {
  646. next if (/^#!/);
  647. last if (!s/^#/@/ and !/^$/);
  648. print;
  649. }
  650. close SELF;
  651. { my %opcode = (
  652. "sha256h" => 0xf3000c40, "sha256h2" => 0xf3100c40,
  653. "sha256su0" => 0xf3ba03c0, "sha256su1" => 0xf3200c40 );
  654. sub unsha256 {
  655. my ($mnemonic,$arg)=@_;
  656. if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
  657. my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
  658. |(($2&7)<<17)|(($2&8)<<4)
  659. |(($3&7)<<1) |(($3&8)<<2);
  660. # since ARMv7 instructions are always encoded little-endian.
  661. # correct solution is to use .inst directive, but older
  662. # assemblers don't implement it:-(
  663. sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
  664. $word&0xff,($word>>8)&0xff,
  665. ($word>>16)&0xff,($word>>24)&0xff,
  666. $mnemonic,$arg;
  667. }
  668. }
  669. }
  670. foreach (split($/,$code)) {
  671. s/\`([^\`]*)\`/eval $1/geo;
  672. s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
  673. s/\bret\b/bx lr/go or
  674. s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
  675. print $_,"\n";
  676. }
  677. close STDOUT; # enforce flush