sha256-armv4.pl 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223
  1. #!/usr/bin/env perl
  2. # ====================================================================
  3. # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
  4. # project. The module is, however, dual licensed under OpenSSL and
  5. # CRYPTOGAMS licenses depending on where you obtain it. For further
  6. # details see http://www.openssl.org/~appro/cryptogams/.
  7. # ====================================================================
  8. # SHA256 block procedure for ARMv4. May 2007.
  9. # Performance is ~2x better than gcc 3.4 generated code and in "abso-
  10. # lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
  11. # byte [on single-issue Xscale PXA250 core].
  12. # July 2010.
  13. #
  14. # Rescheduling for dual-issue pipeline resulted in 22% improvement on
  15. # Cortex A8 core and ~20 cycles per processed byte.
  16. # February 2011.
  17. #
  18. # Profiler-assisted and platform-specific optimization resulted in 16%
  19. # improvement on Cortex A8 core and ~17 cycles per processed byte.
  20. $flavour = shift;
  21. if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
  22. else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
  23. if ($flavour && $flavour ne "void") {
  24. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  25. ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
  26. ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
  27. die "can't locate arm-xlate.pl";
  28. open STDOUT,"| \"$^X\" $xlate $flavour $output";
  29. } else {
  30. open STDOUT,">$output";
  31. }
  32. $ctx="r0"; $t0="r0";
  33. $inp="r1"; $t3="r1";
  34. $len="r2"; $t1="r2";
  35. $T1="r3";
  36. $A="r4";
  37. $B="r5";
  38. $C="r6";
  39. $D="r7";
  40. $E="r8";
  41. $F="r9";
  42. $G="r10";
  43. $H="r11";
  44. @V=($A,$B,$C,$D,$E,$F,$G,$H);
  45. $t2="r12";
  46. $Ktbl="r14";
  47. @Sigma0=( 2,13,22);
  48. @Sigma1=( 6,11,25);
  49. @sigma0=( 7,18, 3);
  50. @sigma1=(17,19,10);
  51. sub BODY_00_15 {
  52. my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
  53. $code.=<<___ if ($i<16);
  54. #if __ARM_ARCH__>=7
  55. ldr $T1,[$inp],#4
  56. #else
  57. ldrb $T1,[$inp,#3] @ $i
  58. ldrb $t2,[$inp,#2]
  59. ldrb $t1,[$inp,#1]
  60. ldrb $t0,[$inp],#4
  61. orr $T1,$T1,$t2,lsl#8
  62. orr $T1,$T1,$t1,lsl#16
  63. orr $T1,$T1,$t0,lsl#24
  64. #endif
  65. ___
  66. $code.=<<___;
  67. mov $t0,$e,ror#$Sigma1[0]
  68. ldr $t2,[$Ktbl],#4 @ *K256++
  69. eor $t0,$t0,$e,ror#$Sigma1[1]
  70. eor $t1,$f,$g
  71. #if $i>=16
  72. add $T1,$T1,$t3 @ from BODY_16_xx
  73. #elif __ARM_ARCH__>=7 && defined(__ARMEL__)
  74. rev $T1,$T1
  75. #endif
  76. #if $i==15
  77. str $inp,[sp,#17*4] @ leave room for $t3
  78. #endif
  79. eor $t0,$t0,$e,ror#$Sigma1[2] @ Sigma1(e)
  80. and $t1,$t1,$e
  81. str $T1,[sp,#`$i%16`*4]
  82. add $T1,$T1,$t0
  83. eor $t1,$t1,$g @ Ch(e,f,g)
  84. add $T1,$T1,$h
  85. mov $h,$a,ror#$Sigma0[0]
  86. add $T1,$T1,$t1
  87. eor $h,$h,$a,ror#$Sigma0[1]
  88. add $T1,$T1,$t2
  89. eor $h,$h,$a,ror#$Sigma0[2] @ Sigma0(a)
  90. #if $i>=15
  91. ldr $t3,[sp,#`($i+2)%16`*4] @ from BODY_16_xx
  92. #endif
  93. orr $t0,$a,$b
  94. and $t1,$a,$b
  95. and $t0,$t0,$c
  96. add $h,$h,$T1
  97. orr $t0,$t0,$t1 @ Maj(a,b,c)
  98. add $d,$d,$T1
  99. add $h,$h,$t0
  100. ___
  101. }
  102. sub BODY_16_XX {
  103. my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
  104. $code.=<<___;
  105. @ ldr $t3,[sp,#`($i+1)%16`*4] @ $i
  106. ldr $t2,[sp,#`($i+14)%16`*4]
  107. mov $t0,$t3,ror#$sigma0[0]
  108. ldr $T1,[sp,#`($i+0)%16`*4]
  109. eor $t0,$t0,$t3,ror#$sigma0[1]
  110. ldr $t1,[sp,#`($i+9)%16`*4]
  111. eor $t0,$t0,$t3,lsr#$sigma0[2] @ sigma0(X[i+1])
  112. mov $t3,$t2,ror#$sigma1[0]
  113. add $T1,$T1,$t0
  114. eor $t3,$t3,$t2,ror#$sigma1[1]
  115. add $T1,$T1,$t1
  116. eor $t3,$t3,$t2,lsr#$sigma1[2] @ sigma1(X[i+14])
  117. @ add $T1,$T1,$t3
  118. ___
  119. &BODY_00_15(@_);
  120. }
  121. $code=<<___;
  122. #include "arm_arch.h"
  123. .text
  124. .code 32
  125. .type K256,%object
  126. .align 5
  127. K256:
  128. .word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
  129. .word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
  130. .word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
  131. .word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
  132. .word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
  133. .word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
  134. .word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
  135. .word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
  136. .word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
  137. .word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
  138. .word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
  139. .word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
  140. .word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
  141. .word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
  142. .word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
  143. .word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
  144. .size K256,.-K256
  145. .global sha256_block_data_order
  146. .type sha256_block_data_order,%function
  147. sha256_block_data_order:
  148. sub r3,pc,#8 @ sha256_block_data_order
  149. add $len,$inp,$len,lsl#6 @ len to point at the end of inp
  150. stmdb sp!,{$ctx,$inp,$len,r4-r11,lr}
  151. ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
  152. sub $Ktbl,r3,#256 @ K256
  153. sub sp,sp,#16*4 @ alloca(X[16])
  154. .Loop:
  155. ___
  156. for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
  157. $code.=".Lrounds_16_xx:\n";
  158. for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
  159. $code.=<<___;
  160. and $t2,$t2,#0xff
  161. cmp $t2,#0xf2
  162. bne .Lrounds_16_xx
  163. ldr $T1,[sp,#16*4] @ pull ctx
  164. ldr $t0,[$T1,#0]
  165. ldr $t1,[$T1,#4]
  166. ldr $t2,[$T1,#8]
  167. add $A,$A,$t0
  168. ldr $t0,[$T1,#12]
  169. add $B,$B,$t1
  170. ldr $t1,[$T1,#16]
  171. add $C,$C,$t2
  172. ldr $t2,[$T1,#20]
  173. add $D,$D,$t0
  174. ldr $t0,[$T1,#24]
  175. add $E,$E,$t1
  176. ldr $t1,[$T1,#28]
  177. add $F,$F,$t2
  178. ldr $inp,[sp,#17*4] @ pull inp
  179. ldr $t2,[sp,#18*4] @ pull inp+len
  180. add $G,$G,$t0
  181. add $H,$H,$t1
  182. stmia $T1,{$A,$B,$C,$D,$E,$F,$G,$H}
  183. cmp $inp,$t2
  184. sub $Ktbl,$Ktbl,#256 @ rewind Ktbl
  185. bne .Loop
  186. add sp,sp,#`16+3`*4 @ destroy frame
  187. #if __ARM_ARCH__>=5
  188. ldmia sp!,{r4-r11,pc}
  189. #else
  190. ldmia sp!,{r4-r11,lr}
  191. tst lr,#1
  192. moveq pc,lr @ be binary compatible with V4, yet
  193. bx lr @ interoperable with Thumb ISA:-)
  194. #endif
  195. .size sha256_block_data_order,.-sha256_block_data_order
  196. .asciz "SHA256 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
  197. .align 2
  198. ___
  199. $code =~ s/\`([^\`]*)\`/eval $1/gem;
  200. $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
  201. print $code;
  202. close STDOUT; # enforce flush