sha1-armv4-large.pl 6.6 KB


  1. #!/usr/bin/env perl
  2. # ====================================================================
  3. # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
  4. # project. The module is, however, dual licensed under OpenSSL and
  5. # CRYPTOGAMS licenses depending on where you obtain it. For further
  6. # details see http://www.openssl.org/~appro/cryptogams/.
  7. # ====================================================================
  8. # sha1_block procedure for ARMv4.
  9. #
  10. # January 2007.
  11. # Size/performance trade-off
  12. # ====================================================================
  13. # impl size in bytes comp cycles[*] measured performance
  14. # ====================================================================
  15. # thumb 304 3212 4420
  16. # armv4-small 392/+29% 1958/+64% 2250/+96%
  17. # armv4-compact 740/+89% 1552/+26% 1840/+22%
  18. # armv4-large 1420/+92% 1307/+19% 1370/+34%[***]
  19. # full unroll ~5100/+260% ~1260/+4% ~1300/+5%
  20. # ====================================================================
  21. # thumb = same as 'small' but in Thumb instructions[**] and
  22. # with recurring code in two private functions;
  23. # small = detached Xload/update, loops are folded;
  24. # compact = detached Xload/update, 5x unroll;
  25. # large = interleaved Xload/update, 5x unroll;
  26. # full unroll = interleaved Xload/update, full unroll, estimated[!];
  27. #
  28. # [*] Manually counted instructions in "grand" loop body. Measured
  29. # performance is affected by prologue and epilogue overhead,
  30. # i-cache availability, branch penalties, etc.
  31. # [**] While each Thumb instruction is twice smaller, they are not as
  32. # diverse as ARM ones: e.g., there are only two arithmetic
  33. # instructions with 3 arguments, no [fixed] rotate, addressing
  34. # modes are limited. As result it takes more instructions to do
  35. # the same job in Thumb, therefore the code is never twice as
  36. # small and always slower.
  37. # [***] which is also ~35% better than compiler generated code. Dual-
  38. # issue Cortex A8 core was measured to process input block in
  39. # ~990 cycles.
  40. # August 2010.
  41. #
  42. # Rescheduling for dual-issue pipeline resulted in 13% improvement on
  43. # Cortex A8 core and in absolute terms ~870 cycles per input block
  44. # [or 13.6 cycles per byte].
  45. # February 2011.
  46. #
  47. # Profiler-assisted and platform-specific optimization resulted in 10%
  48. # improvement on Cortex A8 core and 12.2 cycles per byte.
  49. $flavour = shift;
  50. if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
  51. else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
  52. if ($flavour && $flavour ne "void") {
  53. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  54. ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
  55. ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
  56. die "can't locate arm-xlate.pl";
  57. open STDOUT,"| \"$^X\" $xlate $flavour $output";
  58. } else {
  59. open STDOUT,">$output";
  60. }
  61. $ctx="r0";
  62. $inp="r1";
  63. $len="r2";
  64. $a="r3";
  65. $b="r4";
  66. $c="r5";
  67. $d="r6";
  68. $e="r7";
  69. $K="r8";
  70. $t0="r9";
  71. $t1="r10";
  72. $t2="r11";
  73. $t3="r12";
  74. $Xi="r14";
  75. @V=($a,$b,$c,$d,$e);
  76. sub Xupdate {
  77. my ($a,$b,$c,$d,$e,$opt1,$opt2)=@_;
  78. $code.=<<___;
  79. ldr $t0,[$Xi,#15*4]
  80. ldr $t1,[$Xi,#13*4]
  81. ldr $t2,[$Xi,#7*4]
  82. add $e,$K,$e,ror#2 @ E+=K_xx_xx
  83. ldr $t3,[$Xi,#2*4]
  84. eor $t0,$t0,$t1
  85. eor $t2,$t2,$t3 @ 1 cycle stall
  86. eor $t1,$c,$d @ F_xx_xx
  87. mov $t0,$t0,ror#31
  88. add $e,$e,$a,ror#27 @ E+=ROR(A,27)
  89. eor $t0,$t0,$t2,ror#31
  90. str $t0,[$Xi,#-4]!
  91. $opt1 @ F_xx_xx
  92. $opt2 @ F_xx_xx
  93. add $e,$e,$t0 @ E+=X[i]
  94. ___
  95. }
  96. sub BODY_00_15 {
  97. my ($a,$b,$c,$d,$e)=@_;
  98. $code.=<<___;
  99. #if __ARM_ARCH__<7
  100. ldrb $t1,[$inp,#2]
  101. ldrb $t0,[$inp,#3]
  102. ldrb $t2,[$inp,#1]
  103. add $e,$K,$e,ror#2 @ E+=K_00_19
  104. ldrb $t3,[$inp],#4
  105. orr $t0,$t0,$t1,lsl#8
  106. eor $t1,$c,$d @ F_xx_xx
  107. orr $t0,$t0,$t2,lsl#16
  108. add $e,$e,$a,ror#27 @ E+=ROR(A,27)
  109. orr $t0,$t0,$t3,lsl#24
  110. #else
  111. ldr $t0,[$inp],#4 @ handles unaligned
  112. add $e,$K,$e,ror#2 @ E+=K_00_19
  113. eor $t1,$c,$d @ F_xx_xx
  114. add $e,$e,$a,ror#27 @ E+=ROR(A,27)
  115. #ifdef __ARMEL__
  116. rev $t0,$t0 @ byte swap
  117. #endif
  118. #endif
  119. and $t1,$b,$t1,ror#2
  120. add $e,$e,$t0 @ E+=X[i]
  121. eor $t1,$t1,$d,ror#2 @ F_00_19(B,C,D)
  122. str $t0,[$Xi,#-4]!
  123. add $e,$e,$t1 @ E+=F_00_19(B,C,D)
  124. ___
  125. }
  126. sub BODY_16_19 {
  127. my ($a,$b,$c,$d,$e)=@_;
  128. &Xupdate(@_,"and $t1,$b,$t1,ror#2");
  129. $code.=<<___;
  130. eor $t1,$t1,$d,ror#2 @ F_00_19(B,C,D)
  131. add $e,$e,$t1 @ E+=F_00_19(B,C,D)
  132. ___
  133. }
  134. sub BODY_20_39 {
  135. my ($a,$b,$c,$d,$e)=@_;
  136. &Xupdate(@_,"eor $t1,$b,$t1,ror#2");
  137. $code.=<<___;
  138. add $e,$e,$t1 @ E+=F_20_39(B,C,D)
  139. ___
  140. }
  141. sub BODY_40_59 {
  142. my ($a,$b,$c,$d,$e)=@_;
  143. &Xupdate(@_,"and $t1,$b,$t1,ror#2","and $t2,$c,$d");
  144. $code.=<<___;
  145. add $e,$e,$t1 @ E+=F_40_59(B,C,D)
  146. add $e,$e,$t2,ror#2
  147. ___
  148. }
  149. $code=<<___;
  150. #include "arm_arch.h"
  151. .text
  152. .global sha1_block_data_order
  153. .type sha1_block_data_order,%function
  154. .align 2
  155. sha1_block_data_order:
  156. stmdb sp!,{r4-r12,lr}
  157. add $len,$inp,$len,lsl#6 @ $len to point at the end of $inp
  158. ldmia $ctx,{$a,$b,$c,$d,$e}
  159. .Lloop:
  160. ldr $K,.LK_00_19
  161. mov $Xi,sp
  162. sub sp,sp,#15*4
  163. mov $c,$c,ror#30
  164. mov $d,$d,ror#30
  165. mov $e,$e,ror#30 @ [6]
  166. .L_00_15:
  167. ___
  168. for($i=0;$i<5;$i++) {
  169. &BODY_00_15(@V); unshift(@V,pop(@V));
  170. }
  171. $code.=<<___;
  172. teq $Xi,sp
  173. bne .L_00_15 @ [((11+4)*5+2)*3]
  174. sub sp,sp,#25*4
  175. ___
  176. &BODY_00_15(@V); unshift(@V,pop(@V));
  177. &BODY_16_19(@V); unshift(@V,pop(@V));
  178. &BODY_16_19(@V); unshift(@V,pop(@V));
  179. &BODY_16_19(@V); unshift(@V,pop(@V));
  180. &BODY_16_19(@V); unshift(@V,pop(@V));
  181. $code.=<<___;
  182. ldr $K,.LK_20_39 @ [+15+16*4]
  183. cmn sp,#0 @ [+3], clear carry to denote 20_39
  184. .L_20_39_or_60_79:
  185. ___
  186. for($i=0;$i<5;$i++) {
  187. &BODY_20_39(@V); unshift(@V,pop(@V));
  188. }
  189. $code.=<<___;
  190. teq $Xi,sp @ preserve carry
  191. bne .L_20_39_or_60_79 @ [+((12+3)*5+2)*4]
  192. bcs .L_done @ [+((12+3)*5+2)*4], spare 300 bytes
  193. ldr $K,.LK_40_59
  194. sub sp,sp,#20*4 @ [+2]
  195. .L_40_59:
  196. ___
  197. for($i=0;$i<5;$i++) {
  198. &BODY_40_59(@V); unshift(@V,pop(@V));
  199. }
  200. $code.=<<___;
  201. teq $Xi,sp
  202. bne .L_40_59 @ [+((12+5)*5+2)*4]
  203. ldr $K,.LK_60_79
  204. sub sp,sp,#20*4
  205. cmp sp,#0 @ set carry to denote 60_79
  206. b .L_20_39_or_60_79 @ [+4], spare 300 bytes
  207. .L_done:
  208. add sp,sp,#80*4 @ "deallocate" stack frame
  209. ldmia $ctx,{$K,$t0,$t1,$t2,$t3}
  210. add $a,$K,$a
  211. add $b,$t0,$b
  212. add $c,$t1,$c,ror#2
  213. add $d,$t2,$d,ror#2
  214. add $e,$t3,$e,ror#2
  215. stmia $ctx,{$a,$b,$c,$d,$e}
  216. teq $inp,$len
  217. bne .Lloop @ [+18], total 1307
  218. #if __ARM_ARCH__>=5
  219. ldmia sp!,{r4-r12,pc}
  220. #else
  221. ldmia sp!,{r4-r12,lr}
  222. tst lr,#1
  223. moveq pc,lr @ be binary compatible with V4, yet
  224. bx lr @ interoperable with Thumb ISA:-)
  225. #endif
  226. .align 2
  227. .LK_00_19: .word 0x5a827999
  228. .LK_20_39: .word 0x6ed9eba1
  229. .LK_40_59: .word 0x8f1bbcdc
  230. .LK_60_79: .word 0xca62c1d6
  231. .size sha1_block_data_order,.-sha1_block_data_order
  232. .asciz "SHA1 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
  233. .align 2
  234. ___
  235. $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
  236. print $code;
  237. close STDOUT; # enforce flush