sha256-c64xplus.pl 8.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292
  1. #!/usr/bin/env perl
  2. #
  3. # ====================================================================
  4. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  5. # project. The module is, however, dual licensed under OpenSSL and
  6. # CRYPTOGAMS licenses depending on where you obtain it. For further
  7. # details see http://www.openssl.org/~appro/cryptogams/.
  8. # ====================================================================
  9. #
  10. # SHA256 for C64x+.
  11. #
  12. # January 2012
  13. #
  14. # Performance is just below 10 cycles per processed byte, which is
  15. # almost 40% faster than compiler-generated code. Unroll is unlikely
  16. # to give more than ~8% improvement...
  17. #
  18. # !!! Note that this module uses AMR, which means that all interrupt
  19. # service routines are expected to preserve it and for own well-being
  20. # zero it upon entry.
  21. while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
  22. open STDOUT,">$output";
  23. ($CTXA,$INP,$NUM) = ("A4","B4","A6"); # arguments
  24. $K256="A3";
  25. ($A,$Actx,$B,$Bctx,$C,$Cctx,$D,$Dctx,$T2,$S0,$s1,$t0a,$t1a,$t2a,$X9,$X14)
  26. =map("A$_",(16..31));
  27. ($E,$Ectx,$F,$Fctx,$G,$Gctx,$H,$Hctx,$T1,$S1,$s0,$t0e,$t1e,$t2e,$X1,$X15)
  28. =map("B$_",(16..31));
  29. ($Xia,$Xib)=("A5","B5"); # circular/ring buffer
  30. $CTXB=$t2e;
  31. ($Xn,$X0,$K)=("B7","B8","B9");
  32. ($Maj,$Ch)=($T2,"B6");
  33. $code.=<<___;
  34. .text
  35. .asg B3,RA
  36. .asg A15,FP
  37. .asg B15,SP
  38. .if .BIG_ENDIAN
  39. .asg SWAP2,MV
  40. .asg SWAP4,MV
  41. .endif
  42. .global _sha256_block_data_order
  43. _sha256_block_data_order:
  44. .asmfunc stack_usage(64)
  45. MV $NUM,A0 ; reassign $NUM
  46. || MVK -64,B0
  47. [!A0] BNOP RA ; if ($NUM==0) return;
  48. || [A0] STW FP,*SP--[16] ; save frame pointer and alloca(64)
  49. || [A0] MV SP,FP
  50. [A0] ADDKPC _sha256_block_data_order,B2
  51. || [A0] AND B0,SP,SP ; align stack at 64 bytes
  52. [A0] MVK 0x00404,B1
  53. || [A0] MVKL (K256-_sha256_block_data_order),$K256
  54. [A0] MVKH 0x50000,B1
  55. || [A0] MVKH (K256-_sha256_block_data_order),$K256
  56. [A0] MVC B1,AMR ; setup circular addressing
  57. || [A0] MV SP,$Xia
  58. [A0] MV SP,$Xib
  59. || [A0] ADD B2,$K256,$K256
  60. || [A0] MV $CTXA,$CTXB
  61. || [A0] SUBAW SP,2,SP ; reserve two words above buffer
  62. LDW *${CTXA}[0],$A ; load ctx
  63. || LDW *${CTXB}[4],$E
  64. LDW *${CTXA}[1],$B
  65. || LDW *${CTXB}[5],$F
  66. LDW *${CTXA}[2],$C
  67. || LDW *${CTXB}[6],$G
  68. LDW *${CTXA}[3],$D
  69. || LDW *${CTXB}[7],$H
  70. LDNW *$INP++,$Xn ; pre-fetch input
  71. LDW *$K256++,$K ; pre-fetch K256[0]
  72. MVK 14,B0 ; loop counters
  73. MVK 47,B1
  74. || ADDAW $Xia,9,$Xia
  75. outerloop?:
  76. SUB A0,1,A0
  77. || MV $A,$Actx
  78. || MV $E,$Ectx
  79. || MVD $B,$Bctx
  80. || MVD $F,$Fctx
  81. MV $C,$Cctx
  82. || MV $G,$Gctx
  83. || MVD $D,$Dctx
  84. || MVD $H,$Hctx
  85. || SWAP4 $Xn,$X0
  86. SPLOOPD 8 ; BODY_00_14
  87. || MVC B0,ILC
  88. || SWAP2 $X0,$X0
  89. LDNW *$INP++,$Xn
  90. || ROTL $A,30,$S0
  91. || OR $A,$B,$Maj
  92. || AND $A,$B,$t2a
  93. || ROTL $E,26,$S1
  94. || AND $F,$E,$Ch
  95. || ANDN $G,$E,$t2e
  96. ROTL $A,19,$t0a
  97. || AND $C,$Maj,$Maj
  98. || ROTL $E,21,$t0e
  99. || XOR $t2e,$Ch,$Ch ; Ch(e,f,g) = (e&f)^(~e&g)
  100. ROTL $A,10,$t1a
  101. || OR $t2a,$Maj,$Maj ; Maj(a,b,c) = ((a|b)&c)|(a&b)
  102. || ROTL $E,7,$t1e
  103. || ADD $K,$H,$T1 ; T1 = h + K256[i]
  104. ADD $X0,$T1,$T1 ; T1 += X[i];
  105. || STW $X0,*$Xib++
  106. || XOR $t0a,$S0,$S0
  107. || XOR $t0e,$S1,$S1
  108. XOR $t1a,$S0,$S0 ; Sigma0(a)
  109. || XOR $t1e,$S1,$S1 ; Sigma1(e)
  110. || LDW *$K256++,$K ; pre-fetch K256[i+1]
  111. || ADD $Ch,$T1,$T1 ; T1 += Ch(e,f,g)
  112. ADD $S1,$T1,$T1 ; T1 += Sigma1(e)
  113. || ADD $S0,$Maj,$T2 ; T2 = Sigma0(a) + Maj(a,b,c)
  114. || ROTL $G,0,$H ; h = g
  115. || MV $F,$G ; g = f
  116. || MV $X0,$X14
  117. || SWAP4 $Xn,$X0
  118. SWAP2 $X0,$X0
  119. || MV $E,$F ; f = e
  120. || ADD $D,$T1,$E ; e = d + T1
  121. || MV $C,$D ; d = c
  122. MV $B,$C ; c = b
  123. || MV $A,$B ; b = a
  124. || ADD $T1,$T2,$A ; a = T1 + T2
  125. SPKERNEL
  126. ROTL $A,30,$S0 ; BODY_15
  127. || OR $A,$B,$Maj
  128. || AND $A,$B,$t2a
  129. || ROTL $E,26,$S1
  130. || AND $F,$E,$Ch
  131. || ANDN $G,$E,$t2e
  132. || LDW *${Xib}[1],$Xn ; modulo-scheduled
  133. ROTL $A,19,$t0a
  134. || AND $C,$Maj,$Maj
  135. || ROTL $E,21,$t0e
  136. || XOR $t2e,$Ch,$Ch ; Ch(e,f,g) = (e&f)^(~e&g)
  137. || LDW *${Xib}[2],$X1 ; modulo-scheduled
  138. ROTL $A,10,$t1a
  139. || OR $t2a,$Maj,$Maj ; Maj(a,b,c) = ((a|b)&c)|(a&b)
  140. || ROTL $E,7,$t1e
  141. || ADD $K,$H,$T1 ; T1 = h + K256[i]
  142. ADD $X0,$T1,$T1 ; T1 += X[i];
  143. || STW $X0,*$Xib++
  144. || XOR $t0a,$S0,$S0
  145. || XOR $t0e,$S1,$S1
  146. XOR $t1a,$S0,$S0 ; Sigma0(a)
  147. || XOR $t1e,$S1,$S1 ; Sigma1(e)
  148. || LDW *$K256++,$K ; pre-fetch K256[i+1]
  149. || ADD $Ch,$T1,$T1 ; T1 += Ch(e,f,g)
  150. ADD $S1,$T1,$T1 ; T1 += Sigma1(e)
  151. || ADD $S0,$Maj,$T2 ; T2 = Sigma0(a) + Maj(a,b,c)
  152. || ROTL $G,0,$H ; h = g
  153. || MV $F,$G ; g = f
  154. || MV $X0,$X15
  155. MV $E,$F ; f = e
  156. || ADD $D,$T1,$E ; e = d + T1
  157. || MV $C,$D ; d = c
  158. || MV $Xn,$X0 ; modulo-scheduled
  159. || LDW *$Xia,$X9 ; modulo-scheduled
  160. || ROTL $X1,25,$t0e ; modulo-scheduled
  161. || ROTL $X14,15,$t0a ; modulo-scheduled
  162. SHRU $X1,3,$s0 ; modulo-scheduled
  163. || SHRU $X14,10,$s1 ; modulo-scheduled
  164. || ROTL $B,0,$C ; c = b
  165. || MV $A,$B ; b = a
  166. || ADD $T1,$T2,$A ; a = T1 + T2
  167. SPLOOPD 10 ; BODY_16_63
  168. || MVC B1,ILC
  169. || ROTL $X1,14,$t1e ; modulo-scheduled
  170. || ROTL $X14,13,$t1a ; modulo-scheduled
  171. XOR $t0e,$s0,$s0
  172. || XOR $t0a,$s1,$s1
  173. || MV $X15,$X14
  174. || MV $X1,$Xn
  175. XOR $t1e,$s0,$s0 ; sigma0(X[i+1])
  176. || XOR $t1a,$s1,$s1 ; sigma1(X[i+14])
  177. || LDW *${Xib}[2],$X1 ; module-scheduled
  178. ROTL $A,30,$S0
  179. || OR $A,$B,$Maj
  180. || AND $A,$B,$t2a
  181. || ROTL $E,26,$S1
  182. || AND $F,$E,$Ch
  183. || ANDN $G,$E,$t2e
  184. || ADD $X9,$X0,$X0 ; X[i] += X[i+9]
  185. ROTL $A,19,$t0a
  186. || AND $C,$Maj,$Maj
  187. || ROTL $E,21,$t0e
  188. || XOR $t2e,$Ch,$Ch ; Ch(e,f,g) = (e&f)^(~e&g)
  189. || ADD $s0,$X0,$X0 ; X[i] += sigma1(X[i+1])
  190. ROTL $A,10,$t1a
  191. || OR $t2a,$Maj,$Maj ; Maj(a,b,c) = ((a|b)&c)|(a&b)
  192. || ROTL $E,7,$t1e
  193. || ADD $H,$K,$T1 ; T1 = h + K256[i]
  194. || ADD $s1,$X0,$X0 ; X[i] += sigma1(X[i+14])
  195. XOR $t0a,$S0,$S0
  196. || XOR $t0e,$S1,$S1
  197. || ADD $X0,$T1,$T1 ; T1 += X[i]
  198. || STW $X0,*$Xib++
  199. XOR $t1a,$S0,$S0 ; Sigma0(a)
  200. || XOR $t1e,$S1,$S1 ; Sigma1(e)
  201. || ADD $Ch,$T1,$T1 ; T1 += Ch(e,f,g)
  202. || MV $X0,$X15
  203. || ROTL $G,0,$H ; h = g
  204. || LDW *$K256++,$K ; pre-fetch K256[i+1]
  205. ADD $S1,$T1,$T1 ; T1 += Sigma1(e)
  206. || ADD $S0,$Maj,$T2 ; T2 = Sigma0(a) + Maj(a,b,c)
  207. || MV $F,$G ; g = f
  208. || MV $Xn,$X0 ; modulo-scheduled
  209. || LDW *++$Xia,$X9 ; modulo-scheduled
  210. || ROTL $X1,25,$t0e ; module-scheduled
  211. || ROTL $X14,15,$t0a ; modulo-scheduled
  212. ROTL $X1,14,$t1e ; modulo-scheduled
  213. || ROTL $X14,13,$t1a ; modulo-scheduled
  214. || MV $E,$F ; f = e
  215. || ADD $D,$T1,$E ; e = d + T1
  216. || MV $C,$D ; d = c
  217. || MV $B,$C ; c = b
  218. MV $A,$B ; b = a
  219. || ADD $T1,$T2,$A ; a = T1 + T2
  220. || SHRU $X1,3,$s0 ; modulo-scheduled
  221. || SHRU $X14,10,$s1 ; modulo-scheduled
  222. SPKERNEL
  223. [A0] B outerloop?
  224. || [A0] LDNW *$INP++,$Xn ; pre-fetch input
  225. || [A0] ADDK -260,$K256 ; rewind K256
  226. || ADD $Actx,$A,$A ; accumulate ctx
  227. || ADD $Ectx,$E,$E
  228. || ADD $Bctx,$B,$B
  229. ADD $Fctx,$F,$F
  230. || ADD $Cctx,$C,$C
  231. || ADD $Gctx,$G,$G
  232. || ADD $Dctx,$D,$D
  233. || ADD $Hctx,$H,$H
  234. || [A0] LDW *$K256++,$K ; pre-fetch K256[0]
  235. [!A0] BNOP RA
  236. ||[!A0] MV $CTXA,$CTXB
  237. [!A0] MV FP,SP ; restore stack pointer
  238. ||[!A0] LDW *FP[0],FP ; restore frame pointer
  239. [!A0] STW $A,*${CTXA}[0] ; save ctx
  240. ||[!A0] STW $E,*${CTXB}[4]
  241. ||[!A0] MVK 0,B0
  242. [!A0] STW $B,*${CTXA}[1]
  243. ||[!A0] STW $F,*${CTXB}[5]
  244. ||[!A0] MVC B0,AMR ; clear AMR
  245. STW $C,*${CTXA}[2]
  246. || STW $G,*${CTXB}[6]
  247. STW $D,*${CTXA}[3]
  248. || STW $H,*${CTXB}[7]
  249. .endasmfunc
  250. .sect ".const:sha_asm"
  251. .align 128
  252. K256:
  253. .uword 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
  254. .uword 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
  255. .uword 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
  256. .uword 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
  257. .uword 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
  258. .uword 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
  259. .uword 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
  260. .uword 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
  261. .uword 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
  262. .uword 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
  263. .uword 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
  264. .uword 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
  265. .uword 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
  266. .uword 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
  267. .uword 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
  268. .uword 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
  269. .cstring "SHA256 block transform for C64x+, CRYPTOGAMS by <appro\@openssl.org>"
  270. .align 4
  271. ___
  272. print $code;