sha256-c64x.pl 8.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313
  1. #!/usr/bin/env perl
  2. #
  3. # ====================================================================
  4. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  5. # project. The module is, however, dual licensed under OpenSSL and
  6. # CRYPTOGAMS licenses depending on where you obtain it. For further
  7. # details see http://www.openssl.org/~appro/cryptogams/.
  8. # ====================================================================
  9. #
  10. # SHA256 for C64x.
  11. #
  12. # November 2016
  13. #
  14. # Performance is just below 10 cycles per processed byte, which is
  15. # almost 40% faster than compiler-generated code. Unroll is unlikely
  16. # to give more than ~8% improvement...
  17. #
  18. # !!! Note that this module uses AMR, which means that all interrupt
  19. # service routines are expected to preserve it and for own well-being
  20. # zero it upon entry.
  21. while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
  22. open STDOUT,">$output";
  23. ($CTXA,$INP,$NUM) = ("A4","B4","A6"); # arguments
  24. $K256="A3";
  25. ($A,$Actx,$B,$Bctx,$C,$Cctx,$D,$Dctx,$T2,$S0,$s1,$t0a,$t1a,$t2a,$X9,$X14)
  26. =map("A$_",(16..31));
  27. ($E,$Ectx,$F,$Fctx,$G,$Gctx,$H,$Hctx,$T1,$S1,$s0,$t0e,$t1e,$t2e,$X1,$X15)
  28. =map("B$_",(16..31));
  29. ($Xia,$Xib)=("A5","B5"); # circular/ring buffer
  30. $CTXB=$t2e;
  31. ($Xn,$X0,$K)=("B7","B8","B9");
  32. ($Maj,$Ch)=($T2,"B6");
  33. $code.=<<___;
  34. .text
  35. .if .ASSEMBLER_VERSION<7000000
  36. .asg 0,__TI_EABI__
  37. .endif
  38. .if __TI_EABI__
  39. .nocmp
  40. .asg sha256_block_data_order,_sha256_block_data_order
  41. .endif
  42. .asg B3,RA
  43. .asg A15,FP
  44. .asg B15,SP
  45. .if .BIG_ENDIAN
  46. .asg SWAP2,MV
  47. .asg SWAP4,MV
  48. .endif
  49. .global _sha256_block_data_order
  50. _sha256_block_data_order:
  51. __sha256_block:
  52. .asmfunc stack_usage(64)
  53. MV $NUM,A0 ; reassign $NUM
  54. || MVK -64,B0
  55. [!A0] BNOP RA ; if ($NUM==0) return;
  56. || [A0] STW FP,*SP--[16] ; save frame pointer and alloca(64)
  57. || [A0] MV SP,FP
  58. [A0] ADDKPC _sha256_block_data_order,B2
  59. || [A0] AND B0,SP,SP ; align stack at 64 bytes
  60. .if __TI_EABI__
  61. [A0] MVK 0x00404,B1
  62. || [A0] MVKL \$PCR_OFFSET(K256,__sha256_block),$K256
  63. [A0] MVKH 0x50000,B1
  64. || [A0] MVKH \$PCR_OFFSET(K256,__sha256_block),$K256
  65. .else
  66. [A0] MVK 0x00404,B1
  67. || [A0] MVKL (K256-__sha256_block),$K256
  68. [A0] MVKH 0x50000,B1
  69. || [A0] MVKH (K256-__sha256_block),$K256
  70. .endif
  71. [A0] MVC B1,AMR ; setup circular addressing
  72. || [A0] MV SP,$Xia
  73. [A0] MV SP,$Xib
  74. || [A0] ADD B2,$K256,$K256
  75. || [A0] MV $CTXA,$CTXB
  76. || [A0] SUBAW SP,2,SP ; reserve two words above buffer
  77. LDW *${CTXA}[0],$A ; load ctx
  78. || LDW *${CTXB}[4],$E
  79. LDW *${CTXA}[1],$B
  80. || LDW *${CTXB}[5],$F
  81. LDW *${CTXA}[2],$C
  82. || LDW *${CTXB}[6],$G
  83. LDW *${CTXA}[3],$D
  84. || LDW *${CTXB}[7],$H
  85. LDNW *$INP++,$Xn ; pre-fetch input
  86. LDW *$K256++,$K ; pre-fetch K256[0]
  87. NOP
  88. ADDAW $Xia,9,$Xia
  89. outerloop?:
  90. SUB A0,1,A0
  91. || MV $A,$Actx
  92. || MV $E,$Ectx
  93. || MVD $B,$Bctx
  94. || MVD $F,$Fctx
  95. MV $C,$Cctx
  96. || MV $G,$Gctx
  97. || MVD $D,$Dctx
  98. || MVD $H,$Hctx
  99. || SWAP4 $Xn,$X0
  100. MVK 14,B0 ; loop counter
  101. || SWAP2 $X0,$X0
  102. loop_00_14?: ; BODY_00_14
  103. LDNW *$INP++,$Xn
  104. || ROTL $A,30,$S0
  105. || OR $A,$B,$Maj
  106. || AND $A,$B,$t2a
  107. || ROTL $E,26,$S1
  108. || AND $F,$E,$Ch
  109. || ANDN $G,$E,$t2e
  110. ROTL $A,19,$t0a
  111. || AND $C,$Maj,$Maj
  112. || ROTL $E,21,$t0e
  113. || XOR $t2e,$Ch,$Ch ; Ch(e,f,g) = (e&f)^(~e&g)
  114. ROTL $A,10,$t1a
  115. || OR $t2a,$Maj,$Maj ; Maj(a,b,c) = ((a|b)&c)|(a&b)
  116. || ROTL $E,7,$t1e
  117. || ADD $K,$H,$T1 ; T1 = h + K256[i]
  118. || [B0] BDEC loop_00_14?,B0
  119. ADD $X0,$T1,$T1 ; T1 += X[i];
  120. || STW $X0,*$Xib++
  121. || XOR $t0a,$S0,$S0
  122. || XOR $t0e,$S1,$S1
  123. XOR $t1a,$S0,$S0 ; Sigma0(a)
  124. || XOR $t1e,$S1,$S1 ; Sigma1(e)
  125. || LDW *$K256++,$K ; pre-fetch K256[i+1]
  126. || ADD $Ch,$T1,$T1 ; T1 += Ch(e,f,g)
  127. ADD $S1,$T1,$T1 ; T1 += Sigma1(e)
  128. || ADD $S0,$Maj,$T2 ; T2 = Sigma0(a) + Maj(a,b,c)
  129. || ROTL $G,0,$H ; h = g
  130. || MV $F,$G ; g = f
  131. || MV $X0,$X14
  132. || SWAP4 $Xn,$X0
  133. SWAP2 $X0,$X0
  134. || MV $E,$F ; f = e
  135. || ADD $D,$T1,$E ; e = d + T1
  136. || MV $C,$D ; d = c
  137. MV $B,$C ; c = b
  138. || MV $A,$B ; b = a
  139. || ADD $T1,$T2,$A ; a = T1 + T2
  140. ;;===== branch to loop00_14? is taken here
  141. ROTL $A,30,$S0 ; BODY_15
  142. || OR $A,$B,$Maj
  143. || AND $A,$B,$t2a
  144. || ROTL $E,26,$S1
  145. || AND $F,$E,$Ch
  146. || ANDN $G,$E,$t2e
  147. || LDW *${Xib}[1],$Xn ; modulo-scheduled
  148. ROTL $A,19,$t0a
  149. || AND $C,$Maj,$Maj
  150. || ROTL $E,21,$t0e
  151. || XOR $t2e,$Ch,$Ch ; Ch(e,f,g) = (e&f)^(~e&g)
  152. || LDW *${Xib}[2],$X1 ; modulo-scheduled
  153. ROTL $A,10,$t1a
  154. || OR $t2a,$Maj,$Maj ; Maj(a,b,c) = ((a|b)&c)|(a&b)
  155. || ROTL $E,7,$t1e
  156. || ADD $K,$H,$T1 ; T1 = h + K256[i]
  157. ADD $X0,$T1,$T1 ; T1 += X[i];
  158. || STW $X0,*$Xib++
  159. || XOR $t0a,$S0,$S0
  160. || XOR $t0e,$S1,$S1
  161. XOR $t1a,$S0,$S0 ; Sigma0(a)
  162. || XOR $t1e,$S1,$S1 ; Sigma1(e)
  163. || LDW *$K256++,$K ; pre-fetch K256[i+1]
  164. || ADD $Ch,$T1,$T1 ; T1 += Ch(e,f,g)
  165. ADD $S1,$T1,$T1 ; T1 += Sigma1(e)
  166. || ADD $S0,$Maj,$T2 ; T2 = Sigma0(a) + Maj(a,b,c)
  167. || ROTL $G,0,$H ; h = g
  168. || MV $F,$G ; g = f
  169. || MV $X0,$X15
  170. MV $E,$F ; f = e
  171. || ADD $D,$T1,$E ; e = d + T1
  172. || MV $C,$D ; d = c
  173. || MV $Xn,$X0 ; modulo-scheduled
  174. || LDW *$Xia,$X9 ; modulo-scheduled
  175. || ROTL $X1,25,$t0e ; modulo-scheduled
  176. || ROTL $X14,15,$t0a ; modulo-scheduled
  177. SHRU $X1,3,$s0 ; modulo-scheduled
  178. || SHRU $X14,10,$s1 ; modulo-scheduled
  179. || ROTL $B,0,$C ; c = b
  180. || MV $A,$B ; b = a
  181. || ADD $T1,$T2,$A ; a = T1 + T2
  182. MVK 47,B1 ; loop counter
  183. || ROTL $X1,14,$t1e ; modulo-scheduled
  184. || ROTL $X14,13,$t1a ; modulo-scheduled
  185. loop_16_63?: ; BODY_16_63
  186. XOR $t0e,$s0,$s0
  187. || XOR $t0a,$s1,$s1
  188. || MV $X15,$X14
  189. || MV $X1,$Xn
  190. XOR $t1e,$s0,$s0 ; sigma0(X[i+1])
  191. || XOR $t1a,$s1,$s1 ; sigma1(X[i+14])
  192. || LDW *${Xib}[2],$X1 ; module-scheduled
  193. ROTL $A,30,$S0
  194. || OR $A,$B,$Maj
  195. || AND $A,$B,$t2a
  196. || ROTL $E,26,$S1
  197. || AND $F,$E,$Ch
  198. || ANDN $G,$E,$t2e
  199. || ADD $X9,$X0,$X0 ; X[i] += X[i+9]
  200. ROTL $A,19,$t0a
  201. || AND $C,$Maj,$Maj
  202. || ROTL $E,21,$t0e
  203. || XOR $t2e,$Ch,$Ch ; Ch(e,f,g) = (e&f)^(~e&g)
  204. || ADD $s0,$X0,$X0 ; X[i] += sigma1(X[i+1])
  205. ROTL $A,10,$t1a
  206. || OR $t2a,$Maj,$Maj ; Maj(a,b,c) = ((a|b)&c)|(a&b)
  207. || ROTL $E,7,$t1e
  208. || ADD $H,$K,$T1 ; T1 = h + K256[i]
  209. || ADD $s1,$X0,$X0 ; X[i] += sigma1(X[i+14])
  210. || [B1] BDEC loop_16_63?,B1
  211. XOR $t0a,$S0,$S0
  212. || XOR $t0e,$S1,$S1
  213. || ADD $X0,$T1,$T1 ; T1 += X[i]
  214. || STW $X0,*$Xib++
  215. XOR $t1a,$S0,$S0 ; Sigma0(a)
  216. || XOR $t1e,$S1,$S1 ; Sigma1(e)
  217. || ADD $Ch,$T1,$T1 ; T1 += Ch(e,f,g)
  218. || MV $X0,$X15
  219. || ROTL $G,0,$H ; h = g
  220. || LDW *$K256++,$K ; pre-fetch K256[i+1]
  221. ADD $S1,$T1,$T1 ; T1 += Sigma1(e)
  222. || ADD $S0,$Maj,$T2 ; T2 = Sigma0(a) + Maj(a,b,c)
  223. || MV $F,$G ; g = f
  224. || MV $Xn,$X0 ; modulo-scheduled
  225. || LDW *++$Xia,$X9 ; modulo-scheduled
  226. || ROTL $X1,25,$t0e ; module-scheduled
  227. || ROTL $X14,15,$t0a ; modulo-scheduled
  228. ROTL $X1,14,$t1e ; modulo-scheduled
  229. || ROTL $X14,13,$t1a ; modulo-scheduled
  230. || MV $E,$F ; f = e
  231. || ADD $D,$T1,$E ; e = d + T1
  232. || MV $C,$D ; d = c
  233. || MV $B,$C ; c = b
  234. MV $A,$B ; b = a
  235. || ADD $T1,$T2,$A ; a = T1 + T2
  236. || SHRU $X1,3,$s0 ; modulo-scheduled
  237. || SHRU $X14,10,$s1 ; modulo-scheduled
  238. ;;===== branch to loop16_63? is taken here
  239. [A0] B outerloop?
  240. || [A0] LDNW *$INP++,$Xn ; pre-fetch input
  241. || [A0] ADDK -260,$K256 ; rewind K256
  242. || ADD $Actx,$A,$A ; accumulate ctx
  243. || ADD $Ectx,$E,$E
  244. || ADD $Bctx,$B,$B
  245. ADD $Fctx,$F,$F
  246. || ADD $Cctx,$C,$C
  247. || ADD $Gctx,$G,$G
  248. || ADD $Dctx,$D,$D
  249. || ADD $Hctx,$H,$H
  250. || [A0] LDW *$K256++,$K ; pre-fetch K256[0]
  251. [!A0] BNOP RA
  252. ||[!A0] MV $CTXA,$CTXB
  253. [!A0] MV FP,SP ; restore stack pointer
  254. ||[!A0] LDW *FP[0],FP ; restore frame pointer
  255. [!A0] STW $A,*${CTXA}[0] ; save ctx
  256. ||[!A0] STW $E,*${CTXB}[4]
  257. ||[!A0] MVK 0,B0
  258. [!A0] STW $B,*${CTXA}[1]
  259. ||[!A0] STW $F,*${CTXB}[5]
  260. ||[!A0] MVC B0,AMR ; clear AMR
  261. STW $C,*${CTXA}[2]
  262. || STW $G,*${CTXB}[6]
  263. STW $D,*${CTXA}[3]
  264. || STW $H,*${CTXB}[7]
  265. .endasmfunc
  266. .if __TI_EABI__
  267. .sect ".text:sha_asm.const"
  268. .else
  269. .sect ".const:sha_asm"
  270. .endif
  271. .align 128
  272. K256:
  273. .uword 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
  274. .uword 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
  275. .uword 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
  276. .uword 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
  277. .uword 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
  278. .uword 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
  279. .uword 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
  280. .uword 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
  281. .uword 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
  282. .uword 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
  283. .uword 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
  284. .uword 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
  285. .uword 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
  286. .uword 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
  287. .uword 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
  288. .uword 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
  289. .cstring "SHA256 block transform for C64x, CRYPTOGAMS by <appro\@openssl.org>"
  290. .align 4
  291. ___
  292. print $code;