sha256-c64xplus.pl 8.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320
  1. #! /usr/bin/env perl
  2. # Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the OpenSSL license (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. # ====================================================================
  10. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  11. # project. The module is, however, dual licensed under OpenSSL and
  12. # CRYPTOGAMS licenses depending on where you obtain it. For further
  13. # details see http://www.openssl.org/~appro/cryptogams/.
  14. # ====================================================================
  15. #
  16. # SHA256 for C64x+.
  17. #
  18. # January 2012
  19. #
  20. # Performance is just below 10 cycles per processed byte, which is
  21. # almost 40% faster than compiler-generated code. Unroll is unlikely
  22. # to give more than ~8% improvement...
  23. #
  24. # !!! Note that this module uses AMR, which means that all interrupt
  25. # service routines are expected to preserve it and for own well-being
  26. # zero it upon entry.
  27. while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
  28. open STDOUT,">$output";
  29. ($CTXA,$INP,$NUM) = ("A4","B4","A6"); # arguments
  30. $K256="A3";
  31. ($A,$Actx,$B,$Bctx,$C,$Cctx,$D,$Dctx,$T2,$S0,$s1,$t0a,$t1a,$t2a,$X9,$X14)
  32. =map("A$_",(16..31));
  33. ($E,$Ectx,$F,$Fctx,$G,$Gctx,$H,$Hctx,$T1,$S1,$s0,$t0e,$t1e,$t2e,$X1,$X15)
  34. =map("B$_",(16..31));
  35. ($Xia,$Xib)=("A5","B5"); # circular/ring buffer
  36. $CTXB=$t2e;
  37. ($Xn,$X0,$K)=("B7","B8","B9");
  38. ($Maj,$Ch)=($T2,"B6");
  39. $code.=<<___;
  40. .text
  41. .if .ASSEMBLER_VERSION<7000000
  42. .asg 0,__TI_EABI__
  43. .endif
  44. .if __TI_EABI__
  45. .nocmp
  46. .asg sha256_block_data_order,_sha256_block_data_order
  47. .endif
  48. .asg B3,RA
  49. .asg A15,FP
  50. .asg B15,SP
  51. .if .BIG_ENDIAN
  52. .asg SWAP2,MV
  53. .asg SWAP4,MV
  54. .endif
  55. .global _sha256_block_data_order
  56. _sha256_block_data_order:
  57. __sha256_block:
  58. .asmfunc stack_usage(64)
  59. MV $NUM,A0 ; reassign $NUM
  60. || MVK -64,B0
  61. [!A0] BNOP RA ; if ($NUM==0) return;
  62. || [A0] STW FP,*SP--[16] ; save frame pointer and alloca(64)
  63. || [A0] MV SP,FP
  64. [A0] ADDKPC __sha256_block,B2
  65. || [A0] AND B0,SP,SP ; align stack at 64 bytes
  66. .if __TI_EABI__
  67. [A0] MVK 0x00404,B1
  68. || [A0] MVKL \$PCR_OFFSET(K256,__sha256_block),$K256
  69. [A0] MVKH 0x50000,B1
  70. || [A0] MVKH \$PCR_OFFSET(K256,__sha256_block),$K256
  71. .else
  72. [A0] MVK 0x00404,B1
  73. || [A0] MVKL (K256-__sha256_block),$K256
  74. [A0] MVKH 0x50000,B1
  75. || [A0] MVKH (K256-__sha256_block),$K256
  76. .endif
  77. [A0] MVC B1,AMR ; setup circular addressing
  78. || [A0] MV SP,$Xia
  79. [A0] MV SP,$Xib
  80. || [A0] ADD B2,$K256,$K256
  81. || [A0] MV $CTXA,$CTXB
  82. || [A0] SUBAW SP,2,SP ; reserve two words above buffer
  83. LDW *${CTXA}[0],$A ; load ctx
  84. || LDW *${CTXB}[4],$E
  85. LDW *${CTXA}[1],$B
  86. || LDW *${CTXB}[5],$F
  87. LDW *${CTXA}[2],$C
  88. || LDW *${CTXB}[6],$G
  89. LDW *${CTXA}[3],$D
  90. || LDW *${CTXB}[7],$H
  91. LDNW *$INP++,$Xn ; pre-fetch input
  92. LDW *$K256++,$K ; pre-fetch K256[0]
  93. MVK 14,B0 ; loop counters
  94. MVK 47,B1
  95. || ADDAW $Xia,9,$Xia
  96. outerloop?:
  97. SUB A0,1,A0
  98. || MV $A,$Actx
  99. || MV $E,$Ectx
  100. || MVD $B,$Bctx
  101. || MVD $F,$Fctx
  102. MV $C,$Cctx
  103. || MV $G,$Gctx
  104. || MVD $D,$Dctx
  105. || MVD $H,$Hctx
  106. || SWAP4 $Xn,$X0
  107. SPLOOPD 8 ; BODY_00_14
  108. || MVC B0,ILC
  109. || SWAP2 $X0,$X0
  110. LDNW *$INP++,$Xn
  111. || ROTL $A,30,$S0
  112. || OR $A,$B,$Maj
  113. || AND $A,$B,$t2a
  114. || ROTL $E,26,$S1
  115. || AND $F,$E,$Ch
  116. || ANDN $G,$E,$t2e
  117. ROTL $A,19,$t0a
  118. || AND $C,$Maj,$Maj
  119. || ROTL $E,21,$t0e
  120. || XOR $t2e,$Ch,$Ch ; Ch(e,f,g) = (e&f)^(~e&g)
  121. ROTL $A,10,$t1a
  122. || OR $t2a,$Maj,$Maj ; Maj(a,b,c) = ((a|b)&c)|(a&b)
  123. || ROTL $E,7,$t1e
  124. || ADD $K,$H,$T1 ; T1 = h + K256[i]
  125. ADD $X0,$T1,$T1 ; T1 += X[i];
  126. || STW $X0,*$Xib++
  127. || XOR $t0a,$S0,$S0
  128. || XOR $t0e,$S1,$S1
  129. XOR $t1a,$S0,$S0 ; Sigma0(a)
  130. || XOR $t1e,$S1,$S1 ; Sigma1(e)
  131. || LDW *$K256++,$K ; pre-fetch K256[i+1]
  132. || ADD $Ch,$T1,$T1 ; T1 += Ch(e,f,g)
  133. ADD $S1,$T1,$T1 ; T1 += Sigma1(e)
  134. || ADD $S0,$Maj,$T2 ; T2 = Sigma0(a) + Maj(a,b,c)
  135. || ROTL $G,0,$H ; h = g
  136. || MV $F,$G ; g = f
  137. || MV $X0,$X14
  138. || SWAP4 $Xn,$X0
  139. SWAP2 $X0,$X0
  140. || MV $E,$F ; f = e
  141. || ADD $D,$T1,$E ; e = d + T1
  142. || MV $C,$D ; d = c
  143. MV $B,$C ; c = b
  144. || MV $A,$B ; b = a
  145. || ADD $T1,$T2,$A ; a = T1 + T2
  146. SPKERNEL
  147. ROTL $A,30,$S0 ; BODY_15
  148. || OR $A,$B,$Maj
  149. || AND $A,$B,$t2a
  150. || ROTL $E,26,$S1
  151. || AND $F,$E,$Ch
  152. || ANDN $G,$E,$t2e
  153. || LDW *${Xib}[1],$Xn ; modulo-scheduled
  154. ROTL $A,19,$t0a
  155. || AND $C,$Maj,$Maj
  156. || ROTL $E,21,$t0e
  157. || XOR $t2e,$Ch,$Ch ; Ch(e,f,g) = (e&f)^(~e&g)
  158. || LDW *${Xib}[2],$X1 ; modulo-scheduled
  159. ROTL $A,10,$t1a
  160. || OR $t2a,$Maj,$Maj ; Maj(a,b,c) = ((a|b)&c)|(a&b)
  161. || ROTL $E,7,$t1e
  162. || ADD $K,$H,$T1 ; T1 = h + K256[i]
  163. ADD $X0,$T1,$T1 ; T1 += X[i];
  164. || STW $X0,*$Xib++
  165. || XOR $t0a,$S0,$S0
  166. || XOR $t0e,$S1,$S1
  167. XOR $t1a,$S0,$S0 ; Sigma0(a)
  168. || XOR $t1e,$S1,$S1 ; Sigma1(e)
  169. || LDW *$K256++,$K ; pre-fetch K256[i+1]
  170. || ADD $Ch,$T1,$T1 ; T1 += Ch(e,f,g)
  171. ADD $S1,$T1,$T1 ; T1 += Sigma1(e)
  172. || ADD $S0,$Maj,$T2 ; T2 = Sigma0(a) + Maj(a,b,c)
  173. || ROTL $G,0,$H ; h = g
  174. || MV $F,$G ; g = f
  175. || MV $X0,$X15
  176. MV $E,$F ; f = e
  177. || ADD $D,$T1,$E ; e = d + T1
  178. || MV $C,$D ; d = c
  179. || MV $Xn,$X0 ; modulo-scheduled
  180. || LDW *$Xia,$X9 ; modulo-scheduled
  181. || ROTL $X1,25,$t0e ; modulo-scheduled
  182. || ROTL $X14,15,$t0a ; modulo-scheduled
  183. SHRU $X1,3,$s0 ; modulo-scheduled
  184. || SHRU $X14,10,$s1 ; modulo-scheduled
  185. || ROTL $B,0,$C ; c = b
  186. || MV $A,$B ; b = a
  187. || ADD $T1,$T2,$A ; a = T1 + T2
  188. SPLOOPD 10 ; BODY_16_63
  189. || MVC B1,ILC
  190. || ROTL $X1,14,$t1e ; modulo-scheduled
  191. || ROTL $X14,13,$t1a ; modulo-scheduled
  192. XOR $t0e,$s0,$s0
  193. || XOR $t0a,$s1,$s1
  194. || MV $X15,$X14
  195. || MV $X1,$Xn
  196. XOR $t1e,$s0,$s0 ; sigma0(X[i+1])
  197. || XOR $t1a,$s1,$s1 ; sigma1(X[i+14])
  198. || LDW *${Xib}[2],$X1 ; module-scheduled
  199. ROTL $A,30,$S0
  200. || OR $A,$B,$Maj
  201. || AND $A,$B,$t2a
  202. || ROTL $E,26,$S1
  203. || AND $F,$E,$Ch
  204. || ANDN $G,$E,$t2e
  205. || ADD $X9,$X0,$X0 ; X[i] += X[i+9]
  206. ROTL $A,19,$t0a
  207. || AND $C,$Maj,$Maj
  208. || ROTL $E,21,$t0e
  209. || XOR $t2e,$Ch,$Ch ; Ch(e,f,g) = (e&f)^(~e&g)
  210. || ADD $s0,$X0,$X0 ; X[i] += sigma1(X[i+1])
  211. ROTL $A,10,$t1a
  212. || OR $t2a,$Maj,$Maj ; Maj(a,b,c) = ((a|b)&c)|(a&b)
  213. || ROTL $E,7,$t1e
  214. || ADD $H,$K,$T1 ; T1 = h + K256[i]
  215. || ADD $s1,$X0,$X0 ; X[i] += sigma1(X[i+14])
  216. XOR $t0a,$S0,$S0
  217. || XOR $t0e,$S1,$S1
  218. || ADD $X0,$T1,$T1 ; T1 += X[i]
  219. || STW $X0,*$Xib++
  220. XOR $t1a,$S0,$S0 ; Sigma0(a)
  221. || XOR $t1e,$S1,$S1 ; Sigma1(e)
  222. || ADD $Ch,$T1,$T1 ; T1 += Ch(e,f,g)
  223. || MV $X0,$X15
  224. || ROTL $G,0,$H ; h = g
  225. || LDW *$K256++,$K ; pre-fetch K256[i+1]
  226. ADD $S1,$T1,$T1 ; T1 += Sigma1(e)
  227. || ADD $S0,$Maj,$T2 ; T2 = Sigma0(a) + Maj(a,b,c)
  228. || MV $F,$G ; g = f
  229. || MV $Xn,$X0 ; modulo-scheduled
  230. || LDW *++$Xia,$X9 ; modulo-scheduled
  231. || ROTL $X1,25,$t0e ; module-scheduled
  232. || ROTL $X14,15,$t0a ; modulo-scheduled
  233. ROTL $X1,14,$t1e ; modulo-scheduled
  234. || ROTL $X14,13,$t1a ; modulo-scheduled
  235. || MV $E,$F ; f = e
  236. || ADD $D,$T1,$E ; e = d + T1
  237. || MV $C,$D ; d = c
  238. || MV $B,$C ; c = b
  239. MV $A,$B ; b = a
  240. || ADD $T1,$T2,$A ; a = T1 + T2
  241. || SHRU $X1,3,$s0 ; modulo-scheduled
  242. || SHRU $X14,10,$s1 ; modulo-scheduled
  243. SPKERNEL
  244. [A0] B outerloop?
  245. || [A0] LDNW *$INP++,$Xn ; pre-fetch input
  246. || [A0] ADDK -260,$K256 ; rewind K256
  247. || ADD $Actx,$A,$A ; accumulate ctx
  248. || ADD $Ectx,$E,$E
  249. || ADD $Bctx,$B,$B
  250. ADD $Fctx,$F,$F
  251. || ADD $Cctx,$C,$C
  252. || ADD $Gctx,$G,$G
  253. || ADD $Dctx,$D,$D
  254. || ADD $Hctx,$H,$H
  255. || [A0] LDW *$K256++,$K ; pre-fetch K256[0]
  256. [!A0] BNOP RA
  257. ||[!A0] MV $CTXA,$CTXB
  258. [!A0] MV FP,SP ; restore stack pointer
  259. ||[!A0] LDW *FP[0],FP ; restore frame pointer
  260. [!A0] STW $A,*${CTXA}[0] ; save ctx
  261. ||[!A0] STW $E,*${CTXB}[4]
  262. ||[!A0] MVK 0,B0
  263. [!A0] STW $B,*${CTXA}[1]
  264. ||[!A0] STW $F,*${CTXB}[5]
  265. ||[!A0] MVC B0,AMR ; clear AMR
  266. STW $C,*${CTXA}[2]
  267. || STW $G,*${CTXB}[6]
  268. STW $D,*${CTXA}[3]
  269. || STW $H,*${CTXB}[7]
  270. .endasmfunc
  271. .if __TI_EABI__
  272. .sect ".text:sha_asm.const"
  273. .else
  274. .sect ".const:sha_asm"
  275. .endif
  276. .align 128
  277. K256:
  278. .uword 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
  279. .uword 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
  280. .uword 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
  281. .uword 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
  282. .uword 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
  283. .uword 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
  284. .uword 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
  285. .uword 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
  286. .uword 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
  287. .uword 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
  288. .uword 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
  289. .uword 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
  290. .uword 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
  291. .uword 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
  292. .uword 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
  293. .uword 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
  294. .cstring "SHA256 block transform for C64x+, CRYPTOGAMS by <appro\@openssl.org>"
  295. .align 4
  296. ___
  297. print $code;
  298. close STDOUT;