sha256-c64xplus.pl 8.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319
  1. #! /usr/bin/env perl
  2. # Copyright 2012-2020 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. # ====================================================================
  10. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  11. # project. The module is, however, dual licensed under OpenSSL and
  12. # CRYPTOGAMS licenses depending on where you obtain it. For further
  13. # details see http://www.openssl.org/~appro/cryptogams/.
  14. # ====================================================================
  15. #
  16. # SHA256 for C64x+.
  17. #
  18. # January 2012
  19. #
  20. # Performance is just below 10 cycles per processed byte, which is
  21. # almost 40% faster than compiler-generated code. Unroll is unlikely
  22. # to give more than ~8% improvement...
  23. #
  24. # !!! Note that this module uses AMR, which means that all interrupt
  25. # service routines are expected to preserve it and for own well-being
  26. # zero it upon entry.
  27. $output = pop and open STDOUT,">$output";
  28. ($CTXA,$INP,$NUM) = ("A4","B4","A6"); # arguments
  29. $K256="A3";
  30. ($A,$Actx,$B,$Bctx,$C,$Cctx,$D,$Dctx,$T2,$S0,$s1,$t0a,$t1a,$t2a,$X9,$X14)
  31. =map("A$_",(16..31));
  32. ($E,$Ectx,$F,$Fctx,$G,$Gctx,$H,$Hctx,$T1,$S1,$s0,$t0e,$t1e,$t2e,$X1,$X15)
  33. =map("B$_",(16..31));
  34. ($Xia,$Xib)=("A5","B5"); # circular/ring buffer
  35. $CTXB=$t2e;
  36. ($Xn,$X0,$K)=("B7","B8","B9");
  37. ($Maj,$Ch)=($T2,"B6");
  38. $code.=<<___;
  39. .text
  40. .if .ASSEMBLER_VERSION<7000000
  41. .asg 0,__TI_EABI__
  42. .endif
  43. .if __TI_EABI__
  44. .nocmp
  45. .asg sha256_block_data_order,_sha256_block_data_order
  46. .endif
  47. .asg B3,RA
  48. .asg A15,FP
  49. .asg B15,SP
  50. .if .BIG_ENDIAN
  51. .asg SWAP2,MV
  52. .asg SWAP4,MV
  53. .endif
  54. .global _sha256_block_data_order
  55. _sha256_block_data_order:
  56. __sha256_block:
  57. .asmfunc stack_usage(64)
  58. MV $NUM,A0 ; reassign $NUM
  59. || MVK -64,B0
  60. [!A0] BNOP RA ; if ($NUM==0) return;
  61. || [A0] STW FP,*SP--[16] ; save frame pointer and alloca(64)
  62. || [A0] MV SP,FP
  63. [A0] ADDKPC __sha256_block,B2
  64. || [A0] AND B0,SP,SP ; align stack at 64 bytes
  65. .if __TI_EABI__
  66. [A0] MVK 0x00404,B1
  67. || [A0] MVKL \$PCR_OFFSET(K256,__sha256_block),$K256
  68. [A0] MVKH 0x50000,B1
  69. || [A0] MVKH \$PCR_OFFSET(K256,__sha256_block),$K256
  70. .else
  71. [A0] MVK 0x00404,B1
  72. || [A0] MVKL (K256-__sha256_block),$K256
  73. [A0] MVKH 0x50000,B1
  74. || [A0] MVKH (K256-__sha256_block),$K256
  75. .endif
  76. [A0] MVC B1,AMR ; setup circular addressing
  77. || [A0] MV SP,$Xia
  78. [A0] MV SP,$Xib
  79. || [A0] ADD B2,$K256,$K256
  80. || [A0] MV $CTXA,$CTXB
  81. || [A0] SUBAW SP,2,SP ; reserve two words above buffer
  82. LDW *${CTXA}[0],$A ; load ctx
  83. || LDW *${CTXB}[4],$E
  84. LDW *${CTXA}[1],$B
  85. || LDW *${CTXB}[5],$F
  86. LDW *${CTXA}[2],$C
  87. || LDW *${CTXB}[6],$G
  88. LDW *${CTXA}[3],$D
  89. || LDW *${CTXB}[7],$H
  90. LDNW *$INP++,$Xn ; pre-fetch input
  91. LDW *$K256++,$K ; pre-fetch K256[0]
  92. MVK 14,B0 ; loop counters
  93. MVK 47,B1
  94. || ADDAW $Xia,9,$Xia
  95. outerloop?:
  96. SUB A0,1,A0
  97. || MV $A,$Actx
  98. || MV $E,$Ectx
  99. || MVD $B,$Bctx
  100. || MVD $F,$Fctx
  101. MV $C,$Cctx
  102. || MV $G,$Gctx
  103. || MVD $D,$Dctx
  104. || MVD $H,$Hctx
  105. || SWAP4 $Xn,$X0
  106. SPLOOPD 8 ; BODY_00_14
  107. || MVC B0,ILC
  108. || SWAP2 $X0,$X0
  109. LDNW *$INP++,$Xn
  110. || ROTL $A,30,$S0
  111. || OR $A,$B,$Maj
  112. || AND $A,$B,$t2a
  113. || ROTL $E,26,$S1
  114. || AND $F,$E,$Ch
  115. || ANDN $G,$E,$t2e
  116. ROTL $A,19,$t0a
  117. || AND $C,$Maj,$Maj
  118. || ROTL $E,21,$t0e
  119. || XOR $t2e,$Ch,$Ch ; Ch(e,f,g) = (e&f)^(~e&g)
  120. ROTL $A,10,$t1a
  121. || OR $t2a,$Maj,$Maj ; Maj(a,b,c) = ((a|b)&c)|(a&b)
  122. || ROTL $E,7,$t1e
  123. || ADD $K,$H,$T1 ; T1 = h + K256[i]
  124. ADD $X0,$T1,$T1 ; T1 += X[i];
  125. || STW $X0,*$Xib++
  126. || XOR $t0a,$S0,$S0
  127. || XOR $t0e,$S1,$S1
  128. XOR $t1a,$S0,$S0 ; Sigma0(a)
  129. || XOR $t1e,$S1,$S1 ; Sigma1(e)
  130. || LDW *$K256++,$K ; pre-fetch K256[i+1]
  131. || ADD $Ch,$T1,$T1 ; T1 += Ch(e,f,g)
  132. ADD $S1,$T1,$T1 ; T1 += Sigma1(e)
  133. || ADD $S0,$Maj,$T2 ; T2 = Sigma0(a) + Maj(a,b,c)
  134. || ROTL $G,0,$H ; h = g
  135. || MV $F,$G ; g = f
  136. || MV $X0,$X14
  137. || SWAP4 $Xn,$X0
  138. SWAP2 $X0,$X0
  139. || MV $E,$F ; f = e
  140. || ADD $D,$T1,$E ; e = d + T1
  141. || MV $C,$D ; d = c
  142. MV $B,$C ; c = b
  143. || MV $A,$B ; b = a
  144. || ADD $T1,$T2,$A ; a = T1 + T2
  145. SPKERNEL
  146. ROTL $A,30,$S0 ; BODY_15
  147. || OR $A,$B,$Maj
  148. || AND $A,$B,$t2a
  149. || ROTL $E,26,$S1
  150. || AND $F,$E,$Ch
  151. || ANDN $G,$E,$t2e
  152. || LDW *${Xib}[1],$Xn ; modulo-scheduled
  153. ROTL $A,19,$t0a
  154. || AND $C,$Maj,$Maj
  155. || ROTL $E,21,$t0e
  156. || XOR $t2e,$Ch,$Ch ; Ch(e,f,g) = (e&f)^(~e&g)
  157. || LDW *${Xib}[2],$X1 ; modulo-scheduled
  158. ROTL $A,10,$t1a
  159. || OR $t2a,$Maj,$Maj ; Maj(a,b,c) = ((a|b)&c)|(a&b)
  160. || ROTL $E,7,$t1e
  161. || ADD $K,$H,$T1 ; T1 = h + K256[i]
  162. ADD $X0,$T1,$T1 ; T1 += X[i];
  163. || STW $X0,*$Xib++
  164. || XOR $t0a,$S0,$S0
  165. || XOR $t0e,$S1,$S1
  166. XOR $t1a,$S0,$S0 ; Sigma0(a)
  167. || XOR $t1e,$S1,$S1 ; Sigma1(e)
  168. || LDW *$K256++,$K ; pre-fetch K256[i+1]
  169. || ADD $Ch,$T1,$T1 ; T1 += Ch(e,f,g)
  170. ADD $S1,$T1,$T1 ; T1 += Sigma1(e)
  171. || ADD $S0,$Maj,$T2 ; T2 = Sigma0(a) + Maj(a,b,c)
  172. || ROTL $G,0,$H ; h = g
  173. || MV $F,$G ; g = f
  174. || MV $X0,$X15
  175. MV $E,$F ; f = e
  176. || ADD $D,$T1,$E ; e = d + T1
  177. || MV $C,$D ; d = c
  178. || MV $Xn,$X0 ; modulo-scheduled
  179. || LDW *$Xia,$X9 ; modulo-scheduled
  180. || ROTL $X1,25,$t0e ; modulo-scheduled
  181. || ROTL $X14,15,$t0a ; modulo-scheduled
  182. SHRU $X1,3,$s0 ; modulo-scheduled
  183. || SHRU $X14,10,$s1 ; modulo-scheduled
  184. || ROTL $B,0,$C ; c = b
  185. || MV $A,$B ; b = a
  186. || ADD $T1,$T2,$A ; a = T1 + T2
  187. SPLOOPD 10 ; BODY_16_63
  188. || MVC B1,ILC
  189. || ROTL $X1,14,$t1e ; modulo-scheduled
  190. || ROTL $X14,13,$t1a ; modulo-scheduled
  191. XOR $t0e,$s0,$s0
  192. || XOR $t0a,$s1,$s1
  193. || MV $X15,$X14
  194. || MV $X1,$Xn
  195. XOR $t1e,$s0,$s0 ; sigma0(X[i+1])
  196. || XOR $t1a,$s1,$s1 ; sigma1(X[i+14])
  197. || LDW *${Xib}[2],$X1 ; module-scheduled
  198. ROTL $A,30,$S0
  199. || OR $A,$B,$Maj
  200. || AND $A,$B,$t2a
  201. || ROTL $E,26,$S1
  202. || AND $F,$E,$Ch
  203. || ANDN $G,$E,$t2e
  204. || ADD $X9,$X0,$X0 ; X[i] += X[i+9]
  205. ROTL $A,19,$t0a
  206. || AND $C,$Maj,$Maj
  207. || ROTL $E,21,$t0e
  208. || XOR $t2e,$Ch,$Ch ; Ch(e,f,g) = (e&f)^(~e&g)
  209. || ADD $s0,$X0,$X0 ; X[i] += sigma1(X[i+1])
  210. ROTL $A,10,$t1a
  211. || OR $t2a,$Maj,$Maj ; Maj(a,b,c) = ((a|b)&c)|(a&b)
  212. || ROTL $E,7,$t1e
  213. || ADD $H,$K,$T1 ; T1 = h + K256[i]
  214. || ADD $s1,$X0,$X0 ; X[i] += sigma1(X[i+14])
  215. XOR $t0a,$S0,$S0
  216. || XOR $t0e,$S1,$S1
  217. || ADD $X0,$T1,$T1 ; T1 += X[i]
  218. || STW $X0,*$Xib++
  219. XOR $t1a,$S0,$S0 ; Sigma0(a)
  220. || XOR $t1e,$S1,$S1 ; Sigma1(e)
  221. || ADD $Ch,$T1,$T1 ; T1 += Ch(e,f,g)
  222. || MV $X0,$X15
  223. || ROTL $G,0,$H ; h = g
  224. || LDW *$K256++,$K ; pre-fetch K256[i+1]
  225. ADD $S1,$T1,$T1 ; T1 += Sigma1(e)
  226. || ADD $S0,$Maj,$T2 ; T2 = Sigma0(a) + Maj(a,b,c)
  227. || MV $F,$G ; g = f
  228. || MV $Xn,$X0 ; modulo-scheduled
  229. || LDW *++$Xia,$X9 ; modulo-scheduled
  230. || ROTL $X1,25,$t0e ; module-scheduled
  231. || ROTL $X14,15,$t0a ; modulo-scheduled
  232. ROTL $X1,14,$t1e ; modulo-scheduled
  233. || ROTL $X14,13,$t1a ; modulo-scheduled
  234. || MV $E,$F ; f = e
  235. || ADD $D,$T1,$E ; e = d + T1
  236. || MV $C,$D ; d = c
  237. || MV $B,$C ; c = b
  238. MV $A,$B ; b = a
  239. || ADD $T1,$T2,$A ; a = T1 + T2
  240. || SHRU $X1,3,$s0 ; modulo-scheduled
  241. || SHRU $X14,10,$s1 ; modulo-scheduled
  242. SPKERNEL
  243. [A0] B outerloop?
  244. || [A0] LDNW *$INP++,$Xn ; pre-fetch input
  245. || [A0] ADDK -260,$K256 ; rewind K256
  246. || ADD $Actx,$A,$A ; accumulate ctx
  247. || ADD $Ectx,$E,$E
  248. || ADD $Bctx,$B,$B
  249. ADD $Fctx,$F,$F
  250. || ADD $Cctx,$C,$C
  251. || ADD $Gctx,$G,$G
  252. || ADD $Dctx,$D,$D
  253. || ADD $Hctx,$H,$H
  254. || [A0] LDW *$K256++,$K ; pre-fetch K256[0]
  255. [!A0] BNOP RA
  256. ||[!A0] MV $CTXA,$CTXB
  257. [!A0] MV FP,SP ; restore stack pointer
  258. ||[!A0] LDW *FP[0],FP ; restore frame pointer
  259. [!A0] STW $A,*${CTXA}[0] ; save ctx
  260. ||[!A0] STW $E,*${CTXB}[4]
  261. ||[!A0] MVK 0,B0
  262. [!A0] STW $B,*${CTXA}[1]
  263. ||[!A0] STW $F,*${CTXB}[5]
  264. ||[!A0] MVC B0,AMR ; clear AMR
  265. STW $C,*${CTXA}[2]
  266. || STW $G,*${CTXB}[6]
  267. STW $D,*${CTXA}[3]
  268. || STW $H,*${CTXB}[7]
  269. .endasmfunc
  270. .if __TI_EABI__
  271. .sect ".text:sha_asm.const"
  272. .else
  273. .sect ".const:sha_asm"
  274. .endif
  275. .align 128
  276. K256:
  277. .uword 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
  278. .uword 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
  279. .uword 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
  280. .uword 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
  281. .uword 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
  282. .uword 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
  283. .uword 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
  284. .uword 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
  285. .uword 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
  286. .uword 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
  287. .uword 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
  288. .uword 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
  289. .uword 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
  290. .uword 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
  291. .uword 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
  292. .uword 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
  293. .cstring "SHA256 block transform for C64x+, CRYPTOGAMS by <appro\@openssl.org>"
  294. .align 4
  295. ___
  296. print $code;
  297. close STDOUT or die "error closing STDOUT: $!";