sha1-c64x.pl 7.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330
  1. #!/usr/bin/env perl
  2. #
  3. # ====================================================================
  4. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  5. # project. The module is, however, dual licensed under OpenSSL and
  6. # CRYPTOGAMS licenses depending on where you obtain it. For further
  7. # details see http://www.openssl.org/~appro/cryptogams/.
  8. # ====================================================================
  9. #
  10. # SHA1 for C64x.
  11. #
  12. # November 2016
  13. #
  14. # If compared to compiler-generated code with similar characteristics,
  15. # i.e. compiled with OPENSSL_SMALL_FOOTPRINT and utilizing SPLOOPs,
  16. # this implementation is 25% smaller and >2x faster. In absolute terms
  17. # performance is (quite impressive) ~6.5 cycles per processed byte.
  18. # Unlike its predecessor, sha1-c64xplus module, this module has worse
  19. # interrupt agility. While original added up to 5 cycles delay to
  20. # response to interrupt, this module adds up to 100. Fully unrolled
  21. # implementation doesn't add any delay and even 25% faster, but is
  22. # almost 5x larger...
  23. #
  24. # !!! Note that this module uses AMR, which means that all interrupt
  25. # service routines are expected to preserve it and for own well-being
  26. # zero it upon entry.
  27. while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
  28. open STDOUT,">$output";
  29. ($CTX,$INP,$NUM) = ("A4","B4","A6"); # arguments
  30. ($A,$B,$C,$D,$E, $Arot,$F,$F0,$T,$K) = map("A$_",(16..20, 21..25));
  31. ($X0,$X2,$X8,$X13) = ("A26","B26","A27","B27");
  32. ($TX0,$TX1,$TX2,$TX3) = map("B$_",(28..31));
  33. ($XPA,$XPB) = ("A5","B5"); # X circular buffer
  34. ($Actx,$Bctx,$Cctx,$Dctx,$Ectx) = map("A$_",(3,6..9)); # zaps $NUM
  35. $code=<<___;
  36. .text
  37. .if .ASSEMBLER_VERSION<7000000
  38. .asg 0,__TI_EABI__
  39. .endif
  40. .if __TI_EABI__
  41. .asg sha1_block_data_order,_sha1_block_data_order
  42. .endif
  43. .asg B3,RA
  44. .asg A15,FP
  45. .asg B15,SP
  46. .if .BIG_ENDIAN
  47. .asg MV,SWAP2
  48. .asg MV,SWAP4
  49. .endif
  50. .global _sha1_block_data_order
  51. _sha1_block_data_order:
  52. .asmfunc stack_usage(64)
  53. MV $NUM,A0 ; reassign $NUM
  54. || MVK -64,B0
  55. [!A0] BNOP RA ; if ($NUM==0) return;
  56. || [A0] STW FP,*SP--[16] ; save frame pointer and alloca(64)
  57. || [A0] MV SP,FP
  58. [A0] LDW *${CTX}[0],$A ; load A-E...
  59. || [A0] AND B0,SP,SP ; align stack at 64 bytes
  60. [A0] LDW *${CTX}[1],$B
  61. || [A0] SUBAW SP,2,SP ; reserve two words above buffer
  62. [A0] LDW *${CTX}[2],$C
  63. || [A0] MVK 0x00404,B0
  64. [A0] LDW *${CTX}[3],$D
  65. || [A0] MVKH 0x50000,B0 ; 0x050404, 64 bytes for $XP[AB]
  66. [A0] LDW *${CTX}[4],$E
  67. || [A0] MVC B0,AMR ; setup circular addressing
  68. LDNW *${INP}++,$TX1 ; pre-fetch input
  69. NOP 1
  70. loop?:
  71. MVKL 0x5a827999,$K
  72. || ADDAW SP,2,$XPB
  73. || SUB A0,1,A0
  74. MVKH 0x5a827999,$K ; K_00_19
  75. || MV $A,$Actx
  76. || MV $B,$Bctx
  77. ;;==================================================
  78. B body_00_13? ; BODY_00_13
  79. || MVK 11,B0
  80. || MV $XPB,$XPA
  81. || MV $C,$Cctx
  82. || MV $D,$Dctx
  83. || MVD $E,$Ectx
  84. body_00_13?:
  85. ROTL $A,5,$Arot
  86. || AND $C,$B,$F
  87. || ANDN $D,$B,$F0
  88. || ADD $K,$E,$T ; T=E+K
  89. XOR $F0,$F,$F ; F_00_19(B,C,D)
  90. || MV $D,$E ; E=D
  91. || MV $C,$D ; D=C
  92. || SWAP2 $TX1,$TX2
  93. || LDNW *${INP}++,$TX1
  94. ADD $F,$T,$T ; T+=F_00_19(B,C,D)
  95. || ROTL $B,30,$C ; C=ROL(B,30)
  96. || SWAP4 $TX2,$TX3 ; byte swap
  97. ADD $Arot,$T,$T ; T+=ROL(A,5)
  98. || MV $A,$B ; B=A
  99. ADD $TX3,$T,$A ; A=T+Xi
  100. || STW $TX3,*${XPB}++
  101. || BDEC body_00_13?,B0
  102. ;;==================================================
  103. ROTL $A,5,$Arot ; BODY_14
  104. || AND $C,$B,$F
  105. || ANDN $D,$B,$F0
  106. || ADD $K,$E,$T ; T=E+K
  107. XOR $F0,$F,$F ; F_00_19(B,C,D)
  108. || MV $D,$E ; E=D
  109. || MV $C,$D ; D=C
  110. || SWAP2 $TX1,$TX2
  111. || LDNW *${INP}++,$TX1
  112. ADD $F,$T,$T ; T+=F_00_19(B,C,D)
  113. || ROTL $B,30,$C ; C=ROL(B,30)
  114. || SWAP4 $TX2,$TX2 ; byte swap
  115. || LDW *${XPA}++,$X0 ; fetches from X ring buffer are
  116. || LDW *${XPB}[4],$X2 ; 2 iterations ahead
  117. ADD $Arot,$T,$T ; T+=ROL(A,5)
  118. || MV $A,$B ; B=A
  119. || LDW *${XPA}[7],$X8
  120. || MV $TX3,$X13 ; || LDW *${XPB}[15],$X13
  121. || MV $TX2,$TX3
  122. ADD $TX2,$T,$A ; A=T+Xi
  123. || STW $TX2,*${XPB}++
  124. ;;==================================================
  125. ROTL $A,5,$Arot ; BODY_15
  126. || AND $C,$B,$F
  127. || ANDN $D,$B,$F0
  128. || ADD $K,$E,$T ; T=E+K
  129. XOR $F0,$F,$F ; F_00_19(B,C,D)
  130. || MV $D,$E ; E=D
  131. || MV $C,$D ; D=C
  132. || SWAP2 $TX1,$TX2
  133. ADD $F,$T,$T ; T+=F_00_19(B,C,D)
  134. || ROTL $B,30,$C ; C=ROL(B,30)
  135. || SWAP4 $TX2,$TX2 ; byte swap
  136. || XOR $X0,$X2,$TX0 ; Xupdate XORs are 1 iteration ahead
  137. || LDW *${XPA}++,$X0
  138. || LDW *${XPB}[4],$X2
  139. ADD $Arot,$T,$T ; T+=ROL(A,5)
  140. || MV $A,$B ; B=A
  141. || XOR $X8,$X13,$TX1
  142. || LDW *${XPA}[7],$X8
  143. || MV $TX3,$X13 ; || LDW *${XPB}[15],$X13
  144. || MV $TX2,$TX3
  145. ADD $TX2,$T,$A ; A=T+Xi
  146. || STW $TX2,*${XPB}++
  147. || XOR $TX0,$TX1,$TX1
  148. ;;==================================================
  149. || B body_16_19? ; BODY_16_19
  150. || MVK 1,B0
  151. body_16_19?:
  152. ROTL $A,5,$Arot
  153. || AND $C,$B,$F
  154. || ANDN $D,$B,$F0
  155. || ADD $K,$E,$T ; T=E+K
  156. || ROTL $TX1,1,$TX2 ; Xupdate output
  157. XOR $F0,$F,$F ; F_00_19(B,C,D)
  158. || MV $D,$E ; E=D
  159. || MV $C,$D ; D=C
  160. ADD $F,$T,$T ; T+=F_00_19(B,C,D)
  161. || ROTL $B,30,$C ; C=ROL(B,30)
  162. || XOR $X0,$X2,$TX0
  163. || LDW *${XPA}++,$X0
  164. || LDW *${XPB}[4],$X2
  165. ADD $Arot,$T,$T ; T+=ROL(A,5)
  166. || MV $A,$B ; B=A
  167. || XOR $X8,$X13,$TX1
  168. || LDW *${XPA}[7],$X8
  169. || MV $TX3,$X13 ; || LDW *${XPB}[15],$X13
  170. || MV $TX2,$TX3
  171. ADD $TX2,$T,$A ; A=T+Xi
  172. || STW $TX2,*${XPB}++
  173. || XOR $TX0,$TX1,$TX1
  174. || BDEC body_16_19?,B0
  175. MVKL 0x6ed9eba1,$K
  176. || MVK 17,B0
  177. MVKH 0x6ed9eba1,$K ; K_20_39
  178. ___
  179. sub BODY_20_39 {
  180. my $label = shift;
  181. $code.=<<___;
  182. ;;==================================================
  183. || B $label ; BODY_20_39
  184. $label:
  185. ROTL $A,5,$Arot
  186. || XOR $B,$C,$F
  187. || ADD $K,$E,$T ; T=E+K
  188. || ROTL $TX1,1,$TX2 ; Xupdate output
  189. XOR $D,$F,$F ; F_20_39(B,C,D)
  190. || MV $D,$E ; E=D
  191. || MV $C,$D ; D=C
  192. ADD $F,$T,$T ; T+=F_20_39(B,C,D)
  193. || ROTL $B,30,$C ; C=ROL(B,30)
  194. || XOR $X0,$X2,$TX0
  195. || LDW *${XPA}++,$X0
  196. || LDW *${XPB}[4],$X2
  197. ADD $Arot,$T,$T ; T+=ROL(A,5)
  198. || MV $A,$B ; B=A
  199. || XOR $X8,$X13,$TX1
  200. || LDW *${XPA}[7],$X8
  201. || MV $TX3,$X13 ; || LDW *${XPB}[15],$X13
  202. || MV $TX2,$TX3
  203. ADD $TX2,$T,$A ; A=T+Xi
  204. || STW $TX2,*${XPB}++ ; last one is redundant
  205. || XOR $TX0,$TX1,$TX1
  206. || BDEC $label,B0
  207. ___
  208. } &BODY_20_39("body_20_39?");
  209. $code.=<<___;
  210. ;;==================================================
  211. MVKL 0x8f1bbcdc,$K
  212. || MVK 17,B0
  213. MVKH 0x8f1bbcdc,$K ; K_40_59
  214. || B body_40_59? ; BODY_40_59
  215. || AND $B,$C,$F
  216. || AND $B,$D,$F0
  217. body_40_59?:
  218. ROTL $A,5,$Arot
  219. || XOR $F0,$F,$F
  220. || AND $C,$D,$F0
  221. || ADD $K,$E,$T ; T=E+K
  222. || ROTL $TX1,1,$TX2 ; Xupdate output
  223. XOR $F0,$F,$F ; F_40_59(B,C,D)
  224. || MV $D,$E ; E=D
  225. || MV $C,$D ; D=C
  226. ADD $F,$T,$T ; T+=F_40_59(B,C,D)
  227. || ROTL $B,30,$C ; C=ROL(B,30)
  228. || XOR $X0,$X2,$TX0
  229. || LDW *${XPA}++,$X0
  230. || LDW *${XPB}[4],$X2
  231. ADD $Arot,$T,$T ; T+=ROL(A,5)
  232. || MV $A,$B ; B=A
  233. || XOR $X8,$X13,$TX1
  234. || LDW *${XPA}[7],$X8
  235. || MV $TX3,$X13 ; || LDW *${XPB}[15],$X13
  236. || MV $TX2,$TX3
  237. ADD $TX2,$T,$A ; A=T+Xi
  238. || STW $TX2,*${XPB}++
  239. || XOR $TX0,$TX1,$TX1
  240. || AND $B,$C,$F
  241. || AND $B,$D,$F0
  242. || BDEC body_40_59?,B0
  243. MVKL 0xca62c1d6,$K
  244. || MVK 16,B0
  245. MVKH 0xca62c1d6,$K ; K_60_79
  246. ___
  247. &BODY_20_39("body_60_78?"); # BODY_60_78
  248. $code.=<<___;
  249. ;;==================================================
  250. [A0] B loop?
  251. || ROTL $A,5,$Arot ; BODY_79
  252. || XOR $B,$C,$F
  253. || ROTL $TX1,1,$TX2 ; Xupdate output
  254. [A0] LDNW *${INP}++,$TX1 ; pre-fetch input
  255. || ADD $K,$E,$T ; T=E+K
  256. || XOR $D,$F,$F ; F_20_39(B,C,D)
  257. ADD $F,$T,$T ; T+=F_20_39(B,C,D)
  258. || ADD $Ectx,$D,$E ; E=D,E+=Ectx
  259. || ADD $Dctx,$C,$D ; D=C,D+=Dctx
  260. || ROTL $B,30,$C ; C=ROL(B,30)
  261. ADD $Arot,$T,$T ; T+=ROL(A,5)
  262. || ADD $Bctx,$A,$B ; B=A,B+=Bctx
  263. ADD $TX2,$T,$A ; A=T+Xi
  264. ADD $Actx,$A,$A ; A+=Actx
  265. || ADD $Cctx,$C,$C ; C+=Cctx
  266. ;; end of loop?
  267. BNOP RA ; return
  268. || MV FP,SP ; restore stack pointer
  269. || LDW *FP[0],FP ; restore frame pointer
  270. STW $A,*${CTX}[0] ; emit A-E...
  271. || MVK 0,B0
  272. STW $B,*${CTX}[1]
  273. || MVC B0,AMR ; clear AMR
  274. STW $C,*${CTX}[2]
  275. STW $D,*${CTX}[3]
  276. STW $E,*${CTX}[4]
  277. .endasmfunc
  278. .sect .const
  279. .cstring "SHA1 block transform for C64x, CRYPTOGAMS by <appro\@openssl.org>"
  280. .align 4
  281. ___
  282. print $code;
  283. close STDOUT;