sha1-c64xplus.pl 7.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323
  1. #!/usr/bin/env perl
  2. #
  3. # ====================================================================
  4. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  5. # project. The module is, however, dual licensed under OpenSSL and
  6. # CRYPTOGAMS licenses depending on where you obtain it. For further
  7. # details see http://www.openssl.org/~appro/cryptogams/.
  8. # ====================================================================
  9. #
  10. # SHA1 for C64x+.
  11. #
  12. # November 2011
  13. #
  14. # If compared to compiler-generated code with similar characteristics,
  15. # i.e. compiled with OPENSSL_SMALL_FOOTPRINT and utilizing SPLOOPs,
  16. # this implementation is 25% smaller and >2x faster. In absolute terms
  17. # performance is (quite impressive) ~6.5 cycles per processed byte.
  18. # Fully unrolled assembler would be ~5x larger and is likely to be
  19. # ~15% faster. It would be free from references to intermediate ring
  20. # buffer, but put more pressure on L1P [both because the code would be
  21. # larger and won't be using SPLOOP buffer]. There are no plans to
  22. # realize fully unrolled variant though...
  23. #
  24. # !!! Note that this module uses AMR, which means that all interrupt
  25. # service routines are expected to preserve it and for own well-being
  26. # zero it upon entry.
  27. while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
  28. open STDOUT,">$output";
  29. ($CTX,$INP,$NUM) = ("A4","B4","A6"); # arguments
  30. ($A,$B,$C,$D,$E, $Arot,$F,$F0,$T,$K) = map("A$_",(16..20, 21..25));
  31. ($X0,$X2,$X8,$X13) = ("A26","B26","A27","B27");
  32. ($TX0,$TX1,$TX2,$TX3) = map("B$_",(28..31));
  33. ($XPA,$XPB) = ("A5","B5"); # X circular buffer
  34. ($Actx,$Bctx,$Cctx,$Dctx,$Ectx) = map("A$_",(3,6..9)); # zaps $NUM
  35. $code=<<___;
  36. .text
  37. .asg B3,RA
  38. .asg A15,FP
  39. .asg B15,SP
  40. .if .BIG_ENDIAN
  41. .asg MV,SWAP2
  42. .asg MV,SWAP4
  43. .endif
  44. .global _sha1_block_data_order
  45. _sha1_block_data_order:
  46. .asmfunc stack_usage(64)
  47. MV $NUM,A0 ; reassign $NUM
  48. || MVK -64,B0
  49. [!A0] BNOP RA ; if ($NUM==0) return;
  50. || [A0] STW FP,*SP--[16] ; save frame pointer and alloca(64)
  51. || [A0] MV SP,FP
  52. [A0] LDW *${CTX}[0],$A ; load A-E...
  53. || [A0] AND B0,SP,SP ; align stack at 64 bytes
  54. [A0] LDW *${CTX}[1],$B
  55. || [A0] SUBAW SP,2,SP ; reserve two words above buffer
  56. [A0] LDW *${CTX}[2],$C
  57. || [A0] MVK 0x00404,B0
  58. [A0] LDW *${CTX}[3],$D
  59. || [A0] MVKH 0x50000,B0 ; 0x050404, 64 bytes for $XP[AB]
  60. [A0] LDW *${CTX}[4],$E
  61. || [A0] MVC B0,AMR ; setup circular addressing
  62. LDNW *${INP}++,$TX1 ; pre-fetch input
  63. NOP 1
  64. loop?:
  65. MVK 0x00007999,$K
  66. || ADDAW SP,2,$XPA
  67. || SUB A0,1,A0
  68. || MVK 13,B0
  69. MVKH 0x5a820000,$K ; K_00_19
  70. || ADDAW SP,2,$XPB
  71. || MV $A,$Actx
  72. || MV $B,$Bctx
  73. ;;==================================================
  74. SPLOOPD 5 ; BODY_00_13
  75. || MV $C,$Cctx
  76. || MV $D,$Dctx
  77. || MV $E,$Ectx
  78. || MVC B0,ILC
  79. ROTL $A,5,$Arot
  80. || AND $C,$B,$F
  81. || ANDN $D,$B,$F0
  82. || ADD $K,$E,$T ; T=E+K
  83. XOR $F0,$F,$F ; F_00_19(B,C,D)
  84. || MV $D,$E ; E=D
  85. || MV $C,$D ; D=C
  86. || SWAP2 $TX1,$TX2
  87. || LDNW *${INP}++,$TX1
  88. ADD $F,$T,$T ; T+=F_00_19(B,C,D)
  89. || ROTL $B,30,$C ; C=ROL(B,30)
  90. || SWAP4 $TX2,$TX3 ; byte swap
  91. ADD $Arot,$T,$T ; T+=ROL(A,5)
  92. || MV $A,$B ; B=A
  93. ADD $TX3,$T,$A ; A=T+Xi
  94. || STW $TX3,*${XPB}++
  95. SPKERNEL
  96. ;;==================================================
  97. ROTL $A,5,$Arot ; BODY_14
  98. || AND $C,$B,$F
  99. || ANDN $D,$B,$F0
  100. || ADD $K,$E,$T ; T=E+K
  101. XOR $F0,$F,$F ; F_00_19(B,C,D)
  102. || MV $D,$E ; E=D
  103. || MV $C,$D ; D=C
  104. || SWAP2 $TX1,$TX2
  105. || LDNW *${INP}++,$TX1
  106. ADD $F,$T,$T ; T+=F_00_19(B,C,D)
  107. || ROTL $B,30,$C ; C=ROL(B,30)
  108. || SWAP4 $TX2,$TX2 ; byte swap
  109. || LDW *${XPA}++,$X0 ; fetches from X ring buffer are
  110. || LDW *${XPB}[4],$X2 ; 2 iterations ahead
  111. ADD $Arot,$T,$T ; T+=ROL(A,5)
  112. || MV $A,$B ; B=A
  113. || LDW *${XPA}[7],$X8
  114. || MV $TX3,$X13 ; || LDW *${XPB}[15],$X13
  115. || MV $TX2,$TX3
  116. ADD $TX2,$T,$A ; A=T+Xi
  117. || STW $TX2,*${XPB}++
  118. ;;==================================================
  119. ROTL $A,5,$Arot ; BODY_15
  120. || AND $C,$B,$F
  121. || ANDN $D,$B,$F0
  122. || ADD $K,$E,$T ; T=E+K
  123. XOR $F0,$F,$F ; F_00_19(B,C,D)
  124. || MV $D,$E ; E=D
  125. || MV $C,$D ; D=C
  126. || SWAP2 $TX1,$TX2
  127. ADD $F,$T,$T ; T+=F_00_19(B,C,D)
  128. || ROTL $B,30,$C ; C=ROL(B,30)
  129. || SWAP4 $TX2,$TX2 ; byte swap
  130. || XOR $X0,$X2,$TX0 ; Xupdate XORs are 1 iteration ahead
  131. || LDW *${XPA}++,$X0
  132. || LDW *${XPB}[4],$X2
  133. ADD $Arot,$T,$T ; T+=ROL(A,5)
  134. || MV $A,$B ; B=A
  135. || XOR $X8,$X13,$TX1
  136. || LDW *${XPA}[7],$X8
  137. || MV $TX3,$X13 ; || LDW *${XPB}[15],$X13
  138. || MV $TX2,$TX3
  139. ADD $TX2,$T,$A ; A=T+Xi
  140. || STW $TX2,*${XPB}++
  141. || XOR $TX0,$TX1,$TX1
  142. || MVK 3,B0
  143. ;;==================================================
  144. SPLOOPD 5 ; BODY_16_19
  145. || MVC B0,ILC
  146. ROTL $A,5,$Arot
  147. || AND $C,$B,$F
  148. || ANDN $D,$B,$F0
  149. || ADD $K,$E,$T ; T=E+K
  150. || ROTL $TX1,1,$TX2 ; Xupdate output
  151. XOR $F0,$F,$F ; F_00_19(B,C,D)
  152. || MV $D,$E ; E=D
  153. || MV $C,$D ; D=C
  154. ADD $F,$T,$T ; T+=F_00_19(B,C,D)
  155. || ROTL $B,30,$C ; C=ROL(B,30)
  156. || XOR $X0,$X2,$TX0
  157. || LDW *${XPA}++,$X0
  158. || LDW *${XPB}[4],$X2
  159. ADD $Arot,$T,$T ; T+=ROL(A,5)
  160. || MV $A,$B ; B=A
  161. || XOR $X8,$X13,$TX1
  162. || LDW *${XPA}[7],$X8
  163. || MV $TX3,$X13 ; || LDW *${XPB}[15],$X13
  164. || MV $TX2,$TX3
  165. ADD $TX2,$T,$A ; A=T+Xi
  166. || STW $TX2,*${XPB}++
  167. || XOR $TX0,$TX1,$TX1
  168. SPKERNEL
  169. MVK 0xffffeba1,$K
  170. || MVK 19,B0
  171. MVKH 0x6ed90000,$K ; K_20_39
  172. ___
  173. sub BODY_20_39 {
  174. $code.=<<___;
  175. ;;==================================================
  176. SPLOOPD 5 ; BODY_20_39
  177. || MVC B0,ILC
  178. ROTL $A,5,$Arot
  179. || XOR $B,$C,$F
  180. || ADD $K,$E,$T ; T=E+K
  181. || ROTL $TX1,1,$TX2 ; Xupdate output
  182. XOR $D,$F,$F ; F_20_39(B,C,D)
  183. || MV $D,$E ; E=D
  184. || MV $C,$D ; D=C
  185. ADD $F,$T,$T ; T+=F_20_39(B,C,D)
  186. || ROTL $B,30,$C ; C=ROL(B,30)
  187. || XOR $X0,$X2,$TX0
  188. || LDW *${XPA}++,$X0
  189. || LDW *${XPB}[4],$X2
  190. ADD $Arot,$T,$T ; T+=ROL(A,5)
  191. || MV $A,$B ; B=A
  192. || XOR $X8,$X13,$TX1
  193. || LDW *${XPA}[7],$X8
  194. || MV $TX3,$X13 ; || LDW *${XPB}[15],$X13
  195. || MV $TX2,$TX3
  196. ADD $TX2,$T,$A ; A=T+Xi
  197. || STW $TX2,*${XPB}++ ; last one is redundant
  198. || XOR $TX0,$TX1,$TX1
  199. SPKERNEL
  200. ___
  201. $code.=<<___ if (!shift);
  202. MVK 0xffffbcdc,$K
  203. MVKH 0x8f1b0000,$K ; K_40_59
  204. ___
  205. } &BODY_20_39();
  206. $code.=<<___;
  207. ;;==================================================
  208. SPLOOPD 5 ; BODY_40_59
  209. || MVC B0,ILC
  210. || AND $B,$C,$F
  211. || AND $B,$D,$F0
  212. ROTL $A,5,$Arot
  213. || XOR $F0,$F,$F
  214. || AND $C,$D,$F0
  215. || ADD $K,$E,$T ; T=E+K
  216. || ROTL $TX1,1,$TX2 ; Xupdate output
  217. XOR $F0,$F,$F ; F_40_59(B,C,D)
  218. || MV $D,$E ; E=D
  219. || MV $C,$D ; D=C
  220. ADD $F,$T,$T ; T+=F_40_59(B,C,D)
  221. || ROTL $B,30,$C ; C=ROL(B,30)
  222. || XOR $X0,$X2,$TX0
  223. || LDW *${XPA}++,$X0
  224. || LDW *${XPB}[4],$X2
  225. ADD $Arot,$T,$T ; T+=ROL(A,5)
  226. || MV $A,$B ; B=A
  227. || XOR $X8,$X13,$TX1
  228. || LDW *${XPA}[7],$X8
  229. || MV $TX3,$X13 ; || LDW *${XPB}[15],$X13
  230. || MV $TX2,$TX3
  231. ADD $TX2,$T,$A ; A=T+Xi
  232. || STW $TX2,*${XPB}++
  233. || XOR $TX0,$TX1,$TX1
  234. || AND $B,$C,$F
  235. || AND $B,$D,$F0
  236. SPKERNEL
  237. MVK 0xffffc1d6,$K
  238. || MVK 18,B0
  239. MVKH 0xca620000,$K ; K_60_79
  240. ___
  241. &BODY_20_39(-1); # BODY_60_78
  242. $code.=<<___;
  243. ;;==================================================
  244. [A0] B loop?
  245. || ROTL $A,5,$Arot ; BODY_79
  246. || XOR $B,$C,$F
  247. || ROTL $TX1,1,$TX2 ; Xupdate output
  248. [A0] LDNW *${INP}++,$TX1 ; pre-fetch input
  249. || ADD $K,$E,$T ; T=E+K
  250. || XOR $D,$F,$F ; F_20_39(B,C,D)
  251. ADD $F,$T,$T ; T+=F_20_39(B,C,D)
  252. || ADD $Ectx,$D,$E ; E=D,E+=Ectx
  253. || ADD $Dctx,$C,$D ; D=C,D+=Dctx
  254. || ROTL $B,30,$C ; C=ROL(B,30)
  255. ADD $Arot,$T,$T ; T+=ROL(A,5)
  256. || ADD $Bctx,$A,$B ; B=A,B+=Bctx
  257. ADD $TX2,$T,$A ; A=T+Xi
  258. ADD $Actx,$A,$A ; A+=Actx
  259. || ADD $Cctx,$C,$C ; C+=Cctx
  260. ;; end of loop?
  261. BNOP RA ; return
  262. || MV FP,SP ; restore stack pointer
  263. || LDW *FP[0],FP ; restore frame pointer
  264. STW $A,*${CTX}[0] ; emit A-E...
  265. || MVK 0,B0
  266. STW $B,*${CTX}[1]
  267. || MVC B0,AMR ; clear AMR
  268. STW $C,*${CTX}[2]
  269. STW $D,*${CTX}[3]
  270. STW $E,*${CTX}[4]
  271. .endasmfunc
  272. .sect .const
  273. .cstring "SHA1 block transform for C64x+, CRYPTOGAMS by <appro\@openssl.org>"
  274. .align 4
  275. ___
  276. print $code;
  277. close STDOUT;