2
0

sha1-c64xplus.pl 8.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337
  1. #! /usr/bin/env perl
  2. # Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the OpenSSL license (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. # ====================================================================
  10. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  11. # project. The module is, however, dual licensed under OpenSSL and
  12. # CRYPTOGAMS licenses depending on where you obtain it. For further
  13. # details see http://www.openssl.org/~appro/cryptogams/.
  14. # ====================================================================
  15. #
  16. # SHA1 for C64x+.
  17. #
  18. # November 2011
  19. #
  20. # If compared to compiler-generated code with similar characteristics,
  21. # i.e. compiled with OPENSSL_SMALL_FOOTPRINT and utilizing SPLOOPs,
  22. # this implementation is 25% smaller and >2x faster. In absolute terms
  23. # performance is (quite impressive) ~6.5 cycles per processed byte.
  24. # Fully unrolled assembler would be ~5x larger and is likely to be
  25. # ~15% faster. It would be free from references to intermediate ring
  26. # buffer, but put more pressure on L1P [both because the code would be
  27. # larger and won't be using SPLOOP buffer]. There are no plans to
  28. # realize fully unrolled variant though...
  29. #
  30. # !!! Note that this module uses AMR, which means that all interrupt
  31. # service routines are expected to preserve it and for own well-being
  32. # zero it upon entry.
  33. while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
  34. open STDOUT,">$output";
  35. ($CTX,$INP,$NUM) = ("A4","B4","A6"); # arguments
  36. ($A,$B,$C,$D,$E, $Arot,$F,$F0,$T,$K) = map("A$_",(16..20, 21..25));
  37. ($X0,$X2,$X8,$X13) = ("A26","B26","A27","B27");
  38. ($TX0,$TX1,$TX2,$TX3) = map("B$_",(28..31));
  39. ($XPA,$XPB) = ("A5","B5"); # X circular buffer
  40. ($Actx,$Bctx,$Cctx,$Dctx,$Ectx) = map("A$_",(3,6..9)); # zaps $NUM
  41. $code=<<___;
  42. .text
  43. .if .ASSEMBLER_VERSION<7000000
  44. .asg 0,__TI_EABI__
  45. .endif
  46. .if __TI_EABI__
  47. .asg sha1_block_data_order,_sha1_block_data_order
  48. .endif
  49. .asg B3,RA
  50. .asg A15,FP
  51. .asg B15,SP
  52. .if .BIG_ENDIAN
  53. .asg MV,SWAP2
  54. .asg MV,SWAP4
  55. .endif
  56. .global _sha1_block_data_order
  57. _sha1_block_data_order:
  58. .asmfunc stack_usage(64)
  59. MV $NUM,A0 ; reassign $NUM
  60. || MVK -64,B0
  61. [!A0] BNOP RA ; if ($NUM==0) return;
  62. || [A0] STW FP,*SP--[16] ; save frame pointer and alloca(64)
  63. || [A0] MV SP,FP
  64. [A0] LDW *${CTX}[0],$A ; load A-E...
  65. || [A0] AND B0,SP,SP ; align stack at 64 bytes
  66. [A0] LDW *${CTX}[1],$B
  67. || [A0] SUBAW SP,2,SP ; reserve two words above buffer
  68. [A0] LDW *${CTX}[2],$C
  69. || [A0] MVK 0x00404,B0
  70. [A0] LDW *${CTX}[3],$D
  71. || [A0] MVKH 0x50000,B0 ; 0x050404, 64 bytes for $XP[AB]
  72. [A0] LDW *${CTX}[4],$E
  73. || [A0] MVC B0,AMR ; setup circular addressing
  74. LDNW *${INP}++,$TX1 ; pre-fetch input
  75. NOP 1
  76. loop?:
  77. MVK 0x00007999,$K
  78. || ADDAW SP,2,$XPA
  79. || SUB A0,1,A0
  80. || MVK 13,B0
  81. MVKH 0x5a820000,$K ; K_00_19
  82. || ADDAW SP,2,$XPB
  83. || MV $A,$Actx
  84. || MV $B,$Bctx
  85. ;;==================================================
  86. SPLOOPD 5 ; BODY_00_13
  87. || MV $C,$Cctx
  88. || MV $D,$Dctx
  89. || MV $E,$Ectx
  90. || MVC B0,ILC
  91. ROTL $A,5,$Arot
  92. || AND $C,$B,$F
  93. || ANDN $D,$B,$F0
  94. || ADD $K,$E,$T ; T=E+K
  95. XOR $F0,$F,$F ; F_00_19(B,C,D)
  96. || MV $D,$E ; E=D
  97. || MV $C,$D ; D=C
  98. || SWAP2 $TX1,$TX2
  99. || LDNW *${INP}++,$TX1
  100. ADD $F,$T,$T ; T+=F_00_19(B,C,D)
  101. || ROTL $B,30,$C ; C=ROL(B,30)
  102. || SWAP4 $TX2,$TX3 ; byte swap
  103. ADD $Arot,$T,$T ; T+=ROL(A,5)
  104. || MV $A,$B ; B=A
  105. ADD $TX3,$T,$A ; A=T+Xi
  106. || STW $TX3,*${XPB}++
  107. SPKERNEL
  108. ;;==================================================
  109. ROTL $A,5,$Arot ; BODY_14
  110. || AND $C,$B,$F
  111. || ANDN $D,$B,$F0
  112. || ADD $K,$E,$T ; T=E+K
  113. XOR $F0,$F,$F ; F_00_19(B,C,D)
  114. || MV $D,$E ; E=D
  115. || MV $C,$D ; D=C
  116. || SWAP2 $TX1,$TX2
  117. || LDNW *${INP}++,$TX1
  118. ADD $F,$T,$T ; T+=F_00_19(B,C,D)
  119. || ROTL $B,30,$C ; C=ROL(B,30)
  120. || SWAP4 $TX2,$TX2 ; byte swap
  121. || LDW *${XPA}++,$X0 ; fetches from X ring buffer are
  122. || LDW *${XPB}[4],$X2 ; 2 iterations ahead
  123. ADD $Arot,$T,$T ; T+=ROL(A,5)
  124. || MV $A,$B ; B=A
  125. || LDW *${XPA}[7],$X8
  126. || MV $TX3,$X13 ; || LDW *${XPB}[15],$X13
  127. || MV $TX2,$TX3
  128. ADD $TX2,$T,$A ; A=T+Xi
  129. || STW $TX2,*${XPB}++
  130. ;;==================================================
  131. ROTL $A,5,$Arot ; BODY_15
  132. || AND $C,$B,$F
  133. || ANDN $D,$B,$F0
  134. || ADD $K,$E,$T ; T=E+K
  135. XOR $F0,$F,$F ; F_00_19(B,C,D)
  136. || MV $D,$E ; E=D
  137. || MV $C,$D ; D=C
  138. || SWAP2 $TX1,$TX2
  139. ADD $F,$T,$T ; T+=F_00_19(B,C,D)
  140. || ROTL $B,30,$C ; C=ROL(B,30)
  141. || SWAP4 $TX2,$TX2 ; byte swap
  142. || XOR $X0,$X2,$TX0 ; Xupdate XORs are 1 iteration ahead
  143. || LDW *${XPA}++,$X0
  144. || LDW *${XPB}[4],$X2
  145. ADD $Arot,$T,$T ; T+=ROL(A,5)
  146. || MV $A,$B ; B=A
  147. || XOR $X8,$X13,$TX1
  148. || LDW *${XPA}[7],$X8
  149. || MV $TX3,$X13 ; || LDW *${XPB}[15],$X13
  150. || MV $TX2,$TX3
  151. ADD $TX2,$T,$A ; A=T+Xi
  152. || STW $TX2,*${XPB}++
  153. || XOR $TX0,$TX1,$TX1
  154. || MVK 3,B0
  155. ;;==================================================
  156. SPLOOPD 5 ; BODY_16_19
  157. || MVC B0,ILC
  158. ROTL $A,5,$Arot
  159. || AND $C,$B,$F
  160. || ANDN $D,$B,$F0
  161. || ADD $K,$E,$T ; T=E+K
  162. || ROTL $TX1,1,$TX2 ; Xupdate output
  163. XOR $F0,$F,$F ; F_00_19(B,C,D)
  164. || MV $D,$E ; E=D
  165. || MV $C,$D ; D=C
  166. ADD $F,$T,$T ; T+=F_00_19(B,C,D)
  167. || ROTL $B,30,$C ; C=ROL(B,30)
  168. || XOR $X0,$X2,$TX0
  169. || LDW *${XPA}++,$X0
  170. || LDW *${XPB}[4],$X2
  171. ADD $Arot,$T,$T ; T+=ROL(A,5)
  172. || MV $A,$B ; B=A
  173. || XOR $X8,$X13,$TX1
  174. || LDW *${XPA}[7],$X8
  175. || MV $TX3,$X13 ; || LDW *${XPB}[15],$X13
  176. || MV $TX2,$TX3
  177. ADD $TX2,$T,$A ; A=T+Xi
  178. || STW $TX2,*${XPB}++
  179. || XOR $TX0,$TX1,$TX1
  180. SPKERNEL
  181. MVK 0xffffeba1,$K
  182. || MVK 19,B0
  183. MVKH 0x6ed90000,$K ; K_20_39
  184. ___
  185. sub BODY_20_39 {
  186. $code.=<<___;
  187. ;;==================================================
  188. SPLOOPD 5 ; BODY_20_39
  189. || MVC B0,ILC
  190. ROTL $A,5,$Arot
  191. || XOR $B,$C,$F
  192. || ADD $K,$E,$T ; T=E+K
  193. || ROTL $TX1,1,$TX2 ; Xupdate output
  194. XOR $D,$F,$F ; F_20_39(B,C,D)
  195. || MV $D,$E ; E=D
  196. || MV $C,$D ; D=C
  197. ADD $F,$T,$T ; T+=F_20_39(B,C,D)
  198. || ROTL $B,30,$C ; C=ROL(B,30)
  199. || XOR $X0,$X2,$TX0
  200. || LDW *${XPA}++,$X0
  201. || LDW *${XPB}[4],$X2
  202. ADD $Arot,$T,$T ; T+=ROL(A,5)
  203. || MV $A,$B ; B=A
  204. || XOR $X8,$X13,$TX1
  205. || LDW *${XPA}[7],$X8
  206. || MV $TX3,$X13 ; || LDW *${XPB}[15],$X13
  207. || MV $TX2,$TX3
  208. ADD $TX2,$T,$A ; A=T+Xi
  209. || STW $TX2,*${XPB}++ ; last one is redundant
  210. || XOR $TX0,$TX1,$TX1
  211. SPKERNEL
  212. ___
  213. $code.=<<___ if (!shift);
  214. MVK 0xffffbcdc,$K
  215. MVKH 0x8f1b0000,$K ; K_40_59
  216. ___
  217. } &BODY_20_39();
  218. $code.=<<___;
  219. ;;==================================================
  220. SPLOOPD 5 ; BODY_40_59
  221. || MVC B0,ILC
  222. || AND $B,$C,$F
  223. || AND $B,$D,$F0
  224. ROTL $A,5,$Arot
  225. || XOR $F0,$F,$F
  226. || AND $C,$D,$F0
  227. || ADD $K,$E,$T ; T=E+K
  228. || ROTL $TX1,1,$TX2 ; Xupdate output
  229. XOR $F0,$F,$F ; F_40_59(B,C,D)
  230. || MV $D,$E ; E=D
  231. || MV $C,$D ; D=C
  232. ADD $F,$T,$T ; T+=F_40_59(B,C,D)
  233. || ROTL $B,30,$C ; C=ROL(B,30)
  234. || XOR $X0,$X2,$TX0
  235. || LDW *${XPA}++,$X0
  236. || LDW *${XPB}[4],$X2
  237. ADD $Arot,$T,$T ; T+=ROL(A,5)
  238. || MV $A,$B ; B=A
  239. || XOR $X8,$X13,$TX1
  240. || LDW *${XPA}[7],$X8
  241. || MV $TX3,$X13 ; || LDW *${XPB}[15],$X13
  242. || MV $TX2,$TX3
  243. ADD $TX2,$T,$A ; A=T+Xi
  244. || STW $TX2,*${XPB}++
  245. || XOR $TX0,$TX1,$TX1
  246. || AND $B,$C,$F
  247. || AND $B,$D,$F0
  248. SPKERNEL
  249. MVK 0xffffc1d6,$K
  250. || MVK 18,B0
  251. MVKH 0xca620000,$K ; K_60_79
  252. ___
  253. &BODY_20_39(-1); # BODY_60_78
  254. $code.=<<___;
  255. ;;==================================================
  256. [A0] B loop?
  257. || ROTL $A,5,$Arot ; BODY_79
  258. || XOR $B,$C,$F
  259. || ROTL $TX1,1,$TX2 ; Xupdate output
  260. [A0] LDNW *${INP}++,$TX1 ; pre-fetch input
  261. || ADD $K,$E,$T ; T=E+K
  262. || XOR $D,$F,$F ; F_20_39(B,C,D)
  263. ADD $F,$T,$T ; T+=F_20_39(B,C,D)
  264. || ADD $Ectx,$D,$E ; E=D,E+=Ectx
  265. || ADD $Dctx,$C,$D ; D=C,D+=Dctx
  266. || ROTL $B,30,$C ; C=ROL(B,30)
  267. ADD $Arot,$T,$T ; T+=ROL(A,5)
  268. || ADD $Bctx,$A,$B ; B=A,B+=Bctx
  269. ADD $TX2,$T,$A ; A=T+Xi
  270. ADD $Actx,$A,$A ; A+=Actx
  271. || ADD $Cctx,$C,$C ; C+=Cctx
  272. ;; end of loop?
  273. BNOP RA ; return
  274. || MV FP,SP ; restore stack pointer
  275. || LDW *FP[0],FP ; restore frame pointer
  276. STW $A,*${CTX}[0] ; emit A-E...
  277. || MVK 0,B0
  278. STW $B,*${CTX}[1]
  279. || MVC B0,AMR ; clear AMR
  280. STW $C,*${CTX}[2]
  281. STW $D,*${CTX}[3]
  282. STW $E,*${CTX}[4]
  283. .endasmfunc
  284. .sect .const
  285. .cstring "SHA1 block transform for C64x+, CRYPTOGAMS by <appro\@openssl.org>"
  286. .align 4
  287. ___
  288. print $code;
  289. close STDOUT;