keccak1600-c64x.pl 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884
  1. #!/usr/bin/env perl
  2. # Copyright 2017-2020 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. # ====================================================================
  10. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  11. # project. The module is, however, dual licensed under OpenSSL and
  12. # CRYPTOGAMS licenses depending on where you obtain it. For further
  13. # details see http://www.openssl.org/~appro/cryptogams/.
  14. # ====================================================================
  15. #
  16. # [ABI- and endian-neutral] Keccak-1600 for C64x.
  17. #
  18. # June 2017.
  19. #
  20. # This is straightforward KECCAK_1X_ALT variant (see sha/keccak1600.c)
  21. # with bit interleaving. 64-bit values are simply split between A- and
  22. # B-files, with A-file holding least significant halves. This works
  23. # out perfectly, because all operations including cross-communications
  24. # [in rotate operations] are always complementary. Performance is
  25. # [incredible for a 32-bit processor] 10.9 cycles per processed byte
  26. # for r=1088, which corresponds to SHA3-256. This is >15x faster than
  27. # compiler-generated KECCAK_1X_ALT code, and >10x than other variants.
  28. # On average processor ends up issuing ~4.5 instructions per cycle...
  29. my @A = map([ $_, ($_+1), ($_+2), ($_+3), ($_+4) ], (5,10,16,21,26));
  30. $A[1][4] = 31; # B14 is reserved, A14 is used as iota[]
  31. ($A[3][0],$A[4][1]) = ($A[4][1],$A[3][0]);
  32. my @C = (0..4,$A[3][0],$A[4][0]);
  33. my $iotas = "A14";
  34. my @rhotates = ([ 0, 1, 62, 28, 27 ],
  35. [ 36, 44, 6, 55, 20 ],
  36. [ 3, 10, 43, 25, 39 ],
  37. [ 41, 45, 15, 21, 8 ],
  38. [ 18, 2, 61, 56, 14 ]);
  39. sub ROL64 {
  40. my ($src,$rot,$dst,$p) = @_;
  41. if ($rot&1) {
  42. $code.=<<___;
  43. $p ROTL B$src,$rot/2+1,A$dst
  44. || ROTL A$src,$rot/2, B$dst
  45. ___
  46. } else {
  47. $code.=<<___;
  48. $p ROTL A$src,$rot/2,A$dst
  49. || ROTL B$src,$rot/2,B$dst
  50. ___
  51. }
  52. }
  53. ########################################################################
  54. # Stack frame layout
  55. #
  56. # SP--->+------+------+
  57. # | | |
  58. # +1--->+------+------+<- -9 below 4 slots are used by KeccakF1600_int
  59. # | | |
  60. # +2--->+------+------+<- -8
  61. # | | |
  62. # +3--->+------+------+<- -7
  63. # | A2 | A3 | A3:A2 are preserved by KeccakF1600_int
  64. # +4--->+------+------+<- -6
  65. # | B2 | B3 | B3:B2 are preserved by KeccakF1600_int
  66. # +5--->+------+------+<- -5 below is ABI-compliant layout
  67. # | A10 | A11 |
  68. # +6--->+------+------+<- -4
  69. # | A12 | A13 |
  70. # +7--->+------+------+<- -3
  71. # | A14 | B3 |
  72. # +8--->+------+------+<- -2
  73. # | B10 | B11 |
  74. # +9--->+------+------+<- -1
  75. # | B12 | B13 |
  76. # +------+------+<---FP
  77. # | A15 |
  78. # +------+--
  79. $code.=<<___;
  80. .text
  81. .if .ASSEMBLER_VERSION<7000000
  82. .asg 0,__TI_EABI__
  83. .endif
  84. .if __TI_EABI__
  85. .nocmp
  86. .asg KeccakF1600,_KeccakF1600
  87. .asg SHA3_absorb,_SHA3_absorb
  88. .asg SHA3_squeeze,_SHA3_squeeze
  89. .endif
  90. .asg B3,RA
  91. .asg A15,FP
  92. .asg B15,SP
  93. .align 32
  94. _KeccakF1600_int:
  95. .asmfunc
  96. STDW A3:A2,*FP[-7]
  97. || STDW B3:B2,*SP[4]
  98. _KeccakF1600_cheat:
  99. .if __TI_EABI__
  100. ADDKPC _KeccakF1600_int,B0
  101. || MVKL \$PCR_OFFSET(iotas,_KeccakF1600_int),$iotas
  102. MVKH \$PCR_OFFSET(iotas,_KeccakF1600_int),$iotas
  103. .else
  104. ADDKPC _KeccakF1600_int,B0
  105. || MVKL (iotas-_KeccakF1600_int),$iotas
  106. MVKH (iotas-_KeccakF1600_int),$iotas
  107. .endif
  108. ADD B0,$iotas,$iotas
  109. loop?:
  110. XOR A$A[0][2],A$A[1][2],A$C[2] ; Theta
  111. || XOR B$A[0][2],B$A[1][2],B$C[2]
  112. || XOR A$A[0][3],A$A[1][3],A$C[3]
  113. || XOR B$A[0][3],B$A[1][3],B$C[3]
  114. || XOR A$A[0][0],A$A[1][0],A$C[0]
  115. || XOR B$A[0][0],B$A[1][0],B$C[0]
  116. XOR A$A[2][2],A$C[2],A$C[2]
  117. || XOR B$A[2][2],B$C[2],B$C[2]
  118. || XOR A$A[2][3],A$C[3],A$C[3]
  119. || XOR B$A[2][3],B$C[3],B$C[3]
  120. || XOR A$A[2][0],A$C[0],A$C[0]
  121. || XOR B$A[2][0],B$C[0],B$C[0]
  122. XOR A$A[3][2],A$C[2],A$C[2]
  123. || XOR B$A[3][2],B$C[2],B$C[2]
  124. || XOR A$A[3][3],A$C[3],A$C[3]
  125. || XOR B$A[3][3],B$C[3],B$C[3]
  126. || XOR A$A[3][0],A$C[0],A$C[0]
  127. || XOR B$A[3][0],B$C[0],B$C[0]
  128. XOR A$A[4][2],A$C[2],A$C[2]
  129. || XOR B$A[4][2],B$C[2],B$C[2]
  130. || XOR A$A[4][3],A$C[3],A$C[3]
  131. || XOR B$A[4][3],B$C[3],B$C[3]
  132. || XOR A$A[4][0],A$C[0],A$C[0]
  133. || XOR B$A[4][0],B$C[0],B$C[0]
  134. XOR A$A[0][4],A$A[1][4],A$C[4]
  135. || XOR B$A[0][4],B$A[1][4],B$C[4]
  136. || XOR A$A[0][1],A$A[1][1],A$C[1]
  137. || XOR B$A[0][1],B$A[1][1],B$C[1]
  138. || STDW A$A[3][0]:A$A[4][0],*SP[1] ; offload some data
  139. STDW B$A[3][0]:B$A[4][0],*SP[2]
  140. || XOR A$A[2][4],A$C[4],A$C[4]
  141. || XOR B$A[2][4],B$C[4],B$C[4]
  142. || XOR A$A[2][1],A$C[1],A$C[1]
  143. || XOR B$A[2][1],B$C[1],B$C[1]
  144. || ROTL B$C[2],1,A$C[5] ; ROL64(C[2],1)
  145. || ROTL A$C[2],0,B$C[5]
  146. XOR A$A[3][4],A$C[4],A$C[4]
  147. || XOR B$A[3][4],B$C[4],B$C[4]
  148. || XOR A$A[3][1],A$C[1],A$C[1]
  149. || XOR B$A[3][1],B$C[1],B$C[1]
  150. || ROTL B$C[3],1,A$C[6] ; ROL64(C[3],1)
  151. || ROTL A$C[3],0,B$C[6]
  152. XOR A$A[4][4],A$C[4],A$C[4]
  153. || XOR B$A[4][4],B$C[4],B$C[4]
  154. || XOR A$A[4][1],A$C[1],A$C[1]
  155. || XOR B$A[4][1],B$C[1],B$C[1]
  156. || XOR A$C[0],A$C[5],A$C[5] ; C[0] ^ ROL64(C[2],1)
  157. || XOR B$C[0],B$C[5],B$C[5]
  158. XOR A$C[5],A$A[0][1],A$A[0][1]
  159. || XOR B$C[5],B$A[0][1],B$A[0][1]
  160. || XOR A$C[5],A$A[1][1],A$A[1][1]
  161. || XOR B$C[5],B$A[1][1],B$A[1][1]
  162. || XOR A$C[5],A$A[2][1],A$A[2][1]
  163. || XOR B$C[5],B$A[2][1],B$A[2][1]
  164. XOR A$C[5],A$A[3][1],A$A[3][1]
  165. || XOR B$C[5],B$A[3][1],B$A[3][1]
  166. || XOR A$C[5],A$A[4][1],A$A[4][1]
  167. || XOR B$C[5],B$A[4][1],B$A[4][1]
  168. || ROTL B$C[4],1,A$C[5] ; ROL64(C[4],1)
  169. || ROTL A$C[4],0,B$C[5]
  170. || XOR A$C[1],A$C[6],A$C[6] ; C[1] ^ ROL64(C[3],1)
  171. || XOR B$C[1],B$C[6],B$C[6]
  172. XOR A$C[6],A$A[0][2],A$A[0][2]
  173. || XOR B$C[6],B$A[0][2],B$A[0][2]
  174. || XOR A$C[6],A$A[1][2],A$A[1][2]
  175. || XOR B$C[6],B$A[1][2],B$A[1][2]
  176. || XOR A$C[6],A$A[2][2],A$A[2][2]
  177. || XOR B$C[6],B$A[2][2],B$A[2][2]
  178. || ROTL B$C[1],1,A$C[1] ; ROL64(C[1],1)
  179. || ROTL A$C[1],0,B$C[1]
  180. XOR A$C[6],A$A[3][2],A$A[3][2]
  181. || XOR B$C[6],B$A[3][2],B$A[3][2]
  182. || XOR A$C[6],A$A[4][2],A$A[4][2]
  183. || XOR B$C[6],B$A[4][2],B$A[4][2]
  184. || ROTL B$C[0],1,A$C[6] ; ROL64(C[0],1)
  185. || ROTL A$C[0],0,B$C[6]
  186. || XOR A$C[5],A$C[2],A$C[2] ; C[2] ^= ROL64(C[4],1)
  187. || XOR B$C[5],B$C[2],B$C[2]
  188. XOR A$C[2],A$A[0][3],A$A[0][3]
  189. || XOR B$C[2],B$A[0][3],B$A[0][3]
  190. || XOR A$C[2],A$A[1][3],A$A[1][3]
  191. || XOR B$C[2],B$A[1][3],B$A[1][3]
  192. || XOR A$C[2],A$A[2][3],A$A[2][3]
  193. || XOR B$C[2],B$A[2][3],B$A[2][3]
  194. XOR A$C[6],A$C[3],A$C[3] ; C[3] ^= ROL64(C[0],1)
  195. || XOR B$C[6],B$C[3],B$C[3]
  196. || LDDW *FP[-9],A$A[3][0]:A$A[4][0] ; restore offloaded data
  197. || LDDW *SP[2],B$A[3][0]:B$A[4][0]
  198. || XOR A$C[2],A$A[3][3],A$A[3][3]
  199. || XOR B$C[2],B$A[3][3],B$A[3][3]
  200. XOR A$C[2],A$A[4][3],A$A[4][3]
  201. || XOR B$C[2],B$A[4][3],B$A[4][3]
  202. || XOR A$C[3],A$A[0][4],A$A[0][4]
  203. || XOR B$C[3],B$A[0][4],B$A[0][4]
  204. || XOR A$C[3],A$A[1][4],A$A[1][4]
  205. || XOR B$C[3],B$A[1][4],B$A[1][4]
  206. XOR A$C[3],A$A[2][4],A$A[2][4]
  207. || XOR B$C[3],B$A[2][4],B$A[2][4]
  208. || XOR A$C[3],A$A[3][4],A$A[3][4]
  209. || XOR B$C[3],B$A[3][4],B$A[3][4]
  210. || XOR A$C[3],A$A[4][4],A$A[4][4]
  211. || XOR B$C[3],B$A[4][4],B$A[4][4]
  212. XOR A$C[1],A$C[4],A$C[4] ; C[4] ^= ROL64(C[1],1)
  213. || XOR B$C[1],B$C[4],B$C[4]
  214. || MV A$A[0][1],A$C[1] ; Rho+Pi, "early start"
  215. || MV B$A[0][1],B$C[1]
  216. ___
  217. &ROL64 ($A[1][1],$rhotates[1][1],$A[0][1],"||");
  218. $code.=<<___;
  219. XOR A$C[4],A$A[0][0],A$A[0][0]
  220. || XOR B$C[4],B$A[0][0],B$A[0][0]
  221. || XOR A$C[4],A$A[1][0],A$A[1][0]
  222. || XOR B$C[4],B$A[1][0],B$A[1][0]
  223. || MV A$A[0][3],A$C[3]
  224. || MV B$A[0][3],B$C[3]
  225. ___
  226. &ROL64 ($A[3][3],$rhotates[3][3],$A[0][3],"||");
  227. $code.=<<___;
  228. XOR A$C[4],A$A[2][0],A$A[2][0]
  229. || XOR B$C[4],B$A[2][0],B$A[2][0]
  230. || XOR A$C[4],A$A[3][0],A$A[3][0]
  231. || XOR B$C[4],B$A[3][0],B$A[3][0]
  232. || MV A$A[0][2],A$C[2]
  233. || MV B$A[0][2],B$C[2]
  234. ___
  235. &ROL64 ($A[2][2],$rhotates[2][2],$A[0][2],"||");
  236. $code.=<<___;
  237. XOR A$C[4],A$A[4][0],A$A[4][0]
  238. || XOR B$C[4],B$A[4][0],B$A[4][0]
  239. || MV A$A[0][4],A$C[4]
  240. || MV B$A[0][4],B$C[4]
  241. ___
  242. &ROL64 ($A[4][4],$rhotates[4][4],$A[0][4],"||");
  243. &ROL64 ($A[1][4],$rhotates[1][4],$A[1][1]);
  244. $code.=<<___;
  245. || LDW *${iotas}++[2],A$C[0]
  246. ___
  247. &ROL64 ($A[2][3],$rhotates[2][3],$A[2][2]);
  248. $code.=<<___;
  249. || LDW *${iotas}[-1],B$C[0]
  250. ___
  251. &ROL64 ($A[3][2],$rhotates[3][2],$A[3][3]);
  252. &ROL64 ($A[4][1],$rhotates[4][1],$A[4][4]);
  253. &ROL64 ($A[4][2],$rhotates[4][2],$A[1][4]);
  254. &ROL64 ($A[3][4],$rhotates[3][4],$A[2][3]);
  255. &ROL64 ($A[2][1],$rhotates[2][1],$A[3][2]);
  256. &ROL64 ($A[1][3],$rhotates[1][3],$A[4][1]);
  257. &ROL64 ($A[2][4],$rhotates[2][4],$A[4][2]);
  258. &ROL64 ($A[4][3],$rhotates[4][3],$A[3][4]);
  259. &ROL64 ($A[1][2],$rhotates[1][2],$A[2][1]);
  260. &ROL64 ($A[3][1],$rhotates[3][1],$A[1][3]);
  261. &ROL64 ($A[4][0],$rhotates[4][0],$A[2][4]);
  262. &ROL64 ($A[3][0],$rhotates[3][0],$A[4][3]);
  263. &ROL64 ($A[2][0],$rhotates[2][0],$A[1][2]);
  264. &ROL64 ($A[1][0],$rhotates[1][0],$A[3][1]);
  265. #&ROL64 ($C[3], $rhotates[0][3],$A[1][0]); # moved below
  266. &ROL64 ($C[1], $rhotates[0][1],$A[2][0]);
  267. &ROL64 ($C[4], $rhotates[0][4],$A[3][0]);
  268. &ROL64 ($C[2], $rhotates[0][2],$A[4][0]);
  269. $code.=<<___;
  270. || ANDN A$A[0][2],A$A[0][1],A$C[4] ; Chi+Iota
  271. || ANDN B$A[0][2],B$A[0][1],B$C[4]
  272. || ANDN A$A[0][3],A$A[0][2],A$C[1]
  273. || ANDN B$A[0][3],B$A[0][2],B$C[1]
  274. || ANDN A$A[0][4],A$A[0][3],A$C[2]
  275. || ANDN B$A[0][4],B$A[0][3],B$C[2]
  276. ___
  277. &ROL64 ($C[3], $rhotates[0][3],$A[1][0]);
  278. $code.=<<___;
  279. || ANDN A$A[0][0],A$A[0][4],A$C[3]
  280. || ANDN B$A[0][0],B$A[0][4],B$C[3]
  281. || XOR A$C[4],A$A[0][0],A$A[0][0]
  282. || XOR B$C[4],B$A[0][0],B$A[0][0]
  283. || ANDN A$A[0][1],A$A[0][0],A$C[4]
  284. || ANDN B$A[0][1],B$A[0][0],B$C[4]
  285. XOR A$C[1],A$A[0][1],A$A[0][1]
  286. || XOR B$C[1],B$A[0][1],B$A[0][1]
  287. || XOR A$C[2],A$A[0][2],A$A[0][2]
  288. || XOR B$C[2],B$A[0][2],B$A[0][2]
  289. || XOR A$C[3],A$A[0][3],A$A[0][3]
  290. || XOR B$C[3],B$A[0][3],B$A[0][3]
  291. XOR A$C[4],A$A[0][4],A$A[0][4]
  292. || XOR B$C[4],B$A[0][4],B$A[0][4]
  293. || XOR A$C[0],A$A[0][0],A$A[0][0] ; A[0][0] ^= iotas[i++];
  294. || XOR B$C[0],B$A[0][0],B$A[0][0]
  295. || EXTU $iotas,24,24,A0 ; A0 is A$C[0], as we done?
  296. ANDN A$A[1][2],A$A[1][1],A$C[4]
  297. || ANDN B$A[1][2],B$A[1][1],B$C[4]
  298. || ANDN A$A[1][3],A$A[1][2],A$C[1]
  299. || ANDN B$A[1][3],B$A[1][2],B$C[1]
  300. || ANDN A$A[1][4],A$A[1][3],A$C[2]
  301. || ANDN B$A[1][4],B$A[1][3],B$C[2]
  302. ANDN A$A[1][0],A$A[1][4],A$C[3]
  303. || ANDN B$A[1][0],B$A[1][4],B$C[3]
  304. || XOR A$C[4],A$A[1][0],A$A[1][0]
  305. || XOR B$C[4],B$A[1][0],B$A[1][0]
  306. || ANDN A$A[1][1],A$A[1][0],A$C[4]
  307. || ANDN B$A[1][1],B$A[1][0],B$C[4]
  308. XOR A$C[1],A$A[1][1],A$A[1][1]
  309. || XOR B$C[1],B$A[1][1],B$A[1][1]
  310. || XOR A$C[2],A$A[1][2],A$A[1][2]
  311. || XOR B$C[2],B$A[1][2],B$A[1][2]
  312. || XOR A$C[3],A$A[1][3],A$A[1][3]
  313. || XOR B$C[3],B$A[1][3],B$A[1][3]
  314. XOR A$C[4],A$A[1][4],A$A[1][4]
  315. || XOR B$C[4],B$A[1][4],B$A[1][4]
  316. || ANDN A$A[2][2],A$A[2][1],A$C[4]
  317. || ANDN B$A[2][2],B$A[2][1],B$C[4]
  318. || ANDN A$A[2][3],A$A[2][2],A$C[1]
  319. || ANDN B$A[2][3],B$A[2][2],B$C[1]
  320. ANDN A$A[2][4],A$A[2][3],A$C[2]
  321. || ANDN B$A[2][4],B$A[2][3],B$C[2]
  322. || ANDN A$A[2][0],A$A[2][4],A$C[3]
  323. || ANDN B$A[2][0],B$A[2][4],B$C[3]
  324. || XOR A$C[4],A$A[2][0],A$A[2][0]
  325. || XOR B$C[4],B$A[2][0],B$A[2][0]
  326. ANDN A$A[2][1],A$A[2][0],A$C[4]
  327. || ANDN B$A[2][1],B$A[2][0],B$C[4]
  328. || XOR A$C[1],A$A[2][1],A$A[2][1]
  329. || XOR B$C[1],B$A[2][1],B$A[2][1]
  330. || XOR A$C[2],A$A[2][2],A$A[2][2]
  331. || XOR B$C[2],B$A[2][2],B$A[2][2]
  332. XOR A$C[3],A$A[2][3],A$A[2][3]
  333. || XOR B$C[3],B$A[2][3],B$A[2][3]
  334. || XOR A$C[4],A$A[2][4],A$A[2][4]
  335. || XOR B$C[4],B$A[2][4],B$A[2][4]
  336. ANDN A$A[3][2],A$A[3][1],A$C[4]
  337. || ANDN B$A[3][2],B$A[3][1],B$C[4]
  338. || ANDN A$A[3][3],A$A[3][2],A$C[1]
  339. || ANDN B$A[3][3],B$A[3][2],B$C[1]
  340. || ANDN A$A[3][4],A$A[3][3],A$C[2]
  341. || ANDN B$A[3][4],B$A[3][3],B$C[2]
  342. ANDN A$A[3][0],A$A[3][4],A$C[3]
  343. || ANDN B$A[3][0],B$A[3][4],B$C[3]
  344. || XOR A$C[4],A$A[3][0],A$A[3][0]
  345. || XOR B$C[4],B$A[3][0],B$A[3][0]
  346. || ANDN A$A[3][1],A$A[3][0],A$C[4]
  347. || ANDN B$A[3][1],B$A[3][0],B$C[4]
  348. XOR A$C[1],A$A[3][1],A$A[3][1]
  349. || XOR B$C[1],B$A[3][1],B$A[3][1]
  350. || XOR A$C[2],A$A[3][2],A$A[3][2]
  351. || XOR B$C[2],B$A[3][2],B$A[3][2]
  352. || XOR A$C[3],A$A[3][3],A$A[3][3]
  353. ||[A0] BNOP loop?
  354. XOR B$C[3],B$A[3][3],B$A[3][3]
  355. || XOR A$C[4],A$A[3][4],A$A[3][4]
  356. || XOR B$C[4],B$A[3][4],B$A[3][4]
  357. ||[!A0] LDDW *FP[-7],A3:A2
  358. ||[!A0] LDDW *SP[4], RA:B2
  359. ANDN A$A[4][2],A$A[4][1],A$C[4]
  360. || ANDN B$A[4][2],B$A[4][1],B$C[4]
  361. || ANDN A$A[4][3],A$A[4][2],A$C[1]
  362. || ANDN B$A[4][3],B$A[4][2],B$C[1]
  363. || ANDN A$A[4][4],A$A[4][3],A$C[2]
  364. || ANDN B$A[4][4],B$A[4][3],B$C[2]
  365. ANDN A$A[4][0],A$A[4][4],A$C[3]
  366. || ANDN B$A[4][0],B$A[4][4],B$C[3]
  367. || XOR A$C[4],A$A[4][0],A$A[4][0]
  368. || XOR B$C[4],B$A[4][0],B$A[4][0]
  369. || ANDN A$A[4][1],A$A[4][0],A$C[4]
  370. || ANDN B$A[4][1],B$A[4][0],B$C[4]
  371. XOR A$C[1],A$A[4][1],A$A[4][1]
  372. || XOR B$C[1],B$A[4][1],B$A[4][1]
  373. || XOR A$C[2],A$A[4][2],A$A[4][2]
  374. || XOR B$C[2],B$A[4][2],B$A[4][2]
  375. || XOR A$C[3],A$A[4][3],A$A[4][3]
  376. || XOR B$C[3],B$A[4][3],B$A[4][3]
  377. XOR A$C[4],A$A[4][4],A$A[4][4]
  378. || XOR B$C[4],B$A[4][4],B$A[4][4]
  379. ;;===== branch to loop? is taken here
  380. BNOP RA,5
  381. .endasmfunc
  382. .newblock
  383. .global _KeccakF1600
  384. .align 32
  385. _KeccakF1600:
  386. .asmfunc stack_usage(80)
  387. STW FP,*SP--(80) ; save frame pointer
  388. || MV SP,FP
  389. STDW B13:B12,*SP[9]
  390. || STDW A13:A12,*FP[-4]
  391. STDW B11:B10,*SP[8]
  392. || STDW A11:A10,*FP[-5]
  393. STW RA, *SP[15]
  394. || STW A14,*FP[-6]
  395. || MV A4,A2
  396. || ADD 4,A4,B2
  397. LDW *A2++[2],A$A[0][0] ; load A[5][5]
  398. || LDW *B2++[2],B$A[0][0]
  399. LDW *A2++[2],A$A[0][1]
  400. || LDW *B2++[2],B$A[0][1]
  401. LDW *A2++[2],A$A[0][2]
  402. || LDW *B2++[2],B$A[0][2]
  403. LDW *A2++[2],A$A[0][3]
  404. || LDW *B2++[2],B$A[0][3]
  405. LDW *A2++[2],A$A[0][4]
  406. || LDW *B2++[2],B$A[0][4]
  407. LDW *A2++[2],A$A[1][0]
  408. || LDW *B2++[2],B$A[1][0]
  409. LDW *A2++[2],A$A[1][1]
  410. || LDW *B2++[2],B$A[1][1]
  411. LDW *A2++[2],A$A[1][2]
  412. || LDW *B2++[2],B$A[1][2]
  413. LDW *A2++[2],A$A[1][3]
  414. || LDW *B2++[2],B$A[1][3]
  415. LDW *A2++[2],A$A[1][4]
  416. || LDW *B2++[2],B$A[1][4]
  417. LDW *A2++[2],A$A[2][0]
  418. || LDW *B2++[2],B$A[2][0]
  419. LDW *A2++[2],A$A[2][1]
  420. || LDW *B2++[2],B$A[2][1]
  421. LDW *A2++[2],A$A[2][2]
  422. || LDW *B2++[2],B$A[2][2]
  423. LDW *A2++[2],A$A[2][3]
  424. || LDW *B2++[2],B$A[2][3]
  425. LDW *A2++[2],A$A[2][4]
  426. || LDW *B2++[2],B$A[2][4]
  427. LDW *A2++[2],A$A[3][0]
  428. || LDW *B2++[2],B$A[3][0]
  429. LDW *A2++[2],A$A[3][1]
  430. || LDW *B2++[2],B$A[3][1]
  431. LDW *A2++[2],A$A[3][2]
  432. || LDW *B2++[2],B$A[3][2]
  433. LDW *A2++[2],A$A[3][3]
  434. || LDW *B2++[2],B$A[3][3]
  435. LDW *A2++[2],A$A[3][4]
  436. || LDW *B2++[2],B$A[3][4]
  437. || BNOP _KeccakF1600_int
  438. ADDKPC ret?,RA
  439. || LDW *A2++[2],A$A[4][0]
  440. || LDW *B2++[2],B$A[4][0]
  441. LDW *A2++[2],A$A[4][1]
  442. || LDW *B2++[2],B$A[4][1]
  443. LDW *A2++[2],A$A[4][2]
  444. || LDW *B2++[2],B$A[4][2]
  445. LDW *A2++[2],A$A[4][3]
  446. || LDW *B2++[2],B$A[4][3]
  447. LDW *A2,A$A[4][4]
  448. || LDW *B2,B$A[4][4]
  449. || ADDK -192,A2 ; rewind
  450. || ADDK -192,B2
  451. .align 16
  452. ret?:
  453. STW A$A[0][0],*A2++[2] ; store A[5][5]
  454. || STW B$A[0][0],*B2++[2]
  455. STW A$A[0][1],*A2++[2]
  456. || STW B$A[0][1],*B2++[2]
  457. STW A$A[0][2],*A2++[2]
  458. || STW B$A[0][2],*B2++[2]
  459. STW A$A[0][3],*A2++[2]
  460. || STW B$A[0][3],*B2++[2]
  461. STW A$A[0][4],*A2++[2]
  462. || STW B$A[0][4],*B2++[2]
  463. STW A$A[1][0],*A2++[2]
  464. || STW B$A[1][0],*B2++[2]
  465. STW A$A[1][1],*A2++[2]
  466. || STW B$A[1][1],*B2++[2]
  467. STW A$A[1][2],*A2++[2]
  468. || STW B$A[1][2],*B2++[2]
  469. STW A$A[1][3],*A2++[2]
  470. || STW B$A[1][3],*B2++[2]
  471. STW A$A[1][4],*A2++[2]
  472. || STW B$A[1][4],*B2++[2]
  473. STW A$A[2][0],*A2++[2]
  474. || STW B$A[2][0],*B2++[2]
  475. STW A$A[2][1],*A2++[2]
  476. || STW B$A[2][1],*B2++[2]
  477. STW A$A[2][2],*A2++[2]
  478. || STW B$A[2][2],*B2++[2]
  479. STW A$A[2][3],*A2++[2]
  480. || STW B$A[2][3],*B2++[2]
  481. STW A$A[2][4],*A2++[2]
  482. || STW B$A[2][4],*B2++[2]
  483. STW A$A[3][0],*A2++[2]
  484. || STW B$A[3][0],*B2++[2]
  485. STW A$A[3][1],*A2++[2]
  486. || STW B$A[3][1],*B2++[2]
  487. STW A$A[3][2],*A2++[2]
  488. || STW B$A[3][2],*B2++[2]
  489. STW A$A[3][3],*A2++[2]
  490. || STW B$A[3][3],*B2++[2]
  491. STW A$A[3][4],*A2++[2]
  492. || STW B$A[3][4],*B2++[2]
  493. LDW *SP[15],RA
  494. || LDW *FP[-6],A14
  495. STW A$A[4][0],*A2++[2]
  496. || STW B$A[4][0],*B2++[2]
  497. STW A$A[4][1],*A2++[2]
  498. || STW B$A[4][1],*B2++[2]
  499. STW A$A[4][2],*A2++[2]
  500. || STW B$A[4][2],*B2++[2]
  501. STW A$A[4][3],*A2++[2]
  502. || STW B$A[4][3],*B2++[2]
  503. STW A$A[4][4],*A2
  504. || STW B$A[4][4],*B2
  505. || ADDK -192,A2 ; rewind
  506. MV A2,A4 ; return original A4
  507. || LDDW *SP[8], B11:B10
  508. || LDDW *FP[-5],A11:A10
  509. LDDW *SP[9], B13:B12
  510. || LDDW *FP[-4],A13:A12
  511. || BNOP RA
  512. LDW *++SP(80),FP ; restore frame pointer
  513. NOP 4 ; wait till FP is committed
  514. .endasmfunc
  515. .newblock
  516. .asg B2,BSZ
  517. .asg A2,INP
  518. .asg A3,LEN
  519. .global _SHA3_absorb
  520. .align 32
  521. _SHA3_absorb:
  522. .asmfunc stack_usage(80)
  523. STW FP,*SP--(80) ; save frame pointer
  524. || MV SP,FP
  525. STDW B13:B12,*SP[9]
  526. || STDW A13:A12,*FP[-4]
  527. STDW B11:B10,*SP[8]
  528. || STDW A11:A10,*FP[-5]
  529. STW RA, *SP[15]
  530. || STW A14,*FP[-6]
  531. STW A4,*SP[1] ; save A[][]
  532. || MV B4,INP ; reassign arguments
  533. || MV A6,LEN
  534. || MV B6,BSZ
  535. || ADD 4,A4,B4
  536. LDW *A4++[2],A$A[0][0] ; load A[5][5]
  537. || LDW *B4++[2],B$A[0][0]
  538. LDW *A4++[2],A$A[0][1]
  539. || LDW *B4++[2],B$A[0][1]
  540. LDW *A4++[2],A$A[0][2]
  541. || LDW *B4++[2],B$A[0][2]
  542. LDW *A4++[2],A$A[0][3]
  543. || LDW *B4++[2],B$A[0][3]
  544. LDW *A4++[2],A$A[0][4]
  545. || LDW *B4++[2],B$A[0][4]
  546. LDW *A4++[2],A$A[1][0]
  547. || LDW *B4++[2],B$A[1][0]
  548. LDW *A4++[2],A$A[1][1]
  549. || LDW *B4++[2],B$A[1][1]
  550. LDW *A4++[2],A$A[1][2]
  551. || LDW *B4++[2],B$A[1][2]
  552. LDW *A4++[2],A$A[1][3]
  553. || LDW *B4++[2],B$A[1][3]
  554. LDW *A4++[2],A$A[1][4]
  555. || LDW *B4++[2],B$A[1][4]
  556. LDW *A4++[2],A$A[2][0]
  557. || LDW *B4++[2],B$A[2][0]
  558. LDW *A4++[2],A$A[2][1]
  559. || LDW *B4++[2],B$A[2][1]
  560. LDW *A4++[2],A$A[2][2]
  561. || LDW *B4++[2],B$A[2][2]
  562. LDW *A4++[2],A$A[2][3]
  563. || LDW *B4++[2],B$A[2][3]
  564. LDW *A4++[2],A$A[2][4]
  565. || LDW *B4++[2],B$A[2][4]
  566. LDW *A4++[2],A$A[3][0]
  567. || LDW *B4++[2],B$A[3][0]
  568. LDW *A4++[2],A$A[3][1]
  569. || LDW *B4++[2],B$A[3][1]
  570. LDW *A4++[2],A$A[3][2]
  571. || LDW *B4++[2],B$A[3][2]
  572. LDW *A4++[2],A$A[3][3]
  573. || LDW *B4++[2],B$A[3][3]
  574. LDW *A4++[2],A$A[3][4]
  575. || LDW *B4++[2],B$A[3][4]
  576. LDW *A4++[2],A$A[4][0]
  577. || LDW *B4++[2],B$A[4][0]
  578. LDW *A4++[2],A$A[4][1]
  579. || LDW *B4++[2],B$A[4][1]
  580. LDW *A4++[2],A$A[4][2]
  581. || LDW *B4++[2],B$A[4][2]
  582. LDW *A4++[2],A$A[4][3]
  583. || LDW *B4++[2],B$A[4][3]
  584. LDW *A4,A$A[4][4]
  585. || LDW *B4,B$A[4][4]
  586. || ADDKPC loop?,RA
  587. STDW RA:BSZ,*SP[4]
  588. loop?:
  589. CMPLTU LEN,BSZ,A0 ; len < bsz?
  590. || SHRU BSZ,3,BSZ
  591. [A0] BNOP ret?
  592. ||[A0] ZERO BSZ
  593. ||[A0] LDW *SP[1],A2 ; pull A[][]
  594. [BSZ] LDNDW *INP++,A1:A0
  595. ||[BSZ] SUB LEN,8,LEN
  596. ||[BSZ] SUB BSZ,1,BSZ
  597. NOP 4
  598. ___
  599. for ($y = 0; $y < 5; $y++) {
  600. for ($x = 0; $x < ($y<4 ? 5 : 4); $x++) {
  601. $code.=<<___;
  602. .if .BIG_ENDIAN
  603. SWAP2 A0,A1
  604. || SWAP2 A1,A0
  605. SWAP4 A0,A0
  606. SWAP4 A1,A1
  607. ||[!BSZ]BNOP _KeccakF1600_cheat
  608. ||[!BSZ]STDW LEN:INP,*SP[3]
  609. || DEAL A0,A0
  610. .else
  611. [!BSZ]BNOP _KeccakF1600_cheat
  612. ||[!BSZ]STDW LEN:INP,*SP[3]
  613. || DEAL A0,A0
  614. .endif
  615. [BSZ] LDNDW *INP++,A1:A0
  616. || DEAL A1,A1
  617. [BSZ] SUB LEN,8,LEN
  618. ||[BSZ] SUB BSZ,1,BSZ
  619. PACK2 A1,A0,A0
  620. || PACKH2 A1,A0,A1
  621. XOR A0,A$A[$y][$x],A$A[$y][$x]
  622. XOR A1,B$A[$y][$x],B$A[$y][$x]
  623. ___
  624. }
  625. }
  626. $code.=<<___;
  627. .if .BIG_ENDIAN
  628. SWAP2 A0,A1
  629. || SWAP2 A1,A0
  630. SWAP4 A0,A0
  631. SWAP4 A1,A1
  632. .endif
  633. BNOP _KeccakF1600_cheat
  634. || STDW LEN:INP,*SP[3]
  635. || DEAL A0,A0
  636. DEAL A1,A1
  637. NOP
  638. PACK2 A1,A0,A0
  639. || PACKH2 A1,A0,A1
  640. XOR A0,A$A[4][4],A$A[4][4]
  641. XOR A1,B$A[4][4],B$A[4][4]
  642. .align 16
  643. ret?:
  644. MV LEN,A4 ; return value
  645. || ADD 4,A2,B2
  646. STW A$A[0][0],*A2++[2] ; store A[5][5]
  647. || STW B$A[0][0],*B2++[2]
  648. STW A$A[0][1],*A2++[2]
  649. || STW B$A[0][1],*B2++[2]
  650. STW A$A[0][2],*A2++[2]
  651. || STW B$A[0][2],*B2++[2]
  652. STW A$A[0][3],*A2++[2]
  653. || STW B$A[0][3],*B2++[2]
  654. STW A$A[0][4],*A2++[2]
  655. || STW B$A[0][4],*B2++[2]
  656. STW A$A[1][0],*A2++[2]
  657. || STW B$A[1][0],*B2++[2]
  658. STW A$A[1][1],*A2++[2]
  659. || STW B$A[1][1],*B2++[2]
  660. STW A$A[1][2],*A2++[2]
  661. || STW B$A[1][2],*B2++[2]
  662. STW A$A[1][3],*A2++[2]
  663. || STW B$A[1][3],*B2++[2]
  664. STW A$A[1][4],*A2++[2]
  665. || STW B$A[1][4],*B2++[2]
  666. STW A$A[2][0],*A2++[2]
  667. || STW B$A[2][0],*B2++[2]
  668. STW A$A[2][1],*A2++[2]
  669. || STW B$A[2][1],*B2++[2]
  670. STW A$A[2][2],*A2++[2]
  671. || STW B$A[2][2],*B2++[2]
  672. STW A$A[2][3],*A2++[2]
  673. || STW B$A[2][3],*B2++[2]
  674. STW A$A[2][4],*A2++[2]
  675. || STW B$A[2][4],*B2++[2]
  676. LDW *SP[15],RA
  677. || LDW *FP[-6],A14
  678. STW A$A[3][0],*A2++[2]
  679. || STW B$A[3][0],*B2++[2]
  680. STW A$A[3][1],*A2++[2]
  681. || STW B$A[3][1],*B2++[2]
  682. STW A$A[3][2],*A2++[2]
  683. || STW B$A[3][2],*B2++[2]
  684. STW A$A[3][3],*A2++[2]
  685. || STW B$A[3][3],*B2++[2]
  686. STW A$A[3][4],*A2++[2]
  687. || STW B$A[3][4],*B2++[2]
  688. LDDW *SP[8], B11:B10
  689. || LDDW *FP[-5],A11:A10
  690. LDDW *SP[9], B13:B12
  691. || LDDW *FP[-4],A13:A12
  692. BNOP RA
  693. || LDW *++SP(80),FP ; restore frame pointer
  694. STW A$A[4][0],*A2++[2]
  695. || STW B$A[4][0],*B2++[2]
  696. STW A$A[4][1],*A2++[2]
  697. || STW B$A[4][1],*B2++[2]
  698. STW A$A[4][2],*A2++[2]
  699. || STW B$A[4][2],*B2++[2]
  700. STW A$A[4][3],*A2++[2]
  701. || STW B$A[4][3],*B2++[2]
  702. STW A$A[4][4],*A2++[2]
  703. || STW B$A[4][4],*B2++[2]
  704. .endasmfunc
  705. .newblock
  706. .global _SHA3_squeeze
  707. .asg A12,OUT
  708. .asg A13,LEN
  709. .asg A14,BSZ
  710. .align 32
  711. _SHA3_squeeze:
  712. .asmfunc stack_usage(24)
  713. STW FP,*SP--(24) ; save frame pointer
  714. || MV SP,FP
  715. STW RA, *SP[5]
  716. || STW A14,*FP[-2]
  717. STDW A13:A12,*FP[-2]
  718. || MV B4,OUT ; reassign arguments
  719. MV A6,LEN
  720. || MV B6,BSZ
  721. loop?:
  722. LDW *SP[5],RA ; reload RA
  723. || SHRU BSZ,3,A1
  724. || MV A4,A8
  725. || ADD 4,A4,B8
  726. block?:
  727. CMPLTU LEN,8,A0 ; len < 8?
  728. [A0] BNOP tail?
  729. LDW *A8++[2],A9
  730. || LDW *B8++[2],B9
  731. || SUB LEN,8,LEN ; len -= 8
  732. MV LEN,A0
  733. || SUB A1,1,A1 ; bsz--
  734. || NOP 4
  735. .if .BIG_ENDIAN
  736. SWAP4 A9,A9
  737. || SWAP4 B9,B9
  738. SWAP2 A9,A9
  739. || SWAP2 B9,B9
  740. .endif
  741. [!A0] BNOP ret?
  742. ||[!A0] ZERO A1
  743. PACK2 B9,A9,B7
  744. ||[A1] BNOP block?
  745. PACKH2 B9,A9,B9
  746. || SHFL B7,B7
  747. SHFL B9,B9
  748. STNW B7,*OUT++
  749. STNW B9,*OUT++
  750. NOP
  751. BNOP _KeccakF1600,4
  752. ADDKPC loop?,RA
  753. .align 16
  754. tail?:
  755. .if .BIG_ENDIAN
  756. SWAP4 A9,A9
  757. || SWAP4 B9,B9
  758. SWAP2 A9,A9
  759. || SWAP2 B9,B9
  760. .endif
  761. PACK2 B9,A9,B7
  762. PACKH2 B9,A9,B9
  763. || SHFL B7,B7
  764. SHFL B9,B9
  765. STB B7,*OUT++
  766. || SHRU B7,8,B7
  767. || ADD LEN,7,A0
  768. [A0] STB B7,*OUT++
  769. ||[A0] SHRU B7,8,B7
  770. ||[A0] SUB A0,1,A0
  771. [A0] STB B7,*OUT++
  772. ||[A0] SHRU B7,8,B7
  773. ||[A0] SUB A0,1,A0
  774. [A0] STB B7,*OUT++
  775. ||[A0] SUB A0,1,A0
  776. [A0] STB B9,*OUT++
  777. ||[A0] SHRU B9,8,B9
  778. ||[A0] SUB A0,1,A0
  779. [A0] STB B9,*OUT++
  780. ||[A0] SHRU B9,8,B9
  781. ||[A0] SUB A0,1,A0
  782. [A0] STB B9,*OUT++
  783. ret?:
  784. LDDW *FP[-2],A13:A12
  785. BNOP RA
  786. || LDW *FP[-2],A14
  787. LDW *++SP(24),FP ; restore frame pointer
  788. NOP 4 ; wait till FP is committed
  789. .endasmfunc
  790. .if __TI_EABI__
  791. .sect ".text:sha_asm.const"
  792. .else
  793. .sect ".const:sha_asm"
  794. .endif
  795. .align 256
  796. .uword 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
  797. iotas:
  798. .uword 0x00000001, 0x00000000
  799. .uword 0x00000000, 0x00000089
  800. .uword 0x00000000, 0x8000008b
  801. .uword 0x00000000, 0x80008080
  802. .uword 0x00000001, 0x0000008b
  803. .uword 0x00000001, 0x00008000
  804. .uword 0x00000001, 0x80008088
  805. .uword 0x00000001, 0x80000082
  806. .uword 0x00000000, 0x0000000b
  807. .uword 0x00000000, 0x0000000a
  808. .uword 0x00000001, 0x00008082
  809. .uword 0x00000000, 0x00008003
  810. .uword 0x00000001, 0x0000808b
  811. .uword 0x00000001, 0x8000000b
  812. .uword 0x00000001, 0x8000008a
  813. .uword 0x00000001, 0x80000081
  814. .uword 0x00000000, 0x80000081
  815. .uword 0x00000000, 0x80000008
  816. .uword 0x00000000, 0x00000083
  817. .uword 0x00000000, 0x80008003
  818. .uword 0x00000001, 0x80008088
  819. .uword 0x00000000, 0x80000088
  820. .uword 0x00000001, 0x00008000
  821. .uword 0x00000000, 0x80008082
  822. .cstring "Keccak-1600 absorb and squeeze for C64x, CRYPTOGAMS by <appro\@openssl.org>"
  823. .align 4
  824. ___
  825. $output=pop and open STDOUT,">$output";
  826. print $code;
  827. close STDOUT or die "error closing STDOUT: $!";