keccak1600-armv4.pl 44 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606
  1. #!/usr/bin/env perl
  2. # Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the OpenSSL license (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. # ====================================================================
  10. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  11. # project. The module is, however, dual licensed under OpenSSL and
  12. # CRYPTOGAMS licenses depending on where you obtain it. For further
  13. # details see http://www.openssl.org/~appro/cryptogams/.
  14. # ====================================================================
  15. #
  16. # Keccak-1600 for ARMv4.
  17. #
  18. # June 2017.
  19. #
  20. # Non-NEON code is KECCAK_1X variant (see sha/keccak1600.c) with bit
  21. # interleaving. How does it compare to Keccak Code Package? It's as
  22. # fast, but several times smaller, and is endian- and ISA-neutral. ISA
  23. # neutrality means that minimum ISA requirement is ARMv4, yet it can
  24. # be assembled even as Thumb-2. NEON code path is KECCAK_1X_ALT with
  25. # register layout taken from Keccak Code Package. It's also as fast,
  26. # in fact faster by 10-15% on some processors, and endian-neutral.
  27. #
  28. # August 2017.
  29. #
  30. # Switch to KECCAK_2X variant for non-NEON code and merge almost 1/2
  31. # of rotate instructions with logical ones. This resulted in ~10%
  32. # improvement on most processors. Switch to KECCAK_2X effectively
  33. # minimizes re-loads from temporary storage, and merged rotates just
  34. # eliminate corresponding instructions. As for latter. When examining
  35. # code you'll notice commented ror instructions. These are eliminated
  36. # ones, and you should trace destination register below to see what's
  37. # going on. Just in case, why not all rotates are eliminated. Trouble
  38. # is that you have operations that require both inputs to be rotated,
  39. # e.g. 'eor a,b>>>x,c>>>y'. This conundrum is resolved by using
  40. # 'eor a,b,c>>>(x-y)' and then merge-rotating 'a' in next operation
  41. # that takes 'a' as input. And thing is that this next operation can
  42. # be in next round. It's totally possible to "carry" rotate "factors"
  43. # to the next round, but it makes code more complex. And the last word
  44. # is the keyword, i.e. "almost 1/2" is kind of complexity cap [for the
  45. # time being]...
  46. #
  47. # Reduce per-round instruction count in Thumb-2 case by 16%. This is
  48. # achieved by folding ldr/str pairs to their double-word counterparts.
  49. # Theoretically this should have improved performance on single-issue
  50. # cores, such as Cortex-A5/A7, by 19%. Reality is a bit different, as
  51. # usual...
  52. #
  53. ########################################################################
  54. # Numbers are cycles per processed byte. Non-NEON results account even
  55. # for input bit interleaving.
  56. #
  57. # r=1088(*) Thumb-2(**) NEON
  58. #
  59. # ARM11xx 82/+150%
  60. # Cortex-A5 88/+160%, 86, 36
  61. # Cortex-A7 78/+160%, 68, 34
  62. # Cortex-A8 51/+230%, 57, 30
  63. # Cortex-A9 53/+210%, 51, 26
  64. # Cortex-A15 42/+160%, 38, 18
  65. # Snapdragon S4 43/+210%, 38, 24
  66. #
  67. # (*) Corresponds to SHA3-256. Percentage after slash is improvement
  68. # over compiler-generated KECCAK_2X reference code.
  69. # (**) Thumb-2 results for Cortex-A5/A7 are likely to apply even to
  70. # Cortex-Mx, x>=3. Otherwise, non-NEON results for NEON-capable
  71. # processors are presented mostly for reference purposes.
  72. $flavour = shift;
  73. if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
  74. else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
  75. if ($flavour && $flavour ne "void") {
  76. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  77. ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
  78. ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
  79. die "can't locate arm-xlate.pl";
  80. open STDOUT,"| \"$^X\" $xlate $flavour $output";
  81. } else {
  82. open STDOUT,">$output";
  83. }
  84. my @C = map("r$_",(0..9));
  85. my @E = map("r$_",(10..12,14));
  86. ########################################################################
  87. # Stack layout
  88. # ----->+-----------------------+
  89. # | uint64_t A[5][5] |
  90. # | ... |
  91. # +200->+-----------------------+
  92. # | uint64_t D[5] |
  93. # | ... |
  94. # +240->+-----------------------+
  95. # | uint64_t T[5][5] |
  96. # | ... |
  97. # +440->+-----------------------+
  98. # | saved lr |
  99. # +444->+-----------------------+
  100. # | loop counter |
  101. # +448->+-----------------------+
  102. # | ...
  103. my @A = map([ 8*$_, 8*($_+1), 8*($_+2), 8*($_+3), 8*($_+4) ], (0,5,10,15,20));
  104. my @D = map(8*$_, (25..29));
  105. my @T = map([ 8*$_, 8*($_+1), 8*($_+2), 8*($_+3), 8*($_+4) ], (30,35,40,45,50));
  106. $code.=<<___;
  107. #include "arm_arch.h"
  108. .text
  109. #if defined(__thumb2__)
  110. .syntax unified
  111. .thumb
  112. #else
  113. .code 32
  114. #endif
  115. .type iotas32, %object
  116. .align 5
  117. iotas32:
  118. .long 0x00000001, 0x00000000
  119. .long 0x00000000, 0x00000089
  120. .long 0x00000000, 0x8000008b
  121. .long 0x00000000, 0x80008080
  122. .long 0x00000001, 0x0000008b
  123. .long 0x00000001, 0x00008000
  124. .long 0x00000001, 0x80008088
  125. .long 0x00000001, 0x80000082
  126. .long 0x00000000, 0x0000000b
  127. .long 0x00000000, 0x0000000a
  128. .long 0x00000001, 0x00008082
  129. .long 0x00000000, 0x00008003
  130. .long 0x00000001, 0x0000808b
  131. .long 0x00000001, 0x8000000b
  132. .long 0x00000001, 0x8000008a
  133. .long 0x00000001, 0x80000081
  134. .long 0x00000000, 0x80000081
  135. .long 0x00000000, 0x80000008
  136. .long 0x00000000, 0x00000083
  137. .long 0x00000000, 0x80008003
  138. .long 0x00000001, 0x80008088
  139. .long 0x00000000, 0x80000088
  140. .long 0x00000001, 0x00008000
  141. .long 0x00000000, 0x80008082
  142. .size iotas32,.-iotas32
  143. .type KeccakF1600_int, %function
  144. .align 5
  145. KeccakF1600_int:
  146. add @C[9],sp,#$A[4][2]
  147. add @E[2],sp,#$A[0][0]
  148. add @E[0],sp,#$A[1][0]
  149. ldmia @C[9],{@C[4]-@C[9]} @ A[4][2..4]
  150. KeccakF1600_enter:
  151. str lr,[sp,#440]
  152. eor @E[1],@E[1],@E[1]
  153. str @E[1],[sp,#444]
  154. b .Lround2x
  155. .align 4
  156. .Lround2x:
  157. ___
  158. sub Round {
  159. my (@A,@R); (@A[0..4],@R) = @_;
  160. $code.=<<___;
  161. ldmia @E[2],{@C[0]-@C[3]} @ A[0][0..1]
  162. ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[1][0..1]
  163. #ifdef __thumb2__
  164. eor @C[0],@C[0],@E[0]
  165. eor @C[1],@C[1],@E[1]
  166. eor @C[2],@C[2],@E[2]
  167. ldrd @E[0],@E[1],[sp,#$A[1][2]]
  168. eor @C[3],@C[3],@E[3]
  169. ldrd @E[2],@E[3],[sp,#$A[1][3]]
  170. eor @C[4],@C[4],@E[0]
  171. eor @C[5],@C[5],@E[1]
  172. eor @C[6],@C[6],@E[2]
  173. ldrd @E[0],@E[1],[sp,#$A[1][4]]
  174. eor @C[7],@C[7],@E[3]
  175. ldrd @E[2],@E[3],[sp,#$A[2][0]]
  176. eor @C[8],@C[8],@E[0]
  177. eor @C[9],@C[9],@E[1]
  178. eor @C[0],@C[0],@E[2]
  179. ldrd @E[0],@E[1],[sp,#$A[2][1]]
  180. eor @C[1],@C[1],@E[3]
  181. ldrd @E[2],@E[3],[sp,#$A[2][2]]
  182. eor @C[2],@C[2],@E[0]
  183. eor @C[3],@C[3],@E[1]
  184. eor @C[4],@C[4],@E[2]
  185. ldrd @E[0],@E[1],[sp,#$A[2][3]]
  186. eor @C[5],@C[5],@E[3]
  187. ldrd @E[2],@E[3],[sp,#$A[2][4]]
  188. eor @C[6],@C[6],@E[0]
  189. eor @C[7],@C[7],@E[1]
  190. eor @C[8],@C[8],@E[2]
  191. ldrd @E[0],@E[1],[sp,#$A[3][0]]
  192. eor @C[9],@C[9],@E[3]
  193. ldrd @E[2],@E[3],[sp,#$A[3][1]]
  194. eor @C[0],@C[0],@E[0]
  195. eor @C[1],@C[1],@E[1]
  196. eor @C[2],@C[2],@E[2]
  197. ldrd @E[0],@E[1],[sp,#$A[3][2]]
  198. eor @C[3],@C[3],@E[3]
  199. ldrd @E[2],@E[3],[sp,#$A[3][3]]
  200. eor @C[4],@C[4],@E[0]
  201. eor @C[5],@C[5],@E[1]
  202. eor @C[6],@C[6],@E[2]
  203. ldrd @E[0],@E[1],[sp,#$A[3][4]]
  204. eor @C[7],@C[7],@E[3]
  205. ldrd @E[2],@E[3],[sp,#$A[4][0]]
  206. eor @C[8],@C[8],@E[0]
  207. eor @C[9],@C[9],@E[1]
  208. eor @C[0],@C[0],@E[2]
  209. ldrd @E[0],@E[1],[sp,#$A[4][1]]
  210. eor @C[1],@C[1],@E[3]
  211. ldrd @E[2],@E[3],[sp,#$A[0][2]]
  212. eor @C[2],@C[2],@E[0]
  213. eor @C[3],@C[3],@E[1]
  214. eor @C[4],@C[4],@E[2]
  215. ldrd @E[0],@E[1],[sp,#$A[0][3]]
  216. eor @C[5],@C[5],@E[3]
  217. ldrd @E[2],@E[3],[sp,#$A[0][4]]
  218. #else
  219. eor @C[0],@C[0],@E[0]
  220. add @E[0],sp,#$A[1][2]
  221. eor @C[1],@C[1],@E[1]
  222. eor @C[2],@C[2],@E[2]
  223. eor @C[3],@C[3],@E[3]
  224. ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[1][2..3]
  225. eor @C[4],@C[4],@E[0]
  226. add @E[0],sp,#$A[1][4]
  227. eor @C[5],@C[5],@E[1]
  228. eor @C[6],@C[6],@E[2]
  229. eor @C[7],@C[7],@E[3]
  230. ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[1][4]..A[2][0]
  231. eor @C[8],@C[8],@E[0]
  232. add @E[0],sp,#$A[2][1]
  233. eor @C[9],@C[9],@E[1]
  234. eor @C[0],@C[0],@E[2]
  235. eor @C[1],@C[1],@E[3]
  236. ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[2][1..2]
  237. eor @C[2],@C[2],@E[0]
  238. add @E[0],sp,#$A[2][3]
  239. eor @C[3],@C[3],@E[1]
  240. eor @C[4],@C[4],@E[2]
  241. eor @C[5],@C[5],@E[3]
  242. ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[2][3..4]
  243. eor @C[6],@C[6],@E[0]
  244. add @E[0],sp,#$A[3][0]
  245. eor @C[7],@C[7],@E[1]
  246. eor @C[8],@C[8],@E[2]
  247. eor @C[9],@C[9],@E[3]
  248. ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[3][0..1]
  249. eor @C[0],@C[0],@E[0]
  250. add @E[0],sp,#$A[3][2]
  251. eor @C[1],@C[1],@E[1]
  252. eor @C[2],@C[2],@E[2]
  253. eor @C[3],@C[3],@E[3]
  254. ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[3][2..3]
  255. eor @C[4],@C[4],@E[0]
  256. add @E[0],sp,#$A[3][4]
  257. eor @C[5],@C[5],@E[1]
  258. eor @C[6],@C[6],@E[2]
  259. eor @C[7],@C[7],@E[3]
  260. ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[3][4]..A[4][0]
  261. eor @C[8],@C[8],@E[0]
  262. ldr @E[0],[sp,#$A[4][1]] @ A[4][1]
  263. eor @C[9],@C[9],@E[1]
  264. ldr @E[1],[sp,#$A[4][1]+4]
  265. eor @C[0],@C[0],@E[2]
  266. ldr @E[2],[sp,#$A[0][2]] @ A[0][2]
  267. eor @C[1],@C[1],@E[3]
  268. ldr @E[3],[sp,#$A[0][2]+4]
  269. eor @C[2],@C[2],@E[0]
  270. add @E[0],sp,#$A[0][3]
  271. eor @C[3],@C[3],@E[1]
  272. eor @C[4],@C[4],@E[2]
  273. eor @C[5],@C[5],@E[3]
  274. ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[0][3..4]
  275. #endif
  276. eor @C[6],@C[6],@E[0]
  277. eor @C[7],@C[7],@E[1]
  278. eor @C[8],@C[8],@E[2]
  279. eor @C[9],@C[9],@E[3]
  280. eor @E[0],@C[0],@C[5],ror#32-1 @ E[0] = ROL64(C[2], 1) ^ C[0];
  281. str.l @E[0],[sp,#$D[1]] @ D[1] = E[0]
  282. eor @E[1],@C[1],@C[4]
  283. str.h @E[1],[sp,#$D[1]+4]
  284. eor @E[2],@C[6],@C[1],ror#32-1 @ E[1] = ROL64(C[0], 1) ^ C[3];
  285. eor @E[3],@C[7],@C[0]
  286. str.l @E[2],[sp,#$D[4]] @ D[4] = E[1]
  287. eor @C[0],@C[8],@C[3],ror#32-1 @ C[0] = ROL64(C[1], 1) ^ C[4];
  288. str.h @E[3],[sp,#$D[4]+4]
  289. eor @C[1],@C[9],@C[2]
  290. str.l @C[0],[sp,#$D[0]] @ D[0] = C[0]
  291. eor @C[2],@C[2],@C[7],ror#32-1 @ C[1] = ROL64(C[3], 1) ^ C[1];
  292. ldr.l @C[7],[sp,#$A[3][3]]
  293. eor @C[3],@C[3],@C[6]
  294. str.h @C[1],[sp,#$D[0]+4]
  295. ldr.h @C[6],[sp,#$A[3][3]+4]
  296. str.l @C[2],[sp,#$D[2]] @ D[2] = C[1]
  297. eor @C[4],@C[4],@C[9],ror#32-1 @ C[2] = ROL64(C[4], 1) ^ C[2];
  298. str.h @C[3],[sp,#$D[2]+4]
  299. eor @C[5],@C[5],@C[8]
  300. ldr.l @C[8],[sp,#$A[4][4]]
  301. ldr.h @C[9],[sp,#$A[4][4]+4]
  302. str.l @C[4],[sp,#$D[3]] @ D[3] = C[2]
  303. eor @C[7],@C[7],@C[4]
  304. str.h @C[5],[sp,#$D[3]+4]
  305. eor @C[6],@C[6],@C[5]
  306. ldr.l @C[4],[sp,#$A[0][0]]
  307. @ ror @C[7],@C[7],#32-10 @ C[3] = ROL64(A[3][3] ^ C[2], rhotates[3][3]); /* D[3] */
  308. @ ror @C[6],@C[6],#32-11
  309. ldr.h @C[5],[sp,#$A[0][0]+4]
  310. eor @C[8],@C[8],@E[2]
  311. eor @C[9],@C[9],@E[3]
  312. ldr.l @E[2],[sp,#$A[2][2]]
  313. eor @C[0],@C[0],@C[4]
  314. ldr.h @E[3],[sp,#$A[2][2]+4]
  315. @ ror @C[8],@C[8],#32-7 @ C[4] = ROL64(A[4][4] ^ E[1], rhotates[4][4]); /* D[4] */
  316. @ ror @C[9],@C[9],#32-7
  317. eor @C[1],@C[1],@C[5] @ C[0] = A[0][0] ^ C[0]; /* rotate by 0 */ /* D[0] */
  318. eor @E[2],@E[2],@C[2]
  319. ldr.l @C[2],[sp,#$A[1][1]]
  320. eor @E[3],@E[3],@C[3]
  321. ldr.h @C[3],[sp,#$A[1][1]+4]
  322. ror @C[5],@E[2],#32-21 @ C[2] = ROL64(A[2][2] ^ C[1], rhotates[2][2]); /* D[2] */
  323. ldr @E[2],[sp,#444] @ load counter
  324. eor @C[2],@C[2],@E[0]
  325. adr @E[0],iotas32
  326. ror @C[4],@E[3],#32-22
  327. add @E[3],@E[0],@E[2]
  328. eor @C[3],@C[3],@E[1]
  329. ___
  330. $code.=<<___ if ($A[0][0] != $T[0][0]);
  331. ldmia @E[3],{@E[0],@E[1]} @ iotas[i]
  332. ___
  333. $code.=<<___ if ($A[0][0] == $T[0][0]);
  334. ldr.l @E[0],[@E[3],#8] @ iotas[i].lo
  335. add @E[2],@E[2],#16
  336. ldr.h @E[1],[@E[3],#12] @ iotas[i].hi
  337. cmp @E[2],#192
  338. str @E[2],[sp,#444] @ store counter
  339. ___
  340. $code.=<<___;
  341. bic @E[2],@C[4],@C[2],ror#32-22
  342. bic @E[3],@C[5],@C[3],ror#32-22
  343. ror @C[2],@C[2],#32-22 @ C[1] = ROL64(A[1][1] ^ E[0], rhotates[1][1]); /* D[1] */
  344. ror @C[3],@C[3],#32-22
  345. eor @E[2],@E[2],@C[0]
  346. eor @E[3],@E[3],@C[1]
  347. eor @E[0],@E[0],@E[2]
  348. eor @E[1],@E[1],@E[3]
  349. str.l @E[0],[sp,#$R[0][0]] @ R[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i];
  350. bic @E[2],@C[6],@C[4],ror#11
  351. str.h @E[1],[sp,#$R[0][0]+4]
  352. bic @E[3],@C[7],@C[5],ror#10
  353. bic @E[0],@C[8],@C[6],ror#32-(11-7)
  354. bic @E[1],@C[9],@C[7],ror#32-(10-7)
  355. eor @E[2],@C[2],@E[2],ror#32-11
  356. str.l @E[2],[sp,#$R[0][1]] @ R[0][1] = C[1] ^ (~C[2] & C[3]);
  357. eor @E[3],@C[3],@E[3],ror#32-10
  358. str.h @E[3],[sp,#$R[0][1]+4]
  359. eor @E[0],@C[4],@E[0],ror#32-7
  360. eor @E[1],@C[5],@E[1],ror#32-7
  361. str.l @E[0],[sp,#$R[0][2]] @ R[0][2] = C[2] ^ (~C[3] & C[4]);
  362. bic @E[2],@C[0],@C[8],ror#32-7
  363. str.h @E[1],[sp,#$R[0][2]+4]
  364. bic @E[3],@C[1],@C[9],ror#32-7
  365. eor @E[2],@E[2],@C[6],ror#32-11
  366. str.l @E[2],[sp,#$R[0][3]] @ R[0][3] = C[3] ^ (~C[4] & C[0]);
  367. eor @E[3],@E[3],@C[7],ror#32-10
  368. str.h @E[3],[sp,#$R[0][3]+4]
  369. bic @E[0],@C[2],@C[0]
  370. add @E[3],sp,#$D[3]
  371. ldr.l @C[0],[sp,#$A[0][3]] @ A[0][3]
  372. bic @E[1],@C[3],@C[1]
  373. ldr.h @C[1],[sp,#$A[0][3]+4]
  374. eor @E[0],@E[0],@C[8],ror#32-7
  375. eor @E[1],@E[1],@C[9],ror#32-7
  376. str.l @E[0],[sp,#$R[0][4]] @ R[0][4] = C[4] ^ (~C[0] & C[1]);
  377. add @C[9],sp,#$D[0]
  378. str.h @E[1],[sp,#$R[0][4]+4]
  379. ldmia @E[3],{@E[0]-@E[2],@E[3]} @ D[3..4]
  380. ldmia @C[9],{@C[6]-@C[9]} @ D[0..1]
  381. ldr.l @C[2],[sp,#$A[1][4]] @ A[1][4]
  382. eor @C[0],@C[0],@E[0]
  383. ldr.h @C[3],[sp,#$A[1][4]+4]
  384. eor @C[1],@C[1],@E[1]
  385. @ ror @C[0],@C[0],#32-14 @ C[0] = ROL64(A[0][3] ^ D[3], rhotates[0][3]);
  386. ldr.l @E[0],[sp,#$A[3][1]] @ A[3][1]
  387. @ ror @C[1],@C[1],#32-14
  388. ldr.h @E[1],[sp,#$A[3][1]+4]
  389. eor @C[2],@C[2],@E[2]
  390. ldr.l @C[4],[sp,#$A[2][0]] @ A[2][0]
  391. eor @C[3],@C[3],@E[3]
  392. ldr.h @C[5],[sp,#$A[2][0]+4]
  393. @ ror @C[2],@C[2],#32-10 @ C[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
  394. @ ror @C[3],@C[3],#32-10
  395. eor @C[6],@C[6],@C[4]
  396. ldr.l @E[2],[sp,#$D[2]] @ D[2]
  397. eor @C[7],@C[7],@C[5]
  398. ldr.h @E[3],[sp,#$D[2]+4]
  399. ror @C[5],@C[6],#32-1 @ C[2] = ROL64(A[2][0] ^ D[0], rhotates[2][0]);
  400. ror @C[4],@C[7],#32-2
  401. eor @E[0],@E[0],@C[8]
  402. ldr.l @C[8],[sp,#$A[4][2]] @ A[4][2]
  403. eor @E[1],@E[1],@C[9]
  404. ldr.h @C[9],[sp,#$A[4][2]+4]
  405. ror @C[7],@E[0],#32-22 @ C[3] = ROL64(A[3][1] ^ D[1], rhotates[3][1]);
  406. ror @C[6],@E[1],#32-23
  407. bic @E[0],@C[4],@C[2],ror#32-10
  408. bic @E[1],@C[5],@C[3],ror#32-10
  409. eor @E[2],@E[2],@C[8]
  410. eor @E[3],@E[3],@C[9]
  411. ror @C[9],@E[2],#32-30 @ C[4] = ROL64(A[4][2] ^ D[2], rhotates[4][2]);
  412. ror @C[8],@E[3],#32-31
  413. eor @E[0],@E[0],@C[0],ror#32-14
  414. eor @E[1],@E[1],@C[1],ror#32-14
  415. str.l @E[0],[sp,#$R[1][0]] @ R[1][0] = C[0] ^ (~C[1] & C[2])
  416. bic @E[2],@C[6],@C[4]
  417. str.h @E[1],[sp,#$R[1][0]+4]
  418. bic @E[3],@C[7],@C[5]
  419. eor @E[2],@E[2],@C[2],ror#32-10
  420. str.l @E[2],[sp,#$R[1][1]] @ R[1][1] = C[1] ^ (~C[2] & C[3]);
  421. eor @E[3],@E[3],@C[3],ror#32-10
  422. str.h @E[3],[sp,#$R[1][1]+4]
  423. bic @E[0],@C[8],@C[6]
  424. bic @E[1],@C[9],@C[7]
  425. bic @E[2],@C[0],@C[8],ror#14
  426. bic @E[3],@C[1],@C[9],ror#14
  427. eor @E[0],@E[0],@C[4]
  428. eor @E[1],@E[1],@C[5]
  429. str.l @E[0],[sp,#$R[1][2]] @ R[1][2] = C[2] ^ (~C[3] & C[4]);
  430. bic @C[2],@C[2],@C[0],ror#32-(14-10)
  431. str.h @E[1],[sp,#$R[1][2]+4]
  432. eor @E[2],@C[6],@E[2],ror#32-14
  433. bic @E[1],@C[3],@C[1],ror#32-(14-10)
  434. str.l @E[2],[sp,#$R[1][3]] @ R[1][3] = C[3] ^ (~C[4] & C[0]);
  435. eor @E[3],@C[7],@E[3],ror#32-14
  436. str.h @E[3],[sp,#$R[1][3]+4]
  437. add @E[2],sp,#$D[1]
  438. ldr.l @C[1],[sp,#$A[0][1]] @ A[0][1]
  439. eor @E[0],@C[8],@C[2],ror#32-10
  440. ldr.h @C[0],[sp,#$A[0][1]+4]
  441. eor @E[1],@C[9],@E[1],ror#32-10
  442. str.l @E[0],[sp,#$R[1][4]] @ R[1][4] = C[4] ^ (~C[0] & C[1]);
  443. str.h @E[1],[sp,#$R[1][4]+4]
  444. add @C[9],sp,#$D[3]
  445. ldmia @E[2],{@E[0]-@E[2],@E[3]} @ D[1..2]
  446. ldr.l @C[2],[sp,#$A[1][2]] @ A[1][2]
  447. ldr.h @C[3],[sp,#$A[1][2]+4]
  448. ldmia @C[9],{@C[6]-@C[9]} @ D[3..4]
  449. eor @C[1],@C[1],@E[0]
  450. ldr.l @C[4],[sp,#$A[2][3]] @ A[2][3]
  451. eor @C[0],@C[0],@E[1]
  452. ldr.h @C[5],[sp,#$A[2][3]+4]
  453. ror @C[0],@C[0],#32-1 @ C[0] = ROL64(A[0][1] ^ D[1], rhotates[0][1]);
  454. eor @C[2],@C[2],@E[2]
  455. ldr.l @E[0],[sp,#$A[3][4]] @ A[3][4]
  456. eor @C[3],@C[3],@E[3]
  457. ldr.h @E[1],[sp,#$A[3][4]+4]
  458. @ ror @C[2],@C[2],#32-3 @ C[1] = ROL64(A[1][2] ^ D[2], rhotates[1][2]);
  459. ldr.l @E[2],[sp,#$D[0]] @ D[0]
  460. @ ror @C[3],@C[3],#32-3
  461. ldr.h @E[3],[sp,#$D[0]+4]
  462. eor @C[4],@C[4],@C[6]
  463. eor @C[5],@C[5],@C[7]
  464. @ ror @C[5],@C[6],#32-12 @ C[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
  465. @ ror @C[4],@C[7],#32-13 @ [track reverse order below]
  466. eor @E[0],@E[0],@C[8]
  467. ldr.l @C[8],[sp,#$A[4][0]] @ A[4][0]
  468. eor @E[1],@E[1],@C[9]
  469. ldr.h @C[9],[sp,#$A[4][0]+4]
  470. ror @C[6],@E[0],#32-4 @ C[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]);
  471. ror @C[7],@E[1],#32-4
  472. eor @E[2],@E[2],@C[8]
  473. eor @E[3],@E[3],@C[9]
  474. ror @C[8],@E[2],#32-9 @ C[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]);
  475. ror @C[9],@E[3],#32-9
  476. bic @E[0],@C[5],@C[2],ror#13-3
  477. bic @E[1],@C[4],@C[3],ror#12-3
  478. bic @E[2],@C[6],@C[5],ror#32-13
  479. bic @E[3],@C[7],@C[4],ror#32-12
  480. eor @E[0],@C[0],@E[0],ror#32-13
  481. eor @E[1],@C[1],@E[1],ror#32-12
  482. str.l @E[0],[sp,#$R[2][0]] @ R[2][0] = C[0] ^ (~C[1] & C[2])
  483. eor @E[2],@E[2],@C[2],ror#32-3
  484. str.h @E[1],[sp,#$R[2][0]+4]
  485. eor @E[3],@E[3],@C[3],ror#32-3
  486. str.l @E[2],[sp,#$R[2][1]] @ R[2][1] = C[1] ^ (~C[2] & C[3]);
  487. bic @E[0],@C[8],@C[6]
  488. bic @E[1],@C[9],@C[7]
  489. str.h @E[3],[sp,#$R[2][1]+4]
  490. eor @E[0],@E[0],@C[5],ror#32-13
  491. eor @E[1],@E[1],@C[4],ror#32-12
  492. str.l @E[0],[sp,#$R[2][2]] @ R[2][2] = C[2] ^ (~C[3] & C[4]);
  493. bic @E[2],@C[0],@C[8]
  494. str.h @E[1],[sp,#$R[2][2]+4]
  495. bic @E[3],@C[1],@C[9]
  496. eor @E[2],@E[2],@C[6]
  497. eor @E[3],@E[3],@C[7]
  498. str.l @E[2],[sp,#$R[2][3]] @ R[2][3] = C[3] ^ (~C[4] & C[0]);
  499. bic @E[0],@C[2],@C[0],ror#3
  500. str.h @E[3],[sp,#$R[2][3]+4]
  501. bic @E[1],@C[3],@C[1],ror#3
  502. ldr.l @C[1],[sp,#$A[0][4]] @ A[0][4] [in reverse order]
  503. eor @E[0],@C[8],@E[0],ror#32-3
  504. ldr.h @C[0],[sp,#$A[0][4]+4]
  505. eor @E[1],@C[9],@E[1],ror#32-3
  506. str.l @E[0],[sp,#$R[2][4]] @ R[2][4] = C[4] ^ (~C[0] & C[1]);
  507. add @C[9],sp,#$D[1]
  508. str.h @E[1],[sp,#$R[2][4]+4]
  509. ldr.l @E[0],[sp,#$D[4]] @ D[4]
  510. ldr.h @E[1],[sp,#$D[4]+4]
  511. ldr.l @E[2],[sp,#$D[0]] @ D[0]
  512. ldr.h @E[3],[sp,#$D[0]+4]
  513. ldmia @C[9],{@C[6]-@C[9]} @ D[1..2]
  514. eor @C[1],@C[1],@E[0]
  515. ldr.l @C[2],[sp,#$A[1][0]] @ A[1][0]
  516. eor @C[0],@C[0],@E[1]
  517. ldr.h @C[3],[sp,#$A[1][0]+4]
  518. @ ror @C[1],@E[0],#32-13 @ C[0] = ROL64(A[0][4] ^ D[4], rhotates[0][4]);
  519. ldr.l @C[4],[sp,#$A[2][1]] @ A[2][1]
  520. @ ror @C[0],@E[1],#32-14 @ [was loaded in reverse order]
  521. ldr.h @C[5],[sp,#$A[2][1]+4]
  522. eor @C[2],@C[2],@E[2]
  523. ldr.l @E[0],[sp,#$A[3][2]] @ A[3][2]
  524. eor @C[3],@C[3],@E[3]
  525. ldr.h @E[1],[sp,#$A[3][2]+4]
  526. @ ror @C[2],@C[2],#32-18 @ C[1] = ROL64(A[1][0] ^ D[0], rhotates[1][0]);
  527. ldr.l @E[2],[sp,#$D[3]] @ D[3]
  528. @ ror @C[3],@C[3],#32-18
  529. ldr.h @E[3],[sp,#$D[3]+4]
  530. eor @C[6],@C[6],@C[4]
  531. eor @C[7],@C[7],@C[5]
  532. ror @C[4],@C[6],#32-5 @ C[2] = ROL64(A[2][1] ^ D[1], rhotates[2][1]);
  533. ror @C[5],@C[7],#32-5
  534. eor @E[0],@E[0],@C[8]
  535. ldr.l @C[8],[sp,#$A[4][3]] @ A[4][3]
  536. eor @E[1],@E[1],@C[9]
  537. ldr.h @C[9],[sp,#$A[4][3]+4]
  538. ror @C[7],@E[0],#32-7 @ C[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
  539. ror @C[6],@E[1],#32-8
  540. eor @E[2],@E[2],@C[8]
  541. eor @E[3],@E[3],@C[9]
  542. ror @C[8],@E[2],#32-28 @ C[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]);
  543. ror @C[9],@E[3],#32-28
  544. bic @E[0],@C[4],@C[2],ror#32-18
  545. bic @E[1],@C[5],@C[3],ror#32-18
  546. eor @E[0],@E[0],@C[0],ror#32-14
  547. eor @E[1],@E[1],@C[1],ror#32-13
  548. str.l @E[0],[sp,#$R[3][0]] @ R[3][0] = C[0] ^ (~C[1] & C[2])
  549. bic @E[2],@C[6],@C[4]
  550. str.h @E[1],[sp,#$R[3][0]+4]
  551. bic @E[3],@C[7],@C[5]
  552. eor @E[2],@E[2],@C[2],ror#32-18
  553. str.l @E[2],[sp,#$R[3][1]] @ R[3][1] = C[1] ^ (~C[2] & C[3]);
  554. eor @E[3],@E[3],@C[3],ror#32-18
  555. str.h @E[3],[sp,#$R[3][1]+4]
  556. bic @E[0],@C[8],@C[6]
  557. bic @E[1],@C[9],@C[7]
  558. bic @E[2],@C[0],@C[8],ror#14
  559. bic @E[3],@C[1],@C[9],ror#13
  560. eor @E[0],@E[0],@C[4]
  561. eor @E[1],@E[1],@C[5]
  562. str.l @E[0],[sp,#$R[3][2]] @ R[3][2] = C[2] ^ (~C[3] & C[4]);
  563. bic @C[2],@C[2],@C[0],ror#18-14
  564. str.h @E[1],[sp,#$R[3][2]+4]
  565. eor @E[2],@C[6],@E[2],ror#32-14
  566. bic @E[1],@C[3],@C[1],ror#18-13
  567. eor @E[3],@C[7],@E[3],ror#32-13
  568. str.l @E[2],[sp,#$R[3][3]] @ R[3][3] = C[3] ^ (~C[4] & C[0]);
  569. str.h @E[3],[sp,#$R[3][3]+4]
  570. add @E[3],sp,#$D[2]
  571. ldr.l @C[0],[sp,#$A[0][2]] @ A[0][2]
  572. eor @E[0],@C[8],@C[2],ror#32-18
  573. ldr.h @C[1],[sp,#$A[0][2]+4]
  574. eor @E[1],@C[9],@E[1],ror#32-18
  575. str.l @E[0],[sp,#$R[3][4]] @ R[3][4] = C[4] ^ (~C[0] & C[1]);
  576. str.h @E[1],[sp,#$R[3][4]+4]
  577. ldmia @E[3],{@E[0]-@E[2],@E[3]} @ D[2..3]
  578. ldr.l @C[2],[sp,#$A[1][3]] @ A[1][3]
  579. ldr.h @C[3],[sp,#$A[1][3]+4]
  580. ldr.l @C[6],[sp,#$D[4]] @ D[4]
  581. ldr.h @C[7],[sp,#$D[4]+4]
  582. eor @C[0],@C[0],@E[0]
  583. ldr.l @C[4],[sp,#$A[2][4]] @ A[2][4]
  584. eor @C[1],@C[1],@E[1]
  585. ldr.h @C[5],[sp,#$A[2][4]+4]
  586. @ ror @C[0],@C[0],#32-31 @ C[0] = ROL64(A[0][2] ^ D[2], rhotates[0][2]);
  587. ldr.l @C[8],[sp,#$D[0]] @ D[0]
  588. @ ror @C[1],@C[1],#32-31
  589. ldr.h @C[9],[sp,#$D[0]+4]
  590. eor @E[2],@E[2],@C[2]
  591. ldr.l @E[0],[sp,#$A[3][0]] @ A[3][0]
  592. eor @E[3],@E[3],@C[3]
  593. ldr.h @E[1],[sp,#$A[3][0]+4]
  594. ror @C[3],@E[2],#32-27 @ C[1] = ROL64(A[1][3] ^ D[3], rhotates[1][3]);
  595. ldr.l @E[2],[sp,#$D[1]] @ D[1]
  596. ror @C[2],@E[3],#32-28
  597. ldr.h @E[3],[sp,#$D[1]+4]
  598. eor @C[6],@C[6],@C[4]
  599. eor @C[7],@C[7],@C[5]
  600. ror @C[5],@C[6],#32-19 @ C[2] = ROL64(A[2][4] ^ D[4], rhotates[2][4]);
  601. ror @C[4],@C[7],#32-20
  602. eor @E[0],@E[0],@C[8]
  603. ldr.l @C[8],[sp,#$A[4][1]] @ A[4][1]
  604. eor @E[1],@E[1],@C[9]
  605. ldr.h @C[9],[sp,#$A[4][1]+4]
  606. ror @C[7],@E[0],#32-20 @ C[3] = ROL64(A[3][0] ^ D[0], rhotates[3][0]);
  607. ror @C[6],@E[1],#32-21
  608. eor @C[8],@C[8],@E[2]
  609. eor @C[9],@C[9],@E[3]
  610. @ ror @C[8],@C[2],#32-1 @ C[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
  611. @ ror @C[9],@C[3],#32-1
  612. bic @E[0],@C[4],@C[2]
  613. bic @E[1],@C[5],@C[3]
  614. eor @E[0],@E[0],@C[0],ror#32-31
  615. str.l @E[0],[sp,#$R[4][0]] @ R[4][0] = C[0] ^ (~C[1] & C[2])
  616. eor @E[1],@E[1],@C[1],ror#32-31
  617. str.h @E[1],[sp,#$R[4][0]+4]
  618. bic @E[2],@C[6],@C[4]
  619. bic @E[3],@C[7],@C[5]
  620. eor @E[2],@E[2],@C[2]
  621. eor @E[3],@E[3],@C[3]
  622. str.l @E[2],[sp,#$R[4][1]] @ R[4][1] = C[1] ^ (~C[2] & C[3]);
  623. bic @E[0],@C[8],@C[6],ror#1
  624. str.h @E[3],[sp,#$R[4][1]+4]
  625. bic @E[1],@C[9],@C[7],ror#1
  626. bic @E[2],@C[0],@C[8],ror#31-1
  627. bic @E[3],@C[1],@C[9],ror#31-1
  628. eor @C[4],@C[4],@E[0],ror#32-1
  629. str.l @C[4],[sp,#$R[4][2]] @ R[4][2] = C[2] ^= (~C[3] & C[4]);
  630. eor @C[5],@C[5],@E[1],ror#32-1
  631. str.h @C[5],[sp,#$R[4][2]+4]
  632. eor @C[6],@C[6],@E[2],ror#32-31
  633. eor @C[7],@C[7],@E[3],ror#32-31
  634. str.l @C[6],[sp,#$R[4][3]] @ R[4][3] = C[3] ^= (~C[4] & C[0]);
  635. bic @E[0],@C[2],@C[0],ror#32-31
  636. str.h @C[7],[sp,#$R[4][3]+4]
  637. bic @E[1],@C[3],@C[1],ror#32-31
  638. add @E[2],sp,#$R[0][0]
  639. eor @C[8],@E[0],@C[8],ror#32-1
  640. add @E[0],sp,#$R[1][0]
  641. eor @C[9],@E[1],@C[9],ror#32-1
  642. str.l @C[8],[sp,#$R[4][4]] @ R[4][4] = C[4] ^= (~C[0] & C[1]);
  643. str.h @C[9],[sp,#$R[4][4]+4]
  644. ___
  645. }
  646. Round(@A,@T);
  647. Round(@T,@A);
  648. $code.=<<___;
  649. blo .Lround2x
  650. ldr pc,[sp,#440]
  651. .size KeccakF1600_int,.-KeccakF1600_int
  652. .type KeccakF1600, %function
  653. .align 5
  654. KeccakF1600:
  655. stmdb sp!,{r0,r4-r11,lr}
  656. sub sp,sp,#440+16 @ space for A[5][5],D[5],T[5][5],...
  657. add @E[0],r0,#$A[1][0]
  658. add @E[1],sp,#$A[1][0]
  659. ldmia r0, {@C[0]-@C[9]} @ copy A[5][5] to stack
  660. stmia sp, {@C[0]-@C[9]}
  661. ldmia @E[0]!,{@C[0]-@C[9]}
  662. stmia @E[1]!,{@C[0]-@C[9]}
  663. ldmia @E[0]!,{@C[0]-@C[9]}
  664. stmia @E[1]!,{@C[0]-@C[9]}
  665. ldmia @E[0]!,{@C[0]-@C[9]}
  666. stmia @E[1]!,{@C[0]-@C[9]}
  667. ldmia @E[0], {@C[0]-@C[9]}
  668. add @E[2],sp,#$A[0][0]
  669. add @E[0],sp,#$A[1][0]
  670. stmia @E[1], {@C[0]-@C[9]}
  671. bl KeccakF1600_enter
  672. ldr @E[1], [sp,#440+16] @ restore pointer to A
  673. ldmia sp, {@C[0]-@C[9]}
  674. stmia @E[1]!,{@C[0]-@C[9]} @ return A[5][5]
  675. ldmia @E[0]!,{@C[0]-@C[9]}
  676. stmia @E[1]!,{@C[0]-@C[9]}
  677. ldmia @E[0]!,{@C[0]-@C[9]}
  678. stmia @E[1]!,{@C[0]-@C[9]}
  679. ldmia @E[0]!,{@C[0]-@C[9]}
  680. stmia @E[1]!,{@C[0]-@C[9]}
  681. ldmia @E[0], {@C[0]-@C[9]}
  682. stmia @E[1], {@C[0]-@C[9]}
  683. add sp,sp,#440+20
  684. ldmia sp!,{r4-r11,pc}
  685. .size KeccakF1600,.-KeccakF1600
  686. ___
  687. { my ($A_flat,$inp,$len,$bsz) = map("r$_",(10..12,14));
  688. ########################################################################
  689. # Stack layout
  690. # ----->+-----------------------+
  691. # | uint64_t A[5][5] |
  692. # | ... |
  693. # | ... |
  694. # +456->+-----------------------+
  695. # | 0x55555555 |
  696. # +460->+-----------------------+
  697. # | 0x33333333 |
  698. # +464->+-----------------------+
  699. # | 0x0f0f0f0f |
  700. # +468->+-----------------------+
  701. # | 0x00ff00ff |
  702. # +472->+-----------------------+
  703. # | uint64_t *A |
  704. # +476->+-----------------------+
  705. # | const void *inp |
  706. # +480->+-----------------------+
  707. # | size_t len |
  708. # +484->+-----------------------+
  709. # | size_t bs |
  710. # +488->+-----------------------+
  711. # | ....
  712. $code.=<<___;
  713. .global SHA3_absorb
  714. .type SHA3_absorb,%function
  715. .align 5
  716. SHA3_absorb:
  717. stmdb sp!,{r0-r12,lr}
  718. sub sp,sp,#456+16
  719. add $A_flat,r0,#$A[1][0]
  720. @ mov $inp,r1
  721. mov $len,r2
  722. mov $bsz,r3
  723. cmp r2,r3
  724. blo .Labsorb_abort
  725. add $inp,sp,#0
  726. ldmia r0, {@C[0]-@C[9]} @ copy A[5][5] to stack
  727. stmia $inp!, {@C[0]-@C[9]}
  728. ldmia $A_flat!,{@C[0]-@C[9]}
  729. stmia $inp!, {@C[0]-@C[9]}
  730. ldmia $A_flat!,{@C[0]-@C[9]}
  731. stmia $inp!, {@C[0]-@C[9]}
  732. ldmia $A_flat!,{@C[0]-@C[9]}
  733. stmia $inp!, {@C[0]-@C[9]}
  734. ldmia $A_flat!,{@C[0]-@C[9]}
  735. stmia $inp, {@C[0]-@C[9]}
  736. ldr $inp,[sp,#476] @ restore $inp
  737. #ifdef __thumb2__
  738. mov r9,#0x00ff00ff
  739. mov r8,#0x0f0f0f0f
  740. mov r7,#0x33333333
  741. mov r6,#0x55555555
  742. #else
  743. mov r6,#0x11 @ compose constants
  744. mov r8,#0x0f
  745. mov r9,#0xff
  746. orr r6,r6,r6,lsl#8
  747. orr r8,r8,r8,lsl#8
  748. orr r6,r6,r6,lsl#16 @ 0x11111111
  749. orr r9,r9,r9,lsl#16 @ 0x00ff00ff
  750. orr r8,r8,r8,lsl#16 @ 0x0f0f0f0f
  751. orr r7,r6,r6,lsl#1 @ 0x33333333
  752. orr r6,r6,r6,lsl#2 @ 0x55555555
  753. #endif
  754. str r9,[sp,#468]
  755. str r8,[sp,#464]
  756. str r7,[sp,#460]
  757. str r6,[sp,#456]
  758. b .Loop_absorb
  759. .align 4
  760. .Loop_absorb:
  761. subs r0,$len,$bsz
  762. blo .Labsorbed
  763. add $A_flat,sp,#0
  764. str r0,[sp,#480] @ save len - bsz
  765. .align 4
  766. .Loop_block:
  767. ldrb r0,[$inp],#1
  768. ldrb r1,[$inp],#1
  769. ldrb r2,[$inp],#1
  770. ldrb r3,[$inp],#1
  771. ldrb r4,[$inp],#1
  772. orr r0,r0,r1,lsl#8
  773. ldrb r1,[$inp],#1
  774. orr r0,r0,r2,lsl#16
  775. ldrb r2,[$inp],#1
  776. orr r0,r0,r3,lsl#24 @ lo
  777. ldrb r3,[$inp],#1
  778. orr r1,r4,r1,lsl#8
  779. orr r1,r1,r2,lsl#16
  780. orr r1,r1,r3,lsl#24 @ hi
  781. and r2,r0,r6 @ &=0x55555555
  782. and r0,r0,r6,lsl#1 @ &=0xaaaaaaaa
  783. and r3,r1,r6 @ &=0x55555555
  784. and r1,r1,r6,lsl#1 @ &=0xaaaaaaaa
  785. orr r2,r2,r2,lsr#1
  786. orr r0,r0,r0,lsl#1
  787. orr r3,r3,r3,lsr#1
  788. orr r1,r1,r1,lsl#1
  789. and r2,r2,r7 @ &=0x33333333
  790. and r0,r0,r7,lsl#2 @ &=0xcccccccc
  791. and r3,r3,r7 @ &=0x33333333
  792. and r1,r1,r7,lsl#2 @ &=0xcccccccc
  793. orr r2,r2,r2,lsr#2
  794. orr r0,r0,r0,lsl#2
  795. orr r3,r3,r3,lsr#2
  796. orr r1,r1,r1,lsl#2
  797. and r2,r2,r8 @ &=0x0f0f0f0f
  798. and r0,r0,r8,lsl#4 @ &=0xf0f0f0f0
  799. and r3,r3,r8 @ &=0x0f0f0f0f
  800. and r1,r1,r8,lsl#4 @ &=0xf0f0f0f0
  801. ldmia $A_flat,{r4-r5} @ A_flat[i]
  802. orr r2,r2,r2,lsr#4
  803. orr r0,r0,r0,lsl#4
  804. orr r3,r3,r3,lsr#4
  805. orr r1,r1,r1,lsl#4
  806. and r2,r2,r9 @ &=0x00ff00ff
  807. and r0,r0,r9,lsl#8 @ &=0xff00ff00
  808. and r3,r3,r9 @ &=0x00ff00ff
  809. and r1,r1,r9,lsl#8 @ &=0xff00ff00
  810. orr r2,r2,r2,lsr#8
  811. orr r0,r0,r0,lsl#8
  812. orr r3,r3,r3,lsr#8
  813. orr r1,r1,r1,lsl#8
  814. lsl r2,r2,#16
  815. lsr r1,r1,#16
  816. eor r4,r4,r3,lsl#16
  817. eor r5,r5,r0,lsr#16
  818. eor r4,r4,r2,lsr#16
  819. eor r5,r5,r1,lsl#16
  820. stmia $A_flat!,{r4-r5} @ A_flat[i++] ^= BitInterleave(inp[0..7])
  821. subs $bsz,$bsz,#8
  822. bhi .Loop_block
  823. str $inp,[sp,#476]
  824. bl KeccakF1600_int
  825. add r14,sp,#456
  826. ldmia r14,{r6-r12,r14} @ restore constants and variables
  827. b .Loop_absorb
  828. .align 4
  829. .Labsorbed:
  830. add $inp,sp,#$A[1][0]
  831. ldmia sp, {@C[0]-@C[9]}
  832. stmia $A_flat!,{@C[0]-@C[9]} @ return A[5][5]
  833. ldmia $inp!, {@C[0]-@C[9]}
  834. stmia $A_flat!,{@C[0]-@C[9]}
  835. ldmia $inp!, {@C[0]-@C[9]}
  836. stmia $A_flat!,{@C[0]-@C[9]}
  837. ldmia $inp!, {@C[0]-@C[9]}
  838. stmia $A_flat!,{@C[0]-@C[9]}
  839. ldmia $inp, {@C[0]-@C[9]}
  840. stmia $A_flat, {@C[0]-@C[9]}
  841. .Labsorb_abort:
  842. add sp,sp,#456+32
  843. mov r0,$len @ return value
  844. ldmia sp!,{r4-r12,pc}
  845. .size SHA3_absorb,.-SHA3_absorb
  846. ___
  847. }
  848. { my ($out,$len,$A_flat,$bsz) = map("r$_", (4,5,10,12));
  849. $code.=<<___;
  850. .global SHA3_squeeze
  851. .type SHA3_squeeze,%function
  852. .align 5
  853. SHA3_squeeze:
  854. stmdb sp!,{r0,r3-r10,lr}
  855. mov $A_flat,r0
  856. mov $out,r1
  857. mov $len,r2
  858. mov $bsz,r3
  859. #ifdef __thumb2__
  860. mov r9,#0x00ff00ff
  861. mov r8,#0x0f0f0f0f
  862. mov r7,#0x33333333
  863. mov r6,#0x55555555
  864. #else
  865. mov r6,#0x11 @ compose constants
  866. mov r8,#0x0f
  867. mov r9,#0xff
  868. orr r6,r6,r6,lsl#8
  869. orr r8,r8,r8,lsl#8
  870. orr r6,r6,r6,lsl#16 @ 0x11111111
  871. orr r9,r9,r9,lsl#16 @ 0x00ff00ff
  872. orr r8,r8,r8,lsl#16 @ 0x0f0f0f0f
  873. orr r7,r6,r6,lsl#1 @ 0x33333333
  874. orr r6,r6,r6,lsl#2 @ 0x55555555
  875. #endif
  876. stmdb sp!,{r6-r9}
  877. mov r14,$A_flat
  878. b .Loop_squeeze
  879. .align 4
  880. .Loop_squeeze:
  881. ldmia $A_flat!,{r0,r1} @ A_flat[i++]
  882. lsl r2,r0,#16
  883. lsl r3,r1,#16 @ r3 = r1 << 16
  884. lsr r2,r2,#16 @ r2 = r0 & 0x0000ffff
  885. lsr r1,r1,#16
  886. lsr r0,r0,#16 @ r0 = r0 >> 16
  887. lsl r1,r1,#16 @ r1 = r1 & 0xffff0000
  888. orr r2,r2,r2,lsl#8
  889. orr r3,r3,r3,lsr#8
  890. orr r0,r0,r0,lsl#8
  891. orr r1,r1,r1,lsr#8
  892. and r2,r2,r9 @ &=0x00ff00ff
  893. and r3,r3,r9,lsl#8 @ &=0xff00ff00
  894. and r0,r0,r9 @ &=0x00ff00ff
  895. and r1,r1,r9,lsl#8 @ &=0xff00ff00
  896. orr r2,r2,r2,lsl#4
  897. orr r3,r3,r3,lsr#4
  898. orr r0,r0,r0,lsl#4
  899. orr r1,r1,r1,lsr#4
  900. and r2,r2,r8 @ &=0x0f0f0f0f
  901. and r3,r3,r8,lsl#4 @ &=0xf0f0f0f0
  902. and r0,r0,r8 @ &=0x0f0f0f0f
  903. and r1,r1,r8,lsl#4 @ &=0xf0f0f0f0
  904. orr r2,r2,r2,lsl#2
  905. orr r3,r3,r3,lsr#2
  906. orr r0,r0,r0,lsl#2
  907. orr r1,r1,r1,lsr#2
  908. and r2,r2,r7 @ &=0x33333333
  909. and r3,r3,r7,lsl#2 @ &=0xcccccccc
  910. and r0,r0,r7 @ &=0x33333333
  911. and r1,r1,r7,lsl#2 @ &=0xcccccccc
  912. orr r2,r2,r2,lsl#1
  913. orr r3,r3,r3,lsr#1
  914. orr r0,r0,r0,lsl#1
  915. orr r1,r1,r1,lsr#1
  916. and r2,r2,r6 @ &=0x55555555
  917. and r3,r3,r6,lsl#1 @ &=0xaaaaaaaa
  918. and r0,r0,r6 @ &=0x55555555
  919. and r1,r1,r6,lsl#1 @ &=0xaaaaaaaa
  920. orr r2,r2,r3
  921. orr r0,r0,r1
  922. cmp $len,#8
  923. blo .Lsqueeze_tail
  924. lsr r1,r2,#8
  925. strb r2,[$out],#1
  926. lsr r3,r2,#16
  927. strb r1,[$out],#1
  928. lsr r2,r2,#24
  929. strb r3,[$out],#1
  930. strb r2,[$out],#1
  931. lsr r1,r0,#8
  932. strb r0,[$out],#1
  933. lsr r3,r0,#16
  934. strb r1,[$out],#1
  935. lsr r0,r0,#24
  936. strb r3,[$out],#1
  937. strb r0,[$out],#1
  938. subs $len,$len,#8
  939. beq .Lsqueeze_done
  940. subs $bsz,$bsz,#8 @ bsz -= 8
  941. bhi .Loop_squeeze
  942. mov r0,r14 @ original $A_flat
  943. bl KeccakF1600
  944. ldmia sp,{r6-r10,r12} @ restore constants and variables
  945. mov r14,$A_flat
  946. b .Loop_squeeze
  947. .align 4
  948. .Lsqueeze_tail:
  949. strb r2,[$out],#1
  950. lsr r2,r2,#8
  951. subs $len,$len,#1
  952. beq .Lsqueeze_done
  953. strb r2,[$out],#1
  954. lsr r2,r2,#8
  955. subs $len,$len,#1
  956. beq .Lsqueeze_done
  957. strb r2,[$out],#1
  958. lsr r2,r2,#8
  959. subs $len,$len,#1
  960. beq .Lsqueeze_done
  961. strb r2,[$out],#1
  962. subs $len,$len,#1
  963. beq .Lsqueeze_done
  964. strb r0,[$out],#1
  965. lsr r0,r0,#8
  966. subs $len,$len,#1
  967. beq .Lsqueeze_done
  968. strb r0,[$out],#1
  969. lsr r0,r0,#8
  970. subs $len,$len,#1
  971. beq .Lsqueeze_done
  972. strb r0,[$out]
  973. b .Lsqueeze_done
  974. .align 4
  975. .Lsqueeze_done:
  976. add sp,sp,#24
  977. ldmia sp!,{r4-r10,pc}
  978. .size SHA3_squeeze,.-SHA3_squeeze
  979. ___
  980. }
  981. $code.=<<___;
  982. #if __ARM_MAX_ARCH__>=7
  983. .fpu neon
  984. .type iotas64, %object
  985. .align 5
  986. iotas64:
  987. .quad 0x0000000000000001
  988. .quad 0x0000000000008082
  989. .quad 0x800000000000808a
  990. .quad 0x8000000080008000
  991. .quad 0x000000000000808b
  992. .quad 0x0000000080000001
  993. .quad 0x8000000080008081
  994. .quad 0x8000000000008009
  995. .quad 0x000000000000008a
  996. .quad 0x0000000000000088
  997. .quad 0x0000000080008009
  998. .quad 0x000000008000000a
  999. .quad 0x000000008000808b
  1000. .quad 0x800000000000008b
  1001. .quad 0x8000000000008089
  1002. .quad 0x8000000000008003
  1003. .quad 0x8000000000008002
  1004. .quad 0x8000000000000080
  1005. .quad 0x000000000000800a
  1006. .quad 0x800000008000000a
  1007. .quad 0x8000000080008081
  1008. .quad 0x8000000000008080
  1009. .quad 0x0000000080000001
  1010. .quad 0x8000000080008008
  1011. .size iotas64,.-iotas64
  1012. .type KeccakF1600_neon, %function
  1013. .align 5
  1014. KeccakF1600_neon:
  1015. add r1, r0, #16
  1016. adr r2, iotas64
  1017. mov r3, #24 @ loop counter
  1018. b .Loop_neon
  1019. .align 4
  1020. .Loop_neon:
  1021. @ Theta
  1022. vst1.64 {q4}, [r0:64] @ offload A[0..1][4]
  1023. veor q13, q0, q5 @ A[0..1][0]^A[2..3][0]
  1024. vst1.64 {d18}, [r1:64] @ offload A[2][4]
  1025. veor q14, q1, q6 @ A[0..1][1]^A[2..3][1]
  1026. veor q15, q2, q7 @ A[0..1][2]^A[2..3][2]
  1027. veor d26, d26, d27 @ C[0]=A[0][0]^A[1][0]^A[2][0]^A[3][0]
  1028. veor d27, d28, d29 @ C[1]=A[0][1]^A[1][1]^A[2][1]^A[3][1]
  1029. veor q14, q3, q8 @ A[0..1][3]^A[2..3][3]
  1030. veor q4, q4, q9 @ A[0..1][4]^A[2..3][4]
  1031. veor d30, d30, d31 @ C[2]=A[0][2]^A[1][2]^A[2][2]^A[3][2]
  1032. veor d31, d28, d29 @ C[3]=A[0][3]^A[1][3]^A[2][3]^A[3][3]
  1033. veor d25, d8, d9 @ C[4]=A[0][4]^A[1][4]^A[2][4]^A[3][4]
  1034. veor q13, q13, q10 @ C[0..1]^=A[4][0..1]
  1035. veor q14, q15, q11 @ C[2..3]^=A[4][2..3]
  1036. veor d25, d25, d24 @ C[4]^=A[4][4]
  1037. vadd.u64 q4, q13, q13 @ C[0..1]<<1
  1038. vadd.u64 q15, q14, q14 @ C[2..3]<<1
  1039. vadd.u64 d18, d25, d25 @ C[4]<<1
  1040. vsri.u64 q4, q13, #63 @ ROL64(C[0..1],1)
  1041. vsri.u64 q15, q14, #63 @ ROL64(C[2..3],1)
  1042. vsri.u64 d18, d25, #63 @ ROL64(C[4],1)
  1043. veor d25, d25, d9 @ D[0] = C[4] ^= ROL64(C[1],1)
  1044. veor q13, q13, q15 @ D[1..2] = C[0..1] ^ ROL64(C[2..3],1)
  1045. veor d28, d28, d18 @ D[3] = C[2] ^= ROL64(C[4],1)
  1046. veor d29, d29, d8 @ D[4] = C[3] ^= ROL64(C[0],1)
  1047. veor d0, d0, d25 @ A[0][0] ^= C[4]
  1048. veor d1, d1, d25 @ A[1][0] ^= C[4]
  1049. veor d10, d10, d25 @ A[2][0] ^= C[4]
  1050. veor d11, d11, d25 @ A[3][0] ^= C[4]
  1051. veor d20, d20, d25 @ A[4][0] ^= C[4]
  1052. veor d2, d2, d26 @ A[0][1] ^= D[1]
  1053. veor d3, d3, d26 @ A[1][1] ^= D[1]
  1054. veor d12, d12, d26 @ A[2][1] ^= D[1]
  1055. veor d13, d13, d26 @ A[3][1] ^= D[1]
  1056. veor d21, d21, d26 @ A[4][1] ^= D[1]
  1057. vmov d26, d27
  1058. veor d6, d6, d28 @ A[0][3] ^= C[2]
  1059. veor d7, d7, d28 @ A[1][3] ^= C[2]
  1060. veor d16, d16, d28 @ A[2][3] ^= C[2]
  1061. veor d17, d17, d28 @ A[3][3] ^= C[2]
  1062. veor d23, d23, d28 @ A[4][3] ^= C[2]
  1063. vld1.64 {q4}, [r0:64] @ restore A[0..1][4]
  1064. vmov d28, d29
  1065. vld1.64 {d18}, [r1:64] @ restore A[2][4]
  1066. veor q2, q2, q13 @ A[0..1][2] ^= D[2]
  1067. veor q7, q7, q13 @ A[2..3][2] ^= D[2]
  1068. veor d22, d22, d27 @ A[4][2] ^= D[2]
  1069. veor q4, q4, q14 @ A[0..1][4] ^= C[3]
  1070. veor q9, q9, q14 @ A[2..3][4] ^= C[3]
  1071. veor d24, d24, d29 @ A[4][4] ^= C[3]
  1072. @ Rho + Pi
  1073. vmov d26, d2 @ C[1] = A[0][1]
  1074. vshl.u64 d2, d3, #44
  1075. vmov d27, d4 @ C[2] = A[0][2]
  1076. vshl.u64 d4, d14, #43
  1077. vmov d28, d6 @ C[3] = A[0][3]
  1078. vshl.u64 d6, d17, #21
  1079. vmov d29, d8 @ C[4] = A[0][4]
  1080. vshl.u64 d8, d24, #14
  1081. vsri.u64 d2, d3, #64-44 @ A[0][1] = ROL64(A[1][1], rhotates[1][1])
  1082. vsri.u64 d4, d14, #64-43 @ A[0][2] = ROL64(A[2][2], rhotates[2][2])
  1083. vsri.u64 d6, d17, #64-21 @ A[0][3] = ROL64(A[3][3], rhotates[3][3])
  1084. vsri.u64 d8, d24, #64-14 @ A[0][4] = ROL64(A[4][4], rhotates[4][4])
  1085. vshl.u64 d3, d9, #20
  1086. vshl.u64 d14, d16, #25
  1087. vshl.u64 d17, d15, #15
  1088. vshl.u64 d24, d21, #2
  1089. vsri.u64 d3, d9, #64-20 @ A[1][1] = ROL64(A[1][4], rhotates[1][4])
  1090. vsri.u64 d14, d16, #64-25 @ A[2][2] = ROL64(A[2][3], rhotates[2][3])
  1091. vsri.u64 d17, d15, #64-15 @ A[3][3] = ROL64(A[3][2], rhotates[3][2])
  1092. vsri.u64 d24, d21, #64-2 @ A[4][4] = ROL64(A[4][1], rhotates[4][1])
  1093. vshl.u64 d9, d22, #61
  1094. @ vshl.u64 d16, d19, #8
  1095. vshl.u64 d15, d12, #10
  1096. vshl.u64 d21, d7, #55
  1097. vsri.u64 d9, d22, #64-61 @ A[1][4] = ROL64(A[4][2], rhotates[4][2])
  1098. vext.8 d16, d19, d19, #8-1 @ A[2][3] = ROL64(A[3][4], rhotates[3][4])
  1099. vsri.u64 d15, d12, #64-10 @ A[3][2] = ROL64(A[2][1], rhotates[2][1])
  1100. vsri.u64 d21, d7, #64-55 @ A[4][1] = ROL64(A[1][3], rhotates[1][3])
  1101. vshl.u64 d22, d18, #39
  1102. @ vshl.u64 d19, d23, #56
  1103. vshl.u64 d12, d5, #6
  1104. vshl.u64 d7, d13, #45
  1105. vsri.u64 d22, d18, #64-39 @ A[4][2] = ROL64(A[2][4], rhotates[2][4])
  1106. vext.8 d19, d23, d23, #8-7 @ A[3][4] = ROL64(A[4][3], rhotates[4][3])
  1107. vsri.u64 d12, d5, #64-6 @ A[2][1] = ROL64(A[1][2], rhotates[1][2])
  1108. vsri.u64 d7, d13, #64-45 @ A[1][3] = ROL64(A[3][1], rhotates[3][1])
  1109. vshl.u64 d18, d20, #18
  1110. vshl.u64 d23, d11, #41
  1111. vshl.u64 d5, d10, #3
  1112. vshl.u64 d13, d1, #36
  1113. vsri.u64 d18, d20, #64-18 @ A[2][4] = ROL64(A[4][0], rhotates[4][0])
  1114. vsri.u64 d23, d11, #64-41 @ A[4][3] = ROL64(A[3][0], rhotates[3][0])
  1115. vsri.u64 d5, d10, #64-3 @ A[1][2] = ROL64(A[2][0], rhotates[2][0])
  1116. vsri.u64 d13, d1, #64-36 @ A[3][1] = ROL64(A[1][0], rhotates[1][0])
  1117. vshl.u64 d1, d28, #28
  1118. vshl.u64 d10, d26, #1
  1119. vshl.u64 d11, d29, #27
  1120. vshl.u64 d20, d27, #62
  1121. vsri.u64 d1, d28, #64-28 @ A[1][0] = ROL64(C[3], rhotates[0][3])
  1122. vsri.u64 d10, d26, #64-1 @ A[2][0] = ROL64(C[1], rhotates[0][1])
  1123. vsri.u64 d11, d29, #64-27 @ A[3][0] = ROL64(C[4], rhotates[0][4])
  1124. vsri.u64 d20, d27, #64-62 @ A[4][0] = ROL64(C[2], rhotates[0][2])
  1125. @ Chi + Iota
  1126. vbic q13, q2, q1
  1127. vbic q14, q3, q2
  1128. vbic q15, q4, q3
  1129. veor q13, q13, q0 @ A[0..1][0] ^ (~A[0..1][1] & A[0..1][2])
  1130. veor q14, q14, q1 @ A[0..1][1] ^ (~A[0..1][2] & A[0..1][3])
  1131. veor q2, q2, q15 @ A[0..1][2] ^= (~A[0..1][3] & A[0..1][4])
  1132. vst1.64 {q13}, [r0:64] @ offload A[0..1][0]
  1133. vbic q13, q0, q4
  1134. vbic q15, q1, q0
  1135. vmov q1, q14 @ A[0..1][1]
  1136. veor q3, q3, q13 @ A[0..1][3] ^= (~A[0..1][4] & A[0..1][0])
  1137. veor q4, q4, q15 @ A[0..1][4] ^= (~A[0..1][0] & A[0..1][1])
  1138. vbic q13, q7, q6
  1139. vmov q0, q5 @ A[2..3][0]
  1140. vbic q14, q8, q7
  1141. vmov q15, q6 @ A[2..3][1]
  1142. veor q5, q5, q13 @ A[2..3][0] ^= (~A[2..3][1] & A[2..3][2])
  1143. vbic q13, q9, q8
  1144. veor q6, q6, q14 @ A[2..3][1] ^= (~A[2..3][2] & A[2..3][3])
  1145. vbic q14, q0, q9
  1146. veor q7, q7, q13 @ A[2..3][2] ^= (~A[2..3][3] & A[2..3][4])
  1147. vbic q13, q15, q0
  1148. veor q8, q8, q14 @ A[2..3][3] ^= (~A[2..3][4] & A[2..3][0])
  1149. vmov q14, q10 @ A[4][0..1]
  1150. veor q9, q9, q13 @ A[2..3][4] ^= (~A[2..3][0] & A[2..3][1])
  1151. vld1.64 d25, [r2:64]! @ Iota[i++]
  1152. vbic d26, d22, d21
  1153. vbic d27, d23, d22
  1154. vld1.64 {q0}, [r0:64] @ restore A[0..1][0]
  1155. veor d20, d20, d26 @ A[4][0] ^= (~A[4][1] & A[4][2])
  1156. vbic d26, d24, d23
  1157. veor d21, d21, d27 @ A[4][1] ^= (~A[4][2] & A[4][3])
  1158. vbic d27, d28, d24
  1159. veor d22, d22, d26 @ A[4][2] ^= (~A[4][3] & A[4][4])
  1160. vbic d26, d29, d28
  1161. veor d23, d23, d27 @ A[4][3] ^= (~A[4][4] & A[4][0])
  1162. veor d0, d0, d25 @ A[0][0] ^= Iota[i]
  1163. veor d24, d24, d26 @ A[4][4] ^= (~A[4][0] & A[4][1])
  1164. subs r3, r3, #1
  1165. bne .Loop_neon
  1166. bx lr
  1167. .size KeccakF1600_neon,.-KeccakF1600_neon
  1168. .global SHA3_absorb_neon
  1169. .type SHA3_absorb_neon, %function
  1170. .align 5
  1171. SHA3_absorb_neon:
  1172. stmdb sp!, {r4-r6,lr}
  1173. vstmdb sp!, {d8-d15}
  1174. mov r4, r1 @ inp
  1175. mov r5, r2 @ len
  1176. mov r6, r3 @ bsz
  1177. vld1.32 {d0}, [r0:64]! @ A[0][0]
  1178. vld1.32 {d2}, [r0:64]! @ A[0][1]
  1179. vld1.32 {d4}, [r0:64]! @ A[0][2]
  1180. vld1.32 {d6}, [r0:64]! @ A[0][3]
  1181. vld1.32 {d8}, [r0:64]! @ A[0][4]
  1182. vld1.32 {d1}, [r0:64]! @ A[1][0]
  1183. vld1.32 {d3}, [r0:64]! @ A[1][1]
  1184. vld1.32 {d5}, [r0:64]! @ A[1][2]
  1185. vld1.32 {d7}, [r0:64]! @ A[1][3]
  1186. vld1.32 {d9}, [r0:64]! @ A[1][4]
  1187. vld1.32 {d10}, [r0:64]! @ A[2][0]
  1188. vld1.32 {d12}, [r0:64]! @ A[2][1]
  1189. vld1.32 {d14}, [r0:64]! @ A[2][2]
  1190. vld1.32 {d16}, [r0:64]! @ A[2][3]
  1191. vld1.32 {d18}, [r0:64]! @ A[2][4]
  1192. vld1.32 {d11}, [r0:64]! @ A[3][0]
  1193. vld1.32 {d13}, [r0:64]! @ A[3][1]
  1194. vld1.32 {d15}, [r0:64]! @ A[3][2]
  1195. vld1.32 {d17}, [r0:64]! @ A[3][3]
  1196. vld1.32 {d19}, [r0:64]! @ A[3][4]
  1197. vld1.32 {d20-d23}, [r0:64]! @ A[4][0..3]
  1198. vld1.32 {d24}, [r0:64] @ A[4][4]
  1199. sub r0, r0, #24*8 @ rewind
  1200. b .Loop_absorb_neon
  1201. .align 4
  1202. .Loop_absorb_neon:
  1203. subs r12, r5, r6 @ len - bsz
  1204. blo .Labsorbed_neon
  1205. mov r5, r12
  1206. vld1.8 {d31}, [r4]! @ endian-neutral loads...
  1207. cmp r6, #8*2
  1208. veor d0, d0, d31 @ A[0][0] ^= *inp++
  1209. blo .Lprocess_neon
  1210. vld1.8 {d31}, [r4]!
  1211. veor d2, d2, d31 @ A[0][1] ^= *inp++
  1212. beq .Lprocess_neon
  1213. vld1.8 {d31}, [r4]!
  1214. cmp r6, #8*4
  1215. veor d4, d4, d31 @ A[0][2] ^= *inp++
  1216. blo .Lprocess_neon
  1217. vld1.8 {d31}, [r4]!
  1218. veor d6, d6, d31 @ A[0][3] ^= *inp++
  1219. beq .Lprocess_neon
  1220. vld1.8 {d31},[r4]!
  1221. cmp r6, #8*6
  1222. veor d8, d8, d31 @ A[0][4] ^= *inp++
  1223. blo .Lprocess_neon
  1224. vld1.8 {d31}, [r4]!
  1225. veor d1, d1, d31 @ A[1][0] ^= *inp++
  1226. beq .Lprocess_neon
  1227. vld1.8 {d31}, [r4]!
  1228. cmp r6, #8*8
  1229. veor d3, d3, d31 @ A[1][1] ^= *inp++
  1230. blo .Lprocess_neon
  1231. vld1.8 {d31}, [r4]!
  1232. veor d5, d5, d31 @ A[1][2] ^= *inp++
  1233. beq .Lprocess_neon
  1234. vld1.8 {d31}, [r4]!
  1235. cmp r6, #8*10
  1236. veor d7, d7, d31 @ A[1][3] ^= *inp++
  1237. blo .Lprocess_neon
  1238. vld1.8 {d31}, [r4]!
  1239. veor d9, d9, d31 @ A[1][4] ^= *inp++
  1240. beq .Lprocess_neon
  1241. vld1.8 {d31}, [r4]!
  1242. cmp r6, #8*12
  1243. veor d10, d10, d31 @ A[2][0] ^= *inp++
  1244. blo .Lprocess_neon
  1245. vld1.8 {d31}, [r4]!
  1246. veor d12, d12, d31 @ A[2][1] ^= *inp++
  1247. beq .Lprocess_neon
  1248. vld1.8 {d31}, [r4]!
  1249. cmp r6, #8*14
  1250. veor d14, d14, d31 @ A[2][2] ^= *inp++
  1251. blo .Lprocess_neon
  1252. vld1.8 {d31}, [r4]!
  1253. veor d16, d16, d31 @ A[2][3] ^= *inp++
  1254. beq .Lprocess_neon
  1255. vld1.8 {d31}, [r4]!
  1256. cmp r6, #8*16
  1257. veor d18, d18, d31 @ A[2][4] ^= *inp++
  1258. blo .Lprocess_neon
  1259. vld1.8 {d31}, [r4]!
  1260. veor d11, d11, d31 @ A[3][0] ^= *inp++
  1261. beq .Lprocess_neon
  1262. vld1.8 {d31}, [r4]!
  1263. cmp r6, #8*18
  1264. veor d13, d13, d31 @ A[3][1] ^= *inp++
  1265. blo .Lprocess_neon
  1266. vld1.8 {d31}, [r4]!
  1267. veor d15, d15, d31 @ A[3][2] ^= *inp++
  1268. beq .Lprocess_neon
  1269. vld1.8 {d31}, [r4]!
  1270. cmp r6, #8*20
  1271. veor d17, d17, d31 @ A[3][3] ^= *inp++
  1272. blo .Lprocess_neon
  1273. vld1.8 {d31}, [r4]!
  1274. veor d19, d19, d31 @ A[3][4] ^= *inp++
  1275. beq .Lprocess_neon
  1276. vld1.8 {d31}, [r4]!
  1277. cmp r6, #8*22
  1278. veor d20, d20, d31 @ A[4][0] ^= *inp++
  1279. blo .Lprocess_neon
  1280. vld1.8 {d31}, [r4]!
  1281. veor d21, d21, d31 @ A[4][1] ^= *inp++
  1282. beq .Lprocess_neon
  1283. vld1.8 {d31}, [r4]!
  1284. cmp r6, #8*24
  1285. veor d22, d22, d31 @ A[4][2] ^= *inp++
  1286. blo .Lprocess_neon
  1287. vld1.8 {d31}, [r4]!
  1288. veor d23, d23, d31 @ A[4][3] ^= *inp++
  1289. beq .Lprocess_neon
  1290. vld1.8 {d31}, [r4]!
  1291. veor d24, d24, d31 @ A[4][4] ^= *inp++
  1292. .Lprocess_neon:
  1293. bl KeccakF1600_neon
  1294. b .Loop_absorb_neon
  1295. .align 4
  1296. .Labsorbed_neon:
  1297. vst1.32 {d0}, [r0:64]! @ A[0][0..4]
  1298. vst1.32 {d2}, [r0:64]!
  1299. vst1.32 {d4}, [r0:64]!
  1300. vst1.32 {d6}, [r0:64]!
  1301. vst1.32 {d8}, [r0:64]!
  1302. vst1.32 {d1}, [r0:64]! @ A[1][0..4]
  1303. vst1.32 {d3}, [r0:64]!
  1304. vst1.32 {d5}, [r0:64]!
  1305. vst1.32 {d7}, [r0:64]!
  1306. vst1.32 {d9}, [r0:64]!
  1307. vst1.32 {d10}, [r0:64]! @ A[2][0..4]
  1308. vst1.32 {d12}, [r0:64]!
  1309. vst1.32 {d14}, [r0:64]!
  1310. vst1.32 {d16}, [r0:64]!
  1311. vst1.32 {d18}, [r0:64]!
  1312. vst1.32 {d11}, [r0:64]! @ A[3][0..4]
  1313. vst1.32 {d13}, [r0:64]!
  1314. vst1.32 {d15}, [r0:64]!
  1315. vst1.32 {d17}, [r0:64]!
  1316. vst1.32 {d19}, [r0:64]!
  1317. vst1.32 {d20-d23}, [r0:64]! @ A[4][0..4]
  1318. vst1.32 {d24}, [r0:64]
  1319. mov r0, r5 @ return value
  1320. vldmia sp!, {d8-d15}
  1321. ldmia sp!, {r4-r6,pc}
  1322. .size SHA3_absorb_neon,.-SHA3_absorb_neon
  1323. .global SHA3_squeeze_neon
  1324. .type SHA3_squeeze_neon, %function
  1325. .align 5
  1326. SHA3_squeeze_neon:
  1327. stmdb sp!, {r4-r6,lr}
  1328. mov r4, r1 @ out
  1329. mov r5, r2 @ len
  1330. mov r6, r3 @ bsz
  1331. mov r12, r0 @ A_flat
  1332. mov r14, r3 @ bsz
  1333. b .Loop_squeeze_neon
  1334. .align 4
  1335. .Loop_squeeze_neon:
  1336. cmp r5, #8
  1337. blo .Lsqueeze_neon_tail
  1338. vld1.32 {d0}, [r12]!
  1339. vst1.8 {d0}, [r4]! @ endian-neutral store
  1340. subs r5, r5, #8 @ len -= 8
  1341. beq .Lsqueeze_neon_done
  1342. subs r14, r14, #8 @ bsz -= 8
  1343. bhi .Loop_squeeze_neon
  1344. vstmdb sp!, {d8-d15}
  1345. vld1.32 {d0}, [r0:64]! @ A[0][0..4]
  1346. vld1.32 {d2}, [r0:64]!
  1347. vld1.32 {d4}, [r0:64]!
  1348. vld1.32 {d6}, [r0:64]!
  1349. vld1.32 {d8}, [r0:64]!
  1350. vld1.32 {d1}, [r0:64]! @ A[1][0..4]
  1351. vld1.32 {d3}, [r0:64]!
  1352. vld1.32 {d5}, [r0:64]!
  1353. vld1.32 {d7}, [r0:64]!
  1354. vld1.32 {d9}, [r0:64]!
  1355. vld1.32 {d10}, [r0:64]! @ A[2][0..4]
  1356. vld1.32 {d12}, [r0:64]!
  1357. vld1.32 {d14}, [r0:64]!
  1358. vld1.32 {d16}, [r0:64]!
  1359. vld1.32 {d18}, [r0:64]!
  1360. vld1.32 {d11}, [r0:64]! @ A[3][0..4]
  1361. vld1.32 {d13}, [r0:64]!
  1362. vld1.32 {d15}, [r0:64]!
  1363. vld1.32 {d17}, [r0:64]!
  1364. vld1.32 {d19}, [r0:64]!
  1365. vld1.32 {d20-d23}, [r0:64]! @ A[4][0..4]
  1366. vld1.32 {d24}, [r0:64]
  1367. sub r0, r0, #24*8 @ rewind
  1368. bl KeccakF1600_neon
  1369. mov r12, r0 @ A_flat
  1370. vst1.32 {d0}, [r0:64]! @ A[0][0..4]
  1371. vst1.32 {d2}, [r0:64]!
  1372. vst1.32 {d4}, [r0:64]!
  1373. vst1.32 {d6}, [r0:64]!
  1374. vst1.32 {d8}, [r0:64]!
  1375. vst1.32 {d1}, [r0:64]! @ A[1][0..4]
  1376. vst1.32 {d3}, [r0:64]!
  1377. vst1.32 {d5}, [r0:64]!
  1378. vst1.32 {d7}, [r0:64]!
  1379. vst1.32 {d9}, [r0:64]!
  1380. vst1.32 {d10}, [r0:64]! @ A[2][0..4]
  1381. vst1.32 {d12}, [r0:64]!
  1382. vst1.32 {d14}, [r0:64]!
  1383. vst1.32 {d16}, [r0:64]!
  1384. vst1.32 {d18}, [r0:64]!
  1385. vst1.32 {d11}, [r0:64]! @ A[3][0..4]
  1386. vst1.32 {d13}, [r0:64]!
  1387. vst1.32 {d15}, [r0:64]!
  1388. vst1.32 {d17}, [r0:64]!
  1389. vst1.32 {d19}, [r0:64]!
  1390. vst1.32 {d20-d23}, [r0:64]! @ A[4][0..4]
  1391. mov r14, r6 @ bsz
  1392. vst1.32 {d24}, [r0:64]
  1393. mov r0, r12 @ rewind
  1394. vldmia sp!, {d8-d15}
  1395. b .Loop_squeeze_neon
  1396. .align 4
  1397. .Lsqueeze_neon_tail:
  1398. ldmia r12, {r2,r3}
  1399. cmp r5, #2
  1400. strb r2, [r4],#1 @ endian-neutral store
  1401. lsr r2, r2, #8
  1402. blo .Lsqueeze_neon_done
  1403. strb r2, [r4], #1
  1404. lsr r2, r2, #8
  1405. beq .Lsqueeze_neon_done
  1406. strb r2, [r4], #1
  1407. lsr r2, r2, #8
  1408. cmp r5, #4
  1409. blo .Lsqueeze_neon_done
  1410. strb r2, [r4], #1
  1411. beq .Lsqueeze_neon_done
  1412. strb r3, [r4], #1
  1413. lsr r3, r3, #8
  1414. cmp r5, #6
  1415. blo .Lsqueeze_neon_done
  1416. strb r3, [r4], #1
  1417. lsr r3, r3, #8
  1418. beq .Lsqueeze_neon_done
  1419. strb r3, [r4], #1
  1420. .Lsqueeze_neon_done:
  1421. ldmia sp!, {r4-r6,pc}
  1422. .size SHA3_squeeze_neon,.-SHA3_squeeze_neon
  1423. #endif
  1424. .asciz "Keccak-1600 absorb and squeeze for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
  1425. .align 2
  1426. ___
  1427. {
  1428. my %ldr, %str;
  1429. sub ldrd {
  1430. my ($mnemonic,$half,$reg,$ea) = @_;
  1431. my $op = $mnemonic eq "ldr" ? \%ldr : \%str;
  1432. if ($half eq "l") {
  1433. $$op{reg} = $reg;
  1434. $$op{ea} = $ea;
  1435. sprintf "#ifndef __thumb2__\n" .
  1436. " %s\t%s,%s\n" .
  1437. "#endif", $mnemonic,$reg,$ea;
  1438. } else {
  1439. sprintf "#ifndef __thumb2__\n" .
  1440. " %s\t%s,%s\n" .
  1441. "#else\n" .
  1442. " %sd\t%s,%s,%s\n" .
  1443. "#endif", $mnemonic,$reg,$ea,
  1444. $mnemonic,$$op{reg},$reg,$$op{ea};
  1445. }
  1446. }
  1447. }
  1448. foreach (split($/,$code)) {
  1449. s/\`([^\`]*)\`/eval $1/ge;
  1450. s/^\s+(ldr|str)\.([lh])\s+(r[0-9]+),\s*(\[.*)/ldrd($1,$2,$3,$4)/ge or
  1451. s/\b(ror|ls[rl])\s+(r[0-9]+.*)#/mov $2$1#/g or
  1452. s/\bret\b/bx lr/g or
  1453. s/\bbx\s+lr\b/.word\t0xe12fff1e/g; # make it possible to compile with -march=armv4
  1454. print $_,"\n";
  1455. }
  1456. close STDOUT; # enforce flush