keccak1600-armv4.pl 45 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636
  1. #!/usr/bin/env perl
  2. # Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. # ====================================================================
  10. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  11. # project. The module is, however, dual licensed under OpenSSL and
  12. # CRYPTOGAMS licenses depending on where you obtain it. For further
  13. # details see http://www.openssl.org/~appro/cryptogams/.
  14. # ====================================================================
  15. #
  16. # Keccak-1600 for ARMv4.
  17. #
  18. # June 2017.
  19. #
  20. # Non-NEON code is KECCAK_1X variant (see sha/keccak1600.c) with bit
  21. # interleaving. How does it compare to Keccak Code Package? It's as
  22. # fast, but several times smaller, and is endian- and ISA-neutral. ISA
  23. # neutrality means that minimum ISA requirement is ARMv4, yet it can
  24. # be assembled even as Thumb-2. NEON code path is KECCAK_1X_ALT with
  25. # register layout taken from Keccak Code Package. It's also as fast,
  26. # in fact faster by 10-15% on some processors, and endian-neutral.
  27. #
  28. # August 2017.
  29. #
  30. # Switch to KECCAK_2X variant for non-NEON code and merge almost 1/2
  31. # of rotate instructions with logical ones. This resulted in ~10%
  32. # improvement on most processors. Switch to KECCAK_2X effectively
  33. # minimizes re-loads from temporary storage, and merged rotates just
  34. # eliminate corresponding instructions. As for latter. When examining
  35. # code you'll notice commented ror instructions. These are eliminated
  36. # ones, and you should trace destination register below to see what's
  37. # going on. Just in case, why not all rotates are eliminated. Trouble
  38. # is that you have operations that require both inputs to be rotated,
  39. # e.g. 'eor a,b>>>x,c>>>y'. This conundrum is resolved by using
  40. # 'eor a,b,c>>>(x-y)' and then merge-rotating 'a' in next operation
  41. # that takes 'a' as input. And thing is that this next operation can
  42. # be in next round. It's totally possible to "carry" rotate "factors"
  43. # to the next round, but it makes code more complex. And the last word
  44. # is the keyword, i.e. "almost 1/2" is kind of complexity cap [for the
  45. # time being]...
  46. #
  47. # Reduce per-round instruction count in Thumb-2 case by 16%. This is
  48. # achieved by folding ldr/str pairs to their double-word counterparts.
  49. # Theoretically this should have improved performance on single-issue
  50. # cores, such as Cortex-A5/A7, by 19%. Reality is a bit different, as
  51. # usual...
  52. #
  53. ########################################################################
  54. # Numbers are cycles per processed byte. Non-NEON results account even
  55. # for input bit interleaving.
  56. #
  57. # r=1088(*) Thumb-2(**) NEON
  58. #
  59. # ARM11xx 82/+150%
  60. # Cortex-A5 88/+160%, 86, 36
  61. # Cortex-A7 78/+160%, 68, 34
  62. # Cortex-A8 51/+230%, 57, 30
  63. # Cortex-A9 53/+210%, 51, 26
  64. # Cortex-A15 42/+160%, 38, 18
  65. # Snapdragon S4 43/+210%, 38, 24
  66. #
  67. # (*) Corresponds to SHA3-256. Percentage after slash is improvement
  68. # over compiler-generated KECCAK_2X reference code.
  69. # (**) Thumb-2 results for Cortex-A5/A7 are likely to apply even to
  70. # Cortex-Mx, x>=3. Otherwise, non-NEON results for NEON-capable
  71. # processors are presented mostly for reference purposes.
  72. # $output is the last argument if it looks like a file (it has an extension)
  73. # $flavour is the first argument if it doesn't look like a file
  74. $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  75. $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  76. if ($flavour && $flavour ne "void") {
  77. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  78. ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
  79. ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
  80. die "can't locate arm-xlate.pl";
  81. open STDOUT,"| \"$^X\" $xlate $flavour \"$output\""
  82. or die "can't call $xlate: $!";
  83. } else {
  84. $output and open STDOUT,">$output";
  85. }
  86. my @C = map("r$_",(0..9));
  87. my @E = map("r$_",(10..12,14));
  88. ########################################################################
  89. # Stack layout
  90. # ----->+-----------------------+
  91. # | uint64_t A[5][5] |
  92. # | ... |
  93. # +200->+-----------------------+
  94. # | uint64_t D[5] |
  95. # | ... |
  96. # +240->+-----------------------+
  97. # | uint64_t T[5][5] |
  98. # | ... |
  99. # +440->+-----------------------+
  100. # | saved lr |
  101. # +444->+-----------------------+
  102. # | loop counter |
  103. # +448->+-----------------------+
  104. # | ...
  105. my @A = map([ 8*$_, 8*($_+1), 8*($_+2), 8*($_+3), 8*($_+4) ], (0,5,10,15,20));
  106. my @D = map(8*$_, (25..29));
  107. my @T = map([ 8*$_, 8*($_+1), 8*($_+2), 8*($_+3), 8*($_+4) ], (30,35,40,45,50));
  108. $code.=<<___;
  109. #include "arm_arch.h"
  110. #if defined(__thumb2__)
  111. .syntax unified
  112. .thumb
  113. #else
  114. .code 32
  115. #endif
  116. .text
  117. .type iotas32, %object
  118. .align 5
  119. iotas32:
  120. .long 0x00000001, 0x00000000
  121. .long 0x00000000, 0x00000089
  122. .long 0x00000000, 0x8000008b
  123. .long 0x00000000, 0x80008080
  124. .long 0x00000001, 0x0000008b
  125. .long 0x00000001, 0x00008000
  126. .long 0x00000001, 0x80008088
  127. .long 0x00000001, 0x80000082
  128. .long 0x00000000, 0x0000000b
  129. .long 0x00000000, 0x0000000a
  130. .long 0x00000001, 0x00008082
  131. .long 0x00000000, 0x00008003
  132. .long 0x00000001, 0x0000808b
  133. .long 0x00000001, 0x8000000b
  134. .long 0x00000001, 0x8000008a
  135. .long 0x00000001, 0x80000081
  136. .long 0x00000000, 0x80000081
  137. .long 0x00000000, 0x80000008
  138. .long 0x00000000, 0x00000083
  139. .long 0x00000000, 0x80008003
  140. .long 0x00000001, 0x80008088
  141. .long 0x00000000, 0x80000088
  142. .long 0x00000001, 0x00008000
  143. .long 0x00000000, 0x80008082
  144. .size iotas32,.-iotas32
  145. .type KeccakF1600_int, %function
  146. .align 5
  147. KeccakF1600_int:
  148. add @C[9],sp,#$A[4][2]
  149. add @E[2],sp,#$A[0][0]
  150. add @E[0],sp,#$A[1][0]
  151. ldmia @C[9],{@C[4]-@C[9]} @ A[4][2..4]
  152. KeccakF1600_enter:
  153. str lr,[sp,#440]
  154. eor @E[1],@E[1],@E[1]
  155. str @E[1],[sp,#444]
  156. b .Lround2x
  157. .align 4
  158. .Lround2x:
  159. ___
  160. sub Round {
  161. my (@A,@R); (@A[0..4],@R) = @_;
  162. $code.=<<___;
  163. ldmia @E[2],{@C[0]-@C[3]} @ A[0][0..1]
  164. ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[1][0..1]
  165. #ifdef __thumb2__
  166. eor @C[0],@C[0],@E[0]
  167. eor @C[1],@C[1],@E[1]
  168. eor @C[2],@C[2],@E[2]
  169. ldrd @E[0],@E[1],[sp,#$A[1][2]]
  170. eor @C[3],@C[3],@E[3]
  171. ldrd @E[2],@E[3],[sp,#$A[1][3]]
  172. eor @C[4],@C[4],@E[0]
  173. eor @C[5],@C[5],@E[1]
  174. eor @C[6],@C[6],@E[2]
  175. ldrd @E[0],@E[1],[sp,#$A[1][4]]
  176. eor @C[7],@C[7],@E[3]
  177. ldrd @E[2],@E[3],[sp,#$A[2][0]]
  178. eor @C[8],@C[8],@E[0]
  179. eor @C[9],@C[9],@E[1]
  180. eor @C[0],@C[0],@E[2]
  181. ldrd @E[0],@E[1],[sp,#$A[2][1]]
  182. eor @C[1],@C[1],@E[3]
  183. ldrd @E[2],@E[3],[sp,#$A[2][2]]
  184. eor @C[2],@C[2],@E[0]
  185. eor @C[3],@C[3],@E[1]
  186. eor @C[4],@C[4],@E[2]
  187. ldrd @E[0],@E[1],[sp,#$A[2][3]]
  188. eor @C[5],@C[5],@E[3]
  189. ldrd @E[2],@E[3],[sp,#$A[2][4]]
  190. eor @C[6],@C[6],@E[0]
  191. eor @C[7],@C[7],@E[1]
  192. eor @C[8],@C[8],@E[2]
  193. ldrd @E[0],@E[1],[sp,#$A[3][0]]
  194. eor @C[9],@C[9],@E[3]
  195. ldrd @E[2],@E[3],[sp,#$A[3][1]]
  196. eor @C[0],@C[0],@E[0]
  197. eor @C[1],@C[1],@E[1]
  198. eor @C[2],@C[2],@E[2]
  199. ldrd @E[0],@E[1],[sp,#$A[3][2]]
  200. eor @C[3],@C[3],@E[3]
  201. ldrd @E[2],@E[3],[sp,#$A[3][3]]
  202. eor @C[4],@C[4],@E[0]
  203. eor @C[5],@C[5],@E[1]
  204. eor @C[6],@C[6],@E[2]
  205. ldrd @E[0],@E[1],[sp,#$A[3][4]]
  206. eor @C[7],@C[7],@E[3]
  207. ldrd @E[2],@E[3],[sp,#$A[4][0]]
  208. eor @C[8],@C[8],@E[0]
  209. eor @C[9],@C[9],@E[1]
  210. eor @C[0],@C[0],@E[2]
  211. ldrd @E[0],@E[1],[sp,#$A[4][1]]
  212. eor @C[1],@C[1],@E[3]
  213. ldrd @E[2],@E[3],[sp,#$A[0][2]]
  214. eor @C[2],@C[2],@E[0]
  215. eor @C[3],@C[3],@E[1]
  216. eor @C[4],@C[4],@E[2]
  217. ldrd @E[0],@E[1],[sp,#$A[0][3]]
  218. eor @C[5],@C[5],@E[3]
  219. ldrd @E[2],@E[3],[sp,#$A[0][4]]
  220. #else
  221. eor @C[0],@C[0],@E[0]
  222. add @E[0],sp,#$A[1][2]
  223. eor @C[1],@C[1],@E[1]
  224. eor @C[2],@C[2],@E[2]
  225. eor @C[3],@C[3],@E[3]
  226. ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[1][2..3]
  227. eor @C[4],@C[4],@E[0]
  228. add @E[0],sp,#$A[1][4]
  229. eor @C[5],@C[5],@E[1]
  230. eor @C[6],@C[6],@E[2]
  231. eor @C[7],@C[7],@E[3]
  232. ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[1][4]..A[2][0]
  233. eor @C[8],@C[8],@E[0]
  234. add @E[0],sp,#$A[2][1]
  235. eor @C[9],@C[9],@E[1]
  236. eor @C[0],@C[0],@E[2]
  237. eor @C[1],@C[1],@E[3]
  238. ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[2][1..2]
  239. eor @C[2],@C[2],@E[0]
  240. add @E[0],sp,#$A[2][3]
  241. eor @C[3],@C[3],@E[1]
  242. eor @C[4],@C[4],@E[2]
  243. eor @C[5],@C[5],@E[3]
  244. ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[2][3..4]
  245. eor @C[6],@C[6],@E[0]
  246. add @E[0],sp,#$A[3][0]
  247. eor @C[7],@C[7],@E[1]
  248. eor @C[8],@C[8],@E[2]
  249. eor @C[9],@C[9],@E[3]
  250. ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[3][0..1]
  251. eor @C[0],@C[0],@E[0]
  252. add @E[0],sp,#$A[3][2]
  253. eor @C[1],@C[1],@E[1]
  254. eor @C[2],@C[2],@E[2]
  255. eor @C[3],@C[3],@E[3]
  256. ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[3][2..3]
  257. eor @C[4],@C[4],@E[0]
  258. add @E[0],sp,#$A[3][4]
  259. eor @C[5],@C[5],@E[1]
  260. eor @C[6],@C[6],@E[2]
  261. eor @C[7],@C[7],@E[3]
  262. ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[3][4]..A[4][0]
  263. eor @C[8],@C[8],@E[0]
  264. ldr @E[0],[sp,#$A[4][1]] @ A[4][1]
  265. eor @C[9],@C[9],@E[1]
  266. ldr @E[1],[sp,#$A[4][1]+4]
  267. eor @C[0],@C[0],@E[2]
  268. ldr @E[2],[sp,#$A[0][2]] @ A[0][2]
  269. eor @C[1],@C[1],@E[3]
  270. ldr @E[3],[sp,#$A[0][2]+4]
  271. eor @C[2],@C[2],@E[0]
  272. add @E[0],sp,#$A[0][3]
  273. eor @C[3],@C[3],@E[1]
  274. eor @C[4],@C[4],@E[2]
  275. eor @C[5],@C[5],@E[3]
  276. ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[0][3..4]
  277. #endif
  278. eor @C[6],@C[6],@E[0]
  279. eor @C[7],@C[7],@E[1]
  280. eor @C[8],@C[8],@E[2]
  281. eor @C[9],@C[9],@E[3]
  282. eor @E[0],@C[0],@C[5],ror#32-1 @ E[0] = ROL64(C[2], 1) ^ C[0];
  283. str.l @E[0],[sp,#$D[1]] @ D[1] = E[0]
  284. eor @E[1],@C[1],@C[4]
  285. str.h @E[1],[sp,#$D[1]+4]
  286. eor @E[2],@C[6],@C[1],ror#32-1 @ E[1] = ROL64(C[0], 1) ^ C[3];
  287. eor @E[3],@C[7],@C[0]
  288. str.l @E[2],[sp,#$D[4]] @ D[4] = E[1]
  289. eor @C[0],@C[8],@C[3],ror#32-1 @ C[0] = ROL64(C[1], 1) ^ C[4];
  290. str.h @E[3],[sp,#$D[4]+4]
  291. eor @C[1],@C[9],@C[2]
  292. str.l @C[0],[sp,#$D[0]] @ D[0] = C[0]
  293. eor @C[2],@C[2],@C[7],ror#32-1 @ C[1] = ROL64(C[3], 1) ^ C[1];
  294. ldr.l @C[7],[sp,#$A[3][3]]
  295. eor @C[3],@C[3],@C[6]
  296. str.h @C[1],[sp,#$D[0]+4]
  297. ldr.h @C[6],[sp,#$A[3][3]+4]
  298. str.l @C[2],[sp,#$D[2]] @ D[2] = C[1]
  299. eor @C[4],@C[4],@C[9],ror#32-1 @ C[2] = ROL64(C[4], 1) ^ C[2];
  300. str.h @C[3],[sp,#$D[2]+4]
  301. eor @C[5],@C[5],@C[8]
  302. ldr.l @C[8],[sp,#$A[4][4]]
  303. ldr.h @C[9],[sp,#$A[4][4]+4]
  304. str.l @C[4],[sp,#$D[3]] @ D[3] = C[2]
  305. eor @C[7],@C[7],@C[4]
  306. str.h @C[5],[sp,#$D[3]+4]
  307. eor @C[6],@C[6],@C[5]
  308. ldr.l @C[4],[sp,#$A[0][0]]
  309. @ ror @C[7],@C[7],#32-10 @ C[3] = ROL64(A[3][3] ^ C[2], rhotates[3][3]); /* D[3] */
  310. @ ror @C[6],@C[6],#32-11
  311. ldr.h @C[5],[sp,#$A[0][0]+4]
  312. eor @C[8],@C[8],@E[2]
  313. eor @C[9],@C[9],@E[3]
  314. ldr.l @E[2],[sp,#$A[2][2]]
  315. eor @C[0],@C[0],@C[4]
  316. ldr.h @E[3],[sp,#$A[2][2]+4]
  317. @ ror @C[8],@C[8],#32-7 @ C[4] = ROL64(A[4][4] ^ E[1], rhotates[4][4]); /* D[4] */
  318. @ ror @C[9],@C[9],#32-7
  319. eor @C[1],@C[1],@C[5] @ C[0] = A[0][0] ^ C[0]; /* rotate by 0 */ /* D[0] */
  320. eor @E[2],@E[2],@C[2]
  321. ldr.l @C[2],[sp,#$A[1][1]]
  322. eor @E[3],@E[3],@C[3]
  323. ldr.h @C[3],[sp,#$A[1][1]+4]
  324. ror @C[5],@E[2],#32-21 @ C[2] = ROL64(A[2][2] ^ C[1], rhotates[2][2]); /* D[2] */
  325. ldr @E[2],[sp,#444] @ load counter
  326. eor @C[2],@C[2],@E[0]
  327. adr @E[0],iotas32
  328. ror @C[4],@E[3],#32-22
  329. add @E[3],@E[0],@E[2]
  330. eor @C[3],@C[3],@E[1]
  331. ___
  332. $code.=<<___ if ($A[0][0] != $T[0][0]);
  333. ldmia @E[3],{@E[0],@E[1]} @ iotas[i]
  334. ___
  335. $code.=<<___ if ($A[0][0] == $T[0][0]);
  336. ldr.l @E[0],[@E[3],#8] @ iotas[i].lo
  337. add @E[2],@E[2],#16
  338. ldr.h @E[1],[@E[3],#12] @ iotas[i].hi
  339. cmp @E[2],#192
  340. str @E[2],[sp,#444] @ store counter
  341. ___
  342. $code.=<<___;
  343. bic @E[2],@C[4],@C[2],ror#32-22
  344. bic @E[3],@C[5],@C[3],ror#32-22
  345. ror @C[2],@C[2],#32-22 @ C[1] = ROL64(A[1][1] ^ E[0], rhotates[1][1]); /* D[1] */
  346. ror @C[3],@C[3],#32-22
  347. eor @E[2],@E[2],@C[0]
  348. eor @E[3],@E[3],@C[1]
  349. eor @E[0],@E[0],@E[2]
  350. eor @E[1],@E[1],@E[3]
  351. str.l @E[0],[sp,#$R[0][0]] @ R[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i];
  352. bic @E[2],@C[6],@C[4],ror#11
  353. str.h @E[1],[sp,#$R[0][0]+4]
  354. bic @E[3],@C[7],@C[5],ror#10
  355. bic @E[0],@C[8],@C[6],ror#32-(11-7)
  356. bic @E[1],@C[9],@C[7],ror#32-(10-7)
  357. eor @E[2],@C[2],@E[2],ror#32-11
  358. str.l @E[2],[sp,#$R[0][1]] @ R[0][1] = C[1] ^ (~C[2] & C[3]);
  359. eor @E[3],@C[3],@E[3],ror#32-10
  360. str.h @E[3],[sp,#$R[0][1]+4]
  361. eor @E[0],@C[4],@E[0],ror#32-7
  362. eor @E[1],@C[5],@E[1],ror#32-7
  363. str.l @E[0],[sp,#$R[0][2]] @ R[0][2] = C[2] ^ (~C[3] & C[4]);
  364. bic @E[2],@C[0],@C[8],ror#32-7
  365. str.h @E[1],[sp,#$R[0][2]+4]
  366. bic @E[3],@C[1],@C[9],ror#32-7
  367. eor @E[2],@E[2],@C[6],ror#32-11
  368. str.l @E[2],[sp,#$R[0][3]] @ R[0][3] = C[3] ^ (~C[4] & C[0]);
  369. eor @E[3],@E[3],@C[7],ror#32-10
  370. str.h @E[3],[sp,#$R[0][3]+4]
  371. bic @E[0],@C[2],@C[0]
  372. add @E[3],sp,#$D[3]
  373. ldr.l @C[0],[sp,#$A[0][3]] @ A[0][3]
  374. bic @E[1],@C[3],@C[1]
  375. ldr.h @C[1],[sp,#$A[0][3]+4]
  376. eor @E[0],@E[0],@C[8],ror#32-7
  377. eor @E[1],@E[1],@C[9],ror#32-7
  378. str.l @E[0],[sp,#$R[0][4]] @ R[0][4] = C[4] ^ (~C[0] & C[1]);
  379. add @C[9],sp,#$D[0]
  380. str.h @E[1],[sp,#$R[0][4]+4]
  381. ldmia @E[3],{@E[0]-@E[2],@E[3]} @ D[3..4]
  382. ldmia @C[9],{@C[6]-@C[9]} @ D[0..1]
  383. ldr.l @C[2],[sp,#$A[1][4]] @ A[1][4]
  384. eor @C[0],@C[0],@E[0]
  385. ldr.h @C[3],[sp,#$A[1][4]+4]
  386. eor @C[1],@C[1],@E[1]
  387. @ ror @C[0],@C[0],#32-14 @ C[0] = ROL64(A[0][3] ^ D[3], rhotates[0][3]);
  388. ldr.l @E[0],[sp,#$A[3][1]] @ A[3][1]
  389. @ ror @C[1],@C[1],#32-14
  390. ldr.h @E[1],[sp,#$A[3][1]+4]
  391. eor @C[2],@C[2],@E[2]
  392. ldr.l @C[4],[sp,#$A[2][0]] @ A[2][0]
  393. eor @C[3],@C[3],@E[3]
  394. ldr.h @C[5],[sp,#$A[2][0]+4]
  395. @ ror @C[2],@C[2],#32-10 @ C[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
  396. @ ror @C[3],@C[3],#32-10
  397. eor @C[6],@C[6],@C[4]
  398. ldr.l @E[2],[sp,#$D[2]] @ D[2]
  399. eor @C[7],@C[7],@C[5]
  400. ldr.h @E[3],[sp,#$D[2]+4]
  401. ror @C[5],@C[6],#32-1 @ C[2] = ROL64(A[2][0] ^ D[0], rhotates[2][0]);
  402. ror @C[4],@C[7],#32-2
  403. eor @E[0],@E[0],@C[8]
  404. ldr.l @C[8],[sp,#$A[4][2]] @ A[4][2]
  405. eor @E[1],@E[1],@C[9]
  406. ldr.h @C[9],[sp,#$A[4][2]+4]
  407. ror @C[7],@E[0],#32-22 @ C[3] = ROL64(A[3][1] ^ D[1], rhotates[3][1]);
  408. ror @C[6],@E[1],#32-23
  409. bic @E[0],@C[4],@C[2],ror#32-10
  410. bic @E[1],@C[5],@C[3],ror#32-10
  411. eor @E[2],@E[2],@C[8]
  412. eor @E[3],@E[3],@C[9]
  413. ror @C[9],@E[2],#32-30 @ C[4] = ROL64(A[4][2] ^ D[2], rhotates[4][2]);
  414. ror @C[8],@E[3],#32-31
  415. eor @E[0],@E[0],@C[0],ror#32-14
  416. eor @E[1],@E[1],@C[1],ror#32-14
  417. str.l @E[0],[sp,#$R[1][0]] @ R[1][0] = C[0] ^ (~C[1] & C[2])
  418. bic @E[2],@C[6],@C[4]
  419. str.h @E[1],[sp,#$R[1][0]+4]
  420. bic @E[3],@C[7],@C[5]
  421. eor @E[2],@E[2],@C[2],ror#32-10
  422. str.l @E[2],[sp,#$R[1][1]] @ R[1][1] = C[1] ^ (~C[2] & C[3]);
  423. eor @E[3],@E[3],@C[3],ror#32-10
  424. str.h @E[3],[sp,#$R[1][1]+4]
  425. bic @E[0],@C[8],@C[6]
  426. bic @E[1],@C[9],@C[7]
  427. bic @E[2],@C[0],@C[8],ror#14
  428. bic @E[3],@C[1],@C[9],ror#14
  429. eor @E[0],@E[0],@C[4]
  430. eor @E[1],@E[1],@C[5]
  431. str.l @E[0],[sp,#$R[1][2]] @ R[1][2] = C[2] ^ (~C[3] & C[4]);
  432. bic @C[2],@C[2],@C[0],ror#32-(14-10)
  433. str.h @E[1],[sp,#$R[1][2]+4]
  434. eor @E[2],@C[6],@E[2],ror#32-14
  435. bic @E[1],@C[3],@C[1],ror#32-(14-10)
  436. str.l @E[2],[sp,#$R[1][3]] @ R[1][3] = C[3] ^ (~C[4] & C[0]);
  437. eor @E[3],@C[7],@E[3],ror#32-14
  438. str.h @E[3],[sp,#$R[1][3]+4]
  439. add @E[2],sp,#$D[1]
  440. ldr.l @C[1],[sp,#$A[0][1]] @ A[0][1]
  441. eor @E[0],@C[8],@C[2],ror#32-10
  442. ldr.h @C[0],[sp,#$A[0][1]+4]
  443. eor @E[1],@C[9],@E[1],ror#32-10
  444. str.l @E[0],[sp,#$R[1][4]] @ R[1][4] = C[4] ^ (~C[0] & C[1]);
  445. str.h @E[1],[sp,#$R[1][4]+4]
  446. add @C[9],sp,#$D[3]
  447. ldmia @E[2],{@E[0]-@E[2],@E[3]} @ D[1..2]
  448. ldr.l @C[2],[sp,#$A[1][2]] @ A[1][2]
  449. ldr.h @C[3],[sp,#$A[1][2]+4]
  450. ldmia @C[9],{@C[6]-@C[9]} @ D[3..4]
  451. eor @C[1],@C[1],@E[0]
  452. ldr.l @C[4],[sp,#$A[2][3]] @ A[2][3]
  453. eor @C[0],@C[0],@E[1]
  454. ldr.h @C[5],[sp,#$A[2][3]+4]
  455. ror @C[0],@C[0],#32-1 @ C[0] = ROL64(A[0][1] ^ D[1], rhotates[0][1]);
  456. eor @C[2],@C[2],@E[2]
  457. ldr.l @E[0],[sp,#$A[3][4]] @ A[3][4]
  458. eor @C[3],@C[3],@E[3]
  459. ldr.h @E[1],[sp,#$A[3][4]+4]
  460. @ ror @C[2],@C[2],#32-3 @ C[1] = ROL64(A[1][2] ^ D[2], rhotates[1][2]);
  461. ldr.l @E[2],[sp,#$D[0]] @ D[0]
  462. @ ror @C[3],@C[3],#32-3
  463. ldr.h @E[3],[sp,#$D[0]+4]
  464. eor @C[4],@C[4],@C[6]
  465. eor @C[5],@C[5],@C[7]
  466. @ ror @C[5],@C[6],#32-12 @ C[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
  467. @ ror @C[4],@C[7],#32-13 @ [track reverse order below]
  468. eor @E[0],@E[0],@C[8]
  469. ldr.l @C[8],[sp,#$A[4][0]] @ A[4][0]
  470. eor @E[1],@E[1],@C[9]
  471. ldr.h @C[9],[sp,#$A[4][0]+4]
  472. ror @C[6],@E[0],#32-4 @ C[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]);
  473. ror @C[7],@E[1],#32-4
  474. eor @E[2],@E[2],@C[8]
  475. eor @E[3],@E[3],@C[9]
  476. ror @C[8],@E[2],#32-9 @ C[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]);
  477. ror @C[9],@E[3],#32-9
  478. bic @E[0],@C[5],@C[2],ror#13-3
  479. bic @E[1],@C[4],@C[3],ror#12-3
  480. bic @E[2],@C[6],@C[5],ror#32-13
  481. bic @E[3],@C[7],@C[4],ror#32-12
  482. eor @E[0],@C[0],@E[0],ror#32-13
  483. eor @E[1],@C[1],@E[1],ror#32-12
  484. str.l @E[0],[sp,#$R[2][0]] @ R[2][0] = C[0] ^ (~C[1] & C[2])
  485. eor @E[2],@E[2],@C[2],ror#32-3
  486. str.h @E[1],[sp,#$R[2][0]+4]
  487. eor @E[3],@E[3],@C[3],ror#32-3
  488. str.l @E[2],[sp,#$R[2][1]] @ R[2][1] = C[1] ^ (~C[2] & C[3]);
  489. bic @E[0],@C[8],@C[6]
  490. bic @E[1],@C[9],@C[7]
  491. str.h @E[3],[sp,#$R[2][1]+4]
  492. eor @E[0],@E[0],@C[5],ror#32-13
  493. eor @E[1],@E[1],@C[4],ror#32-12
  494. str.l @E[0],[sp,#$R[2][2]] @ R[2][2] = C[2] ^ (~C[3] & C[4]);
  495. bic @E[2],@C[0],@C[8]
  496. str.h @E[1],[sp,#$R[2][2]+4]
  497. bic @E[3],@C[1],@C[9]
  498. eor @E[2],@E[2],@C[6]
  499. eor @E[3],@E[3],@C[7]
  500. str.l @E[2],[sp,#$R[2][3]] @ R[2][3] = C[3] ^ (~C[4] & C[0]);
  501. bic @E[0],@C[2],@C[0],ror#3
  502. str.h @E[3],[sp,#$R[2][3]+4]
  503. bic @E[1],@C[3],@C[1],ror#3
  504. ldr.l @C[1],[sp,#$A[0][4]] @ A[0][4] [in reverse order]
  505. eor @E[0],@C[8],@E[0],ror#32-3
  506. ldr.h @C[0],[sp,#$A[0][4]+4]
  507. eor @E[1],@C[9],@E[1],ror#32-3
  508. str.l @E[0],[sp,#$R[2][4]] @ R[2][4] = C[4] ^ (~C[0] & C[1]);
  509. add @C[9],sp,#$D[1]
  510. str.h @E[1],[sp,#$R[2][4]+4]
  511. ldr.l @E[0],[sp,#$D[4]] @ D[4]
  512. ldr.h @E[1],[sp,#$D[4]+4]
  513. ldr.l @E[2],[sp,#$D[0]] @ D[0]
  514. ldr.h @E[3],[sp,#$D[0]+4]
  515. ldmia @C[9],{@C[6]-@C[9]} @ D[1..2]
  516. eor @C[1],@C[1],@E[0]
  517. ldr.l @C[2],[sp,#$A[1][0]] @ A[1][0]
  518. eor @C[0],@C[0],@E[1]
  519. ldr.h @C[3],[sp,#$A[1][0]+4]
  520. @ ror @C[1],@E[0],#32-13 @ C[0] = ROL64(A[0][4] ^ D[4], rhotates[0][4]);
  521. ldr.l @C[4],[sp,#$A[2][1]] @ A[2][1]
  522. @ ror @C[0],@E[1],#32-14 @ [was loaded in reverse order]
  523. ldr.h @C[5],[sp,#$A[2][1]+4]
  524. eor @C[2],@C[2],@E[2]
  525. ldr.l @E[0],[sp,#$A[3][2]] @ A[3][2]
  526. eor @C[3],@C[3],@E[3]
  527. ldr.h @E[1],[sp,#$A[3][2]+4]
  528. @ ror @C[2],@C[2],#32-18 @ C[1] = ROL64(A[1][0] ^ D[0], rhotates[1][0]);
  529. ldr.l @E[2],[sp,#$D[3]] @ D[3]
  530. @ ror @C[3],@C[3],#32-18
  531. ldr.h @E[3],[sp,#$D[3]+4]
  532. eor @C[6],@C[6],@C[4]
  533. eor @C[7],@C[7],@C[5]
  534. ror @C[4],@C[6],#32-5 @ C[2] = ROL64(A[2][1] ^ D[1], rhotates[2][1]);
  535. ror @C[5],@C[7],#32-5
  536. eor @E[0],@E[0],@C[8]
  537. ldr.l @C[8],[sp,#$A[4][3]] @ A[4][3]
  538. eor @E[1],@E[1],@C[9]
  539. ldr.h @C[9],[sp,#$A[4][3]+4]
  540. ror @C[7],@E[0],#32-7 @ C[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
  541. ror @C[6],@E[1],#32-8
  542. eor @E[2],@E[2],@C[8]
  543. eor @E[3],@E[3],@C[9]
  544. ror @C[8],@E[2],#32-28 @ C[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]);
  545. ror @C[9],@E[3],#32-28
  546. bic @E[0],@C[4],@C[2],ror#32-18
  547. bic @E[1],@C[5],@C[3],ror#32-18
  548. eor @E[0],@E[0],@C[0],ror#32-14
  549. eor @E[1],@E[1],@C[1],ror#32-13
  550. str.l @E[0],[sp,#$R[3][0]] @ R[3][0] = C[0] ^ (~C[1] & C[2])
  551. bic @E[2],@C[6],@C[4]
  552. str.h @E[1],[sp,#$R[3][0]+4]
  553. bic @E[3],@C[7],@C[5]
  554. eor @E[2],@E[2],@C[2],ror#32-18
  555. str.l @E[2],[sp,#$R[3][1]] @ R[3][1] = C[1] ^ (~C[2] & C[3]);
  556. eor @E[3],@E[3],@C[3],ror#32-18
  557. str.h @E[3],[sp,#$R[3][1]+4]
  558. bic @E[0],@C[8],@C[6]
  559. bic @E[1],@C[9],@C[7]
  560. bic @E[2],@C[0],@C[8],ror#14
  561. bic @E[3],@C[1],@C[9],ror#13
  562. eor @E[0],@E[0],@C[4]
  563. eor @E[1],@E[1],@C[5]
  564. str.l @E[0],[sp,#$R[3][2]] @ R[3][2] = C[2] ^ (~C[3] & C[4]);
  565. bic @C[2],@C[2],@C[0],ror#18-14
  566. str.h @E[1],[sp,#$R[3][2]+4]
  567. eor @E[2],@C[6],@E[2],ror#32-14
  568. bic @E[1],@C[3],@C[1],ror#18-13
  569. eor @E[3],@C[7],@E[3],ror#32-13
  570. str.l @E[2],[sp,#$R[3][3]] @ R[3][3] = C[3] ^ (~C[4] & C[0]);
  571. str.h @E[3],[sp,#$R[3][3]+4]
  572. add @E[3],sp,#$D[2]
  573. ldr.l @C[0],[sp,#$A[0][2]] @ A[0][2]
  574. eor @E[0],@C[8],@C[2],ror#32-18
  575. ldr.h @C[1],[sp,#$A[0][2]+4]
  576. eor @E[1],@C[9],@E[1],ror#32-18
  577. str.l @E[0],[sp,#$R[3][4]] @ R[3][4] = C[4] ^ (~C[0] & C[1]);
  578. str.h @E[1],[sp,#$R[3][4]+4]
  579. ldmia @E[3],{@E[0]-@E[2],@E[3]} @ D[2..3]
  580. ldr.l @C[2],[sp,#$A[1][3]] @ A[1][3]
  581. ldr.h @C[3],[sp,#$A[1][3]+4]
  582. ldr.l @C[6],[sp,#$D[4]] @ D[4]
  583. ldr.h @C[7],[sp,#$D[4]+4]
  584. eor @C[0],@C[0],@E[0]
  585. ldr.l @C[4],[sp,#$A[2][4]] @ A[2][4]
  586. eor @C[1],@C[1],@E[1]
  587. ldr.h @C[5],[sp,#$A[2][4]+4]
  588. @ ror @C[0],@C[0],#32-31 @ C[0] = ROL64(A[0][2] ^ D[2], rhotates[0][2]);
  589. ldr.l @C[8],[sp,#$D[0]] @ D[0]
  590. @ ror @C[1],@C[1],#32-31
  591. ldr.h @C[9],[sp,#$D[0]+4]
  592. eor @E[2],@E[2],@C[2]
  593. ldr.l @E[0],[sp,#$A[3][0]] @ A[3][0]
  594. eor @E[3],@E[3],@C[3]
  595. ldr.h @E[1],[sp,#$A[3][0]+4]
  596. ror @C[3],@E[2],#32-27 @ C[1] = ROL64(A[1][3] ^ D[3], rhotates[1][3]);
  597. ldr.l @E[2],[sp,#$D[1]] @ D[1]
  598. ror @C[2],@E[3],#32-28
  599. ldr.h @E[3],[sp,#$D[1]+4]
  600. eor @C[6],@C[6],@C[4]
  601. eor @C[7],@C[7],@C[5]
  602. ror @C[5],@C[6],#32-19 @ C[2] = ROL64(A[2][4] ^ D[4], rhotates[2][4]);
  603. ror @C[4],@C[7],#32-20
  604. eor @E[0],@E[0],@C[8]
  605. ldr.l @C[8],[sp,#$A[4][1]] @ A[4][1]
  606. eor @E[1],@E[1],@C[9]
  607. ldr.h @C[9],[sp,#$A[4][1]+4]
  608. ror @C[7],@E[0],#32-20 @ C[3] = ROL64(A[3][0] ^ D[0], rhotates[3][0]);
  609. ror @C[6],@E[1],#32-21
  610. eor @C[8],@C[8],@E[2]
  611. eor @C[9],@C[9],@E[3]
  612. @ ror @C[8],@C[2],#32-1 @ C[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
  613. @ ror @C[9],@C[3],#32-1
  614. bic @E[0],@C[4],@C[2]
  615. bic @E[1],@C[5],@C[3]
  616. eor @E[0],@E[0],@C[0],ror#32-31
  617. str.l @E[0],[sp,#$R[4][0]] @ R[4][0] = C[0] ^ (~C[1] & C[2])
  618. eor @E[1],@E[1],@C[1],ror#32-31
  619. str.h @E[1],[sp,#$R[4][0]+4]
  620. bic @E[2],@C[6],@C[4]
  621. bic @E[3],@C[7],@C[5]
  622. eor @E[2],@E[2],@C[2]
  623. eor @E[3],@E[3],@C[3]
  624. str.l @E[2],[sp,#$R[4][1]] @ R[4][1] = C[1] ^ (~C[2] & C[3]);
  625. bic @E[0],@C[8],@C[6],ror#1
  626. str.h @E[3],[sp,#$R[4][1]+4]
  627. bic @E[1],@C[9],@C[7],ror#1
  628. bic @E[2],@C[0],@C[8],ror#31-1
  629. bic @E[3],@C[1],@C[9],ror#31-1
  630. eor @C[4],@C[4],@E[0],ror#32-1
  631. str.l @C[4],[sp,#$R[4][2]] @ R[4][2] = C[2] ^= (~C[3] & C[4]);
  632. eor @C[5],@C[5],@E[1],ror#32-1
  633. str.h @C[5],[sp,#$R[4][2]+4]
  634. eor @C[6],@C[6],@E[2],ror#32-31
  635. eor @C[7],@C[7],@E[3],ror#32-31
  636. str.l @C[6],[sp,#$R[4][3]] @ R[4][3] = C[3] ^= (~C[4] & C[0]);
  637. bic @E[0],@C[2],@C[0],ror#32-31
  638. str.h @C[7],[sp,#$R[4][3]+4]
  639. bic @E[1],@C[3],@C[1],ror#32-31
  640. add @E[2],sp,#$R[0][0]
  641. eor @C[8],@E[0],@C[8],ror#32-1
  642. add @E[0],sp,#$R[1][0]
  643. eor @C[9],@E[1],@C[9],ror#32-1
  644. str.l @C[8],[sp,#$R[4][4]] @ R[4][4] = C[4] ^= (~C[0] & C[1]);
  645. str.h @C[9],[sp,#$R[4][4]+4]
  646. ___
  647. }
  648. Round(@A,@T);
  649. Round(@T,@A);
  650. $code.=<<___;
  651. blo .Lround2x
  652. #if __ARM_ARCH__>=5
  653. ldr pc,[sp,#440]
  654. #else
  655. ldr lr,[sp,#440]
  656. tst lr,#1
  657. moveq pc,lr @ be binary compatible with V4, yet
  658. bx lr @ interoperable with Thumb ISA:-)
  659. #endif
  660. .size KeccakF1600_int,.-KeccakF1600_int
  661. .type KeccakF1600, %function
  662. .align 5
  663. KeccakF1600:
  664. stmdb sp!,{r0,r4-r11,lr}
  665. sub sp,sp,#440+16 @ space for A[5][5],D[5],T[5][5],...
  666. add @E[0],r0,#$A[1][0]
  667. add @E[1],sp,#$A[1][0]
  668. ldmia r0, {@C[0]-@C[9]} @ copy A[5][5] to stack
  669. stmia sp, {@C[0]-@C[9]}
  670. ldmia @E[0]!,{@C[0]-@C[9]}
  671. stmia @E[1]!,{@C[0]-@C[9]}
  672. ldmia @E[0]!,{@C[0]-@C[9]}
  673. stmia @E[1]!,{@C[0]-@C[9]}
  674. ldmia @E[0]!,{@C[0]-@C[9]}
  675. stmia @E[1]!,{@C[0]-@C[9]}
  676. ldmia @E[0], {@C[0]-@C[9]}
  677. add @E[2],sp,#$A[0][0]
  678. add @E[0],sp,#$A[1][0]
  679. stmia @E[1], {@C[0]-@C[9]}
  680. bl KeccakF1600_enter
  681. ldr @E[1], [sp,#440+16] @ restore pointer to A
  682. ldmia sp, {@C[0]-@C[9]}
  683. stmia @E[1]!,{@C[0]-@C[9]} @ return A[5][5]
  684. ldmia @E[0]!,{@C[0]-@C[9]}
  685. stmia @E[1]!,{@C[0]-@C[9]}
  686. ldmia @E[0]!,{@C[0]-@C[9]}
  687. stmia @E[1]!,{@C[0]-@C[9]}
  688. ldmia @E[0]!,{@C[0]-@C[9]}
  689. stmia @E[1]!,{@C[0]-@C[9]}
  690. ldmia @E[0], {@C[0]-@C[9]}
  691. stmia @E[1], {@C[0]-@C[9]}
  692. add sp,sp,#440+20
  693. #if __ARM_ARCH__>=5
  694. ldmia sp!,{r4-r11,pc}
  695. #else
  696. ldmia sp!,{r4-r11,lr}
  697. tst lr,#1
  698. moveq pc,lr @ be binary compatible with V4, yet
  699. bx lr @ interoperable with Thumb ISA:-)
  700. #endif
  701. .size KeccakF1600,.-KeccakF1600
  702. ___
  703. { my ($A_flat,$inp,$len,$bsz) = map("r$_",(10..12,14));
  704. ########################################################################
  705. # Stack layout
  706. # ----->+-----------------------+
  707. # | uint64_t A[5][5] |
  708. # | ... |
  709. # | ... |
  710. # +456->+-----------------------+
  711. # | 0x55555555 |
  712. # +460->+-----------------------+
  713. # | 0x33333333 |
  714. # +464->+-----------------------+
  715. # | 0x0f0f0f0f |
  716. # +468->+-----------------------+
  717. # | 0x00ff00ff |
  718. # +472->+-----------------------+
  719. # | uint64_t *A |
  720. # +476->+-----------------------+
  721. # | const void *inp |
  722. # +480->+-----------------------+
  723. # | size_t len |
  724. # +484->+-----------------------+
  725. # | size_t bs |
  726. # +488->+-----------------------+
  727. # | ....
  728. $code.=<<___;
  729. .global SHA3_absorb
  730. .type SHA3_absorb,%function
  731. .align 5
  732. SHA3_absorb:
  733. stmdb sp!,{r0-r12,lr}
  734. sub sp,sp,#456+16
  735. add $A_flat,r0,#$A[1][0]
  736. @ mov $inp,r1
  737. mov $len,r2
  738. mov $bsz,r3
  739. cmp r2,r3
  740. blo .Labsorb_abort
  741. add $inp,sp,#0
  742. ldmia r0, {@C[0]-@C[9]} @ copy A[5][5] to stack
  743. stmia $inp!, {@C[0]-@C[9]}
  744. ldmia $A_flat!,{@C[0]-@C[9]}
  745. stmia $inp!, {@C[0]-@C[9]}
  746. ldmia $A_flat!,{@C[0]-@C[9]}
  747. stmia $inp!, {@C[0]-@C[9]}
  748. ldmia $A_flat!,{@C[0]-@C[9]}
  749. stmia $inp!, {@C[0]-@C[9]}
  750. ldmia $A_flat!,{@C[0]-@C[9]}
  751. stmia $inp, {@C[0]-@C[9]}
  752. ldr $inp,[sp,#476] @ restore $inp
  753. #ifdef __thumb2__
  754. mov r9,#0x00ff00ff
  755. mov r8,#0x0f0f0f0f
  756. mov r7,#0x33333333
  757. mov r6,#0x55555555
  758. #else
  759. mov r6,#0x11 @ compose constants
  760. mov r8,#0x0f
  761. mov r9,#0xff
  762. orr r6,r6,r6,lsl#8
  763. orr r8,r8,r8,lsl#8
  764. orr r6,r6,r6,lsl#16 @ 0x11111111
  765. orr r9,r9,r9,lsl#16 @ 0x00ff00ff
  766. orr r8,r8,r8,lsl#16 @ 0x0f0f0f0f
  767. orr r7,r6,r6,lsl#1 @ 0x33333333
  768. orr r6,r6,r6,lsl#2 @ 0x55555555
  769. #endif
  770. str r9,[sp,#468]
  771. str r8,[sp,#464]
  772. str r7,[sp,#460]
  773. str r6,[sp,#456]
  774. b .Loop_absorb
  775. .align 4
  776. .Loop_absorb:
  777. subs r0,$len,$bsz
  778. blo .Labsorbed
  779. add $A_flat,sp,#0
  780. str r0,[sp,#480] @ save len - bsz
  781. .align 4
  782. .Loop_block:
  783. ldrb r0,[$inp],#1
  784. ldrb r1,[$inp],#1
  785. ldrb r2,[$inp],#1
  786. ldrb r3,[$inp],#1
  787. ldrb r4,[$inp],#1
  788. orr r0,r0,r1,lsl#8
  789. ldrb r1,[$inp],#1
  790. orr r0,r0,r2,lsl#16
  791. ldrb r2,[$inp],#1
  792. orr r0,r0,r3,lsl#24 @ lo
  793. ldrb r3,[$inp],#1
  794. orr r1,r4,r1,lsl#8
  795. orr r1,r1,r2,lsl#16
  796. orr r1,r1,r3,lsl#24 @ hi
  797. and r2,r0,r6 @ &=0x55555555
  798. and r0,r0,r6,lsl#1 @ &=0xaaaaaaaa
  799. and r3,r1,r6 @ &=0x55555555
  800. and r1,r1,r6,lsl#1 @ &=0xaaaaaaaa
  801. orr r2,r2,r2,lsr#1
  802. orr r0,r0,r0,lsl#1
  803. orr r3,r3,r3,lsr#1
  804. orr r1,r1,r1,lsl#1
  805. and r2,r2,r7 @ &=0x33333333
  806. and r0,r0,r7,lsl#2 @ &=0xcccccccc
  807. and r3,r3,r7 @ &=0x33333333
  808. and r1,r1,r7,lsl#2 @ &=0xcccccccc
  809. orr r2,r2,r2,lsr#2
  810. orr r0,r0,r0,lsl#2
  811. orr r3,r3,r3,lsr#2
  812. orr r1,r1,r1,lsl#2
  813. and r2,r2,r8 @ &=0x0f0f0f0f
  814. and r0,r0,r8,lsl#4 @ &=0xf0f0f0f0
  815. and r3,r3,r8 @ &=0x0f0f0f0f
  816. and r1,r1,r8,lsl#4 @ &=0xf0f0f0f0
  817. ldmia $A_flat,{r4-r5} @ A_flat[i]
  818. orr r2,r2,r2,lsr#4
  819. orr r0,r0,r0,lsl#4
  820. orr r3,r3,r3,lsr#4
  821. orr r1,r1,r1,lsl#4
  822. and r2,r2,r9 @ &=0x00ff00ff
  823. and r0,r0,r9,lsl#8 @ &=0xff00ff00
  824. and r3,r3,r9 @ &=0x00ff00ff
  825. and r1,r1,r9,lsl#8 @ &=0xff00ff00
  826. orr r2,r2,r2,lsr#8
  827. orr r0,r0,r0,lsl#8
  828. orr r3,r3,r3,lsr#8
  829. orr r1,r1,r1,lsl#8
  830. lsl r2,r2,#16
  831. lsr r1,r1,#16
  832. eor r4,r4,r3,lsl#16
  833. eor r5,r5,r0,lsr#16
  834. eor r4,r4,r2,lsr#16
  835. eor r5,r5,r1,lsl#16
  836. stmia $A_flat!,{r4-r5} @ A_flat[i++] ^= BitInterleave(inp[0..7])
  837. subs $bsz,$bsz,#8
  838. bhi .Loop_block
  839. str $inp,[sp,#476]
  840. bl KeccakF1600_int
  841. add r14,sp,#456
  842. ldmia r14,{r6-r12,r14} @ restore constants and variables
  843. b .Loop_absorb
  844. .align 4
  845. .Labsorbed:
  846. add $inp,sp,#$A[1][0]
  847. ldmia sp, {@C[0]-@C[9]}
  848. stmia $A_flat!,{@C[0]-@C[9]} @ return A[5][5]
  849. ldmia $inp!, {@C[0]-@C[9]}
  850. stmia $A_flat!,{@C[0]-@C[9]}
  851. ldmia $inp!, {@C[0]-@C[9]}
  852. stmia $A_flat!,{@C[0]-@C[9]}
  853. ldmia $inp!, {@C[0]-@C[9]}
  854. stmia $A_flat!,{@C[0]-@C[9]}
  855. ldmia $inp, {@C[0]-@C[9]}
  856. stmia $A_flat, {@C[0]-@C[9]}
  857. .Labsorb_abort:
  858. add sp,sp,#456+32
  859. mov r0,$len @ return value
  860. #if __ARM_ARCH__>=5
  861. ldmia sp!,{r4-r12,pc}
  862. #else
  863. ldmia sp!,{r4-r12,lr}
  864. tst lr,#1
  865. moveq pc,lr @ be binary compatible with V4, yet
  866. bx lr @ interoperable with Thumb ISA:-)
  867. #endif
  868. .size SHA3_absorb,.-SHA3_absorb
  869. ___
  870. }
  871. { my ($out,$len,$A_flat,$bsz) = map("r$_", (4,5,10,12));
  872. $code.=<<___;
  873. .global SHA3_squeeze
  874. .type SHA3_squeeze,%function
  875. .align 5
  876. SHA3_squeeze:
  877. stmdb sp!,{r0,r3-r10,lr}
  878. mov $A_flat,r0
  879. mov $out,r1
  880. mov $len,r2
  881. mov $bsz,r3
  882. #ifdef __thumb2__
  883. mov r9,#0x00ff00ff
  884. mov r8,#0x0f0f0f0f
  885. mov r7,#0x33333333
  886. mov r6,#0x55555555
  887. #else
  888. mov r6,#0x11 @ compose constants
  889. mov r8,#0x0f
  890. mov r9,#0xff
  891. orr r6,r6,r6,lsl#8
  892. orr r8,r8,r8,lsl#8
  893. orr r6,r6,r6,lsl#16 @ 0x11111111
  894. orr r9,r9,r9,lsl#16 @ 0x00ff00ff
  895. orr r8,r8,r8,lsl#16 @ 0x0f0f0f0f
  896. orr r7,r6,r6,lsl#1 @ 0x33333333
  897. orr r6,r6,r6,lsl#2 @ 0x55555555
  898. #endif
  899. stmdb sp!,{r6-r9}
  900. mov r14,$A_flat
  901. b .Loop_squeeze
  902. .align 4
  903. .Loop_squeeze:
  904. ldmia $A_flat!,{r0,r1} @ A_flat[i++]
  905. lsl r2,r0,#16
  906. lsl r3,r1,#16 @ r3 = r1 << 16
  907. lsr r2,r2,#16 @ r2 = r0 & 0x0000ffff
  908. lsr r1,r1,#16
  909. lsr r0,r0,#16 @ r0 = r0 >> 16
  910. lsl r1,r1,#16 @ r1 = r1 & 0xffff0000
  911. orr r2,r2,r2,lsl#8
  912. orr r3,r3,r3,lsr#8
  913. orr r0,r0,r0,lsl#8
  914. orr r1,r1,r1,lsr#8
  915. and r2,r2,r9 @ &=0x00ff00ff
  916. and r3,r3,r9,lsl#8 @ &=0xff00ff00
  917. and r0,r0,r9 @ &=0x00ff00ff
  918. and r1,r1,r9,lsl#8 @ &=0xff00ff00
  919. orr r2,r2,r2,lsl#4
  920. orr r3,r3,r3,lsr#4
  921. orr r0,r0,r0,lsl#4
  922. orr r1,r1,r1,lsr#4
  923. and r2,r2,r8 @ &=0x0f0f0f0f
  924. and r3,r3,r8,lsl#4 @ &=0xf0f0f0f0
  925. and r0,r0,r8 @ &=0x0f0f0f0f
  926. and r1,r1,r8,lsl#4 @ &=0xf0f0f0f0
  927. orr r2,r2,r2,lsl#2
  928. orr r3,r3,r3,lsr#2
  929. orr r0,r0,r0,lsl#2
  930. orr r1,r1,r1,lsr#2
  931. and r2,r2,r7 @ &=0x33333333
  932. and r3,r3,r7,lsl#2 @ &=0xcccccccc
  933. and r0,r0,r7 @ &=0x33333333
  934. and r1,r1,r7,lsl#2 @ &=0xcccccccc
  935. orr r2,r2,r2,lsl#1
  936. orr r3,r3,r3,lsr#1
  937. orr r0,r0,r0,lsl#1
  938. orr r1,r1,r1,lsr#1
  939. and r2,r2,r6 @ &=0x55555555
  940. and r3,r3,r6,lsl#1 @ &=0xaaaaaaaa
  941. and r0,r0,r6 @ &=0x55555555
  942. and r1,r1,r6,lsl#1 @ &=0xaaaaaaaa
  943. orr r2,r2,r3
  944. orr r0,r0,r1
  945. cmp $len,#8
  946. blo .Lsqueeze_tail
  947. lsr r1,r2,#8
  948. strb r2,[$out],#1
  949. lsr r3,r2,#16
  950. strb r1,[$out],#1
  951. lsr r2,r2,#24
  952. strb r3,[$out],#1
  953. strb r2,[$out],#1
  954. lsr r1,r0,#8
  955. strb r0,[$out],#1
  956. lsr r3,r0,#16
  957. strb r1,[$out],#1
  958. lsr r0,r0,#24
  959. strb r3,[$out],#1
  960. strb r0,[$out],#1
  961. subs $len,$len,#8
  962. beq .Lsqueeze_done
  963. subs $bsz,$bsz,#8 @ bsz -= 8
  964. bhi .Loop_squeeze
  965. mov r0,r14 @ original $A_flat
  966. bl KeccakF1600
  967. ldmia sp,{r6-r10,r12} @ restore constants and variables
  968. mov r14,$A_flat
  969. b .Loop_squeeze
  970. .align 4
  971. .Lsqueeze_tail:
  972. strb r2,[$out],#1
  973. lsr r2,r2,#8
  974. subs $len,$len,#1
  975. beq .Lsqueeze_done
  976. strb r2,[$out],#1
  977. lsr r2,r2,#8
  978. subs $len,$len,#1
  979. beq .Lsqueeze_done
  980. strb r2,[$out],#1
  981. lsr r2,r2,#8
  982. subs $len,$len,#1
  983. beq .Lsqueeze_done
  984. strb r2,[$out],#1
  985. subs $len,$len,#1
  986. beq .Lsqueeze_done
  987. strb r0,[$out],#1
  988. lsr r0,r0,#8
  989. subs $len,$len,#1
  990. beq .Lsqueeze_done
  991. strb r0,[$out],#1
  992. lsr r0,r0,#8
  993. subs $len,$len,#1
  994. beq .Lsqueeze_done
  995. strb r0,[$out]
  996. b .Lsqueeze_done
  997. .align 4
  998. .Lsqueeze_done:
  999. add sp,sp,#24
  1000. #if __ARM_ARCH__>=5
  1001. ldmia sp!,{r4-r10,pc}
  1002. #else
  1003. ldmia sp!,{r4-r10,lr}
  1004. tst lr,#1
  1005. moveq pc,lr @ be binary compatible with V4, yet
  1006. bx lr @ interoperable with Thumb ISA:-)
  1007. #endif
  1008. .size SHA3_squeeze,.-SHA3_squeeze
  1009. ___
  1010. }
  1011. $code.=<<___;
  1012. #if __ARM_MAX_ARCH__>=7
  1013. .fpu neon
  1014. .type iotas64, %object
  1015. .align 5
  1016. iotas64:
  1017. .quad 0x0000000000000001
  1018. .quad 0x0000000000008082
  1019. .quad 0x800000000000808a
  1020. .quad 0x8000000080008000
  1021. .quad 0x000000000000808b
  1022. .quad 0x0000000080000001
  1023. .quad 0x8000000080008081
  1024. .quad 0x8000000000008009
  1025. .quad 0x000000000000008a
  1026. .quad 0x0000000000000088
  1027. .quad 0x0000000080008009
  1028. .quad 0x000000008000000a
  1029. .quad 0x000000008000808b
  1030. .quad 0x800000000000008b
  1031. .quad 0x8000000000008089
  1032. .quad 0x8000000000008003
  1033. .quad 0x8000000000008002
  1034. .quad 0x8000000000000080
  1035. .quad 0x000000000000800a
  1036. .quad 0x800000008000000a
  1037. .quad 0x8000000080008081
  1038. .quad 0x8000000000008080
  1039. .quad 0x0000000080000001
  1040. .quad 0x8000000080008008
  1041. .size iotas64,.-iotas64
  1042. .type KeccakF1600_neon, %function
  1043. .align 5
  1044. KeccakF1600_neon:
  1045. add r1, r0, #16
  1046. adr r2, iotas64
  1047. mov r3, #24 @ loop counter
  1048. b .Loop_neon
  1049. .align 4
  1050. .Loop_neon:
  1051. @ Theta
  1052. vst1.64 {q4}, [r0,:64] @ offload A[0..1][4]
  1053. veor q13, q0, q5 @ A[0..1][0]^A[2..3][0]
  1054. vst1.64 {d18}, [r1,:64] @ offload A[2][4]
  1055. veor q14, q1, q6 @ A[0..1][1]^A[2..3][1]
  1056. veor q15, q2, q7 @ A[0..1][2]^A[2..3][2]
  1057. veor d26, d26, d27 @ C[0]=A[0][0]^A[1][0]^A[2][0]^A[3][0]
  1058. veor d27, d28, d29 @ C[1]=A[0][1]^A[1][1]^A[2][1]^A[3][1]
  1059. veor q14, q3, q8 @ A[0..1][3]^A[2..3][3]
  1060. veor q4, q4, q9 @ A[0..1][4]^A[2..3][4]
  1061. veor d30, d30, d31 @ C[2]=A[0][2]^A[1][2]^A[2][2]^A[3][2]
  1062. veor d31, d28, d29 @ C[3]=A[0][3]^A[1][3]^A[2][3]^A[3][3]
  1063. veor d25, d8, d9 @ C[4]=A[0][4]^A[1][4]^A[2][4]^A[3][4]
  1064. veor q13, q13, q10 @ C[0..1]^=A[4][0..1]
  1065. veor q14, q15, q11 @ C[2..3]^=A[4][2..3]
  1066. veor d25, d25, d24 @ C[4]^=A[4][4]
  1067. vadd.u64 q4, q13, q13 @ C[0..1]<<1
  1068. vadd.u64 q15, q14, q14 @ C[2..3]<<1
  1069. vadd.u64 d18, d25, d25 @ C[4]<<1
  1070. vsri.u64 q4, q13, #63 @ ROL64(C[0..1],1)
  1071. vsri.u64 q15, q14, #63 @ ROL64(C[2..3],1)
  1072. vsri.u64 d18, d25, #63 @ ROL64(C[4],1)
  1073. veor d25, d25, d9 @ D[0] = C[4] ^= ROL64(C[1],1)
  1074. veor q13, q13, q15 @ D[1..2] = C[0..1] ^ ROL64(C[2..3],1)
  1075. veor d28, d28, d18 @ D[3] = C[2] ^= ROL64(C[4],1)
  1076. veor d29, d29, d8 @ D[4] = C[3] ^= ROL64(C[0],1)
  1077. veor d0, d0, d25 @ A[0][0] ^= C[4]
  1078. veor d1, d1, d25 @ A[1][0] ^= C[4]
  1079. veor d10, d10, d25 @ A[2][0] ^= C[4]
  1080. veor d11, d11, d25 @ A[3][0] ^= C[4]
  1081. veor d20, d20, d25 @ A[4][0] ^= C[4]
  1082. veor d2, d2, d26 @ A[0][1] ^= D[1]
  1083. veor d3, d3, d26 @ A[1][1] ^= D[1]
  1084. veor d12, d12, d26 @ A[2][1] ^= D[1]
  1085. veor d13, d13, d26 @ A[3][1] ^= D[1]
  1086. veor d21, d21, d26 @ A[4][1] ^= D[1]
  1087. vmov d26, d27
  1088. veor d6, d6, d28 @ A[0][3] ^= C[2]
  1089. veor d7, d7, d28 @ A[1][3] ^= C[2]
  1090. veor d16, d16, d28 @ A[2][3] ^= C[2]
  1091. veor d17, d17, d28 @ A[3][3] ^= C[2]
  1092. veor d23, d23, d28 @ A[4][3] ^= C[2]
  1093. vld1.64 {q4}, [r0,:64] @ restore A[0..1][4]
  1094. vmov d28, d29
  1095. vld1.64 {d18}, [r1,:64] @ restore A[2][4]
  1096. veor q2, q2, q13 @ A[0..1][2] ^= D[2]
  1097. veor q7, q7, q13 @ A[2..3][2] ^= D[2]
  1098. veor d22, d22, d27 @ A[4][2] ^= D[2]
  1099. veor q4, q4, q14 @ A[0..1][4] ^= C[3]
  1100. veor q9, q9, q14 @ A[2..3][4] ^= C[3]
  1101. veor d24, d24, d29 @ A[4][4] ^= C[3]
  1102. @ Rho + Pi
  1103. vmov d26, d2 @ C[1] = A[0][1]
  1104. vshl.u64 d2, d3, #44
  1105. vmov d27, d4 @ C[2] = A[0][2]
  1106. vshl.u64 d4, d14, #43
  1107. vmov d28, d6 @ C[3] = A[0][3]
  1108. vshl.u64 d6, d17, #21
  1109. vmov d29, d8 @ C[4] = A[0][4]
  1110. vshl.u64 d8, d24, #14
  1111. vsri.u64 d2, d3, #64-44 @ A[0][1] = ROL64(A[1][1], rhotates[1][1])
  1112. vsri.u64 d4, d14, #64-43 @ A[0][2] = ROL64(A[2][2], rhotates[2][2])
  1113. vsri.u64 d6, d17, #64-21 @ A[0][3] = ROL64(A[3][3], rhotates[3][3])
  1114. vsri.u64 d8, d24, #64-14 @ A[0][4] = ROL64(A[4][4], rhotates[4][4])
  1115. vshl.u64 d3, d9, #20
  1116. vshl.u64 d14, d16, #25
  1117. vshl.u64 d17, d15, #15
  1118. vshl.u64 d24, d21, #2
  1119. vsri.u64 d3, d9, #64-20 @ A[1][1] = ROL64(A[1][4], rhotates[1][4])
  1120. vsri.u64 d14, d16, #64-25 @ A[2][2] = ROL64(A[2][3], rhotates[2][3])
  1121. vsri.u64 d17, d15, #64-15 @ A[3][3] = ROL64(A[3][2], rhotates[3][2])
  1122. vsri.u64 d24, d21, #64-2 @ A[4][4] = ROL64(A[4][1], rhotates[4][1])
  1123. vshl.u64 d9, d22, #61
  1124. @ vshl.u64 d16, d19, #8
  1125. vshl.u64 d15, d12, #10
  1126. vshl.u64 d21, d7, #55
  1127. vsri.u64 d9, d22, #64-61 @ A[1][4] = ROL64(A[4][2], rhotates[4][2])
  1128. vext.8 d16, d19, d19, #8-1 @ A[2][3] = ROL64(A[3][4], rhotates[3][4])
  1129. vsri.u64 d15, d12, #64-10 @ A[3][2] = ROL64(A[2][1], rhotates[2][1])
  1130. vsri.u64 d21, d7, #64-55 @ A[4][1] = ROL64(A[1][3], rhotates[1][3])
  1131. vshl.u64 d22, d18, #39
  1132. @ vshl.u64 d19, d23, #56
  1133. vshl.u64 d12, d5, #6
  1134. vshl.u64 d7, d13, #45
  1135. vsri.u64 d22, d18, #64-39 @ A[4][2] = ROL64(A[2][4], rhotates[2][4])
  1136. vext.8 d19, d23, d23, #8-7 @ A[3][4] = ROL64(A[4][3], rhotates[4][3])
  1137. vsri.u64 d12, d5, #64-6 @ A[2][1] = ROL64(A[1][2], rhotates[1][2])
  1138. vsri.u64 d7, d13, #64-45 @ A[1][3] = ROL64(A[3][1], rhotates[3][1])
  1139. vshl.u64 d18, d20, #18
  1140. vshl.u64 d23, d11, #41
  1141. vshl.u64 d5, d10, #3
  1142. vshl.u64 d13, d1, #36
  1143. vsri.u64 d18, d20, #64-18 @ A[2][4] = ROL64(A[4][0], rhotates[4][0])
  1144. vsri.u64 d23, d11, #64-41 @ A[4][3] = ROL64(A[3][0], rhotates[3][0])
  1145. vsri.u64 d5, d10, #64-3 @ A[1][2] = ROL64(A[2][0], rhotates[2][0])
  1146. vsri.u64 d13, d1, #64-36 @ A[3][1] = ROL64(A[1][0], rhotates[1][0])
  1147. vshl.u64 d1, d28, #28
  1148. vshl.u64 d10, d26, #1
  1149. vshl.u64 d11, d29, #27
  1150. vshl.u64 d20, d27, #62
  1151. vsri.u64 d1, d28, #64-28 @ A[1][0] = ROL64(C[3], rhotates[0][3])
  1152. vsri.u64 d10, d26, #64-1 @ A[2][0] = ROL64(C[1], rhotates[0][1])
  1153. vsri.u64 d11, d29, #64-27 @ A[3][0] = ROL64(C[4], rhotates[0][4])
  1154. vsri.u64 d20, d27, #64-62 @ A[4][0] = ROL64(C[2], rhotates[0][2])
  1155. @ Chi + Iota
  1156. vbic q13, q2, q1
  1157. vbic q14, q3, q2
  1158. vbic q15, q4, q3
  1159. veor q13, q13, q0 @ A[0..1][0] ^ (~A[0..1][1] & A[0..1][2])
  1160. veor q14, q14, q1 @ A[0..1][1] ^ (~A[0..1][2] & A[0..1][3])
  1161. veor q2, q2, q15 @ A[0..1][2] ^= (~A[0..1][3] & A[0..1][4])
  1162. vst1.64 {q13}, [r0,:64] @ offload A[0..1][0]
  1163. vbic q13, q0, q4
  1164. vbic q15, q1, q0
  1165. vmov q1, q14 @ A[0..1][1]
  1166. veor q3, q3, q13 @ A[0..1][3] ^= (~A[0..1][4] & A[0..1][0])
  1167. veor q4, q4, q15 @ A[0..1][4] ^= (~A[0..1][0] & A[0..1][1])
  1168. vbic q13, q7, q6
  1169. vmov q0, q5 @ A[2..3][0]
  1170. vbic q14, q8, q7
  1171. vmov q15, q6 @ A[2..3][1]
  1172. veor q5, q5, q13 @ A[2..3][0] ^= (~A[2..3][1] & A[2..3][2])
  1173. vbic q13, q9, q8
  1174. veor q6, q6, q14 @ A[2..3][1] ^= (~A[2..3][2] & A[2..3][3])
  1175. vbic q14, q0, q9
  1176. veor q7, q7, q13 @ A[2..3][2] ^= (~A[2..3][3] & A[2..3][4])
  1177. vbic q13, q15, q0
  1178. veor q8, q8, q14 @ A[2..3][3] ^= (~A[2..3][4] & A[2..3][0])
  1179. vmov q14, q10 @ A[4][0..1]
  1180. veor q9, q9, q13 @ A[2..3][4] ^= (~A[2..3][0] & A[2..3][1])
  1181. vld1.64 d25, [r2,:64]! @ Iota[i++]
  1182. vbic d26, d22, d21
  1183. vbic d27, d23, d22
  1184. vld1.64 {q0}, [r0,:64] @ restore A[0..1][0]
  1185. veor d20, d20, d26 @ A[4][0] ^= (~A[4][1] & A[4][2])
  1186. vbic d26, d24, d23
  1187. veor d21, d21, d27 @ A[4][1] ^= (~A[4][2] & A[4][3])
  1188. vbic d27, d28, d24
  1189. veor d22, d22, d26 @ A[4][2] ^= (~A[4][3] & A[4][4])
  1190. vbic d26, d29, d28
  1191. veor d23, d23, d27 @ A[4][3] ^= (~A[4][4] & A[4][0])
  1192. veor d0, d0, d25 @ A[0][0] ^= Iota[i]
  1193. veor d24, d24, d26 @ A[4][4] ^= (~A[4][0] & A[4][1])
  1194. subs r3, r3, #1
  1195. bne .Loop_neon
  1196. ret
  1197. .size KeccakF1600_neon,.-KeccakF1600_neon
  1198. .global SHA3_absorb_neon
  1199. .type SHA3_absorb_neon, %function
  1200. .align 5
  1201. SHA3_absorb_neon:
  1202. stmdb sp!, {r4-r6,lr}
  1203. vstmdb sp!, {d8-d15}
  1204. mov r4, r1 @ inp
  1205. mov r5, r2 @ len
  1206. mov r6, r3 @ bsz
  1207. vld1.32 {d0}, [r0,:64]! @ A[0][0]
  1208. vld1.32 {d2}, [r0,:64]! @ A[0][1]
  1209. vld1.32 {d4}, [r0,:64]! @ A[0][2]
  1210. vld1.32 {d6}, [r0,:64]! @ A[0][3]
  1211. vld1.32 {d8}, [r0,:64]! @ A[0][4]
  1212. vld1.32 {d1}, [r0,:64]! @ A[1][0]
  1213. vld1.32 {d3}, [r0,:64]! @ A[1][1]
  1214. vld1.32 {d5}, [r0,:64]! @ A[1][2]
  1215. vld1.32 {d7}, [r0,:64]! @ A[1][3]
  1216. vld1.32 {d9}, [r0,:64]! @ A[1][4]
  1217. vld1.32 {d10}, [r0,:64]! @ A[2][0]
  1218. vld1.32 {d12}, [r0,:64]! @ A[2][1]
  1219. vld1.32 {d14}, [r0,:64]! @ A[2][2]
  1220. vld1.32 {d16}, [r0,:64]! @ A[2][3]
  1221. vld1.32 {d18}, [r0,:64]! @ A[2][4]
  1222. vld1.32 {d11}, [r0,:64]! @ A[3][0]
  1223. vld1.32 {d13}, [r0,:64]! @ A[3][1]
  1224. vld1.32 {d15}, [r0,:64]! @ A[3][2]
  1225. vld1.32 {d17}, [r0,:64]! @ A[3][3]
  1226. vld1.32 {d19}, [r0,:64]! @ A[3][4]
  1227. vld1.32 {d20-d23}, [r0,:64]! @ A[4][0..3]
  1228. vld1.32 {d24}, [r0,:64] @ A[4][4]
  1229. sub r0, r0, #24*8 @ rewind
  1230. b .Loop_absorb_neon
  1231. .align 4
  1232. .Loop_absorb_neon:
  1233. subs r12, r5, r6 @ len - bsz
  1234. blo .Labsorbed_neon
  1235. mov r5, r12
  1236. vld1.8 {d31}, [r4]! @ endian-neutral loads...
  1237. cmp r6, #8*2
  1238. veor d0, d0, d31 @ A[0][0] ^= *inp++
  1239. blo .Lprocess_neon
  1240. vld1.8 {d31}, [r4]!
  1241. veor d2, d2, d31 @ A[0][1] ^= *inp++
  1242. beq .Lprocess_neon
  1243. vld1.8 {d31}, [r4]!
  1244. cmp r6, #8*4
  1245. veor d4, d4, d31 @ A[0][2] ^= *inp++
  1246. blo .Lprocess_neon
  1247. vld1.8 {d31}, [r4]!
  1248. veor d6, d6, d31 @ A[0][3] ^= *inp++
  1249. beq .Lprocess_neon
  1250. vld1.8 {d31},[r4]!
  1251. cmp r6, #8*6
  1252. veor d8, d8, d31 @ A[0][4] ^= *inp++
  1253. blo .Lprocess_neon
  1254. vld1.8 {d31}, [r4]!
  1255. veor d1, d1, d31 @ A[1][0] ^= *inp++
  1256. beq .Lprocess_neon
  1257. vld1.8 {d31}, [r4]!
  1258. cmp r6, #8*8
  1259. veor d3, d3, d31 @ A[1][1] ^= *inp++
  1260. blo .Lprocess_neon
  1261. vld1.8 {d31}, [r4]!
  1262. veor d5, d5, d31 @ A[1][2] ^= *inp++
  1263. beq .Lprocess_neon
  1264. vld1.8 {d31}, [r4]!
  1265. cmp r6, #8*10
  1266. veor d7, d7, d31 @ A[1][3] ^= *inp++
  1267. blo .Lprocess_neon
  1268. vld1.8 {d31}, [r4]!
  1269. veor d9, d9, d31 @ A[1][4] ^= *inp++
  1270. beq .Lprocess_neon
  1271. vld1.8 {d31}, [r4]!
  1272. cmp r6, #8*12
  1273. veor d10, d10, d31 @ A[2][0] ^= *inp++
  1274. blo .Lprocess_neon
  1275. vld1.8 {d31}, [r4]!
  1276. veor d12, d12, d31 @ A[2][1] ^= *inp++
  1277. beq .Lprocess_neon
  1278. vld1.8 {d31}, [r4]!
  1279. cmp r6, #8*14
  1280. veor d14, d14, d31 @ A[2][2] ^= *inp++
  1281. blo .Lprocess_neon
  1282. vld1.8 {d31}, [r4]!
  1283. veor d16, d16, d31 @ A[2][3] ^= *inp++
  1284. beq .Lprocess_neon
  1285. vld1.8 {d31}, [r4]!
  1286. cmp r6, #8*16
  1287. veor d18, d18, d31 @ A[2][4] ^= *inp++
  1288. blo .Lprocess_neon
  1289. vld1.8 {d31}, [r4]!
  1290. veor d11, d11, d31 @ A[3][0] ^= *inp++
  1291. beq .Lprocess_neon
  1292. vld1.8 {d31}, [r4]!
  1293. cmp r6, #8*18
  1294. veor d13, d13, d31 @ A[3][1] ^= *inp++
  1295. blo .Lprocess_neon
  1296. vld1.8 {d31}, [r4]!
  1297. veor d15, d15, d31 @ A[3][2] ^= *inp++
  1298. beq .Lprocess_neon
  1299. vld1.8 {d31}, [r4]!
  1300. cmp r6, #8*20
  1301. veor d17, d17, d31 @ A[3][3] ^= *inp++
  1302. blo .Lprocess_neon
  1303. vld1.8 {d31}, [r4]!
  1304. veor d19, d19, d31 @ A[3][4] ^= *inp++
  1305. beq .Lprocess_neon
  1306. vld1.8 {d31}, [r4]!
  1307. cmp r6, #8*22
  1308. veor d20, d20, d31 @ A[4][0] ^= *inp++
  1309. blo .Lprocess_neon
  1310. vld1.8 {d31}, [r4]!
  1311. veor d21, d21, d31 @ A[4][1] ^= *inp++
  1312. beq .Lprocess_neon
  1313. vld1.8 {d31}, [r4]!
  1314. cmp r6, #8*24
  1315. veor d22, d22, d31 @ A[4][2] ^= *inp++
  1316. blo .Lprocess_neon
  1317. vld1.8 {d31}, [r4]!
  1318. veor d23, d23, d31 @ A[4][3] ^= *inp++
  1319. beq .Lprocess_neon
  1320. vld1.8 {d31}, [r4]!
  1321. veor d24, d24, d31 @ A[4][4] ^= *inp++
  1322. .Lprocess_neon:
  1323. bl KeccakF1600_neon
  1324. b .Loop_absorb_neon
  1325. .align 4
  1326. .Labsorbed_neon:
  1327. vst1.32 {d0}, [r0,:64]! @ A[0][0..4]
  1328. vst1.32 {d2}, [r0,:64]!
  1329. vst1.32 {d4}, [r0,:64]!
  1330. vst1.32 {d6}, [r0,:64]!
  1331. vst1.32 {d8}, [r0,:64]!
  1332. vst1.32 {d1}, [r0,:64]! @ A[1][0..4]
  1333. vst1.32 {d3}, [r0,:64]!
  1334. vst1.32 {d5}, [r0,:64]!
  1335. vst1.32 {d7}, [r0,:64]!
  1336. vst1.32 {d9}, [r0,:64]!
  1337. vst1.32 {d10}, [r0,:64]! @ A[2][0..4]
  1338. vst1.32 {d12}, [r0,:64]!
  1339. vst1.32 {d14}, [r0,:64]!
  1340. vst1.32 {d16}, [r0,:64]!
  1341. vst1.32 {d18}, [r0,:64]!
  1342. vst1.32 {d11}, [r0,:64]! @ A[3][0..4]
  1343. vst1.32 {d13}, [r0,:64]!
  1344. vst1.32 {d15}, [r0,:64]!
  1345. vst1.32 {d17}, [r0,:64]!
  1346. vst1.32 {d19}, [r0,:64]!
  1347. vst1.32 {d20-d23}, [r0,:64]! @ A[4][0..4]
  1348. vst1.32 {d24}, [r0,:64]
  1349. mov r0, r5 @ return value
  1350. vldmia sp!, {d8-d15}
  1351. ldmia sp!, {r4-r6,pc}
  1352. .size SHA3_absorb_neon,.-SHA3_absorb_neon
  1353. .global SHA3_squeeze_neon
  1354. .type SHA3_squeeze_neon, %function
  1355. .align 5
  1356. SHA3_squeeze_neon:
  1357. stmdb sp!, {r4-r6,lr}
  1358. mov r4, r1 @ out
  1359. mov r5, r2 @ len
  1360. mov r6, r3 @ bsz
  1361. mov r12, r0 @ A_flat
  1362. mov r14, r3 @ bsz
  1363. b .Loop_squeeze_neon
  1364. .align 4
  1365. .Loop_squeeze_neon:
  1366. cmp r5, #8
  1367. blo .Lsqueeze_neon_tail
  1368. vld1.32 {d0}, [r12]!
  1369. vst1.8 {d0}, [r4]! @ endian-neutral store
  1370. subs r5, r5, #8 @ len -= 8
  1371. beq .Lsqueeze_neon_done
  1372. subs r14, r14, #8 @ bsz -= 8
  1373. bhi .Loop_squeeze_neon
  1374. vstmdb sp!, {d8-d15}
  1375. vld1.32 {d0}, [r0,:64]! @ A[0][0..4]
  1376. vld1.32 {d2}, [r0,:64]!
  1377. vld1.32 {d4}, [r0,:64]!
  1378. vld1.32 {d6}, [r0,:64]!
  1379. vld1.32 {d8}, [r0,:64]!
  1380. vld1.32 {d1}, [r0,:64]! @ A[1][0..4]
  1381. vld1.32 {d3}, [r0,:64]!
  1382. vld1.32 {d5}, [r0,:64]!
  1383. vld1.32 {d7}, [r0,:64]!
  1384. vld1.32 {d9}, [r0,:64]!
  1385. vld1.32 {d10}, [r0,:64]! @ A[2][0..4]
  1386. vld1.32 {d12}, [r0,:64]!
  1387. vld1.32 {d14}, [r0,:64]!
  1388. vld1.32 {d16}, [r0,:64]!
  1389. vld1.32 {d18}, [r0,:64]!
  1390. vld1.32 {d11}, [r0,:64]! @ A[3][0..4]
  1391. vld1.32 {d13}, [r0,:64]!
  1392. vld1.32 {d15}, [r0,:64]!
  1393. vld1.32 {d17}, [r0,:64]!
  1394. vld1.32 {d19}, [r0,:64]!
  1395. vld1.32 {d20-d23}, [r0,:64]! @ A[4][0..4]
  1396. vld1.32 {d24}, [r0,:64]
  1397. sub r0, r0, #24*8 @ rewind
  1398. bl KeccakF1600_neon
  1399. mov r12, r0 @ A_flat
  1400. vst1.32 {d0}, [r0,:64]! @ A[0][0..4]
  1401. vst1.32 {d2}, [r0,:64]!
  1402. vst1.32 {d4}, [r0,:64]!
  1403. vst1.32 {d6}, [r0,:64]!
  1404. vst1.32 {d8}, [r0,:64]!
  1405. vst1.32 {d1}, [r0,:64]! @ A[1][0..4]
  1406. vst1.32 {d3}, [r0,:64]!
  1407. vst1.32 {d5}, [r0,:64]!
  1408. vst1.32 {d7}, [r0,:64]!
  1409. vst1.32 {d9}, [r0,:64]!
  1410. vst1.32 {d10}, [r0,:64]! @ A[2][0..4]
  1411. vst1.32 {d12}, [r0,:64]!
  1412. vst1.32 {d14}, [r0,:64]!
  1413. vst1.32 {d16}, [r0,:64]!
  1414. vst1.32 {d18}, [r0,:64]!
  1415. vst1.32 {d11}, [r0,:64]! @ A[3][0..4]
  1416. vst1.32 {d13}, [r0,:64]!
  1417. vst1.32 {d15}, [r0,:64]!
  1418. vst1.32 {d17}, [r0,:64]!
  1419. vst1.32 {d19}, [r0,:64]!
  1420. vst1.32 {d20-d23}, [r0,:64]! @ A[4][0..4]
  1421. mov r14, r6 @ bsz
  1422. vst1.32 {d24}, [r0,:64]
  1423. mov r0, r12 @ rewind
  1424. vldmia sp!, {d8-d15}
  1425. b .Loop_squeeze_neon
  1426. .align 4
  1427. .Lsqueeze_neon_tail:
  1428. ldmia r12, {r2,r3}
  1429. cmp r5, #2
  1430. strb r2, [r4],#1 @ endian-neutral store
  1431. lsr r2, r2, #8
  1432. blo .Lsqueeze_neon_done
  1433. strb r2, [r4], #1
  1434. lsr r2, r2, #8
  1435. beq .Lsqueeze_neon_done
  1436. strb r2, [r4], #1
  1437. lsr r2, r2, #8
  1438. cmp r5, #4
  1439. blo .Lsqueeze_neon_done
  1440. strb r2, [r4], #1
  1441. beq .Lsqueeze_neon_done
  1442. strb r3, [r4], #1
  1443. lsr r3, r3, #8
  1444. cmp r5, #6
  1445. blo .Lsqueeze_neon_done
  1446. strb r3, [r4], #1
  1447. lsr r3, r3, #8
  1448. beq .Lsqueeze_neon_done
  1449. strb r3, [r4], #1
  1450. .Lsqueeze_neon_done:
  1451. ldmia sp!, {r4-r6,pc}
  1452. .size SHA3_squeeze_neon,.-SHA3_squeeze_neon
  1453. #endif
  1454. .asciz "Keccak-1600 absorb and squeeze for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
  1455. .align 2
  1456. ___
  1457. {
  1458. my %ldr, %str;
  1459. sub ldrd {
  1460. my ($mnemonic,$half,$reg,$ea) = @_;
  1461. my $op = $mnemonic eq "ldr" ? \%ldr : \%str;
  1462. if ($half eq "l") {
  1463. $$op{reg} = $reg;
  1464. $$op{ea} = $ea;
  1465. sprintf "#ifndef __thumb2__\n" .
  1466. " %s\t%s,%s\n" .
  1467. "#endif", $mnemonic,$reg,$ea;
  1468. } else {
  1469. sprintf "#ifndef __thumb2__\n" .
  1470. " %s\t%s,%s\n" .
  1471. "#else\n" .
  1472. " %sd\t%s,%s,%s\n" .
  1473. "#endif", $mnemonic,$reg,$ea,
  1474. $mnemonic,$$op{reg},$reg,$$op{ea};
  1475. }
  1476. }
  1477. }
  1478. foreach (split($/,$code)) {
  1479. s/\`([^\`]*)\`/eval $1/ge;
  1480. s/^\s+(ldr|str)\.([lh])\s+(r[0-9]+),\s*(\[.*)/ldrd($1,$2,$3,$4)/ge or
  1481. s/\b(ror|ls[rl])\s+(r[0-9]+.*)#/mov $2$1#/g or
  1482. s/\bret\b/bx lr/g or
  1483. s/\bbx\s+lr\b/.word\t0xe12fff1e/g; # make it possible to compile with -march=armv4
  1484. print $_,"\n";
  1485. }
  1486. close STDOUT; # enforce flush