keccak1600-armv4.pl 43 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589
  1. #!/usr/bin/env perl
  2. # Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the OpenSSL license (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. # ====================================================================
  10. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  11. # project. The module is, however, dual licensed under OpenSSL and
  12. # CRYPTOGAMS licenses depending on where you obtain it. For further
  13. # details see http://www.openssl.org/~appro/cryptogams/.
  14. # ====================================================================
  15. #
  16. # Keccak-1600 for ARMv4.
  17. #
  18. # June 2017.
  19. #
  20. # Non-NEON code is KECCAK_1X variant (see sha/keccak1600.c) with bit
  21. # interleaving. How does it compare to Keccak Code Package? It's as
  22. # fast, but several times smaller, and is endian- and ISA-neutral. ISA
  23. # neutrality means that minimum ISA requirement is ARMv4, yet it can
  24. # be assembled even as Thumb-2. NEON code path is KECCAK_1X_ALT with
  25. # register layout taken from Keccak Code Package. It's also as fast,
  26. # in fact faster by 10-15% on some processors, and endian-neutral.
  27. #
  28. # August 2017.
  29. #
  30. # Switch to KECCAK_2X variant for non-NEON code and merge almost 1/2
  31. # of rotate instructions with logical ones. This resulted in ~10%
  32. # improvement on most processors. Switch to KECCAK_2X effectively
  33. # minimizes re-loads from temporary storage, and merged rotates just
  34. # eliminate corresponding instructions. As for latter. When examining
  35. # code you'll notice commented ror instructions. These are eliminated
  36. # ones, and you should trace destination register below to see what's
  37. # going on. Just in case, why not all rotates are eliminated. Trouble
  38. # is that you have operations that require both inputs to be rotated,
  39. # e.g. 'eor a,b>>>x,c>>>y'. This conundrum is resolved by using
  40. # 'eor a,b,c>>>(x-y)' and then merge-rotating 'a' in next operation
  41. # that takes 'a' as input. And thing is that this next operation can
  42. # be in next round. It's totally possible to "carry" rotate "factors"
  43. # to the next round, but it makes code more complex. And the last word
  44. # is the keyword, i.e. "almost 1/2" is kind of complexity cap [for the
  45. # time being]...
  46. #
  47. # Reduce per-round instruction count in Thumb-2 case by 16%. This is
  48. # achieved by folding ldr/str pairs to their double-word counterparts.
  49. # Theoretically this should have improved performance on single-issue
  50. # cores, such as Cortex-A5/A7, by 19%. Reality is a bit different, as
  51. # usual...
  52. #
  53. ########################################################################
  54. # Numbers are cycles per processed byte. Non-NEON results account even
  55. # for input bit interleaving.
  56. #
  57. # r=1088(*) Thumb-2(**) NEON
  58. #
  59. # ARM11xx 82/+150%
  60. # Cortex-A5 88/+160%, 86, 36
  61. # Cortex-A7 78/+160%, 68, 34
  62. # Cortex-A8 51/+230%, 57, 30
  63. # Cortex-A9 53/+210%, 51, 26
  64. # Cortex-A15 42/+160%, 38, 18
  65. # Snapdragon S4 43/+210%, 38, 24
  66. #
  67. # (*) Corresponds to SHA3-256. Percentage after slash is improvement
  68. # over compiler-generated KECCAK_2X reference code.
  69. # (**) Thumb-2 results for Cortex-A5/A7 are likely to apply even to
  70. # Cortex-Mx, x>=3. Otherwise, non-NEON results for NEON-capable
  71. # processors are presented mostly for reference purposes.
  72. my @C = map("r$_",(0..9));
  73. my @E = map("r$_",(10..12,14));
  74. ########################################################################
  75. # Stack layout
  76. # ----->+-----------------------+
  77. # | uint64_t A[5][5] |
  78. # | ... |
  79. # +200->+-----------------------+
  80. # | uint64_t D[5] |
  81. # | ... |
  82. # +240->+-----------------------+
  83. # | uint64_t T[5][5] |
  84. # | ... |
  85. # +440->+-----------------------+
  86. # | saved lr |
  87. # +444->+-----------------------+
  88. # | loop counter |
  89. # +448->+-----------------------+
  90. # | ...
  91. my @A = map([ 8*$_, 8*($_+1), 8*($_+2), 8*($_+3), 8*($_+4) ], (0,5,10,15,20));
  92. my @D = map(8*$_, (25..29));
  93. my @T = map([ 8*$_, 8*($_+1), 8*($_+2), 8*($_+3), 8*($_+4) ], (30,35,40,45,50));
  94. $code.=<<___;
  95. .text
  96. #if defined(__thumb2__)
  97. .syntax unified
  98. .thumb
  99. #else
  100. .code 32
  101. #endif
  102. .type iotas32, %object
  103. .align 5
  104. iotas32:
  105. .long 0x00000001, 0x00000000
  106. .long 0x00000000, 0x00000089
  107. .long 0x00000000, 0x8000008b
  108. .long 0x00000000, 0x80008080
  109. .long 0x00000001, 0x0000008b
  110. .long 0x00000001, 0x00008000
  111. .long 0x00000001, 0x80008088
  112. .long 0x00000001, 0x80000082
  113. .long 0x00000000, 0x0000000b
  114. .long 0x00000000, 0x0000000a
  115. .long 0x00000001, 0x00008082
  116. .long 0x00000000, 0x00008003
  117. .long 0x00000001, 0x0000808b
  118. .long 0x00000001, 0x8000000b
  119. .long 0x00000001, 0x8000008a
  120. .long 0x00000001, 0x80000081
  121. .long 0x00000000, 0x80000081
  122. .long 0x00000000, 0x80000008
  123. .long 0x00000000, 0x00000083
  124. .long 0x00000000, 0x80008003
  125. .long 0x00000001, 0x80008088
  126. .long 0x00000000, 0x80000088
  127. .long 0x00000001, 0x00008000
  128. .long 0x00000000, 0x80008082
  129. .size iotas32,.-iotas32
  130. .type KeccakF1600_int, %function
  131. .align 5
  132. KeccakF1600_int:
  133. add @C[9],sp,#$A[4][2]
  134. add @E[2],sp,#$A[0][0]
  135. add @E[0],sp,#$A[1][0]
  136. ldmia @C[9],{@C[4]-@C[9]} @ A[4][2..4]
  137. KeccakF1600_enter:
  138. str lr,[sp,#440]
  139. eor @E[1],@E[1],@E[1]
  140. str @E[1],[sp,#444]
  141. b .Lround2x
  142. .align 4
  143. .Lround2x:
  144. ___
  145. sub Round {
  146. my (@A,@R); (@A[0..4],@R) = @_;
  147. $code.=<<___;
  148. ldmia @E[2],{@C[0]-@C[3]} @ A[0][0..1]
  149. ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[1][0..1]
  150. #ifdef __thumb2__
  151. eor @C[0],@C[0],@E[0]
  152. eor @C[1],@C[1],@E[1]
  153. eor @C[2],@C[2],@E[2]
  154. ldrd @E[0],@E[1],[sp,#$A[1][2]]
  155. eor @C[3],@C[3],@E[3]
  156. ldrd @E[2],@E[3],[sp,#$A[1][3]]
  157. eor @C[4],@C[4],@E[0]
  158. eor @C[5],@C[5],@E[1]
  159. eor @C[6],@C[6],@E[2]
  160. ldrd @E[0],@E[1],[sp,#$A[1][4]]
  161. eor @C[7],@C[7],@E[3]
  162. ldrd @E[2],@E[3],[sp,#$A[2][0]]
  163. eor @C[8],@C[8],@E[0]
  164. eor @C[9],@C[9],@E[1]
  165. eor @C[0],@C[0],@E[2]
  166. ldrd @E[0],@E[1],[sp,#$A[2][1]]
  167. eor @C[1],@C[1],@E[3]
  168. ldrd @E[2],@E[3],[sp,#$A[2][2]]
  169. eor @C[2],@C[2],@E[0]
  170. eor @C[3],@C[3],@E[1]
  171. eor @C[4],@C[4],@E[2]
  172. ldrd @E[0],@E[1],[sp,#$A[2][3]]
  173. eor @C[5],@C[5],@E[3]
  174. ldrd @E[2],@E[3],[sp,#$A[2][4]]
  175. eor @C[6],@C[6],@E[0]
  176. eor @C[7],@C[7],@E[1]
  177. eor @C[8],@C[8],@E[2]
  178. ldrd @E[0],@E[1],[sp,#$A[3][0]]
  179. eor @C[9],@C[9],@E[3]
  180. ldrd @E[2],@E[3],[sp,#$A[3][1]]
  181. eor @C[0],@C[0],@E[0]
  182. eor @C[1],@C[1],@E[1]
  183. eor @C[2],@C[2],@E[2]
  184. ldrd @E[0],@E[1],[sp,#$A[3][2]]
  185. eor @C[3],@C[3],@E[3]
  186. ldrd @E[2],@E[3],[sp,#$A[3][3]]
  187. eor @C[4],@C[4],@E[0]
  188. eor @C[5],@C[5],@E[1]
  189. eor @C[6],@C[6],@E[2]
  190. ldrd @E[0],@E[1],[sp,#$A[3][4]]
  191. eor @C[7],@C[7],@E[3]
  192. ldrd @E[2],@E[3],[sp,#$A[4][0]]
  193. eor @C[8],@C[8],@E[0]
  194. eor @C[9],@C[9],@E[1]
  195. eor @C[0],@C[0],@E[2]
  196. ldrd @E[0],@E[1],[sp,#$A[4][1]]
  197. eor @C[1],@C[1],@E[3]
  198. ldrd @E[2],@E[3],[sp,#$A[0][2]]
  199. eor @C[2],@C[2],@E[0]
  200. eor @C[3],@C[3],@E[1]
  201. eor @C[4],@C[4],@E[2]
  202. ldrd @E[0],@E[1],[sp,#$A[0][3]]
  203. eor @C[5],@C[5],@E[3]
  204. ldrd @E[2],@E[3],[sp,#$A[0][4]]
  205. #else
  206. eor @C[0],@C[0],@E[0]
  207. add @E[0],sp,#$A[1][2]
  208. eor @C[1],@C[1],@E[1]
  209. eor @C[2],@C[2],@E[2]
  210. eor @C[3],@C[3],@E[3]
  211. ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[1][2..3]
  212. eor @C[4],@C[4],@E[0]
  213. add @E[0],sp,#$A[1][4]
  214. eor @C[5],@C[5],@E[1]
  215. eor @C[6],@C[6],@E[2]
  216. eor @C[7],@C[7],@E[3]
  217. ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[1][4]..A[2][0]
  218. eor @C[8],@C[8],@E[0]
  219. add @E[0],sp,#$A[2][1]
  220. eor @C[9],@C[9],@E[1]
  221. eor @C[0],@C[0],@E[2]
  222. eor @C[1],@C[1],@E[3]
  223. ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[2][1..2]
  224. eor @C[2],@C[2],@E[0]
  225. add @E[0],sp,#$A[2][3]
  226. eor @C[3],@C[3],@E[1]
  227. eor @C[4],@C[4],@E[2]
  228. eor @C[5],@C[5],@E[3]
  229. ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[2][3..4]
  230. eor @C[6],@C[6],@E[0]
  231. add @E[0],sp,#$A[3][0]
  232. eor @C[7],@C[7],@E[1]
  233. eor @C[8],@C[8],@E[2]
  234. eor @C[9],@C[9],@E[3]
  235. ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[3][0..1]
  236. eor @C[0],@C[0],@E[0]
  237. add @E[0],sp,#$A[3][2]
  238. eor @C[1],@C[1],@E[1]
  239. eor @C[2],@C[2],@E[2]
  240. eor @C[3],@C[3],@E[3]
  241. ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[3][2..3]
  242. eor @C[4],@C[4],@E[0]
  243. add @E[0],sp,#$A[3][4]
  244. eor @C[5],@C[5],@E[1]
  245. eor @C[6],@C[6],@E[2]
  246. eor @C[7],@C[7],@E[3]
  247. ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[3][4]..A[4][0]
  248. eor @C[8],@C[8],@E[0]
  249. ldr @E[0],[sp,#$A[4][1]] @ A[4][1]
  250. eor @C[9],@C[9],@E[1]
  251. ldr @E[1],[sp,#$A[4][1]+4]
  252. eor @C[0],@C[0],@E[2]
  253. ldr @E[2],[sp,#$A[0][2]] @ A[0][2]
  254. eor @C[1],@C[1],@E[3]
  255. ldr @E[3],[sp,#$A[0][2]+4]
  256. eor @C[2],@C[2],@E[0]
  257. add @E[0],sp,#$A[0][3]
  258. eor @C[3],@C[3],@E[1]
  259. eor @C[4],@C[4],@E[2]
  260. eor @C[5],@C[5],@E[3]
  261. ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[0][3..4]
  262. #endif
  263. eor @C[6],@C[6],@E[0]
  264. eor @C[7],@C[7],@E[1]
  265. eor @C[8],@C[8],@E[2]
  266. eor @C[9],@C[9],@E[3]
  267. eor @E[0],@C[0],@C[5],ror#32-1 @ E[0] = ROL64(C[2], 1) ^ C[0];
  268. str.l @E[0],[sp,#$D[1]] @ D[1] = E[0]
  269. eor @E[1],@C[1],@C[4]
  270. str.h @E[1],[sp,#$D[1]+4]
  271. eor @E[2],@C[6],@C[1],ror#32-1 @ E[1] = ROL64(C[0], 1) ^ C[3];
  272. eor @E[3],@C[7],@C[0]
  273. str.l @E[2],[sp,#$D[4]] @ D[4] = E[1]
  274. eor @C[0],@C[8],@C[3],ror#32-1 @ C[0] = ROL64(C[1], 1) ^ C[4];
  275. str.h @E[3],[sp,#$D[4]+4]
  276. eor @C[1],@C[9],@C[2]
  277. str.l @C[0],[sp,#$D[0]] @ D[0] = C[0]
  278. eor @C[2],@C[2],@C[7],ror#32-1 @ C[1] = ROL64(C[3], 1) ^ C[1];
  279. ldr.l @C[7],[sp,#$A[3][3]]
  280. eor @C[3],@C[3],@C[6]
  281. str.h @C[1],[sp,#$D[0]+4]
  282. ldr.h @C[6],[sp,#$A[3][3]+4]
  283. str.l @C[2],[sp,#$D[2]] @ D[2] = C[1]
  284. eor @C[4],@C[4],@C[9],ror#32-1 @ C[2] = ROL64(C[4], 1) ^ C[2];
  285. str.h @C[3],[sp,#$D[2]+4]
  286. eor @C[5],@C[5],@C[8]
  287. ldr.l @C[8],[sp,#$A[4][4]]
  288. ldr.h @C[9],[sp,#$A[4][4]+4]
  289. str.l @C[4],[sp,#$D[3]] @ D[3] = C[2]
  290. eor @C[7],@C[7],@C[4]
  291. str.h @C[5],[sp,#$D[3]+4]
  292. eor @C[6],@C[6],@C[5]
  293. ldr.l @C[4],[sp,#$A[0][0]]
  294. @ ror @C[7],@C[7],#32-10 @ C[3] = ROL64(A[3][3] ^ C[2], rhotates[3][3]); /* D[3] */
  295. @ ror @C[6],@C[6],#32-11
  296. ldr.h @C[5],[sp,#$A[0][0]+4]
  297. eor @C[8],@C[8],@E[2]
  298. eor @C[9],@C[9],@E[3]
  299. ldr.l @E[2],[sp,#$A[2][2]]
  300. eor @C[0],@C[0],@C[4]
  301. ldr.h @E[3],[sp,#$A[2][2]+4]
  302. @ ror @C[8],@C[8],#32-7 @ C[4] = ROL64(A[4][4] ^ E[1], rhotates[4][4]); /* D[4] */
  303. @ ror @C[9],@C[9],#32-7
  304. eor @C[1],@C[1],@C[5] @ C[0] = A[0][0] ^ C[0]; /* rotate by 0 */ /* D[0] */
  305. eor @E[2],@E[2],@C[2]
  306. ldr.l @C[2],[sp,#$A[1][1]]
  307. eor @E[3],@E[3],@C[3]
  308. ldr.h @C[3],[sp,#$A[1][1]+4]
  309. ror @C[5],@E[2],#32-21 @ C[2] = ROL64(A[2][2] ^ C[1], rhotates[2][2]); /* D[2] */
  310. ldr @E[2],[sp,#444] @ load counter
  311. eor @C[2],@C[2],@E[0]
  312. adr @E[0],iotas32
  313. ror @C[4],@E[3],#32-22
  314. add @E[3],@E[0],@E[2]
  315. eor @C[3],@C[3],@E[1]
  316. ___
  317. $code.=<<___ if ($A[0][0] != $T[0][0]);
  318. ldmia @E[3],{@E[0],@E[1]} @ iotas[i]
  319. ___
  320. $code.=<<___ if ($A[0][0] == $T[0][0]);
  321. ldr.l @E[0],[@E[3],#8] @ iotas[i].lo
  322. add @E[2],@E[2],#16
  323. ldr.h @E[1],[@E[3],#12] @ iotas[i].hi
  324. cmp @E[2],#192
  325. str @E[2],[sp,#444] @ store counter
  326. ___
  327. $code.=<<___;
  328. bic @E[2],@C[4],@C[2],ror#32-22
  329. bic @E[3],@C[5],@C[3],ror#32-22
  330. ror @C[2],@C[2],#32-22 @ C[1] = ROL64(A[1][1] ^ E[0], rhotates[1][1]); /* D[1] */
  331. ror @C[3],@C[3],#32-22
  332. eor @E[2],@E[2],@C[0]
  333. eor @E[3],@E[3],@C[1]
  334. eor @E[0],@E[0],@E[2]
  335. eor @E[1],@E[1],@E[3]
  336. str.l @E[0],[sp,#$R[0][0]] @ R[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i];
  337. bic @E[2],@C[6],@C[4],ror#11
  338. str.h @E[1],[sp,#$R[0][0]+4]
  339. bic @E[3],@C[7],@C[5],ror#10
  340. bic @E[0],@C[8],@C[6],ror#32-(11-7)
  341. bic @E[1],@C[9],@C[7],ror#32-(10-7)
  342. eor @E[2],@C[2],@E[2],ror#32-11
  343. str.l @E[2],[sp,#$R[0][1]] @ R[0][1] = C[1] ^ (~C[2] & C[3]);
  344. eor @E[3],@C[3],@E[3],ror#32-10
  345. str.h @E[3],[sp,#$R[0][1]+4]
  346. eor @E[0],@C[4],@E[0],ror#32-7
  347. eor @E[1],@C[5],@E[1],ror#32-7
  348. str.l @E[0],[sp,#$R[0][2]] @ R[0][2] = C[2] ^ (~C[3] & C[4]);
  349. bic @E[2],@C[0],@C[8],ror#32-7
  350. str.h @E[1],[sp,#$R[0][2]+4]
  351. bic @E[3],@C[1],@C[9],ror#32-7
  352. eor @E[2],@E[2],@C[6],ror#32-11
  353. str.l @E[2],[sp,#$R[0][3]] @ R[0][3] = C[3] ^ (~C[4] & C[0]);
  354. eor @E[3],@E[3],@C[7],ror#32-10
  355. str.h @E[3],[sp,#$R[0][3]+4]
  356. bic @E[0],@C[2],@C[0]
  357. add @E[3],sp,#$D[3]
  358. ldr.l @C[0],[sp,#$A[0][3]] @ A[0][3]
  359. bic @E[1],@C[3],@C[1]
  360. ldr.h @C[1],[sp,#$A[0][3]+4]
  361. eor @E[0],@E[0],@C[8],ror#32-7
  362. eor @E[1],@E[1],@C[9],ror#32-7
  363. str.l @E[0],[sp,#$R[0][4]] @ R[0][4] = C[4] ^ (~C[0] & C[1]);
  364. add @C[9],sp,#$D[0]
  365. str.h @E[1],[sp,#$R[0][4]+4]
  366. ldmia @E[3],{@E[0]-@E[2],@E[3]} @ D[3..4]
  367. ldmia @C[9],{@C[6]-@C[9]} @ D[0..1]
  368. ldr.l @C[2],[sp,#$A[1][4]] @ A[1][4]
  369. eor @C[0],@C[0],@E[0]
  370. ldr.h @C[3],[sp,#$A[1][4]+4]
  371. eor @C[1],@C[1],@E[1]
  372. @ ror @C[0],@C[0],#32-14 @ C[0] = ROL64(A[0][3] ^ D[3], rhotates[0][3]);
  373. ldr.l @E[0],[sp,#$A[3][1]] @ A[3][1]
  374. @ ror @C[1],@C[1],#32-14
  375. ldr.h @E[1],[sp,#$A[3][1]+4]
  376. eor @C[2],@C[2],@E[2]
  377. ldr.l @C[4],[sp,#$A[2][0]] @ A[2][0]
  378. eor @C[3],@C[3],@E[3]
  379. ldr.h @C[5],[sp,#$A[2][0]+4]
  380. @ ror @C[2],@C[2],#32-10 @ C[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
  381. @ ror @C[3],@C[3],#32-10
  382. eor @C[6],@C[6],@C[4]
  383. ldr.l @E[2],[sp,#$D[2]] @ D[2]
  384. eor @C[7],@C[7],@C[5]
  385. ldr.h @E[3],[sp,#$D[2]+4]
  386. ror @C[5],@C[6],#32-1 @ C[2] = ROL64(A[2][0] ^ D[0], rhotates[2][0]);
  387. ror @C[4],@C[7],#32-2
  388. eor @E[0],@E[0],@C[8]
  389. ldr.l @C[8],[sp,#$A[4][2]] @ A[4][2]
  390. eor @E[1],@E[1],@C[9]
  391. ldr.h @C[9],[sp,#$A[4][2]+4]
  392. ror @C[7],@E[0],#32-22 @ C[3] = ROL64(A[3][1] ^ D[1], rhotates[3][1]);
  393. ror @C[6],@E[1],#32-23
  394. bic @E[0],@C[4],@C[2],ror#32-10
  395. bic @E[1],@C[5],@C[3],ror#32-10
  396. eor @E[2],@E[2],@C[8]
  397. eor @E[3],@E[3],@C[9]
  398. ror @C[9],@E[2],#32-30 @ C[4] = ROL64(A[4][2] ^ D[2], rhotates[4][2]);
  399. ror @C[8],@E[3],#32-31
  400. eor @E[0],@E[0],@C[0],ror#32-14
  401. eor @E[1],@E[1],@C[1],ror#32-14
  402. str.l @E[0],[sp,#$R[1][0]] @ R[1][0] = C[0] ^ (~C[1] & C[2])
  403. bic @E[2],@C[6],@C[4]
  404. str.h @E[1],[sp,#$R[1][0]+4]
  405. bic @E[3],@C[7],@C[5]
  406. eor @E[2],@E[2],@C[2],ror#32-10
  407. str.l @E[2],[sp,#$R[1][1]] @ R[1][1] = C[1] ^ (~C[2] & C[3]);
  408. eor @E[3],@E[3],@C[3],ror#32-10
  409. str.h @E[3],[sp,#$R[1][1]+4]
  410. bic @E[0],@C[8],@C[6]
  411. bic @E[1],@C[9],@C[7]
  412. bic @E[2],@C[0],@C[8],ror#14
  413. bic @E[3],@C[1],@C[9],ror#14
  414. eor @E[0],@E[0],@C[4]
  415. eor @E[1],@E[1],@C[5]
  416. str.l @E[0],[sp,#$R[1][2]] @ R[1][2] = C[2] ^ (~C[3] & C[4]);
  417. bic @C[2],@C[2],@C[0],ror#32-(14-10)
  418. str.h @E[1],[sp,#$R[1][2]+4]
  419. eor @E[2],@C[6],@E[2],ror#32-14
  420. bic @E[1],@C[3],@C[1],ror#32-(14-10)
  421. str.l @E[2],[sp,#$R[1][3]] @ R[1][3] = C[3] ^ (~C[4] & C[0]);
  422. eor @E[3],@C[7],@E[3],ror#32-14
  423. str.h @E[3],[sp,#$R[1][3]+4]
  424. add @E[2],sp,#$D[1]
  425. ldr.l @C[1],[sp,#$A[0][1]] @ A[0][1]
  426. eor @E[0],@C[8],@C[2],ror#32-10
  427. ldr.h @C[0],[sp,#$A[0][1]+4]
  428. eor @E[1],@C[9],@E[1],ror#32-10
  429. str.l @E[0],[sp,#$R[1][4]] @ R[1][4] = C[4] ^ (~C[0] & C[1]);
  430. str.h @E[1],[sp,#$R[1][4]+4]
  431. add @C[9],sp,#$D[3]
  432. ldmia @E[2],{@E[0]-@E[2],@E[3]} @ D[1..2]
  433. ldr.l @C[2],[sp,#$A[1][2]] @ A[1][2]
  434. ldr.h @C[3],[sp,#$A[1][2]+4]
  435. ldmia @C[9],{@C[6]-@C[9]} @ D[3..4]
  436. eor @C[1],@C[1],@E[0]
  437. ldr.l @C[4],[sp,#$A[2][3]] @ A[2][3]
  438. eor @C[0],@C[0],@E[1]
  439. ldr.h @C[5],[sp,#$A[2][3]+4]
  440. ror @C[0],@C[0],#32-1 @ C[0] = ROL64(A[0][1] ^ D[1], rhotates[0][1]);
  441. eor @C[2],@C[2],@E[2]
  442. ldr.l @E[0],[sp,#$A[3][4]] @ A[3][4]
  443. eor @C[3],@C[3],@E[3]
  444. ldr.h @E[1],[sp,#$A[3][4]+4]
  445. @ ror @C[2],@C[2],#32-3 @ C[1] = ROL64(A[1][2] ^ D[2], rhotates[1][2]);
  446. ldr.l @E[2],[sp,#$D[0]] @ D[0]
  447. @ ror @C[3],@C[3],#32-3
  448. ldr.h @E[3],[sp,#$D[0]+4]
  449. eor @C[4],@C[4],@C[6]
  450. eor @C[5],@C[5],@C[7]
  451. @ ror @C[5],@C[6],#32-12 @ C[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
  452. @ ror @C[4],@C[7],#32-13 @ [track reverse order below]
  453. eor @E[0],@E[0],@C[8]
  454. ldr.l @C[8],[sp,#$A[4][0]] @ A[4][0]
  455. eor @E[1],@E[1],@C[9]
  456. ldr.h @C[9],[sp,#$A[4][0]+4]
  457. ror @C[6],@E[0],#32-4 @ C[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]);
  458. ror @C[7],@E[1],#32-4
  459. eor @E[2],@E[2],@C[8]
  460. eor @E[3],@E[3],@C[9]
  461. ror @C[8],@E[2],#32-9 @ C[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]);
  462. ror @C[9],@E[3],#32-9
  463. bic @E[0],@C[5],@C[2],ror#13-3
  464. bic @E[1],@C[4],@C[3],ror#12-3
  465. bic @E[2],@C[6],@C[5],ror#32-13
  466. bic @E[3],@C[7],@C[4],ror#32-12
  467. eor @E[0],@C[0],@E[0],ror#32-13
  468. eor @E[1],@C[1],@E[1],ror#32-12
  469. str.l @E[0],[sp,#$R[2][0]] @ R[2][0] = C[0] ^ (~C[1] & C[2])
  470. eor @E[2],@E[2],@C[2],ror#32-3
  471. str.h @E[1],[sp,#$R[2][0]+4]
  472. eor @E[3],@E[3],@C[3],ror#32-3
  473. str.l @E[2],[sp,#$R[2][1]] @ R[2][1] = C[1] ^ (~C[2] & C[3]);
  474. bic @E[0],@C[8],@C[6]
  475. bic @E[1],@C[9],@C[7]
  476. str.h @E[3],[sp,#$R[2][1]+4]
  477. eor @E[0],@E[0],@C[5],ror#32-13
  478. eor @E[1],@E[1],@C[4],ror#32-12
  479. str.l @E[0],[sp,#$R[2][2]] @ R[2][2] = C[2] ^ (~C[3] & C[4]);
  480. bic @E[2],@C[0],@C[8]
  481. str.h @E[1],[sp,#$R[2][2]+4]
  482. bic @E[3],@C[1],@C[9]
  483. eor @E[2],@E[2],@C[6]
  484. eor @E[3],@E[3],@C[7]
  485. str.l @E[2],[sp,#$R[2][3]] @ R[2][3] = C[3] ^ (~C[4] & C[0]);
  486. bic @E[0],@C[2],@C[0],ror#3
  487. str.h @E[3],[sp,#$R[2][3]+4]
  488. bic @E[1],@C[3],@C[1],ror#3
  489. ldr.l @C[1],[sp,#$A[0][4]] @ A[0][4] [in reverse order]
  490. eor @E[0],@C[8],@E[0],ror#32-3
  491. ldr.h @C[0],[sp,#$A[0][4]+4]
  492. eor @E[1],@C[9],@E[1],ror#32-3
  493. str.l @E[0],[sp,#$R[2][4]] @ R[2][4] = C[4] ^ (~C[0] & C[1]);
  494. add @C[9],sp,#$D[1]
  495. str.h @E[1],[sp,#$R[2][4]+4]
  496. ldr.l @E[0],[sp,#$D[4]] @ D[4]
  497. ldr.h @E[1],[sp,#$D[4]+4]
  498. ldr.l @E[2],[sp,#$D[0]] @ D[0]
  499. ldr.h @E[3],[sp,#$D[0]+4]
  500. ldmia @C[9],{@C[6]-@C[9]} @ D[1..2]
  501. eor @C[1],@C[1],@E[0]
  502. ldr.l @C[2],[sp,#$A[1][0]] @ A[1][0]
  503. eor @C[0],@C[0],@E[1]
  504. ldr.h @C[3],[sp,#$A[1][0]+4]
  505. @ ror @C[1],@E[0],#32-13 @ C[0] = ROL64(A[0][4] ^ D[4], rhotates[0][4]);
  506. ldr.l @C[4],[sp,#$A[2][1]] @ A[2][1]
  507. @ ror @C[0],@E[1],#32-14 @ [was loaded in reverse order]
  508. ldr.h @C[5],[sp,#$A[2][1]+4]
  509. eor @C[2],@C[2],@E[2]
  510. ldr.l @E[0],[sp,#$A[3][2]] @ A[3][2]
  511. eor @C[3],@C[3],@E[3]
  512. ldr.h @E[1],[sp,#$A[3][2]+4]
  513. @ ror @C[2],@C[2],#32-18 @ C[1] = ROL64(A[1][0] ^ D[0], rhotates[1][0]);
  514. ldr.l @E[2],[sp,#$D[3]] @ D[3]
  515. @ ror @C[3],@C[3],#32-18
  516. ldr.h @E[3],[sp,#$D[3]+4]
  517. eor @C[6],@C[6],@C[4]
  518. eor @C[7],@C[7],@C[5]
  519. ror @C[4],@C[6],#32-5 @ C[2] = ROL64(A[2][1] ^ D[1], rhotates[2][1]);
  520. ror @C[5],@C[7],#32-5
  521. eor @E[0],@E[0],@C[8]
  522. ldr.l @C[8],[sp,#$A[4][3]] @ A[4][3]
  523. eor @E[1],@E[1],@C[9]
  524. ldr.h @C[9],[sp,#$A[4][3]+4]
  525. ror @C[7],@E[0],#32-7 @ C[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
  526. ror @C[6],@E[1],#32-8
  527. eor @E[2],@E[2],@C[8]
  528. eor @E[3],@E[3],@C[9]
  529. ror @C[8],@E[2],#32-28 @ C[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]);
  530. ror @C[9],@E[3],#32-28
  531. bic @E[0],@C[4],@C[2],ror#32-18
  532. bic @E[1],@C[5],@C[3],ror#32-18
  533. eor @E[0],@E[0],@C[0],ror#32-14
  534. eor @E[1],@E[1],@C[1],ror#32-13
  535. str.l @E[0],[sp,#$R[3][0]] @ R[3][0] = C[0] ^ (~C[1] & C[2])
  536. bic @E[2],@C[6],@C[4]
  537. str.h @E[1],[sp,#$R[3][0]+4]
  538. bic @E[3],@C[7],@C[5]
  539. eor @E[2],@E[2],@C[2],ror#32-18
  540. str.l @E[2],[sp,#$R[3][1]] @ R[3][1] = C[1] ^ (~C[2] & C[3]);
  541. eor @E[3],@E[3],@C[3],ror#32-18
  542. str.h @E[3],[sp,#$R[3][1]+4]
  543. bic @E[0],@C[8],@C[6]
  544. bic @E[1],@C[9],@C[7]
  545. bic @E[2],@C[0],@C[8],ror#14
  546. bic @E[3],@C[1],@C[9],ror#13
  547. eor @E[0],@E[0],@C[4]
  548. eor @E[1],@E[1],@C[5]
  549. str.l @E[0],[sp,#$R[3][2]] @ R[3][2] = C[2] ^ (~C[3] & C[4]);
  550. bic @C[2],@C[2],@C[0],ror#18-14
  551. str.h @E[1],[sp,#$R[3][2]+4]
  552. eor @E[2],@C[6],@E[2],ror#32-14
  553. bic @E[1],@C[3],@C[1],ror#18-13
  554. eor @E[3],@C[7],@E[3],ror#32-13
  555. str.l @E[2],[sp,#$R[3][3]] @ R[3][3] = C[3] ^ (~C[4] & C[0]);
  556. str.h @E[3],[sp,#$R[3][3]+4]
  557. add @E[3],sp,#$D[2]
  558. ldr.l @C[0],[sp,#$A[0][2]] @ A[0][2]
  559. eor @E[0],@C[8],@C[2],ror#32-18
  560. ldr.h @C[1],[sp,#$A[0][2]+4]
  561. eor @E[1],@C[9],@E[1],ror#32-18
  562. str.l @E[0],[sp,#$R[3][4]] @ R[3][4] = C[4] ^ (~C[0] & C[1]);
  563. str.h @E[1],[sp,#$R[3][4]+4]
  564. ldmia @E[3],{@E[0]-@E[2],@E[3]} @ D[2..3]
  565. ldr.l @C[2],[sp,#$A[1][3]] @ A[1][3]
  566. ldr.h @C[3],[sp,#$A[1][3]+4]
  567. ldr.l @C[6],[sp,#$D[4]] @ D[4]
  568. ldr.h @C[7],[sp,#$D[4]+4]
  569. eor @C[0],@C[0],@E[0]
  570. ldr.l @C[4],[sp,#$A[2][4]] @ A[2][4]
  571. eor @C[1],@C[1],@E[1]
  572. ldr.h @C[5],[sp,#$A[2][4]+4]
  573. @ ror @C[0],@C[0],#32-31 @ C[0] = ROL64(A[0][2] ^ D[2], rhotates[0][2]);
  574. ldr.l @C[8],[sp,#$D[0]] @ D[0]
  575. @ ror @C[1],@C[1],#32-31
  576. ldr.h @C[9],[sp,#$D[0]+4]
  577. eor @E[2],@E[2],@C[2]
  578. ldr.l @E[0],[sp,#$A[3][0]] @ A[3][0]
  579. eor @E[3],@E[3],@C[3]
  580. ldr.h @E[1],[sp,#$A[3][0]+4]
  581. ror @C[3],@E[2],#32-27 @ C[1] = ROL64(A[1][3] ^ D[3], rhotates[1][3]);
  582. ldr.l @E[2],[sp,#$D[1]] @ D[1]
  583. ror @C[2],@E[3],#32-28
  584. ldr.h @E[3],[sp,#$D[1]+4]
  585. eor @C[6],@C[6],@C[4]
  586. eor @C[7],@C[7],@C[5]
  587. ror @C[5],@C[6],#32-19 @ C[2] = ROL64(A[2][4] ^ D[4], rhotates[2][4]);
  588. ror @C[4],@C[7],#32-20
  589. eor @E[0],@E[0],@C[8]
  590. ldr.l @C[8],[sp,#$A[4][1]] @ A[4][1]
  591. eor @E[1],@E[1],@C[9]
  592. ldr.h @C[9],[sp,#$A[4][1]+4]
  593. ror @C[7],@E[0],#32-20 @ C[3] = ROL64(A[3][0] ^ D[0], rhotates[3][0]);
  594. ror @C[6],@E[1],#32-21
  595. eor @C[8],@C[8],@E[2]
  596. eor @C[9],@C[9],@E[3]
  597. @ ror @C[8],@C[2],#32-1 @ C[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
  598. @ ror @C[9],@C[3],#32-1
  599. bic @E[0],@C[4],@C[2]
  600. bic @E[1],@C[5],@C[3]
  601. eor @E[0],@E[0],@C[0],ror#32-31
  602. str.l @E[0],[sp,#$R[4][0]] @ R[4][0] = C[0] ^ (~C[1] & C[2])
  603. eor @E[1],@E[1],@C[1],ror#32-31
  604. str.h @E[1],[sp,#$R[4][0]+4]
  605. bic @E[2],@C[6],@C[4]
  606. bic @E[3],@C[7],@C[5]
  607. eor @E[2],@E[2],@C[2]
  608. eor @E[3],@E[3],@C[3]
  609. str.l @E[2],[sp,#$R[4][1]] @ R[4][1] = C[1] ^ (~C[2] & C[3]);
  610. bic @E[0],@C[8],@C[6],ror#1
  611. str.h @E[3],[sp,#$R[4][1]+4]
  612. bic @E[1],@C[9],@C[7],ror#1
  613. bic @E[2],@C[0],@C[8],ror#31-1
  614. bic @E[3],@C[1],@C[9],ror#31-1
  615. eor @C[4],@C[4],@E[0],ror#32-1
  616. str.l @C[4],[sp,#$R[4][2]] @ R[4][2] = C[2] ^= (~C[3] & C[4]);
  617. eor @C[5],@C[5],@E[1],ror#32-1
  618. str.h @C[5],[sp,#$R[4][2]+4]
  619. eor @C[6],@C[6],@E[2],ror#32-31
  620. eor @C[7],@C[7],@E[3],ror#32-31
  621. str.l @C[6],[sp,#$R[4][3]] @ R[4][3] = C[3] ^= (~C[4] & C[0]);
  622. bic @E[0],@C[2],@C[0],ror#32-31
  623. str.h @C[7],[sp,#$R[4][3]+4]
  624. bic @E[1],@C[3],@C[1],ror#32-31
  625. add @E[2],sp,#$R[0][0]
  626. eor @C[8],@E[0],@C[8],ror#32-1
  627. add @E[0],sp,#$R[1][0]
  628. eor @C[9],@E[1],@C[9],ror#32-1
  629. str.l @C[8],[sp,#$R[4][4]] @ R[4][4] = C[4] ^= (~C[0] & C[1]);
  630. str.h @C[9],[sp,#$R[4][4]+4]
  631. ___
  632. }
  633. Round(@A,@T);
  634. Round(@T,@A);
  635. $code.=<<___;
  636. blo .Lround2x
  637. ldr pc,[sp,#440]
  638. .size KeccakF1600_int,.-KeccakF1600_int
  639. .type KeccakF1600, %function
  640. .align 5
  641. KeccakF1600:
  642. stmdb sp!,{r0,r4-r11,lr}
  643. sub sp,sp,#440+16 @ space for A[5][5],D[5],T[5][5],...
  644. add @E[0],r0,#$A[1][0]
  645. add @E[1],sp,#$A[1][0]
  646. ldmia r0, {@C[0]-@C[9]} @ copy A[5][5] to stack
  647. stmia sp, {@C[0]-@C[9]}
  648. ldmia @E[0]!,{@C[0]-@C[9]}
  649. stmia @E[1]!,{@C[0]-@C[9]}
  650. ldmia @E[0]!,{@C[0]-@C[9]}
  651. stmia @E[1]!,{@C[0]-@C[9]}
  652. ldmia @E[0]!,{@C[0]-@C[9]}
  653. stmia @E[1]!,{@C[0]-@C[9]}
  654. ldmia @E[0], {@C[0]-@C[9]}
  655. add @E[2],sp,#$A[0][0]
  656. add @E[0],sp,#$A[1][0]
  657. stmia @E[1], {@C[0]-@C[9]}
  658. bl KeccakF1600_enter
  659. ldr @E[1], [sp,#440+16] @ restore pointer to A
  660. ldmia sp, {@C[0]-@C[9]}
  661. stmia @E[1]!,{@C[0]-@C[9]} @ return A[5][5]
  662. ldmia @E[0]!,{@C[0]-@C[9]}
  663. stmia @E[1]!,{@C[0]-@C[9]}
  664. ldmia @E[0]!,{@C[0]-@C[9]}
  665. stmia @E[1]!,{@C[0]-@C[9]}
  666. ldmia @E[0]!,{@C[0]-@C[9]}
  667. stmia @E[1]!,{@C[0]-@C[9]}
  668. ldmia @E[0], {@C[0]-@C[9]}
  669. stmia @E[1], {@C[0]-@C[9]}
  670. add sp,sp,#440+20
  671. ldmia sp!,{r4-r11,pc}
  672. .size KeccakF1600,.-KeccakF1600
  673. ___
  674. { my ($A_flat,$inp,$len,$bsz) = map("r$_",(10..12,14));
  675. ########################################################################
  676. # Stack layout
  677. # ----->+-----------------------+
  678. # | uint64_t A[5][5] |
  679. # | ... |
  680. # | ... |
  681. # +456->+-----------------------+
  682. # | 0x55555555 |
  683. # +460->+-----------------------+
  684. # | 0x33333333 |
  685. # +464->+-----------------------+
  686. # | 0x0f0f0f0f |
  687. # +468->+-----------------------+
  688. # | 0x00ff00ff |
  689. # +472->+-----------------------+
  690. # | uint64_t *A |
  691. # +476->+-----------------------+
  692. # | const void *inp |
  693. # +480->+-----------------------+
  694. # | size_t len |
  695. # +484->+-----------------------+
  696. # | size_t bs |
  697. # +488->+-----------------------+
  698. # | ....
  699. $code.=<<___;
  700. .global SHA3_absorb
  701. .type SHA3_absorb,%function
  702. .align 5
  703. SHA3_absorb:
  704. stmdb sp!,{r0-r12,lr}
  705. sub sp,sp,#456+16
  706. add $A_flat,r0,#$A[1][0]
  707. @ mov $inp,r1
  708. mov $len,r2
  709. mov $bsz,r3
  710. cmp r2,r3
  711. blo .Labsorb_abort
  712. add $inp,sp,#0
  713. ldmia r0, {@C[0]-@C[9]} @ copy A[5][5] to stack
  714. stmia $inp!, {@C[0]-@C[9]}
  715. ldmia $A_flat!,{@C[0]-@C[9]}
  716. stmia $inp!, {@C[0]-@C[9]}
  717. ldmia $A_flat!,{@C[0]-@C[9]}
  718. stmia $inp!, {@C[0]-@C[9]}
  719. ldmia $A_flat!,{@C[0]-@C[9]}
  720. stmia $inp!, {@C[0]-@C[9]}
  721. ldmia $A_flat!,{@C[0]-@C[9]}
  722. stmia $inp, {@C[0]-@C[9]}
  723. ldr $inp,[sp,#476] @ restore $inp
  724. #ifdef __thumb2__
  725. mov r9,#0x00ff00ff
  726. mov r8,#0x0f0f0f0f
  727. mov r7,#0x33333333
  728. mov r6,#0x55555555
  729. #else
  730. mov r6,#0x11 @ compose constants
  731. mov r8,#0x0f
  732. mov r9,#0xff
  733. orr r6,r6,r6,lsl#8
  734. orr r8,r8,r8,lsl#8
  735. orr r6,r6,r6,lsl#16 @ 0x11111111
  736. orr r9,r9,r9,lsl#16 @ 0x00ff00ff
  737. orr r8,r8,r8,lsl#16 @ 0x0f0f0f0f
  738. orr r7,r6,r6,lsl#1 @ 0x33333333
  739. orr r6,r6,r6,lsl#2 @ 0x55555555
  740. #endif
  741. str r9,[sp,#468]
  742. str r8,[sp,#464]
  743. str r7,[sp,#460]
  744. str r6,[sp,#456]
  745. b .Loop_absorb
  746. .align 4
  747. .Loop_absorb:
  748. subs r0,$len,$bsz
  749. blo .Labsorbed
  750. add $A_flat,sp,#0
  751. str r0,[sp,#480] @ save len - bsz
  752. .align 4
  753. .Loop_block:
  754. ldrb r0,[$inp],#1
  755. ldrb r1,[$inp],#1
  756. ldrb r2,[$inp],#1
  757. ldrb r3,[$inp],#1
  758. ldrb r4,[$inp],#1
  759. orr r0,r0,r1,lsl#8
  760. ldrb r1,[$inp],#1
  761. orr r0,r0,r2,lsl#16
  762. ldrb r2,[$inp],#1
  763. orr r0,r0,r3,lsl#24 @ lo
  764. ldrb r3,[$inp],#1
  765. orr r1,r4,r1,lsl#8
  766. orr r1,r1,r2,lsl#16
  767. orr r1,r1,r3,lsl#24 @ hi
  768. and r2,r0,r6 @ &=0x55555555
  769. and r0,r0,r6,lsl#1 @ &=0xaaaaaaaa
  770. and r3,r1,r6 @ &=0x55555555
  771. and r1,r1,r6,lsl#1 @ &=0xaaaaaaaa
  772. orr r2,r2,r2,lsr#1
  773. orr r0,r0,r0,lsl#1
  774. orr r3,r3,r3,lsr#1
  775. orr r1,r1,r1,lsl#1
  776. and r2,r2,r7 @ &=0x33333333
  777. and r0,r0,r7,lsl#2 @ &=0xcccccccc
  778. and r3,r3,r7 @ &=0x33333333
  779. and r1,r1,r7,lsl#2 @ &=0xcccccccc
  780. orr r2,r2,r2,lsr#2
  781. orr r0,r0,r0,lsl#2
  782. orr r3,r3,r3,lsr#2
  783. orr r1,r1,r1,lsl#2
  784. and r2,r2,r8 @ &=0x0f0f0f0f
  785. and r0,r0,r8,lsl#4 @ &=0xf0f0f0f0
  786. and r3,r3,r8 @ &=0x0f0f0f0f
  787. and r1,r1,r8,lsl#4 @ &=0xf0f0f0f0
  788. ldmia $A_flat,{r4-r5} @ A_flat[i]
  789. orr r2,r2,r2,lsr#4
  790. orr r0,r0,r0,lsl#4
  791. orr r3,r3,r3,lsr#4
  792. orr r1,r1,r1,lsl#4
  793. and r2,r2,r9 @ &=0x00ff00ff
  794. and r0,r0,r9,lsl#8 @ &=0xff00ff00
  795. and r3,r3,r9 @ &=0x00ff00ff
  796. and r1,r1,r9,lsl#8 @ &=0xff00ff00
  797. orr r2,r2,r2,lsr#8
  798. orr r0,r0,r0,lsl#8
  799. orr r3,r3,r3,lsr#8
  800. orr r1,r1,r1,lsl#8
  801. lsl r2,r2,#16
  802. lsr r1,r1,#16
  803. eor r4,r4,r3,lsl#16
  804. eor r5,r5,r0,lsr#16
  805. eor r4,r4,r2,lsr#16
  806. eor r5,r5,r1,lsl#16
  807. stmia $A_flat!,{r4-r5} @ A_flat[i++] ^= BitInterleave(inp[0..7])
  808. subs $bsz,$bsz,#8
  809. bhi .Loop_block
  810. str $inp,[sp,#476]
  811. bl KeccakF1600_int
  812. add r14,sp,#456
  813. ldmia r14,{r6-r12,r14} @ restore constants and variables
  814. b .Loop_absorb
  815. .align 4
  816. .Labsorbed:
  817. add $inp,sp,#$A[1][0]
  818. ldmia sp, {@C[0]-@C[9]}
  819. stmia $A_flat!,{@C[0]-@C[9]} @ return A[5][5]
  820. ldmia $inp!, {@C[0]-@C[9]}
  821. stmia $A_flat!,{@C[0]-@C[9]}
  822. ldmia $inp!, {@C[0]-@C[9]}
  823. stmia $A_flat!,{@C[0]-@C[9]}
  824. ldmia $inp!, {@C[0]-@C[9]}
  825. stmia $A_flat!,{@C[0]-@C[9]}
  826. ldmia $inp, {@C[0]-@C[9]}
  827. stmia $A_flat, {@C[0]-@C[9]}
  828. .Labsorb_abort:
  829. add sp,sp,#456+32
  830. mov r0,$len @ return value
  831. ldmia sp!,{r4-r12,pc}
  832. .size SHA3_absorb,.-SHA3_absorb
  833. ___
  834. }
  835. { my ($out,$len,$A_flat,$bsz) = map("r$_", (4,5,10,12));
  836. $code.=<<___;
  837. .global SHA3_squeeze
  838. .type SHA3_squeeze,%function
  839. .align 5
  840. SHA3_squeeze:
  841. stmdb sp!,{r0,r3-r10,lr}
  842. mov $A_flat,r0
  843. mov $out,r1
  844. mov $len,r2
  845. mov $bsz,r3
  846. #ifdef __thumb2__
  847. mov r9,#0x00ff00ff
  848. mov r8,#0x0f0f0f0f
  849. mov r7,#0x33333333
  850. mov r6,#0x55555555
  851. #else
  852. mov r6,#0x11 @ compose constants
  853. mov r8,#0x0f
  854. mov r9,#0xff
  855. orr r6,r6,r6,lsl#8
  856. orr r8,r8,r8,lsl#8
  857. orr r6,r6,r6,lsl#16 @ 0x11111111
  858. orr r9,r9,r9,lsl#16 @ 0x00ff00ff
  859. orr r8,r8,r8,lsl#16 @ 0x0f0f0f0f
  860. orr r7,r6,r6,lsl#1 @ 0x33333333
  861. orr r6,r6,r6,lsl#2 @ 0x55555555
  862. #endif
  863. stmdb sp!,{r6-r9}
  864. mov r14,$A_flat
  865. b .Loop_squeeze
  866. .align 4
  867. .Loop_squeeze:
  868. ldmia $A_flat!,{r0,r1} @ A_flat[i++]
  869. lsl r2,r0,#16
  870. lsl r3,r1,#16 @ r3 = r1 << 16
  871. lsr r2,r2,#16 @ r2 = r0 & 0x0000ffff
  872. lsr r1,r1,#16
  873. lsr r0,r0,#16 @ r0 = r0 >> 16
  874. lsl r1,r1,#16 @ r1 = r1 & 0xffff0000
  875. orr r2,r2,r2,lsl#8
  876. orr r3,r3,r3,lsr#8
  877. orr r0,r0,r0,lsl#8
  878. orr r1,r1,r1,lsr#8
  879. and r2,r2,r9 @ &=0x00ff00ff
  880. and r3,r3,r9,lsl#8 @ &=0xff00ff00
  881. and r0,r0,r9 @ &=0x00ff00ff
  882. and r1,r1,r9,lsl#8 @ &=0xff00ff00
  883. orr r2,r2,r2,lsl#4
  884. orr r3,r3,r3,lsr#4
  885. orr r0,r0,r0,lsl#4
  886. orr r1,r1,r1,lsr#4
  887. and r2,r2,r8 @ &=0x0f0f0f0f
  888. and r3,r3,r8,lsl#4 @ &=0xf0f0f0f0
  889. and r0,r0,r8 @ &=0x0f0f0f0f
  890. and r1,r1,r8,lsl#4 @ &=0xf0f0f0f0
  891. orr r2,r2,r2,lsl#2
  892. orr r3,r3,r3,lsr#2
  893. orr r0,r0,r0,lsl#2
  894. orr r1,r1,r1,lsr#2
  895. and r2,r2,r7 @ &=0x33333333
  896. and r3,r3,r7,lsl#2 @ &=0xcccccccc
  897. and r0,r0,r7 @ &=0x33333333
  898. and r1,r1,r7,lsl#2 @ &=0xcccccccc
  899. orr r2,r2,r2,lsl#1
  900. orr r3,r3,r3,lsr#1
  901. orr r0,r0,r0,lsl#1
  902. orr r1,r1,r1,lsr#1
  903. and r2,r2,r6 @ &=0x55555555
  904. and r3,r3,r6,lsl#1 @ &=0xaaaaaaaa
  905. and r0,r0,r6 @ &=0x55555555
  906. and r1,r1,r6,lsl#1 @ &=0xaaaaaaaa
  907. orr r2,r2,r3
  908. orr r0,r0,r1
  909. cmp $len,#8
  910. blo .Lsqueeze_tail
  911. lsr r1,r2,#8
  912. strb r2,[$out],#1
  913. lsr r3,r2,#16
  914. strb r1,[$out],#1
  915. lsr r2,r2,#24
  916. strb r3,[$out],#1
  917. strb r2,[$out],#1
  918. lsr r1,r0,#8
  919. strb r0,[$out],#1
  920. lsr r3,r0,#16
  921. strb r1,[$out],#1
  922. lsr r0,r0,#24
  923. strb r3,[$out],#1
  924. strb r0,[$out],#1
  925. subs $len,$len,#8
  926. beq .Lsqueeze_done
  927. subs $bsz,$bsz,#8 @ bsz -= 8
  928. bhi .Loop_squeeze
  929. mov r0,r14 @ original $A_flat
  930. bl KeccakF1600
  931. ldmia sp,{r6-r10,r12} @ restore constants and variables
  932. mov r14,$A_flat
  933. b .Loop_squeeze
  934. .align 4
  935. .Lsqueeze_tail:
  936. strb r2,[$out],#1
  937. lsr r2,r2,#8
  938. subs $len,$len,#1
  939. beq .Lsqueeze_done
  940. strb r2,[$out],#1
  941. lsr r2,r2,#8
  942. subs $len,$len,#1
  943. beq .Lsqueeze_done
  944. strb r2,[$out],#1
  945. lsr r2,r2,#8
  946. subs $len,$len,#1
  947. beq .Lsqueeze_done
  948. strb r2,[$out],#1
  949. subs $len,$len,#1
  950. beq .Lsqueeze_done
  951. strb r0,[$out],#1
  952. lsr r0,r0,#8
  953. subs $len,$len,#1
  954. beq .Lsqueeze_done
  955. strb r0,[$out],#1
  956. lsr r0,r0,#8
  957. subs $len,$len,#1
  958. beq .Lsqueeze_done
  959. strb r0,[$out]
  960. b .Lsqueeze_done
  961. .align 4
  962. .Lsqueeze_done:
  963. add sp,sp,#24
  964. ldmia sp!,{r4-r10,pc}
  965. .size SHA3_squeeze,.-SHA3_squeeze
  966. ___
  967. }
  968. $code.=<<___;
  969. .fpu neon
  970. .type iotas64, %object
  971. .align 5
  972. iotas64:
  973. .quad 0x0000000000000001
  974. .quad 0x0000000000008082
  975. .quad 0x800000000000808a
  976. .quad 0x8000000080008000
  977. .quad 0x000000000000808b
  978. .quad 0x0000000080000001
  979. .quad 0x8000000080008081
  980. .quad 0x8000000000008009
  981. .quad 0x000000000000008a
  982. .quad 0x0000000000000088
  983. .quad 0x0000000080008009
  984. .quad 0x000000008000000a
  985. .quad 0x000000008000808b
  986. .quad 0x800000000000008b
  987. .quad 0x8000000000008089
  988. .quad 0x8000000000008003
  989. .quad 0x8000000000008002
  990. .quad 0x8000000000000080
  991. .quad 0x000000000000800a
  992. .quad 0x800000008000000a
  993. .quad 0x8000000080008081
  994. .quad 0x8000000000008080
  995. .quad 0x0000000080000001
  996. .quad 0x8000000080008008
  997. .size iotas64,.-iotas64
  998. .type KeccakF1600_neon, %function
  999. .align 5
  1000. KeccakF1600_neon:
  1001. add r1, r0, #16
  1002. adr r2, iotas64
  1003. mov r3, #24 @ loop counter
  1004. b .Loop_neon
  1005. .align 4
  1006. .Loop_neon:
  1007. @ Theta
  1008. vst1.64 {q4}, [r0:64] @ offload A[0..1][4]
  1009. veor q13, q0, q5 @ A[0..1][0]^A[2..3][0]
  1010. vst1.64 {d18}, [r1:64] @ offload A[2][4]
  1011. veor q14, q1, q6 @ A[0..1][1]^A[2..3][1]
  1012. veor q15, q2, q7 @ A[0..1][2]^A[2..3][2]
  1013. veor d26, d26, d27 @ C[0]=A[0][0]^A[1][0]^A[2][0]^A[3][0]
  1014. veor d27, d28, d29 @ C[1]=A[0][1]^A[1][1]^A[2][1]^A[3][1]
  1015. veor q14, q3, q8 @ A[0..1][3]^A[2..3][3]
  1016. veor q4, q4, q9 @ A[0..1][4]^A[2..3][4]
  1017. veor d30, d30, d31 @ C[2]=A[0][2]^A[1][2]^A[2][2]^A[3][2]
  1018. veor d31, d28, d29 @ C[3]=A[0][3]^A[1][3]^A[2][3]^A[3][3]
  1019. veor d25, d8, d9 @ C[4]=A[0][4]^A[1][4]^A[2][4]^A[3][4]
  1020. veor q13, q13, q10 @ C[0..1]^=A[4][0..1]
  1021. veor q14, q15, q11 @ C[2..3]^=A[4][2..3]
  1022. veor d25, d25, d24 @ C[4]^=A[4][4]
  1023. vadd.u64 q4, q13, q13 @ C[0..1]<<1
  1024. vadd.u64 q15, q14, q14 @ C[2..3]<<1
  1025. vadd.u64 d18, d25, d25 @ C[4]<<1
  1026. vsri.u64 q4, q13, #63 @ ROL64(C[0..1],1)
  1027. vsri.u64 q15, q14, #63 @ ROL64(C[2..3],1)
  1028. vsri.u64 d18, d25, #63 @ ROL64(C[4],1)
  1029. veor d25, d25, d9 @ D[0] = C[4] ^= ROL64(C[1],1)
  1030. veor q13, q13, q15 @ D[1..2] = C[0..1] ^ ROL64(C[2..3],1)
  1031. veor d28, d28, d18 @ D[3] = C[2] ^= ROL64(C[4],1)
  1032. veor d29, d29, d8 @ D[4] = C[3] ^= ROL64(C[0],1)
  1033. veor d0, d0, d25 @ A[0][0] ^= C[4]
  1034. veor d1, d1, d25 @ A[1][0] ^= C[4]
  1035. veor d10, d10, d25 @ A[2][0] ^= C[4]
  1036. veor d11, d11, d25 @ A[3][0] ^= C[4]
  1037. veor d20, d20, d25 @ A[4][0] ^= C[4]
  1038. veor d2, d2, d26 @ A[0][1] ^= D[1]
  1039. veor d3, d3, d26 @ A[1][1] ^= D[1]
  1040. veor d12, d12, d26 @ A[2][1] ^= D[1]
  1041. veor d13, d13, d26 @ A[3][1] ^= D[1]
  1042. veor d21, d21, d26 @ A[4][1] ^= D[1]
  1043. vmov d26, d27
  1044. veor d6, d6, d28 @ A[0][3] ^= C[2]
  1045. veor d7, d7, d28 @ A[1][3] ^= C[2]
  1046. veor d16, d16, d28 @ A[2][3] ^= C[2]
  1047. veor d17, d17, d28 @ A[3][3] ^= C[2]
  1048. veor d23, d23, d28 @ A[4][3] ^= C[2]
  1049. vld1.64 {q4}, [r0:64] @ restore A[0..1][4]
  1050. vmov d28, d29
  1051. vld1.64 {d18}, [r1:64] @ restore A[2][4]
  1052. veor q2, q2, q13 @ A[0..1][2] ^= D[2]
  1053. veor q7, q7, q13 @ A[2..3][2] ^= D[2]
  1054. veor d22, d22, d27 @ A[4][2] ^= D[2]
  1055. veor q4, q4, q14 @ A[0..1][4] ^= C[3]
  1056. veor q9, q9, q14 @ A[2..3][4] ^= C[3]
  1057. veor d24, d24, d29 @ A[4][4] ^= C[3]
  1058. @ Rho + Pi
  1059. vmov d26, d2 @ C[1] = A[0][1]
  1060. vshl.u64 d2, d3, #44
  1061. vmov d27, d4 @ C[2] = A[0][2]
  1062. vshl.u64 d4, d14, #43
  1063. vmov d28, d6 @ C[3] = A[0][3]
  1064. vshl.u64 d6, d17, #21
  1065. vmov d29, d8 @ C[4] = A[0][4]
  1066. vshl.u64 d8, d24, #14
  1067. vsri.u64 d2, d3, #64-44 @ A[0][1] = ROL64(A[1][1], rhotates[1][1])
  1068. vsri.u64 d4, d14, #64-43 @ A[0][2] = ROL64(A[2][2], rhotates[2][2])
  1069. vsri.u64 d6, d17, #64-21 @ A[0][3] = ROL64(A[3][3], rhotates[3][3])
  1070. vsri.u64 d8, d24, #64-14 @ A[0][4] = ROL64(A[4][4], rhotates[4][4])
  1071. vshl.u64 d3, d9, #20
  1072. vshl.u64 d14, d16, #25
  1073. vshl.u64 d17, d15, #15
  1074. vshl.u64 d24, d21, #2
  1075. vsri.u64 d3, d9, #64-20 @ A[1][1] = ROL64(A[1][4], rhotates[1][4])
  1076. vsri.u64 d14, d16, #64-25 @ A[2][2] = ROL64(A[2][3], rhotates[2][3])
  1077. vsri.u64 d17, d15, #64-15 @ A[3][3] = ROL64(A[3][2], rhotates[3][2])
  1078. vsri.u64 d24, d21, #64-2 @ A[4][4] = ROL64(A[4][1], rhotates[4][1])
  1079. vshl.u64 d9, d22, #61
  1080. @ vshl.u64 d16, d19, #8
  1081. vshl.u64 d15, d12, #10
  1082. vshl.u64 d21, d7, #55
  1083. vsri.u64 d9, d22, #64-61 @ A[1][4] = ROL64(A[4][2], rhotates[4][2])
  1084. vext.8 d16, d19, d19, #8-1 @ A[2][3] = ROL64(A[3][4], rhotates[3][4])
  1085. vsri.u64 d15, d12, #64-10 @ A[3][2] = ROL64(A[2][1], rhotates[2][1])
  1086. vsri.u64 d21, d7, #64-55 @ A[4][1] = ROL64(A[1][3], rhotates[1][3])
  1087. vshl.u64 d22, d18, #39
  1088. @ vshl.u64 d19, d23, #56
  1089. vshl.u64 d12, d5, #6
  1090. vshl.u64 d7, d13, #45
  1091. vsri.u64 d22, d18, #64-39 @ A[4][2] = ROL64(A[2][4], rhotates[2][4])
  1092. vext.8 d19, d23, d23, #8-7 @ A[3][4] = ROL64(A[4][3], rhotates[4][3])
  1093. vsri.u64 d12, d5, #64-6 @ A[2][1] = ROL64(A[1][2], rhotates[1][2])
  1094. vsri.u64 d7, d13, #64-45 @ A[1][3] = ROL64(A[3][1], rhotates[3][1])
  1095. vshl.u64 d18, d20, #18
  1096. vshl.u64 d23, d11, #41
  1097. vshl.u64 d5, d10, #3
  1098. vshl.u64 d13, d1, #36
  1099. vsri.u64 d18, d20, #64-18 @ A[2][4] = ROL64(A[4][0], rhotates[4][0])
  1100. vsri.u64 d23, d11, #64-41 @ A[4][3] = ROL64(A[3][0], rhotates[3][0])
  1101. vsri.u64 d5, d10, #64-3 @ A[1][2] = ROL64(A[2][0], rhotates[2][0])
  1102. vsri.u64 d13, d1, #64-36 @ A[3][1] = ROL64(A[1][0], rhotates[1][0])
  1103. vshl.u64 d1, d28, #28
  1104. vshl.u64 d10, d26, #1
  1105. vshl.u64 d11, d29, #27
  1106. vshl.u64 d20, d27, #62
  1107. vsri.u64 d1, d28, #64-28 @ A[1][0] = ROL64(C[3], rhotates[0][3])
  1108. vsri.u64 d10, d26, #64-1 @ A[2][0] = ROL64(C[1], rhotates[0][1])
  1109. vsri.u64 d11, d29, #64-27 @ A[3][0] = ROL64(C[4], rhotates[0][4])
  1110. vsri.u64 d20, d27, #64-62 @ A[4][0] = ROL64(C[2], rhotates[0][2])
  1111. @ Chi + Iota
  1112. vbic q13, q2, q1
  1113. vbic q14, q3, q2
  1114. vbic q15, q4, q3
  1115. veor q13, q13, q0 @ A[0..1][0] ^ (~A[0..1][1] & A[0..1][2])
  1116. veor q14, q14, q1 @ A[0..1][1] ^ (~A[0..1][2] & A[0..1][3])
  1117. veor q2, q2, q15 @ A[0..1][2] ^= (~A[0..1][3] & A[0..1][4])
  1118. vst1.64 {q13}, [r0:64] @ offload A[0..1][0]
  1119. vbic q13, q0, q4
  1120. vbic q15, q1, q0
  1121. vmov q1, q14 @ A[0..1][1]
  1122. veor q3, q3, q13 @ A[0..1][3] ^= (~A[0..1][4] & A[0..1][0])
  1123. veor q4, q4, q15 @ A[0..1][4] ^= (~A[0..1][0] & A[0..1][1])
  1124. vbic q13, q7, q6
  1125. vmov q0, q5 @ A[2..3][0]
  1126. vbic q14, q8, q7
  1127. vmov q15, q6 @ A[2..3][1]
  1128. veor q5, q5, q13 @ A[2..3][0] ^= (~A[2..3][1] & A[2..3][2])
  1129. vbic q13, q9, q8
  1130. veor q6, q6, q14 @ A[2..3][1] ^= (~A[2..3][2] & A[2..3][3])
  1131. vbic q14, q0, q9
  1132. veor q7, q7, q13 @ A[2..3][2] ^= (~A[2..3][3] & A[2..3][4])
  1133. vbic q13, q15, q0
  1134. veor q8, q8, q14 @ A[2..3][3] ^= (~A[2..3][4] & A[2..3][0])
  1135. vmov q14, q10 @ A[4][0..1]
  1136. veor q9, q9, q13 @ A[2..3][4] ^= (~A[2..3][0] & A[2..3][1])
  1137. vld1.64 d25, [r2:64]! @ Iota[i++]
  1138. vbic d26, d22, d21
  1139. vbic d27, d23, d22
  1140. vld1.64 {q0}, [r0:64] @ restore A[0..1][0]
  1141. veor d20, d20, d26 @ A[4][0] ^= (~A[4][1] & A[4][2])
  1142. vbic d26, d24, d23
  1143. veor d21, d21, d27 @ A[4][1] ^= (~A[4][2] & A[4][3])
  1144. vbic d27, d28, d24
  1145. veor d22, d22, d26 @ A[4][2] ^= (~A[4][3] & A[4][4])
  1146. vbic d26, d29, d28
  1147. veor d23, d23, d27 @ A[4][3] ^= (~A[4][4] & A[4][0])
  1148. veor d0, d0, d25 @ A[0][0] ^= Iota[i]
  1149. veor d24, d24, d26 @ A[4][4] ^= (~A[4][0] & A[4][1])
  1150. subs r3, r3, #1
  1151. bne .Loop_neon
  1152. bx lr
  1153. .size KeccakF1600_neon,.-KeccakF1600_neon
  1154. .global SHA3_absorb_neon
  1155. .type SHA3_absorb_neon, %function
  1156. .align 5
  1157. SHA3_absorb_neon:
  1158. stmdb sp!, {r4-r6,lr}
  1159. vstmdb sp!, {d8-d15}
  1160. mov r4, r1 @ inp
  1161. mov r5, r2 @ len
  1162. mov r6, r3 @ bsz
  1163. vld1.32 {d0}, [r0:64]! @ A[0][0]
  1164. vld1.32 {d2}, [r0:64]! @ A[0][1]
  1165. vld1.32 {d4}, [r0:64]! @ A[0][2]
  1166. vld1.32 {d6}, [r0:64]! @ A[0][3]
  1167. vld1.32 {d8}, [r0:64]! @ A[0][4]
  1168. vld1.32 {d1}, [r0:64]! @ A[1][0]
  1169. vld1.32 {d3}, [r0:64]! @ A[1][1]
  1170. vld1.32 {d5}, [r0:64]! @ A[1][2]
  1171. vld1.32 {d7}, [r0:64]! @ A[1][3]
  1172. vld1.32 {d9}, [r0:64]! @ A[1][4]
  1173. vld1.32 {d10}, [r0:64]! @ A[2][0]
  1174. vld1.32 {d12}, [r0:64]! @ A[2][1]
  1175. vld1.32 {d14}, [r0:64]! @ A[2][2]
  1176. vld1.32 {d16}, [r0:64]! @ A[2][3]
  1177. vld1.32 {d18}, [r0:64]! @ A[2][4]
  1178. vld1.32 {d11}, [r0:64]! @ A[3][0]
  1179. vld1.32 {d13}, [r0:64]! @ A[3][1]
  1180. vld1.32 {d15}, [r0:64]! @ A[3][2]
  1181. vld1.32 {d17}, [r0:64]! @ A[3][3]
  1182. vld1.32 {d19}, [r0:64]! @ A[3][4]
  1183. vld1.32 {d20-d23}, [r0:64]! @ A[4][0..3]
  1184. vld1.32 {d24}, [r0:64] @ A[4][4]
  1185. sub r0, r0, #24*8 @ rewind
  1186. b .Loop_absorb_neon
  1187. .align 4
  1188. .Loop_absorb_neon:
  1189. subs r12, r5, r6 @ len - bsz
  1190. blo .Labsorbed_neon
  1191. mov r5, r12
  1192. vld1.8 {d31}, [r4]! @ endian-neutral loads...
  1193. cmp r6, #8*2
  1194. veor d0, d0, d31 @ A[0][0] ^= *inp++
  1195. blo .Lprocess_neon
  1196. vld1.8 {d31}, [r4]!
  1197. veor d2, d2, d31 @ A[0][1] ^= *inp++
  1198. beq .Lprocess_neon
  1199. vld1.8 {d31}, [r4]!
  1200. cmp r6, #8*4
  1201. veor d4, d4, d31 @ A[0][2] ^= *inp++
  1202. blo .Lprocess_neon
  1203. vld1.8 {d31}, [r4]!
  1204. veor d6, d6, d31 @ A[0][3] ^= *inp++
  1205. beq .Lprocess_neon
  1206. vld1.8 {d31},[r4]!
  1207. cmp r6, #8*6
  1208. veor d8, d8, d31 @ A[0][4] ^= *inp++
  1209. blo .Lprocess_neon
  1210. vld1.8 {d31}, [r4]!
  1211. veor d1, d1, d31 @ A[1][0] ^= *inp++
  1212. beq .Lprocess_neon
  1213. vld1.8 {d31}, [r4]!
  1214. cmp r6, #8*8
  1215. veor d3, d3, d31 @ A[1][1] ^= *inp++
  1216. blo .Lprocess_neon
  1217. vld1.8 {d31}, [r4]!
  1218. veor d5, d5, d31 @ A[1][2] ^= *inp++
  1219. beq .Lprocess_neon
  1220. vld1.8 {d31}, [r4]!
  1221. cmp r6, #8*10
  1222. veor d7, d7, d31 @ A[1][3] ^= *inp++
  1223. blo .Lprocess_neon
  1224. vld1.8 {d31}, [r4]!
  1225. veor d9, d9, d31 @ A[1][4] ^= *inp++
  1226. beq .Lprocess_neon
  1227. vld1.8 {d31}, [r4]!
  1228. cmp r6, #8*12
  1229. veor d10, d10, d31 @ A[2][0] ^= *inp++
  1230. blo .Lprocess_neon
  1231. vld1.8 {d31}, [r4]!
  1232. veor d12, d12, d31 @ A[2][1] ^= *inp++
  1233. beq .Lprocess_neon
  1234. vld1.8 {d31}, [r4]!
  1235. cmp r6, #8*14
  1236. veor d14, d14, d31 @ A[2][2] ^= *inp++
  1237. blo .Lprocess_neon
  1238. vld1.8 {d31}, [r4]!
  1239. veor d16, d16, d31 @ A[2][3] ^= *inp++
  1240. beq .Lprocess_neon
  1241. vld1.8 {d31}, [r4]!
  1242. cmp r6, #8*16
  1243. veor d18, d18, d31 @ A[2][4] ^= *inp++
  1244. blo .Lprocess_neon
  1245. vld1.8 {d31}, [r4]!
  1246. veor d11, d11, d31 @ A[3][0] ^= *inp++
  1247. beq .Lprocess_neon
  1248. vld1.8 {d31}, [r4]!
  1249. cmp r6, #8*18
  1250. veor d13, d13, d31 @ A[3][1] ^= *inp++
  1251. blo .Lprocess_neon
  1252. vld1.8 {d31}, [r4]!
  1253. veor d15, d15, d31 @ A[3][2] ^= *inp++
  1254. beq .Lprocess_neon
  1255. vld1.8 {d31}, [r4]!
  1256. cmp r6, #8*20
  1257. veor d17, d17, d31 @ A[3][3] ^= *inp++
  1258. blo .Lprocess_neon
  1259. vld1.8 {d31}, [r4]!
  1260. veor d19, d19, d31 @ A[3][4] ^= *inp++
  1261. beq .Lprocess_neon
  1262. vld1.8 {d31}, [r4]!
  1263. cmp r6, #8*22
  1264. veor d20, d20, d31 @ A[4][0] ^= *inp++
  1265. blo .Lprocess_neon
  1266. vld1.8 {d31}, [r4]!
  1267. veor d21, d21, d31 @ A[4][1] ^= *inp++
  1268. beq .Lprocess_neon
  1269. vld1.8 {d31}, [r4]!
  1270. cmp r6, #8*24
  1271. veor d22, d22, d31 @ A[4][2] ^= *inp++
  1272. blo .Lprocess_neon
  1273. vld1.8 {d31}, [r4]!
  1274. veor d23, d23, d31 @ A[4][3] ^= *inp++
  1275. beq .Lprocess_neon
  1276. vld1.8 {d31}, [r4]!
  1277. veor d24, d24, d31 @ A[4][4] ^= *inp++
  1278. .Lprocess_neon:
  1279. bl KeccakF1600_neon
  1280. b .Loop_absorb_neon
  1281. .align 4
  1282. .Labsorbed_neon:
  1283. vst1.32 {d0}, [r0:64]! @ A[0][0..4]
  1284. vst1.32 {d2}, [r0:64]!
  1285. vst1.32 {d4}, [r0:64]!
  1286. vst1.32 {d6}, [r0:64]!
  1287. vst1.32 {d8}, [r0:64]!
  1288. vst1.32 {d1}, [r0:64]! @ A[1][0..4]
  1289. vst1.32 {d3}, [r0:64]!
  1290. vst1.32 {d5}, [r0:64]!
  1291. vst1.32 {d7}, [r0:64]!
  1292. vst1.32 {d9}, [r0:64]!
  1293. vst1.32 {d10}, [r0:64]! @ A[2][0..4]
  1294. vst1.32 {d12}, [r0:64]!
  1295. vst1.32 {d14}, [r0:64]!
  1296. vst1.32 {d16}, [r0:64]!
  1297. vst1.32 {d18}, [r0:64]!
  1298. vst1.32 {d11}, [r0:64]! @ A[3][0..4]
  1299. vst1.32 {d13}, [r0:64]!
  1300. vst1.32 {d15}, [r0:64]!
  1301. vst1.32 {d17}, [r0:64]!
  1302. vst1.32 {d19}, [r0:64]!
  1303. vst1.32 {d20-d23}, [r0:64]! @ A[4][0..4]
  1304. vst1.32 {d24}, [r0:64]
  1305. mov r0, r5 @ return value
  1306. vldmia sp!, {d8-d15}
  1307. ldmia sp!, {r4-r6,pc}
  1308. .size SHA3_absorb_neon,.-SHA3_absorb_neon
  1309. .global SHA3_squeeze_neon
  1310. .type SHA3_squeeze_neon, %function
  1311. .align 5
  1312. SHA3_squeeze_neon:
  1313. stmdb sp!, {r4-r6,lr}
  1314. mov r4, r1 @ out
  1315. mov r5, r2 @ len
  1316. mov r6, r3 @ bsz
  1317. mov r12, r0 @ A_flat
  1318. mov r14, r3 @ bsz
  1319. b .Loop_squeeze_neon
  1320. .align 4
  1321. .Loop_squeeze_neon:
  1322. cmp r5, #8
  1323. blo .Lsqueeze_neon_tail
  1324. vld1.32 {d0}, [r12]!
  1325. vst1.8 {d0}, [r4]! @ endian-neutral store
  1326. subs r5, r5, #8 @ len -= 8
  1327. beq .Lsqueeze_neon_done
  1328. subs r14, r14, #8 @ bsz -= 8
  1329. bhi .Loop_squeeze_neon
  1330. vstmdb sp!, {d8-d15}
  1331. vld1.32 {d0}, [r0:64]! @ A[0][0..4]
  1332. vld1.32 {d2}, [r0:64]!
  1333. vld1.32 {d4}, [r0:64]!
  1334. vld1.32 {d6}, [r0:64]!
  1335. vld1.32 {d8}, [r0:64]!
  1336. vld1.32 {d1}, [r0:64]! @ A[1][0..4]
  1337. vld1.32 {d3}, [r0:64]!
  1338. vld1.32 {d5}, [r0:64]!
  1339. vld1.32 {d7}, [r0:64]!
  1340. vld1.32 {d9}, [r0:64]!
  1341. vld1.32 {d10}, [r0:64]! @ A[2][0..4]
  1342. vld1.32 {d12}, [r0:64]!
  1343. vld1.32 {d14}, [r0:64]!
  1344. vld1.32 {d16}, [r0:64]!
  1345. vld1.32 {d18}, [r0:64]!
  1346. vld1.32 {d11}, [r0:64]! @ A[3][0..4]
  1347. vld1.32 {d13}, [r0:64]!
  1348. vld1.32 {d15}, [r0:64]!
  1349. vld1.32 {d17}, [r0:64]!
  1350. vld1.32 {d19}, [r0:64]!
  1351. vld1.32 {d20-d23}, [r0:64]! @ A[4][0..4]
  1352. vld1.32 {d24}, [r0:64]
  1353. sub r0, r0, #24*8 @ rewind
  1354. bl KeccakF1600_neon
  1355. mov r12, r0 @ A_flat
  1356. vst1.32 {d0}, [r0:64]! @ A[0][0..4]
  1357. vst1.32 {d2}, [r0:64]!
  1358. vst1.32 {d4}, [r0:64]!
  1359. vst1.32 {d6}, [r0:64]!
  1360. vst1.32 {d8}, [r0:64]!
  1361. vst1.32 {d1}, [r0:64]! @ A[1][0..4]
  1362. vst1.32 {d3}, [r0:64]!
  1363. vst1.32 {d5}, [r0:64]!
  1364. vst1.32 {d7}, [r0:64]!
  1365. vst1.32 {d9}, [r0:64]!
  1366. vst1.32 {d10}, [r0:64]! @ A[2][0..4]
  1367. vst1.32 {d12}, [r0:64]!
  1368. vst1.32 {d14}, [r0:64]!
  1369. vst1.32 {d16}, [r0:64]!
  1370. vst1.32 {d18}, [r0:64]!
  1371. vst1.32 {d11}, [r0:64]! @ A[3][0..4]
  1372. vst1.32 {d13}, [r0:64]!
  1373. vst1.32 {d15}, [r0:64]!
  1374. vst1.32 {d17}, [r0:64]!
  1375. vst1.32 {d19}, [r0:64]!
  1376. vst1.32 {d20-d23}, [r0:64]! @ A[4][0..4]
  1377. mov r14, r6 @ bsz
  1378. vst1.32 {d24}, [r0:64]
  1379. mov r0, r12 @ rewind
  1380. vldmia sp!, {d8-d15}
  1381. b .Loop_squeeze_neon
  1382. .align 4
  1383. .Lsqueeze_neon_tail:
  1384. ldmia r12, {r2,r3}
  1385. cmp r5, #2
  1386. strb r2, [r4],#1 @ endian-neutral store
  1387. lsr r2, r2, #8
  1388. blo .Lsqueeze_neon_done
  1389. strb r2, [r4], #1
  1390. lsr r2, r2, #8
  1391. beq .Lsqueeze_neon_done
  1392. strb r2, [r4], #1
  1393. lsr r2, r2, #8
  1394. cmp r5, #4
  1395. blo .Lsqueeze_neon_done
  1396. strb r2, [r4], #1
  1397. beq .Lsqueeze_neon_done
  1398. strb r3, [r4], #1
  1399. lsr r3, r3, #8
  1400. cmp r5, #6
  1401. blo .Lsqueeze_neon_done
  1402. strb r3, [r4], #1
  1403. lsr r3, r3, #8
  1404. beq .Lsqueeze_neon_done
  1405. strb r3, [r4], #1
  1406. .Lsqueeze_neon_done:
  1407. ldmia sp!, {r4-r6,pc}
  1408. .size SHA3_squeeze_neon,.-SHA3_squeeze_neon
  1409. .asciz "Keccak-1600 absorb and squeeze for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
  1410. .align 2
  1411. ___
  1412. {
  1413. my %ldr, %str;
  1414. sub ldrd {
  1415. my ($mnemonic,$half,$reg,$ea) = @_;
  1416. my $op = $mnemonic eq "ldr" ? \%ldr : \%str;
  1417. if ($half eq "l") {
  1418. $$op{reg} = $reg;
  1419. $$op{ea} = $ea;
  1420. sprintf "#ifndef __thumb2__\n" .
  1421. " %s\t%s,%s\n" .
  1422. "#endif", $mnemonic,$reg,$ea;
  1423. } else {
  1424. sprintf "#ifndef __thumb2__\n" .
  1425. " %s\t%s,%s\n" .
  1426. "#else\n" .
  1427. " %sd\t%s,%s,%s\n" .
  1428. "#endif", $mnemonic,$reg,$ea,
  1429. $mnemonic,$$op{reg},$reg,$$op{ea};
  1430. }
  1431. }
  1432. }
  1433. $output=pop;
  1434. open STDOUT,">$output";
  1435. foreach (split($/,$code)) {
  1436. s/\`([^\`]*)\`/eval $1/ge;
  1437. s/^\s+(ldr|str)\.([lh])\s+(r[0-9]+),\s*(\[.*)/ldrd($1,$2,$3,$4)/ge or
  1438. s/\bret\b/bx lr/g or
  1439. s/\bbx\s+lr\b/.word\t0xe12fff1e/g; # make it possible to compile with -march=armv4
  1440. print $_,"\n";
  1441. }
  1442. close STDOUT; # enforce flush