bsaes-armv8.pl 82 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378
  1. #!/usr/bin/env perl
  2. # Copyright 2020-2023 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. use strict;
  9. my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  10. my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  11. my $xlate;
  12. $0 =~ m/(.*[\/\\])[^\/\\]+$/; my $dir=$1;
  13. ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
  14. ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate ) or
  15. die "can't locate arm-xlate.pl";
  16. open OUT,"| \"$^X\" $xlate $flavour $output";
  17. *STDOUT=*OUT;
  18. my $code = data();
  19. print $code;
  20. close STDOUT or die "error closing STDOUT: $!"; # enforce flush
  21. sub data
  22. {
  23. local $/;
  24. return <DATA>;
  25. }
  26. __END__
  27. // Copyright 2021-2023 The OpenSSL Project Authors. All Rights Reserved.
  28. //
  29. // Licensed under the OpenSSL license (the "License"). You may not use
  30. // this file except in compliance with the License. You can obtain a copy
  31. // in the file LICENSE in the source distribution or at
  32. // https://www.openssl.org/source/license.html
  33. //
  34. // ====================================================================
  35. // Written by Ben Avison <bavison@riscosopen.org> for the OpenSSL
  36. // project. Rights for redistribution and usage in source and binary
  37. // forms are granted according to the OpenSSL license.
  38. // ====================================================================
  39. //
  40. // This implementation is a translation of bsaes-armv7 for AArch64.
  41. // No attempt has been made to carry across the build switches for
  42. // kernel targets, since the Linux kernel crypto support has moved on
  43. // from when it was based on OpenSSL.
  44. // A lot of hand-scheduling has been performed. Consequently, this code
  45. // doesn't factor out neatly into macros in the same way that the
  46. // AArch32 version did, and there is little to be gained by wrapping it
  47. // up in Perl, and it is presented as pure assembly.
  48. #include "crypto/arm_arch.h"
  49. .text
  50. .extern AES_cbc_encrypt
  51. .extern AES_encrypt
  52. .extern AES_decrypt
  53. .type _bsaes_decrypt8,%function
  54. .align 4
  55. // On entry:
  56. // x9 -> key (previously expanded using _bsaes_key_convert)
  57. // x10 = number of rounds
  58. // v0-v7 input data
  59. // On exit:
  60. // x9-x11 corrupted
  61. // other general-purpose registers preserved
  62. // v0-v7 output data
  63. // v11-v15 preserved
  64. // other SIMD registers corrupted
  65. _bsaes_decrypt8:
  66. ldr q8, [x9], #16
  67. adr x11, .LM0ISR
  68. movi v9.16b, #0x55
  69. ldr q10, [x11], #16
  70. movi v16.16b, #0x33
  71. movi v17.16b, #0x0f
  72. sub x10, x10, #1
  73. eor v0.16b, v0.16b, v8.16b
  74. eor v1.16b, v1.16b, v8.16b
  75. eor v2.16b, v2.16b, v8.16b
  76. eor v4.16b, v4.16b, v8.16b
  77. eor v3.16b, v3.16b, v8.16b
  78. eor v5.16b, v5.16b, v8.16b
  79. tbl v0.16b, {v0.16b}, v10.16b
  80. tbl v1.16b, {v1.16b}, v10.16b
  81. tbl v2.16b, {v2.16b}, v10.16b
  82. tbl v4.16b, {v4.16b}, v10.16b
  83. eor v6.16b, v6.16b, v8.16b
  84. eor v7.16b, v7.16b, v8.16b
  85. tbl v3.16b, {v3.16b}, v10.16b
  86. tbl v5.16b, {v5.16b}, v10.16b
  87. tbl v6.16b, {v6.16b}, v10.16b
  88. ushr v8.2d, v0.2d, #1
  89. tbl v7.16b, {v7.16b}, v10.16b
  90. ushr v10.2d, v4.2d, #1
  91. ushr v18.2d, v2.2d, #1
  92. eor v8.16b, v8.16b, v1.16b
  93. ushr v19.2d, v6.2d, #1
  94. eor v10.16b, v10.16b, v5.16b
  95. eor v18.16b, v18.16b, v3.16b
  96. and v8.16b, v8.16b, v9.16b
  97. eor v19.16b, v19.16b, v7.16b
  98. and v10.16b, v10.16b, v9.16b
  99. and v18.16b, v18.16b, v9.16b
  100. eor v1.16b, v1.16b, v8.16b
  101. shl v8.2d, v8.2d, #1
  102. and v9.16b, v19.16b, v9.16b
  103. eor v5.16b, v5.16b, v10.16b
  104. shl v10.2d, v10.2d, #1
  105. eor v3.16b, v3.16b, v18.16b
  106. shl v18.2d, v18.2d, #1
  107. eor v0.16b, v0.16b, v8.16b
  108. shl v8.2d, v9.2d, #1
  109. eor v7.16b, v7.16b, v9.16b
  110. eor v4.16b, v4.16b, v10.16b
  111. eor v2.16b, v2.16b, v18.16b
  112. ushr v9.2d, v1.2d, #2
  113. eor v6.16b, v6.16b, v8.16b
  114. ushr v8.2d, v0.2d, #2
  115. ushr v10.2d, v5.2d, #2
  116. ushr v18.2d, v4.2d, #2
  117. eor v9.16b, v9.16b, v3.16b
  118. eor v8.16b, v8.16b, v2.16b
  119. eor v10.16b, v10.16b, v7.16b
  120. eor v18.16b, v18.16b, v6.16b
  121. and v9.16b, v9.16b, v16.16b
  122. and v8.16b, v8.16b, v16.16b
  123. and v10.16b, v10.16b, v16.16b
  124. and v16.16b, v18.16b, v16.16b
  125. eor v3.16b, v3.16b, v9.16b
  126. shl v9.2d, v9.2d, #2
  127. eor v2.16b, v2.16b, v8.16b
  128. shl v8.2d, v8.2d, #2
  129. eor v7.16b, v7.16b, v10.16b
  130. shl v10.2d, v10.2d, #2
  131. eor v6.16b, v6.16b, v16.16b
  132. shl v16.2d, v16.2d, #2
  133. eor v1.16b, v1.16b, v9.16b
  134. eor v0.16b, v0.16b, v8.16b
  135. eor v5.16b, v5.16b, v10.16b
  136. eor v4.16b, v4.16b, v16.16b
  137. ushr v8.2d, v3.2d, #4
  138. ushr v9.2d, v2.2d, #4
  139. ushr v10.2d, v1.2d, #4
  140. ushr v16.2d, v0.2d, #4
  141. eor v8.16b, v8.16b, v7.16b
  142. eor v9.16b, v9.16b, v6.16b
  143. eor v10.16b, v10.16b, v5.16b
  144. eor v16.16b, v16.16b, v4.16b
  145. and v8.16b, v8.16b, v17.16b
  146. and v9.16b, v9.16b, v17.16b
  147. and v10.16b, v10.16b, v17.16b
  148. and v16.16b, v16.16b, v17.16b
  149. eor v7.16b, v7.16b, v8.16b
  150. shl v8.2d, v8.2d, #4
  151. eor v6.16b, v6.16b, v9.16b
  152. shl v9.2d, v9.2d, #4
  153. eor v5.16b, v5.16b, v10.16b
  154. shl v10.2d, v10.2d, #4
  155. eor v4.16b, v4.16b, v16.16b
  156. shl v16.2d, v16.2d, #4
  157. eor v3.16b, v3.16b, v8.16b
  158. eor v2.16b, v2.16b, v9.16b
  159. eor v1.16b, v1.16b, v10.16b
  160. eor v0.16b, v0.16b, v16.16b
  161. b .Ldec_sbox
  162. .align 4
  163. .Ldec_loop:
  164. ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x9], #64
  165. ldp q8, q9, [x9], #32
  166. eor v0.16b, v16.16b, v0.16b
  167. ldr q10, [x9], #16
  168. eor v1.16b, v17.16b, v1.16b
  169. ldr q16, [x9], #16
  170. eor v2.16b, v18.16b, v2.16b
  171. eor v3.16b, v19.16b, v3.16b
  172. eor v4.16b, v8.16b, v4.16b
  173. eor v5.16b, v9.16b, v5.16b
  174. eor v6.16b, v10.16b, v6.16b
  175. eor v7.16b, v16.16b, v7.16b
  176. tbl v0.16b, {v0.16b}, v28.16b
  177. tbl v1.16b, {v1.16b}, v28.16b
  178. tbl v2.16b, {v2.16b}, v28.16b
  179. tbl v3.16b, {v3.16b}, v28.16b
  180. tbl v4.16b, {v4.16b}, v28.16b
  181. tbl v5.16b, {v5.16b}, v28.16b
  182. tbl v6.16b, {v6.16b}, v28.16b
  183. tbl v7.16b, {v7.16b}, v28.16b
  184. .Ldec_sbox:
  185. eor v1.16b, v1.16b, v4.16b
  186. eor v3.16b, v3.16b, v4.16b
  187. subs x10, x10, #1
  188. eor v4.16b, v4.16b, v7.16b
  189. eor v2.16b, v2.16b, v7.16b
  190. eor v1.16b, v1.16b, v6.16b
  191. eor v6.16b, v6.16b, v4.16b
  192. eor v2.16b, v2.16b, v5.16b
  193. eor v0.16b, v0.16b, v1.16b
  194. eor v7.16b, v7.16b, v6.16b
  195. eor v8.16b, v6.16b, v2.16b
  196. and v9.16b, v4.16b, v6.16b
  197. eor v10.16b, v2.16b, v6.16b
  198. eor v3.16b, v3.16b, v0.16b
  199. eor v5.16b, v5.16b, v0.16b
  200. eor v16.16b, v7.16b, v4.16b
  201. eor v17.16b, v4.16b, v0.16b
  202. and v18.16b, v0.16b, v2.16b
  203. eor v19.16b, v7.16b, v4.16b
  204. eor v1.16b, v1.16b, v3.16b
  205. eor v20.16b, v3.16b, v0.16b
  206. eor v21.16b, v5.16b, v2.16b
  207. eor v22.16b, v3.16b, v7.16b
  208. and v8.16b, v17.16b, v8.16b
  209. orr v17.16b, v3.16b, v5.16b
  210. eor v23.16b, v1.16b, v6.16b
  211. eor v24.16b, v20.16b, v16.16b
  212. eor v25.16b, v1.16b, v5.16b
  213. orr v26.16b, v20.16b, v21.16b
  214. and v20.16b, v20.16b, v21.16b
  215. and v27.16b, v7.16b, v1.16b
  216. eor v21.16b, v21.16b, v23.16b
  217. orr v28.16b, v16.16b, v23.16b
  218. orr v29.16b, v22.16b, v25.16b
  219. eor v26.16b, v26.16b, v8.16b
  220. and v16.16b, v16.16b, v23.16b
  221. and v22.16b, v22.16b, v25.16b
  222. and v21.16b, v24.16b, v21.16b
  223. eor v8.16b, v28.16b, v8.16b
  224. eor v23.16b, v5.16b, v2.16b
  225. eor v24.16b, v1.16b, v6.16b
  226. eor v16.16b, v16.16b, v22.16b
  227. eor v22.16b, v3.16b, v0.16b
  228. eor v25.16b, v29.16b, v21.16b
  229. eor v21.16b, v26.16b, v21.16b
  230. eor v8.16b, v8.16b, v20.16b
  231. eor v26.16b, v23.16b, v24.16b
  232. eor v16.16b, v16.16b, v20.16b
  233. eor v28.16b, v22.16b, v19.16b
  234. eor v20.16b, v25.16b, v20.16b
  235. eor v9.16b, v21.16b, v9.16b
  236. eor v8.16b, v8.16b, v18.16b
  237. eor v18.16b, v5.16b, v1.16b
  238. eor v21.16b, v16.16b, v17.16b
  239. eor v16.16b, v16.16b, v17.16b
  240. eor v17.16b, v20.16b, v27.16b
  241. eor v20.16b, v3.16b, v7.16b
  242. eor v25.16b, v9.16b, v8.16b
  243. eor v27.16b, v0.16b, v4.16b
  244. and v29.16b, v9.16b, v17.16b
  245. eor v30.16b, v8.16b, v29.16b
  246. eor v31.16b, v21.16b, v29.16b
  247. eor v29.16b, v21.16b, v29.16b
  248. bsl v30.16b, v17.16b, v21.16b
  249. bsl v31.16b, v9.16b, v8.16b
  250. bsl v16.16b, v30.16b, v29.16b
  251. bsl v21.16b, v29.16b, v30.16b
  252. eor v8.16b, v31.16b, v30.16b
  253. and v1.16b, v1.16b, v31.16b
  254. and v9.16b, v16.16b, v31.16b
  255. and v6.16b, v6.16b, v30.16b
  256. eor v16.16b, v17.16b, v21.16b
  257. and v4.16b, v4.16b, v30.16b
  258. eor v17.16b, v8.16b, v30.16b
  259. and v21.16b, v24.16b, v8.16b
  260. eor v9.16b, v9.16b, v25.16b
  261. and v19.16b, v19.16b, v8.16b
  262. eor v24.16b, v30.16b, v16.16b
  263. eor v25.16b, v30.16b, v16.16b
  264. and v7.16b, v7.16b, v17.16b
  265. and v10.16b, v10.16b, v16.16b
  266. eor v29.16b, v9.16b, v16.16b
  267. eor v30.16b, v31.16b, v9.16b
  268. and v0.16b, v24.16b, v0.16b
  269. and v9.16b, v18.16b, v9.16b
  270. and v2.16b, v25.16b, v2.16b
  271. eor v10.16b, v10.16b, v6.16b
  272. eor v18.16b, v29.16b, v16.16b
  273. and v5.16b, v30.16b, v5.16b
  274. eor v24.16b, v8.16b, v29.16b
  275. and v25.16b, v26.16b, v29.16b
  276. and v26.16b, v28.16b, v29.16b
  277. eor v8.16b, v8.16b, v29.16b
  278. eor v17.16b, v17.16b, v18.16b
  279. eor v5.16b, v1.16b, v5.16b
  280. and v23.16b, v24.16b, v23.16b
  281. eor v21.16b, v21.16b, v25.16b
  282. eor v19.16b, v19.16b, v26.16b
  283. eor v0.16b, v4.16b, v0.16b
  284. and v3.16b, v17.16b, v3.16b
  285. eor v1.16b, v9.16b, v1.16b
  286. eor v9.16b, v25.16b, v23.16b
  287. eor v5.16b, v5.16b, v21.16b
  288. eor v2.16b, v6.16b, v2.16b
  289. and v6.16b, v8.16b, v22.16b
  290. eor v3.16b, v7.16b, v3.16b
  291. and v8.16b, v20.16b, v18.16b
  292. eor v10.16b, v10.16b, v9.16b
  293. eor v0.16b, v0.16b, v19.16b
  294. eor v9.16b, v1.16b, v9.16b
  295. eor v1.16b, v2.16b, v21.16b
  296. eor v3.16b, v3.16b, v19.16b
  297. and v16.16b, v27.16b, v16.16b
  298. eor v17.16b, v26.16b, v6.16b
  299. eor v6.16b, v8.16b, v7.16b
  300. eor v7.16b, v1.16b, v9.16b
  301. eor v1.16b, v5.16b, v3.16b
  302. eor v2.16b, v10.16b, v3.16b
  303. eor v4.16b, v16.16b, v4.16b
  304. eor v8.16b, v6.16b, v17.16b
  305. eor v5.16b, v9.16b, v3.16b
  306. eor v9.16b, v0.16b, v1.16b
  307. eor v6.16b, v7.16b, v1.16b
  308. eor v0.16b, v4.16b, v17.16b
  309. eor v4.16b, v8.16b, v7.16b
  310. eor v7.16b, v9.16b, v2.16b
  311. eor v8.16b, v3.16b, v0.16b
  312. eor v7.16b, v7.16b, v5.16b
  313. eor v3.16b, v4.16b, v7.16b
  314. eor v4.16b, v7.16b, v0.16b
  315. eor v7.16b, v8.16b, v3.16b
  316. bcc .Ldec_done
  317. ext v8.16b, v0.16b, v0.16b, #8
  318. ext v9.16b, v1.16b, v1.16b, #8
  319. ldr q28, [x11] // load from .LISR in common case (x10 > 0)
  320. ext v10.16b, v6.16b, v6.16b, #8
  321. ext v16.16b, v3.16b, v3.16b, #8
  322. ext v17.16b, v5.16b, v5.16b, #8
  323. ext v18.16b, v4.16b, v4.16b, #8
  324. eor v8.16b, v8.16b, v0.16b
  325. eor v9.16b, v9.16b, v1.16b
  326. eor v10.16b, v10.16b, v6.16b
  327. eor v16.16b, v16.16b, v3.16b
  328. eor v17.16b, v17.16b, v5.16b
  329. ext v19.16b, v2.16b, v2.16b, #8
  330. ext v20.16b, v7.16b, v7.16b, #8
  331. eor v18.16b, v18.16b, v4.16b
  332. eor v6.16b, v6.16b, v8.16b
  333. eor v8.16b, v2.16b, v10.16b
  334. eor v4.16b, v4.16b, v9.16b
  335. eor v2.16b, v19.16b, v2.16b
  336. eor v9.16b, v20.16b, v7.16b
  337. eor v0.16b, v0.16b, v16.16b
  338. eor v1.16b, v1.16b, v16.16b
  339. eor v6.16b, v6.16b, v17.16b
  340. eor v8.16b, v8.16b, v16.16b
  341. eor v7.16b, v7.16b, v18.16b
  342. eor v4.16b, v4.16b, v16.16b
  343. eor v2.16b, v3.16b, v2.16b
  344. eor v1.16b, v1.16b, v17.16b
  345. eor v3.16b, v5.16b, v9.16b
  346. eor v5.16b, v8.16b, v17.16b
  347. eor v7.16b, v7.16b, v17.16b
  348. ext v8.16b, v0.16b, v0.16b, #12
  349. ext v9.16b, v6.16b, v6.16b, #12
  350. ext v10.16b, v4.16b, v4.16b, #12
  351. ext v16.16b, v1.16b, v1.16b, #12
  352. ext v17.16b, v5.16b, v5.16b, #12
  353. ext v18.16b, v7.16b, v7.16b, #12
  354. eor v0.16b, v0.16b, v8.16b
  355. eor v6.16b, v6.16b, v9.16b
  356. eor v4.16b, v4.16b, v10.16b
  357. ext v19.16b, v2.16b, v2.16b, #12
  358. ext v20.16b, v3.16b, v3.16b, #12
  359. eor v1.16b, v1.16b, v16.16b
  360. eor v5.16b, v5.16b, v17.16b
  361. eor v7.16b, v7.16b, v18.16b
  362. eor v2.16b, v2.16b, v19.16b
  363. eor v16.16b, v16.16b, v0.16b
  364. eor v3.16b, v3.16b, v20.16b
  365. eor v17.16b, v17.16b, v4.16b
  366. eor v10.16b, v10.16b, v6.16b
  367. ext v0.16b, v0.16b, v0.16b, #8
  368. eor v9.16b, v9.16b, v1.16b
  369. ext v1.16b, v1.16b, v1.16b, #8
  370. eor v8.16b, v8.16b, v3.16b
  371. eor v16.16b, v16.16b, v3.16b
  372. eor v18.16b, v18.16b, v5.16b
  373. eor v19.16b, v19.16b, v7.16b
  374. ext v21.16b, v5.16b, v5.16b, #8
  375. ext v5.16b, v7.16b, v7.16b, #8
  376. eor v7.16b, v20.16b, v2.16b
  377. ext v4.16b, v4.16b, v4.16b, #8
  378. ext v20.16b, v3.16b, v3.16b, #8
  379. eor v17.16b, v17.16b, v3.16b
  380. ext v2.16b, v2.16b, v2.16b, #8
  381. eor v3.16b, v10.16b, v3.16b
  382. ext v10.16b, v6.16b, v6.16b, #8
  383. eor v0.16b, v0.16b, v8.16b
  384. eor v1.16b, v1.16b, v16.16b
  385. eor v5.16b, v5.16b, v18.16b
  386. eor v3.16b, v3.16b, v4.16b
  387. eor v7.16b, v20.16b, v7.16b
  388. eor v6.16b, v2.16b, v19.16b
  389. eor v4.16b, v21.16b, v17.16b
  390. eor v2.16b, v10.16b, v9.16b
  391. bne .Ldec_loop
  392. ldr q28, [x11, #16]! // load from .LISRM0 on last round (x10 == 0)
  393. b .Ldec_loop
  394. .align 4
  395. .Ldec_done:
  396. ushr v8.2d, v0.2d, #1
  397. movi v9.16b, #0x55
  398. ldr q10, [x9]
  399. ushr v16.2d, v2.2d, #1
  400. movi v17.16b, #0x33
  401. ushr v18.2d, v6.2d, #1
  402. movi v19.16b, #0x0f
  403. eor v8.16b, v8.16b, v1.16b
  404. ushr v20.2d, v3.2d, #1
  405. eor v16.16b, v16.16b, v7.16b
  406. eor v18.16b, v18.16b, v4.16b
  407. and v8.16b, v8.16b, v9.16b
  408. eor v20.16b, v20.16b, v5.16b
  409. and v16.16b, v16.16b, v9.16b
  410. and v18.16b, v18.16b, v9.16b
  411. shl v21.2d, v8.2d, #1
  412. eor v1.16b, v1.16b, v8.16b
  413. and v8.16b, v20.16b, v9.16b
  414. eor v7.16b, v7.16b, v16.16b
  415. shl v9.2d, v16.2d, #1
  416. eor v4.16b, v4.16b, v18.16b
  417. shl v16.2d, v18.2d, #1
  418. eor v0.16b, v0.16b, v21.16b
  419. shl v18.2d, v8.2d, #1
  420. eor v5.16b, v5.16b, v8.16b
  421. eor v2.16b, v2.16b, v9.16b
  422. eor v6.16b, v6.16b, v16.16b
  423. ushr v8.2d, v1.2d, #2
  424. eor v3.16b, v3.16b, v18.16b
  425. ushr v9.2d, v0.2d, #2
  426. ushr v16.2d, v7.2d, #2
  427. ushr v18.2d, v2.2d, #2
  428. eor v8.16b, v8.16b, v4.16b
  429. eor v9.16b, v9.16b, v6.16b
  430. eor v16.16b, v16.16b, v5.16b
  431. eor v18.16b, v18.16b, v3.16b
  432. and v8.16b, v8.16b, v17.16b
  433. and v9.16b, v9.16b, v17.16b
  434. and v16.16b, v16.16b, v17.16b
  435. and v17.16b, v18.16b, v17.16b
  436. eor v4.16b, v4.16b, v8.16b
  437. shl v8.2d, v8.2d, #2
  438. eor v6.16b, v6.16b, v9.16b
  439. shl v9.2d, v9.2d, #2
  440. eor v5.16b, v5.16b, v16.16b
  441. shl v16.2d, v16.2d, #2
  442. eor v3.16b, v3.16b, v17.16b
  443. shl v17.2d, v17.2d, #2
  444. eor v1.16b, v1.16b, v8.16b
  445. eor v0.16b, v0.16b, v9.16b
  446. eor v7.16b, v7.16b, v16.16b
  447. eor v2.16b, v2.16b, v17.16b
  448. ushr v8.2d, v4.2d, #4
  449. ushr v9.2d, v6.2d, #4
  450. ushr v16.2d, v1.2d, #4
  451. ushr v17.2d, v0.2d, #4
  452. eor v8.16b, v8.16b, v5.16b
  453. eor v9.16b, v9.16b, v3.16b
  454. eor v16.16b, v16.16b, v7.16b
  455. eor v17.16b, v17.16b, v2.16b
  456. and v8.16b, v8.16b, v19.16b
  457. and v9.16b, v9.16b, v19.16b
  458. and v16.16b, v16.16b, v19.16b
  459. and v17.16b, v17.16b, v19.16b
  460. eor v5.16b, v5.16b, v8.16b
  461. shl v8.2d, v8.2d, #4
  462. eor v3.16b, v3.16b, v9.16b
  463. shl v9.2d, v9.2d, #4
  464. eor v7.16b, v7.16b, v16.16b
  465. shl v16.2d, v16.2d, #4
  466. eor v2.16b, v2.16b, v17.16b
  467. shl v17.2d, v17.2d, #4
  468. eor v4.16b, v4.16b, v8.16b
  469. eor v6.16b, v6.16b, v9.16b
  470. eor v7.16b, v7.16b, v10.16b
  471. eor v1.16b, v1.16b, v16.16b
  472. eor v2.16b, v2.16b, v10.16b
  473. eor v0.16b, v0.16b, v17.16b
  474. eor v4.16b, v4.16b, v10.16b
  475. eor v6.16b, v6.16b, v10.16b
  476. eor v3.16b, v3.16b, v10.16b
  477. eor v5.16b, v5.16b, v10.16b
  478. eor v1.16b, v1.16b, v10.16b
  479. eor v0.16b, v0.16b, v10.16b
  480. ret
  481. .size _bsaes_decrypt8,.-_bsaes_decrypt8
  482. .type _bsaes_const,%object
  483. .align 6
  484. _bsaes_const:
  485. // InvShiftRows constants
  486. // Used in _bsaes_decrypt8, which assumes contiguity
  487. // .LM0ISR used with round 0 key
  488. // .LISR used with middle round keys
  489. // .LISRM0 used with final round key
  490. .LM0ISR:
  491. .quad 0x0a0e0206070b0f03, 0x0004080c0d010509
  492. .LISR:
  493. .quad 0x0504070602010003, 0x0f0e0d0c080b0a09
  494. .LISRM0:
  495. .quad 0x01040b0e0205080f, 0x0306090c00070a0d
  496. // ShiftRows constants
  497. // Used in _bsaes_encrypt8, which assumes contiguity
  498. // .LM0SR used with round 0 key
  499. // .LSR used with middle round keys
  500. // .LSRM0 used with final round key
  501. .LM0SR:
  502. .quad 0x0a0e02060f03070b, 0x0004080c05090d01
  503. .LSR:
  504. .quad 0x0504070600030201, 0x0f0e0d0c0a09080b
  505. .LSRM0:
  506. .quad 0x0304090e00050a0f, 0x01060b0c0207080d
  507. .LM0_bigendian:
  508. .quad 0x02060a0e03070b0f, 0x0004080c0105090d
  509. .LM0_littleendian:
  510. .quad 0x0105090d0004080c, 0x03070b0f02060a0e
  511. // Used in ossl_bsaes_ctr32_encrypt_blocks, prior to dropping into
  512. // _bsaes_encrypt8_alt, for round 0 key in place of .LM0SR
  513. .LREVM0SR:
  514. .quad 0x090d01050c000408, 0x03070b0f060a0e02
  515. .align 6
  516. .size _bsaes_const,.-_bsaes_const
  517. .type _bsaes_encrypt8,%function
  518. .align 4
  519. // On entry:
  520. // x9 -> key (previously expanded using _bsaes_key_convert)
  521. // x10 = number of rounds
  522. // v0-v7 input data
  523. // On exit:
  524. // x9-x11 corrupted
  525. // other general-purpose registers preserved
  526. // v0-v7 output data
  527. // v11-v15 preserved
  528. // other SIMD registers corrupted
  529. _bsaes_encrypt8:
  530. ldr q8, [x9], #16
  531. adr x11, .LM0SR
  532. ldr q9, [x11], #16
  533. _bsaes_encrypt8_alt:
  534. eor v0.16b, v0.16b, v8.16b
  535. eor v1.16b, v1.16b, v8.16b
  536. sub x10, x10, #1
  537. eor v2.16b, v2.16b, v8.16b
  538. eor v4.16b, v4.16b, v8.16b
  539. eor v3.16b, v3.16b, v8.16b
  540. eor v5.16b, v5.16b, v8.16b
  541. tbl v0.16b, {v0.16b}, v9.16b
  542. tbl v1.16b, {v1.16b}, v9.16b
  543. tbl v2.16b, {v2.16b}, v9.16b
  544. tbl v4.16b, {v4.16b}, v9.16b
  545. eor v6.16b, v6.16b, v8.16b
  546. eor v7.16b, v7.16b, v8.16b
  547. tbl v3.16b, {v3.16b}, v9.16b
  548. tbl v5.16b, {v5.16b}, v9.16b
  549. tbl v6.16b, {v6.16b}, v9.16b
  550. ushr v8.2d, v0.2d, #1
  551. movi v10.16b, #0x55
  552. tbl v7.16b, {v7.16b}, v9.16b
  553. ushr v9.2d, v4.2d, #1
  554. movi v16.16b, #0x33
  555. ushr v17.2d, v2.2d, #1
  556. eor v8.16b, v8.16b, v1.16b
  557. movi v18.16b, #0x0f
  558. ushr v19.2d, v6.2d, #1
  559. eor v9.16b, v9.16b, v5.16b
  560. eor v17.16b, v17.16b, v3.16b
  561. and v8.16b, v8.16b, v10.16b
  562. eor v19.16b, v19.16b, v7.16b
  563. and v9.16b, v9.16b, v10.16b
  564. and v17.16b, v17.16b, v10.16b
  565. eor v1.16b, v1.16b, v8.16b
  566. shl v8.2d, v8.2d, #1
  567. and v10.16b, v19.16b, v10.16b
  568. eor v5.16b, v5.16b, v9.16b
  569. shl v9.2d, v9.2d, #1
  570. eor v3.16b, v3.16b, v17.16b
  571. shl v17.2d, v17.2d, #1
  572. eor v0.16b, v0.16b, v8.16b
  573. shl v8.2d, v10.2d, #1
  574. eor v7.16b, v7.16b, v10.16b
  575. eor v4.16b, v4.16b, v9.16b
  576. eor v2.16b, v2.16b, v17.16b
  577. ushr v9.2d, v1.2d, #2
  578. eor v6.16b, v6.16b, v8.16b
  579. ushr v8.2d, v0.2d, #2
  580. ushr v10.2d, v5.2d, #2
  581. ushr v17.2d, v4.2d, #2
  582. eor v9.16b, v9.16b, v3.16b
  583. eor v8.16b, v8.16b, v2.16b
  584. eor v10.16b, v10.16b, v7.16b
  585. eor v17.16b, v17.16b, v6.16b
  586. and v9.16b, v9.16b, v16.16b
  587. and v8.16b, v8.16b, v16.16b
  588. and v10.16b, v10.16b, v16.16b
  589. and v16.16b, v17.16b, v16.16b
  590. eor v3.16b, v3.16b, v9.16b
  591. shl v9.2d, v9.2d, #2
  592. eor v2.16b, v2.16b, v8.16b
  593. shl v8.2d, v8.2d, #2
  594. eor v7.16b, v7.16b, v10.16b
  595. shl v10.2d, v10.2d, #2
  596. eor v6.16b, v6.16b, v16.16b
  597. shl v16.2d, v16.2d, #2
  598. eor v1.16b, v1.16b, v9.16b
  599. eor v0.16b, v0.16b, v8.16b
  600. eor v5.16b, v5.16b, v10.16b
  601. eor v4.16b, v4.16b, v16.16b
  602. ushr v8.2d, v3.2d, #4
  603. ushr v9.2d, v2.2d, #4
  604. ushr v10.2d, v1.2d, #4
  605. ushr v16.2d, v0.2d, #4
  606. eor v8.16b, v8.16b, v7.16b
  607. eor v9.16b, v9.16b, v6.16b
  608. eor v10.16b, v10.16b, v5.16b
  609. eor v16.16b, v16.16b, v4.16b
  610. and v8.16b, v8.16b, v18.16b
  611. and v9.16b, v9.16b, v18.16b
  612. and v10.16b, v10.16b, v18.16b
  613. and v16.16b, v16.16b, v18.16b
  614. eor v7.16b, v7.16b, v8.16b
  615. shl v8.2d, v8.2d, #4
  616. eor v6.16b, v6.16b, v9.16b
  617. shl v9.2d, v9.2d, #4
  618. eor v5.16b, v5.16b, v10.16b
  619. shl v10.2d, v10.2d, #4
  620. eor v4.16b, v4.16b, v16.16b
  621. shl v16.2d, v16.2d, #4
  622. eor v3.16b, v3.16b, v8.16b
  623. eor v2.16b, v2.16b, v9.16b
  624. eor v1.16b, v1.16b, v10.16b
  625. eor v0.16b, v0.16b, v16.16b
  626. b .Lenc_sbox
  627. .align 4
  628. .Lenc_loop:
  629. ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x9], #64
  630. ldp q8, q9, [x9], #32
  631. eor v0.16b, v16.16b, v0.16b
  632. ldr q10, [x9], #16
  633. eor v1.16b, v17.16b, v1.16b
  634. ldr q16, [x9], #16
  635. eor v2.16b, v18.16b, v2.16b
  636. eor v3.16b, v19.16b, v3.16b
  637. eor v4.16b, v8.16b, v4.16b
  638. eor v5.16b, v9.16b, v5.16b
  639. eor v6.16b, v10.16b, v6.16b
  640. eor v7.16b, v16.16b, v7.16b
  641. tbl v0.16b, {v0.16b}, v28.16b
  642. tbl v1.16b, {v1.16b}, v28.16b
  643. tbl v2.16b, {v2.16b}, v28.16b
  644. tbl v3.16b, {v3.16b}, v28.16b
  645. tbl v4.16b, {v4.16b}, v28.16b
  646. tbl v5.16b, {v5.16b}, v28.16b
  647. tbl v6.16b, {v6.16b}, v28.16b
  648. tbl v7.16b, {v7.16b}, v28.16b
  649. .Lenc_sbox:
  650. eor v5.16b, v5.16b, v6.16b
  651. eor v3.16b, v3.16b, v0.16b
  652. subs x10, x10, #1
  653. eor v2.16b, v2.16b, v1.16b
  654. eor v5.16b, v5.16b, v0.16b
  655. eor v8.16b, v3.16b, v7.16b
  656. eor v6.16b, v6.16b, v2.16b
  657. eor v7.16b, v7.16b, v5.16b
  658. eor v8.16b, v8.16b, v4.16b
  659. eor v3.16b, v6.16b, v3.16b
  660. eor v4.16b, v4.16b, v5.16b
  661. eor v6.16b, v1.16b, v5.16b
  662. eor v2.16b, v2.16b, v7.16b
  663. eor v1.16b, v8.16b, v1.16b
  664. eor v8.16b, v7.16b, v4.16b
  665. eor v9.16b, v3.16b, v0.16b
  666. eor v10.16b, v7.16b, v6.16b
  667. eor v16.16b, v5.16b, v3.16b
  668. eor v17.16b, v6.16b, v2.16b
  669. eor v18.16b, v5.16b, v1.16b
  670. eor v19.16b, v2.16b, v4.16b
  671. eor v20.16b, v1.16b, v0.16b
  672. orr v21.16b, v8.16b, v9.16b
  673. orr v22.16b, v10.16b, v16.16b
  674. eor v23.16b, v8.16b, v17.16b
  675. eor v24.16b, v9.16b, v18.16b
  676. and v19.16b, v19.16b, v20.16b
  677. orr v20.16b, v17.16b, v18.16b
  678. and v8.16b, v8.16b, v9.16b
  679. and v9.16b, v17.16b, v18.16b
  680. and v17.16b, v23.16b, v24.16b
  681. and v10.16b, v10.16b, v16.16b
  682. eor v16.16b, v21.16b, v19.16b
  683. eor v18.16b, v20.16b, v19.16b
  684. and v19.16b, v2.16b, v1.16b
  685. and v20.16b, v6.16b, v5.16b
  686. eor v21.16b, v22.16b, v17.16b
  687. eor v9.16b, v9.16b, v10.16b
  688. eor v10.16b, v16.16b, v17.16b
  689. eor v16.16b, v18.16b, v8.16b
  690. and v17.16b, v4.16b, v0.16b
  691. orr v18.16b, v7.16b, v3.16b
  692. eor v21.16b, v21.16b, v8.16b
  693. eor v8.16b, v9.16b, v8.16b
  694. eor v9.16b, v10.16b, v19.16b
  695. eor v10.16b, v3.16b, v0.16b
  696. eor v16.16b, v16.16b, v17.16b
  697. eor v17.16b, v5.16b, v1.16b
  698. eor v19.16b, v21.16b, v20.16b
  699. eor v20.16b, v8.16b, v18.16b
  700. eor v8.16b, v8.16b, v18.16b
  701. eor v18.16b, v7.16b, v4.16b
  702. eor v21.16b, v9.16b, v16.16b
  703. eor v22.16b, v6.16b, v2.16b
  704. and v23.16b, v9.16b, v19.16b
  705. eor v24.16b, v10.16b, v17.16b
  706. eor v25.16b, v0.16b, v1.16b
  707. eor v26.16b, v7.16b, v6.16b
  708. eor v27.16b, v18.16b, v22.16b
  709. eor v28.16b, v3.16b, v5.16b
  710. eor v29.16b, v16.16b, v23.16b
  711. eor v30.16b, v20.16b, v23.16b
  712. eor v23.16b, v20.16b, v23.16b
  713. eor v31.16b, v4.16b, v2.16b
  714. bsl v29.16b, v19.16b, v20.16b
  715. bsl v30.16b, v9.16b, v16.16b
  716. bsl v8.16b, v29.16b, v23.16b
  717. bsl v20.16b, v23.16b, v29.16b
  718. eor v9.16b, v30.16b, v29.16b
  719. and v5.16b, v5.16b, v30.16b
  720. and v8.16b, v8.16b, v30.16b
  721. and v1.16b, v1.16b, v29.16b
  722. eor v16.16b, v19.16b, v20.16b
  723. and v2.16b, v2.16b, v29.16b
  724. eor v19.16b, v9.16b, v29.16b
  725. and v17.16b, v17.16b, v9.16b
  726. eor v8.16b, v8.16b, v21.16b
  727. and v20.16b, v22.16b, v9.16b
  728. eor v21.16b, v29.16b, v16.16b
  729. eor v22.16b, v29.16b, v16.16b
  730. and v23.16b, v25.16b, v16.16b
  731. and v6.16b, v6.16b, v19.16b
  732. eor v25.16b, v8.16b, v16.16b
  733. eor v29.16b, v30.16b, v8.16b
  734. and v4.16b, v21.16b, v4.16b
  735. and v8.16b, v28.16b, v8.16b
  736. and v0.16b, v22.16b, v0.16b
  737. eor v21.16b, v23.16b, v1.16b
  738. eor v22.16b, v9.16b, v25.16b
  739. eor v9.16b, v9.16b, v25.16b
  740. eor v23.16b, v25.16b, v16.16b
  741. and v3.16b, v29.16b, v3.16b
  742. and v24.16b, v24.16b, v25.16b
  743. and v25.16b, v27.16b, v25.16b
  744. and v10.16b, v22.16b, v10.16b
  745. and v9.16b, v9.16b, v18.16b
  746. eor v18.16b, v19.16b, v23.16b
  747. and v19.16b, v26.16b, v23.16b
  748. eor v3.16b, v5.16b, v3.16b
  749. eor v17.16b, v17.16b, v24.16b
  750. eor v10.16b, v24.16b, v10.16b
  751. and v16.16b, v31.16b, v16.16b
  752. eor v20.16b, v20.16b, v25.16b
  753. eor v9.16b, v25.16b, v9.16b
  754. eor v4.16b, v2.16b, v4.16b
  755. and v7.16b, v18.16b, v7.16b
  756. eor v18.16b, v19.16b, v6.16b
  757. eor v5.16b, v8.16b, v5.16b
  758. eor v0.16b, v1.16b, v0.16b
  759. eor v1.16b, v21.16b, v10.16b
  760. eor v8.16b, v3.16b, v17.16b
  761. eor v2.16b, v16.16b, v2.16b
  762. eor v3.16b, v6.16b, v7.16b
  763. eor v6.16b, v18.16b, v9.16b
  764. eor v4.16b, v4.16b, v20.16b
  765. eor v10.16b, v5.16b, v10.16b
  766. eor v0.16b, v0.16b, v17.16b
  767. eor v9.16b, v2.16b, v9.16b
  768. eor v3.16b, v3.16b, v20.16b
  769. eor v7.16b, v6.16b, v1.16b
  770. eor v5.16b, v8.16b, v4.16b
  771. eor v6.16b, v10.16b, v1.16b
  772. eor v2.16b, v4.16b, v0.16b
  773. eor v4.16b, v3.16b, v10.16b
  774. eor v9.16b, v9.16b, v7.16b
  775. eor v3.16b, v0.16b, v5.16b
  776. eor v0.16b, v1.16b, v4.16b
  777. eor v1.16b, v4.16b, v8.16b
  778. eor v4.16b, v9.16b, v5.16b
  779. eor v6.16b, v6.16b, v3.16b
  780. bcc .Lenc_done
  781. ext v8.16b, v0.16b, v0.16b, #12
  782. ext v9.16b, v4.16b, v4.16b, #12
  783. ldr q28, [x11]
  784. ext v10.16b, v6.16b, v6.16b, #12
  785. ext v16.16b, v1.16b, v1.16b, #12
  786. ext v17.16b, v3.16b, v3.16b, #12
  787. ext v18.16b, v7.16b, v7.16b, #12
  788. eor v0.16b, v0.16b, v8.16b
  789. eor v4.16b, v4.16b, v9.16b
  790. eor v6.16b, v6.16b, v10.16b
  791. ext v19.16b, v2.16b, v2.16b, #12
  792. ext v20.16b, v5.16b, v5.16b, #12
  793. eor v1.16b, v1.16b, v16.16b
  794. eor v3.16b, v3.16b, v17.16b
  795. eor v7.16b, v7.16b, v18.16b
  796. eor v2.16b, v2.16b, v19.16b
  797. eor v16.16b, v16.16b, v0.16b
  798. eor v5.16b, v5.16b, v20.16b
  799. eor v17.16b, v17.16b, v6.16b
  800. eor v10.16b, v10.16b, v4.16b
  801. ext v0.16b, v0.16b, v0.16b, #8
  802. eor v9.16b, v9.16b, v1.16b
  803. ext v1.16b, v1.16b, v1.16b, #8
  804. eor v8.16b, v8.16b, v5.16b
  805. eor v16.16b, v16.16b, v5.16b
  806. eor v18.16b, v18.16b, v3.16b
  807. eor v19.16b, v19.16b, v7.16b
  808. ext v3.16b, v3.16b, v3.16b, #8
  809. ext v7.16b, v7.16b, v7.16b, #8
  810. eor v20.16b, v20.16b, v2.16b
  811. ext v6.16b, v6.16b, v6.16b, #8
  812. ext v21.16b, v5.16b, v5.16b, #8
  813. eor v17.16b, v17.16b, v5.16b
  814. ext v2.16b, v2.16b, v2.16b, #8
  815. eor v10.16b, v10.16b, v5.16b
  816. ext v22.16b, v4.16b, v4.16b, #8
  817. eor v0.16b, v0.16b, v8.16b
  818. eor v1.16b, v1.16b, v16.16b
  819. eor v5.16b, v7.16b, v18.16b
  820. eor v4.16b, v3.16b, v17.16b
  821. eor v3.16b, v6.16b, v10.16b
  822. eor v7.16b, v21.16b, v20.16b
  823. eor v6.16b, v2.16b, v19.16b
  824. eor v2.16b, v22.16b, v9.16b
  825. bne .Lenc_loop
  826. ldr q28, [x11, #16]! // load from .LSRM0 on last round (x10 == 0)
  827. b .Lenc_loop
  828. .align 4
  829. .Lenc_done:
  830. ushr v8.2d, v0.2d, #1
  831. movi v9.16b, #0x55
  832. ldr q10, [x9]
  833. ushr v16.2d, v3.2d, #1
  834. movi v17.16b, #0x33
  835. ushr v18.2d, v4.2d, #1
  836. movi v19.16b, #0x0f
  837. eor v8.16b, v8.16b, v1.16b
  838. ushr v20.2d, v2.2d, #1
  839. eor v16.16b, v16.16b, v7.16b
  840. eor v18.16b, v18.16b, v6.16b
  841. and v8.16b, v8.16b, v9.16b
  842. eor v20.16b, v20.16b, v5.16b
  843. and v16.16b, v16.16b, v9.16b
  844. and v18.16b, v18.16b, v9.16b
  845. shl v21.2d, v8.2d, #1
  846. eor v1.16b, v1.16b, v8.16b
  847. and v8.16b, v20.16b, v9.16b
  848. eor v7.16b, v7.16b, v16.16b
  849. shl v9.2d, v16.2d, #1
  850. eor v6.16b, v6.16b, v18.16b
  851. shl v16.2d, v18.2d, #1
  852. eor v0.16b, v0.16b, v21.16b
  853. shl v18.2d, v8.2d, #1
  854. eor v5.16b, v5.16b, v8.16b
  855. eor v3.16b, v3.16b, v9.16b
  856. eor v4.16b, v4.16b, v16.16b
  857. ushr v8.2d, v1.2d, #2
  858. eor v2.16b, v2.16b, v18.16b
  859. ushr v9.2d, v0.2d, #2
  860. ushr v16.2d, v7.2d, #2
  861. ushr v18.2d, v3.2d, #2
  862. eor v8.16b, v8.16b, v6.16b
  863. eor v9.16b, v9.16b, v4.16b
  864. eor v16.16b, v16.16b, v5.16b
  865. eor v18.16b, v18.16b, v2.16b
  866. and v8.16b, v8.16b, v17.16b
  867. and v9.16b, v9.16b, v17.16b
  868. and v16.16b, v16.16b, v17.16b
  869. and v17.16b, v18.16b, v17.16b
  870. eor v6.16b, v6.16b, v8.16b
  871. shl v8.2d, v8.2d, #2
  872. eor v4.16b, v4.16b, v9.16b
  873. shl v9.2d, v9.2d, #2
  874. eor v5.16b, v5.16b, v16.16b
  875. shl v16.2d, v16.2d, #2
  876. eor v2.16b, v2.16b, v17.16b
  877. shl v17.2d, v17.2d, #2
  878. eor v1.16b, v1.16b, v8.16b
  879. eor v0.16b, v0.16b, v9.16b
  880. eor v7.16b, v7.16b, v16.16b
  881. eor v3.16b, v3.16b, v17.16b
  882. ushr v8.2d, v6.2d, #4
  883. ushr v9.2d, v4.2d, #4
  884. ushr v16.2d, v1.2d, #4
  885. ushr v17.2d, v0.2d, #4
  886. eor v8.16b, v8.16b, v5.16b
  887. eor v9.16b, v9.16b, v2.16b
  888. eor v16.16b, v16.16b, v7.16b
  889. eor v17.16b, v17.16b, v3.16b
  890. and v8.16b, v8.16b, v19.16b
  891. and v9.16b, v9.16b, v19.16b
  892. and v16.16b, v16.16b, v19.16b
  893. and v17.16b, v17.16b, v19.16b
  894. eor v5.16b, v5.16b, v8.16b
  895. shl v8.2d, v8.2d, #4
  896. eor v2.16b, v2.16b, v9.16b
  897. shl v9.2d, v9.2d, #4
  898. eor v7.16b, v7.16b, v16.16b
  899. shl v16.2d, v16.2d, #4
  900. eor v3.16b, v3.16b, v17.16b
  901. shl v17.2d, v17.2d, #4
  902. eor v6.16b, v6.16b, v8.16b
  903. eor v4.16b, v4.16b, v9.16b
  904. eor v7.16b, v7.16b, v10.16b
  905. eor v1.16b, v1.16b, v16.16b
  906. eor v3.16b, v3.16b, v10.16b
  907. eor v0.16b, v0.16b, v17.16b
  908. eor v6.16b, v6.16b, v10.16b
  909. eor v4.16b, v4.16b, v10.16b
  910. eor v2.16b, v2.16b, v10.16b
  911. eor v5.16b, v5.16b, v10.16b
  912. eor v1.16b, v1.16b, v10.16b
  913. eor v0.16b, v0.16b, v10.16b
  914. ret
  915. .size _bsaes_encrypt8,.-_bsaes_encrypt8
  916. .type _bsaes_key_convert,%function
  917. .align 4
  918. // On entry:
  919. // x9 -> input key (big-endian)
  920. // x10 = number of rounds
  921. // x17 -> output key (native endianness)
  922. // On exit:
  923. // x9, x10 corrupted
  924. // x11 -> .LM0_bigendian
  925. // x17 -> last quadword of output key
  926. // other general-purpose registers preserved
  927. // v2-v6 preserved
  928. // v7.16b[] = 0x63
  929. // v8-v14 preserved
  930. // v15 = last round key (converted to native endianness)
  931. // other SIMD registers corrupted
  932. _bsaes_key_convert:
  933. #ifdef __AARCH64EL__
  934. adr x11, .LM0_littleendian
  935. #else
  936. adr x11, .LM0_bigendian
  937. #endif
  938. ldr q0, [x9], #16 // load round 0 key
  939. ldr q1, [x11] // .LM0
  940. ldr q15, [x9], #16 // load round 1 key
  941. movi v7.16b, #0x63 // compose .L63
  942. movi v16.16b, #0x01 // bit masks
  943. movi v17.16b, #0x02
  944. movi v18.16b, #0x04
  945. movi v19.16b, #0x08
  946. movi v20.16b, #0x10
  947. movi v21.16b, #0x20
  948. movi v22.16b, #0x40
  949. movi v23.16b, #0x80
  950. #ifdef __AARCH64EL__
  951. rev32 v0.16b, v0.16b
  952. #endif
  953. sub x10, x10, #1
  954. str q0, [x17], #16 // save round 0 key
  955. .align 4
  956. .Lkey_loop:
  957. tbl v0.16b, {v15.16b}, v1.16b
  958. ldr q15, [x9], #16 // load next round key
  959. eor v0.16b, v0.16b, v7.16b
  960. cmtst v24.16b, v0.16b, v16.16b
  961. cmtst v25.16b, v0.16b, v17.16b
  962. cmtst v26.16b, v0.16b, v18.16b
  963. cmtst v27.16b, v0.16b, v19.16b
  964. cmtst v28.16b, v0.16b, v20.16b
  965. cmtst v29.16b, v0.16b, v21.16b
  966. cmtst v30.16b, v0.16b, v22.16b
  967. cmtst v31.16b, v0.16b, v23.16b
  968. sub x10, x10, #1
  969. st1 {v24.16b-v27.16b}, [x17], #64 // write bit-sliced round key
  970. st1 {v28.16b-v31.16b}, [x17], #64
  971. cbnz x10, .Lkey_loop
  972. // don't save last round key
  973. #ifdef __AARCH64EL__
  974. rev32 v15.16b, v15.16b
  975. adr x11, .LM0_bigendian
  976. #endif
  977. ret
  978. .size _bsaes_key_convert,.-_bsaes_key_convert
  979. .globl ossl_bsaes_cbc_encrypt
  980. .type ossl_bsaes_cbc_encrypt,%function
  981. .align 4
  982. // On entry:
  983. // x0 -> input ciphertext
  984. // x1 -> output plaintext
  985. // x2 = size of ciphertext and plaintext in bytes (assumed a multiple of 16)
  986. // x3 -> key
  987. // x4 -> 128-bit initialisation vector (or preceding 128-bit block of ciphertext if continuing after an earlier call)
  988. // w5 must be == 0
  989. // On exit:
  990. // Output plaintext filled in
  991. // Initialisation vector overwritten with last quadword of ciphertext
  992. // No output registers, usual AAPCS64 register preservation
  993. ossl_bsaes_cbc_encrypt:
  994. cmp x2, #128
  995. bhs .Lcbc_do_bsaes
  996. b AES_cbc_encrypt
  997. .Lcbc_do_bsaes:
  998. // it is up to the caller to make sure we are called with enc == 0
  999. stp x29, x30, [sp, #-48]!
  1000. stp d8, d9, [sp, #16]
  1001. stp d10, d15, [sp, #32]
  1002. lsr x2, x2, #4 // len in 16 byte blocks
  1003. ldr w15, [x3, #240] // get # of rounds
  1004. mov x14, sp
  1005. // allocate the key schedule on the stack
  1006. add x17, sp, #96
  1007. sub x17, x17, x15, lsl #7 // 128 bytes per inner round key, less 96 bytes
  1008. // populate the key schedule
  1009. mov x9, x3 // pass key
  1010. mov x10, x15 // pass # of rounds
  1011. mov sp, x17 // sp is sp
  1012. bl _bsaes_key_convert
  1013. ldr q6, [sp]
  1014. str q15, [x17] // save last round key
  1015. eor v6.16b, v6.16b, v7.16b // fix up round 0 key (by XORing with 0x63)
  1016. str q6, [sp]
  1017. ldr q15, [x4] // load IV
  1018. b .Lcbc_dec_loop
  1019. .align 4
  1020. .Lcbc_dec_loop:
  1021. subs x2, x2, #0x8
  1022. bmi .Lcbc_dec_loop_finish
  1023. ldr q0, [x0], #16 // load input
  1024. mov x9, sp // pass the key
  1025. ldr q1, [x0], #16
  1026. mov x10, x15
  1027. ldr q2, [x0], #16
  1028. ldr q3, [x0], #16
  1029. ldr q4, [x0], #16
  1030. ldr q5, [x0], #16
  1031. ldr q6, [x0], #16
  1032. ldr q7, [x0], #-7*16
  1033. bl _bsaes_decrypt8
  1034. ldr q16, [x0], #16 // reload input
  1035. eor v0.16b, v0.16b, v15.16b // ^= IV
  1036. eor v1.16b, v1.16b, v16.16b
  1037. str q0, [x1], #16 // write output
  1038. ldr q0, [x0], #16
  1039. str q1, [x1], #16
  1040. ldr q1, [x0], #16
  1041. eor v1.16b, v4.16b, v1.16b
  1042. ldr q4, [x0], #16
  1043. eor v2.16b, v2.16b, v4.16b
  1044. eor v0.16b, v6.16b, v0.16b
  1045. ldr q4, [x0], #16
  1046. str q0, [x1], #16
  1047. str q1, [x1], #16
  1048. eor v0.16b, v7.16b, v4.16b
  1049. ldr q1, [x0], #16
  1050. str q2, [x1], #16
  1051. ldr q2, [x0], #16
  1052. ldr q15, [x0], #16
  1053. str q0, [x1], #16
  1054. eor v0.16b, v5.16b, v2.16b
  1055. eor v1.16b, v3.16b, v1.16b
  1056. str q1, [x1], #16
  1057. str q0, [x1], #16
  1058. b .Lcbc_dec_loop
  1059. .Lcbc_dec_loop_finish:
  1060. adds x2, x2, #8
  1061. beq .Lcbc_dec_done
  1062. ldr q0, [x0], #16 // load input
  1063. cmp x2, #2
  1064. blo .Lcbc_dec_one
  1065. ldr q1, [x0], #16
  1066. mov x9, sp // pass the key
  1067. mov x10, x15
  1068. beq .Lcbc_dec_two
  1069. ldr q2, [x0], #16
  1070. cmp x2, #4
  1071. blo .Lcbc_dec_three
  1072. ldr q3, [x0], #16
  1073. beq .Lcbc_dec_four
  1074. ldr q4, [x0], #16
  1075. cmp x2, #6
  1076. blo .Lcbc_dec_five
  1077. ldr q5, [x0], #16
  1078. beq .Lcbc_dec_six
  1079. ldr q6, [x0], #-6*16
  1080. bl _bsaes_decrypt8
  1081. ldr q5, [x0], #16 // reload input
  1082. eor v0.16b, v0.16b, v15.16b // ^= IV
  1083. ldr q8, [x0], #16
  1084. ldr q9, [x0], #16
  1085. ldr q10, [x0], #16
  1086. str q0, [x1], #16 // write output
  1087. ldr q0, [x0], #16
  1088. eor v1.16b, v1.16b, v5.16b
  1089. ldr q5, [x0], #16
  1090. eor v6.16b, v6.16b, v8.16b
  1091. ldr q15, [x0]
  1092. eor v4.16b, v4.16b, v9.16b
  1093. eor v2.16b, v2.16b, v10.16b
  1094. str q1, [x1], #16
  1095. eor v0.16b, v7.16b, v0.16b
  1096. str q6, [x1], #16
  1097. eor v1.16b, v3.16b, v5.16b
  1098. str q4, [x1], #16
  1099. str q2, [x1], #16
  1100. str q0, [x1], #16
  1101. str q1, [x1]
  1102. b .Lcbc_dec_done
  1103. .align 4
  1104. .Lcbc_dec_six:
  1105. sub x0, x0, #0x60
  1106. bl _bsaes_decrypt8
  1107. ldr q3, [x0], #16 // reload input
  1108. eor v0.16b, v0.16b, v15.16b // ^= IV
  1109. ldr q5, [x0], #16
  1110. ldr q8, [x0], #16
  1111. ldr q9, [x0], #16
  1112. str q0, [x1], #16 // write output
  1113. ldr q0, [x0], #16
  1114. eor v1.16b, v1.16b, v3.16b
  1115. ldr q15, [x0]
  1116. eor v3.16b, v6.16b, v5.16b
  1117. eor v4.16b, v4.16b, v8.16b
  1118. eor v2.16b, v2.16b, v9.16b
  1119. str q1, [x1], #16
  1120. eor v0.16b, v7.16b, v0.16b
  1121. str q3, [x1], #16
  1122. str q4, [x1], #16
  1123. str q2, [x1], #16
  1124. str q0, [x1]
  1125. b .Lcbc_dec_done
  1126. .align 4
  1127. .Lcbc_dec_five:
  1128. sub x0, x0, #0x50
  1129. bl _bsaes_decrypt8
  1130. ldr q3, [x0], #16 // reload input
  1131. eor v0.16b, v0.16b, v15.16b // ^= IV
  1132. ldr q5, [x0], #16
  1133. ldr q7, [x0], #16
  1134. ldr q8, [x0], #16
  1135. str q0, [x1], #16 // write output
  1136. ldr q15, [x0]
  1137. eor v0.16b, v1.16b, v3.16b
  1138. eor v1.16b, v6.16b, v5.16b
  1139. eor v3.16b, v4.16b, v7.16b
  1140. str q0, [x1], #16
  1141. eor v0.16b, v2.16b, v8.16b
  1142. str q1, [x1], #16
  1143. str q3, [x1], #16
  1144. str q0, [x1]
  1145. b .Lcbc_dec_done
  1146. .align 4
  1147. .Lcbc_dec_four:
  1148. sub x0, x0, #0x40
  1149. bl _bsaes_decrypt8
  1150. ldr q2, [x0], #16 // reload input
  1151. eor v0.16b, v0.16b, v15.16b // ^= IV
  1152. ldr q3, [x0], #16
  1153. ldr q5, [x0], #16
  1154. str q0, [x1], #16 // write output
  1155. ldr q15, [x0]
  1156. eor v0.16b, v1.16b, v2.16b
  1157. eor v1.16b, v6.16b, v3.16b
  1158. eor v2.16b, v4.16b, v5.16b
  1159. str q0, [x1], #16
  1160. str q1, [x1], #16
  1161. str q2, [x1]
  1162. b .Lcbc_dec_done
  1163. .align 4
  1164. .Lcbc_dec_three:
  1165. sub x0, x0, #0x30
  1166. bl _bsaes_decrypt8
  1167. ldr q2, [x0], #16 // reload input
  1168. eor v0.16b, v0.16b, v15.16b // ^= IV
  1169. ldr q3, [x0], #16
  1170. ldr q15, [x0]
  1171. str q0, [x1], #16 // write output
  1172. eor v0.16b, v1.16b, v2.16b
  1173. eor v1.16b, v6.16b, v3.16b
  1174. str q0, [x1], #16
  1175. str q1, [x1]
  1176. b .Lcbc_dec_done
  1177. .align 4
  1178. .Lcbc_dec_two:
  1179. sub x0, x0, #0x20
  1180. bl _bsaes_decrypt8
  1181. ldr q2, [x0], #16 // reload input
  1182. eor v0.16b, v0.16b, v15.16b // ^= IV
  1183. ldr q15, [x0]
  1184. str q0, [x1], #16 // write output
  1185. eor v0.16b, v1.16b, v2.16b
  1186. str q0, [x1]
  1187. b .Lcbc_dec_done
  1188. .align 4
  1189. .Lcbc_dec_one:
  1190. sub x0, x0, #0x10
  1191. stp x1, x4, [sp, #-32]!
  1192. str x14, [sp, #16]
  1193. mov v8.16b, v15.16b
  1194. mov v15.16b, v0.16b
  1195. mov x2, x3
  1196. bl AES_decrypt
  1197. ldr x14, [sp, #16]
  1198. ldp x1, x4, [sp], #32
  1199. ldr q0, [x1] // load result
  1200. eor v0.16b, v0.16b, v8.16b // ^= IV
  1201. str q0, [x1] // write output
  1202. .align 4
  1203. .Lcbc_dec_done:
  1204. movi v0.16b, #0
  1205. movi v1.16b, #0
  1206. .Lcbc_dec_bzero:// wipe key schedule [if any]
  1207. stp q0, q1, [sp], #32
  1208. cmp sp, x14
  1209. bne .Lcbc_dec_bzero
  1210. str q15, [x4] // return IV
  1211. ldp d8, d9, [sp, #16]
  1212. ldp d10, d15, [sp, #32]
  1213. ldp x29, x30, [sp], #48
  1214. ret
  1215. .size ossl_bsaes_cbc_encrypt,.-ossl_bsaes_cbc_encrypt
  1216. .globl ossl_bsaes_ctr32_encrypt_blocks
  1217. .type ossl_bsaes_ctr32_encrypt_blocks,%function
  1218. .align 4
  1219. // On entry:
  1220. // x0 -> input text (whole 16-byte blocks)
  1221. // x1 -> output text (whole 16-byte blocks)
  1222. // x2 = number of 16-byte blocks to encrypt/decrypt (> 0)
  1223. // x3 -> key
  1224. // x4 -> initial value of 128-bit counter (stored big-endian) which increments, modulo 2^32, for each block
  1225. // On exit:
  1226. // Output text filled in
  1227. // No output registers, usual AAPCS64 register preservation
  1228. ossl_bsaes_ctr32_encrypt_blocks:
  1229. cmp x2, #8 // use plain AES for
  1230. blo .Lctr_enc_short // small sizes
  1231. stp x29, x30, [sp, #-80]!
  1232. stp d8, d9, [sp, #16]
  1233. stp d10, d11, [sp, #32]
  1234. stp d12, d13, [sp, #48]
  1235. stp d14, d15, [sp, #64]
  1236. ldr w15, [x3, #240] // get # of rounds
  1237. mov x14, sp
  1238. // allocate the key schedule on the stack
  1239. add x17, sp, #96
  1240. sub x17, x17, x15, lsl #7 // 128 bytes per inner round key, less 96 bytes
  1241. // populate the key schedule
  1242. mov x9, x3 // pass key
  1243. mov x10, x15 // pass # of rounds
  1244. mov sp, x17 // sp is sp
  1245. bl _bsaes_key_convert
  1246. eor v7.16b, v7.16b, v15.16b // fix up last round key
  1247. str q7, [x17] // save last round key
  1248. ldr q0, [x4] // load counter
  1249. add x13, x11, #.LREVM0SR-.LM0_bigendian
  1250. ldr q4, [sp] // load round0 key
  1251. movi v8.4s, #1 // compose 1<<96
  1252. movi v9.16b, #0
  1253. rev32 v15.16b, v0.16b
  1254. rev32 v0.16b, v0.16b
  1255. ext v11.16b, v9.16b, v8.16b, #4
  1256. rev32 v4.16b, v4.16b
  1257. add v12.4s, v11.4s, v11.4s // compose 2<<96
  1258. str q4, [sp] // save adjusted round0 key
  1259. add v13.4s, v11.4s, v12.4s // compose 3<<96
  1260. add v14.4s, v12.4s, v12.4s // compose 4<<96
  1261. b .Lctr_enc_loop
  1262. .align 4
  1263. .Lctr_enc_loop:
  1264. // Intermix prologue from _bsaes_encrypt8 to use the opportunity
  1265. // to flip byte order in 32-bit counter
  1266. add v1.4s, v15.4s, v11.4s // +1
  1267. add x9, sp, #0x10 // pass next round key
  1268. add v2.4s, v15.4s, v12.4s // +2
  1269. ldr q9, [x13] // .LREVM0SR
  1270. ldr q8, [sp] // load round0 key
  1271. add v3.4s, v15.4s, v13.4s // +3
  1272. mov x10, x15 // pass rounds
  1273. sub x11, x13, #.LREVM0SR-.LSR // pass constants
  1274. add v6.4s, v2.4s, v14.4s
  1275. add v4.4s, v15.4s, v14.4s // +4
  1276. add v7.4s, v3.4s, v14.4s
  1277. add v15.4s, v4.4s, v14.4s // next counter
  1278. add v5.4s, v1.4s, v14.4s
  1279. bl _bsaes_encrypt8_alt
  1280. subs x2, x2, #8
  1281. blo .Lctr_enc_loop_done
  1282. ldr q16, [x0], #16
  1283. ldr q17, [x0], #16
  1284. eor v1.16b, v1.16b, v17.16b
  1285. ldr q17, [x0], #16
  1286. eor v0.16b, v0.16b, v16.16b
  1287. eor v4.16b, v4.16b, v17.16b
  1288. str q0, [x1], #16
  1289. ldr q16, [x0], #16
  1290. str q1, [x1], #16
  1291. mov v0.16b, v15.16b
  1292. str q4, [x1], #16
  1293. ldr q1, [x0], #16
  1294. eor v4.16b, v6.16b, v16.16b
  1295. eor v1.16b, v3.16b, v1.16b
  1296. ldr q3, [x0], #16
  1297. eor v3.16b, v7.16b, v3.16b
  1298. ldr q6, [x0], #16
  1299. eor v2.16b, v2.16b, v6.16b
  1300. ldr q6, [x0], #16
  1301. eor v5.16b, v5.16b, v6.16b
  1302. str q4, [x1], #16
  1303. str q1, [x1], #16
  1304. str q3, [x1], #16
  1305. str q2, [x1], #16
  1306. str q5, [x1], #16
  1307. bne .Lctr_enc_loop
  1308. b .Lctr_enc_done
  1309. .align 4
  1310. .Lctr_enc_loop_done:
  1311. add x2, x2, #8
  1312. ldr q16, [x0], #16 // load input
  1313. eor v0.16b, v0.16b, v16.16b
  1314. str q0, [x1], #16 // write output
  1315. cmp x2, #2
  1316. blo .Lctr_enc_done
  1317. ldr q17, [x0], #16
  1318. eor v1.16b, v1.16b, v17.16b
  1319. str q1, [x1], #16
  1320. beq .Lctr_enc_done
  1321. ldr q18, [x0], #16
  1322. eor v4.16b, v4.16b, v18.16b
  1323. str q4, [x1], #16
  1324. cmp x2, #4
  1325. blo .Lctr_enc_done
  1326. ldr q19, [x0], #16
  1327. eor v6.16b, v6.16b, v19.16b
  1328. str q6, [x1], #16
  1329. beq .Lctr_enc_done
  1330. ldr q20, [x0], #16
  1331. eor v3.16b, v3.16b, v20.16b
  1332. str q3, [x1], #16
  1333. cmp x2, #6
  1334. blo .Lctr_enc_done
  1335. ldr q21, [x0], #16
  1336. eor v7.16b, v7.16b, v21.16b
  1337. str q7, [x1], #16
  1338. beq .Lctr_enc_done
  1339. ldr q22, [x0]
  1340. eor v2.16b, v2.16b, v22.16b
  1341. str q2, [x1], #16
  1342. .Lctr_enc_done:
  1343. movi v0.16b, #0
  1344. movi v1.16b, #0
  1345. .Lctr_enc_bzero: // wipe key schedule [if any]
  1346. stp q0, q1, [sp], #32
  1347. cmp sp, x14
  1348. bne .Lctr_enc_bzero
  1349. ldp d8, d9, [sp, #16]
  1350. ldp d10, d11, [sp, #32]
  1351. ldp d12, d13, [sp, #48]
  1352. ldp d14, d15, [sp, #64]
  1353. ldp x29, x30, [sp], #80
  1354. ret
  1355. .Lctr_enc_short:
  1356. stp x29, x30, [sp, #-96]!
  1357. stp x19, x20, [sp, #16]
  1358. stp x21, x22, [sp, #32]
  1359. str x23, [sp, #48]
  1360. mov x19, x0 // copy arguments
  1361. mov x20, x1
  1362. mov x21, x2
  1363. mov x22, x3
  1364. ldr w23, [x4, #12] // load counter .LSW
  1365. ldr q1, [x4] // load whole counter value
  1366. #ifdef __AARCH64EL__
  1367. rev w23, w23
  1368. #endif
  1369. str q1, [sp, #80] // copy counter value
  1370. .Lctr_enc_short_loop:
  1371. add x0, sp, #80 // input counter value
  1372. add x1, sp, #64 // output on the stack
  1373. mov x2, x22 // key
  1374. bl AES_encrypt
  1375. ldr q0, [x19], #16 // load input
  1376. ldr q1, [sp, #64] // load encrypted counter
  1377. add x23, x23, #1
  1378. #ifdef __AARCH64EL__
  1379. rev w0, w23
  1380. str w0, [sp, #80+12] // next counter value
  1381. #else
  1382. str w23, [sp, #80+12] // next counter value
  1383. #endif
  1384. eor v0.16b, v0.16b, v1.16b
  1385. str q0, [x20], #16 // store output
  1386. subs x21, x21, #1
  1387. bne .Lctr_enc_short_loop
  1388. movi v0.16b, #0
  1389. movi v1.16b, #0
  1390. stp q0, q1, [sp, #64]
  1391. ldr x23, [sp, #48]
  1392. ldp x21, x22, [sp, #32]
  1393. ldp x19, x20, [sp, #16]
  1394. ldp x29, x30, [sp], #96
  1395. ret
  1396. .size ossl_bsaes_ctr32_encrypt_blocks,.-ossl_bsaes_ctr32_encrypt_blocks
  1397. .globl ossl_bsaes_xts_encrypt
  1398. .type ossl_bsaes_xts_encrypt,%function
  1399. .align 4
  1400. // On entry:
  1401. // x0 -> input plaintext
  1402. // x1 -> output ciphertext
  1403. // x2 -> length of text in bytes (must be at least 16)
  1404. // x3 -> key1 (used to encrypt the XORed plaintext blocks)
  1405. // x4 -> key2 (used to encrypt the initial vector to yield the initial tweak)
  1406. // x5 -> 16-byte initial vector (typically, sector number)
  1407. // On exit:
  1408. // Output ciphertext filled in
  1409. // No output registers, usual AAPCS64 register preservation
  1410. ossl_bsaes_xts_encrypt:
  1411. // Stack layout:
  1412. // sp ->
  1413. // nrounds*128-96 bytes: key schedule
  1414. // x19 ->
  1415. // 16 bytes: frame record
  1416. // 4*16 bytes: tweak storage across _bsaes_encrypt8
  1417. // 6*8 bytes: storage for 5 callee-saved general-purpose registers
  1418. // 8*8 bytes: storage for 8 callee-saved SIMD registers
  1419. stp x29, x30, [sp, #-192]!
  1420. stp x19, x20, [sp, #80]
  1421. stp x21, x22, [sp, #96]
  1422. str x23, [sp, #112]
  1423. stp d8, d9, [sp, #128]
  1424. stp d10, d11, [sp, #144]
  1425. stp d12, d13, [sp, #160]
  1426. stp d14, d15, [sp, #176]
  1427. mov x19, sp
  1428. mov x20, x0
  1429. mov x21, x1
  1430. mov x22, x2
  1431. mov x23, x3
  1432. // generate initial tweak
  1433. sub sp, sp, #16
  1434. mov x0, x5 // iv[]
  1435. mov x1, sp
  1436. mov x2, x4 // key2
  1437. bl AES_encrypt
  1438. ldr q11, [sp], #16
  1439. ldr w1, [x23, #240] // get # of rounds
  1440. // allocate the key schedule on the stack
  1441. add x17, sp, #96
  1442. sub x17, x17, x1, lsl #7 // 128 bytes per inner round key, less 96 bytes
  1443. // populate the key schedule
  1444. mov x9, x23 // pass key
  1445. mov x10, x1 // pass # of rounds
  1446. mov sp, x17
  1447. bl _bsaes_key_convert
  1448. eor v15.16b, v15.16b, v7.16b // fix up last round key
  1449. str q15, [x17] // save last round key
  1450. subs x22, x22, #0x80
  1451. blo .Lxts_enc_short
  1452. b .Lxts_enc_loop
  1453. .align 4
  1454. .Lxts_enc_loop:
  1455. ldr q8, .Lxts_magic
  1456. mov x10, x1 // pass rounds
  1457. add x2, x19, #16
  1458. ldr q0, [x20], #16
  1459. sshr v1.2d, v11.2d, #63
  1460. mov x9, sp // pass key schedule
  1461. ldr q6, .Lxts_magic+16
  1462. add v2.2d, v11.2d, v11.2d
  1463. cmtst v3.2d, v11.2d, v6.2d
  1464. and v1.16b, v1.16b, v8.16b
  1465. ext v1.16b, v1.16b, v1.16b, #8
  1466. and v3.16b, v3.16b, v8.16b
  1467. ldr q4, [x20], #16
  1468. eor v12.16b, v2.16b, v1.16b
  1469. eor v1.16b, v4.16b, v12.16b
  1470. eor v0.16b, v0.16b, v11.16b
  1471. cmtst v2.2d, v12.2d, v6.2d
  1472. add v4.2d, v12.2d, v12.2d
  1473. add x0, x19, #16
  1474. ext v3.16b, v3.16b, v3.16b, #8
  1475. and v2.16b, v2.16b, v8.16b
  1476. eor v13.16b, v4.16b, v3.16b
  1477. ldr q3, [x20], #16
  1478. ext v4.16b, v2.16b, v2.16b, #8
  1479. eor v2.16b, v3.16b, v13.16b
  1480. ldr q3, [x20], #16
  1481. add v5.2d, v13.2d, v13.2d
  1482. cmtst v7.2d, v13.2d, v6.2d
  1483. and v7.16b, v7.16b, v8.16b
  1484. ldr q9, [x20], #16
  1485. ext v7.16b, v7.16b, v7.16b, #8
  1486. ldr q10, [x20], #16
  1487. eor v14.16b, v5.16b, v4.16b
  1488. ldr q16, [x20], #16
  1489. add v4.2d, v14.2d, v14.2d
  1490. eor v3.16b, v3.16b, v14.16b
  1491. eor v15.16b, v4.16b, v7.16b
  1492. add v5.2d, v15.2d, v15.2d
  1493. ldr q7, [x20], #16
  1494. cmtst v4.2d, v14.2d, v6.2d
  1495. and v17.16b, v4.16b, v8.16b
  1496. cmtst v18.2d, v15.2d, v6.2d
  1497. eor v4.16b, v9.16b, v15.16b
  1498. ext v9.16b, v17.16b, v17.16b, #8
  1499. eor v9.16b, v5.16b, v9.16b
  1500. add v17.2d, v9.2d, v9.2d
  1501. and v18.16b, v18.16b, v8.16b
  1502. eor v5.16b, v10.16b, v9.16b
  1503. str q9, [x2], #16
  1504. ext v10.16b, v18.16b, v18.16b, #8
  1505. cmtst v9.2d, v9.2d, v6.2d
  1506. and v9.16b, v9.16b, v8.16b
  1507. eor v10.16b, v17.16b, v10.16b
  1508. cmtst v17.2d, v10.2d, v6.2d
  1509. eor v6.16b, v16.16b, v10.16b
  1510. str q10, [x2], #16
  1511. ext v9.16b, v9.16b, v9.16b, #8
  1512. add v10.2d, v10.2d, v10.2d
  1513. eor v9.16b, v10.16b, v9.16b
  1514. str q9, [x2], #16
  1515. eor v7.16b, v7.16b, v9.16b
  1516. add v9.2d, v9.2d, v9.2d
  1517. and v8.16b, v17.16b, v8.16b
  1518. ext v8.16b, v8.16b, v8.16b, #8
  1519. eor v8.16b, v9.16b, v8.16b
  1520. str q8, [x2] // next round tweak
  1521. bl _bsaes_encrypt8
  1522. ldr q8, [x0], #16
  1523. eor v0.16b, v0.16b, v11.16b
  1524. eor v1.16b, v1.16b, v12.16b
  1525. ldr q9, [x0], #16
  1526. eor v4.16b, v4.16b, v13.16b
  1527. eor v6.16b, v6.16b, v14.16b
  1528. ldr q10, [x0], #16
  1529. eor v3.16b, v3.16b, v15.16b
  1530. subs x22, x22, #0x80
  1531. str q0, [x21], #16
  1532. ldr q11, [x0] // next round tweak
  1533. str q1, [x21], #16
  1534. eor v0.16b, v7.16b, v8.16b
  1535. eor v1.16b, v2.16b, v9.16b
  1536. str q4, [x21], #16
  1537. eor v2.16b, v5.16b, v10.16b
  1538. str q6, [x21], #16
  1539. str q3, [x21], #16
  1540. str q0, [x21], #16
  1541. str q1, [x21], #16
  1542. str q2, [x21], #16
  1543. bpl .Lxts_enc_loop
  1544. .Lxts_enc_short:
  1545. adds x22, x22, #0x70
  1546. bmi .Lxts_enc_done
  1547. ldr q8, .Lxts_magic
  1548. sshr v1.2d, v11.2d, #63
  1549. add v2.2d, v11.2d, v11.2d
  1550. ldr q9, .Lxts_magic+16
  1551. subs x22, x22, #0x10
  1552. ldr q0, [x20], #16
  1553. and v1.16b, v1.16b, v8.16b
  1554. cmtst v3.2d, v11.2d, v9.2d
  1555. ext v1.16b, v1.16b, v1.16b, #8
  1556. and v3.16b, v3.16b, v8.16b
  1557. eor v12.16b, v2.16b, v1.16b
  1558. ext v1.16b, v3.16b, v3.16b, #8
  1559. add v2.2d, v12.2d, v12.2d
  1560. cmtst v3.2d, v12.2d, v9.2d
  1561. eor v13.16b, v2.16b, v1.16b
  1562. and v22.16b, v3.16b, v8.16b
  1563. bmi .Lxts_enc_1
  1564. ext v2.16b, v22.16b, v22.16b, #8
  1565. add v3.2d, v13.2d, v13.2d
  1566. ldr q1, [x20], #16
  1567. cmtst v4.2d, v13.2d, v9.2d
  1568. subs x22, x22, #0x10
  1569. eor v14.16b, v3.16b, v2.16b
  1570. and v23.16b, v4.16b, v8.16b
  1571. bmi .Lxts_enc_2
  1572. ext v3.16b, v23.16b, v23.16b, #8
  1573. add v4.2d, v14.2d, v14.2d
  1574. ldr q2, [x20], #16
  1575. cmtst v5.2d, v14.2d, v9.2d
  1576. eor v0.16b, v0.16b, v11.16b
  1577. subs x22, x22, #0x10
  1578. eor v15.16b, v4.16b, v3.16b
  1579. and v24.16b, v5.16b, v8.16b
  1580. bmi .Lxts_enc_3
  1581. ext v4.16b, v24.16b, v24.16b, #8
  1582. add v5.2d, v15.2d, v15.2d
  1583. ldr q3, [x20], #16
  1584. cmtst v6.2d, v15.2d, v9.2d
  1585. eor v1.16b, v1.16b, v12.16b
  1586. subs x22, x22, #0x10
  1587. eor v16.16b, v5.16b, v4.16b
  1588. and v25.16b, v6.16b, v8.16b
  1589. bmi .Lxts_enc_4
  1590. ext v5.16b, v25.16b, v25.16b, #8
  1591. add v6.2d, v16.2d, v16.2d
  1592. add x0, x19, #16
  1593. cmtst v7.2d, v16.2d, v9.2d
  1594. ldr q4, [x20], #16
  1595. eor v2.16b, v2.16b, v13.16b
  1596. str q16, [x0], #16
  1597. subs x22, x22, #0x10
  1598. eor v17.16b, v6.16b, v5.16b
  1599. and v26.16b, v7.16b, v8.16b
  1600. bmi .Lxts_enc_5
  1601. ext v7.16b, v26.16b, v26.16b, #8
  1602. add v18.2d, v17.2d, v17.2d
  1603. ldr q5, [x20], #16
  1604. eor v3.16b, v3.16b, v14.16b
  1605. str q17, [x0], #16
  1606. subs x22, x22, #0x10
  1607. eor v18.16b, v18.16b, v7.16b
  1608. bmi .Lxts_enc_6
  1609. ldr q6, [x20], #16
  1610. eor v4.16b, v4.16b, v15.16b
  1611. eor v5.16b, v5.16b, v16.16b
  1612. str q18, [x0] // next round tweak
  1613. mov x9, sp // pass key schedule
  1614. mov x10, x1
  1615. add x0, x19, #16
  1616. sub x22, x22, #0x10
  1617. eor v6.16b, v6.16b, v17.16b
  1618. bl _bsaes_encrypt8
  1619. ldr q16, [x0], #16
  1620. eor v0.16b, v0.16b, v11.16b
  1621. eor v1.16b, v1.16b, v12.16b
  1622. ldr q17, [x0], #16
  1623. eor v4.16b, v4.16b, v13.16b
  1624. eor v6.16b, v6.16b, v14.16b
  1625. eor v3.16b, v3.16b, v15.16b
  1626. ldr q11, [x0] // next round tweak
  1627. str q0, [x21], #16
  1628. str q1, [x21], #16
  1629. eor v0.16b, v7.16b, v16.16b
  1630. eor v1.16b, v2.16b, v17.16b
  1631. str q4, [x21], #16
  1632. str q6, [x21], #16
  1633. str q3, [x21], #16
  1634. str q0, [x21], #16
  1635. str q1, [x21], #16
  1636. b .Lxts_enc_done
  1637. .align 4
  1638. .Lxts_enc_6:
  1639. eor v4.16b, v4.16b, v15.16b
  1640. eor v5.16b, v5.16b, v16.16b
  1641. mov x9, sp // pass key schedule
  1642. mov x10, x1 // pass rounds
  1643. add x0, x19, #16
  1644. bl _bsaes_encrypt8
  1645. ldr q16, [x0], #16
  1646. eor v0.16b, v0.16b, v11.16b
  1647. eor v1.16b, v1.16b, v12.16b
  1648. eor v4.16b, v4.16b, v13.16b
  1649. eor v6.16b, v6.16b, v14.16b
  1650. ldr q11, [x0] // next round tweak
  1651. eor v3.16b, v3.16b, v15.16b
  1652. str q0, [x21], #16
  1653. str q1, [x21], #16
  1654. eor v0.16b, v7.16b, v16.16b
  1655. str q4, [x21], #16
  1656. str q6, [x21], #16
  1657. str q3, [x21], #16
  1658. str q0, [x21], #16
  1659. b .Lxts_enc_done
  1660. .align 4
  1661. .Lxts_enc_5:
  1662. eor v3.16b, v3.16b, v14.16b
  1663. eor v4.16b, v4.16b, v15.16b
  1664. mov x9, sp // pass key schedule
  1665. mov x10, x1 // pass rounds
  1666. add x0, x19, #16
  1667. bl _bsaes_encrypt8
  1668. eor v0.16b, v0.16b, v11.16b
  1669. eor v1.16b, v1.16b, v12.16b
  1670. ldr q11, [x0] // next round tweak
  1671. eor v4.16b, v4.16b, v13.16b
  1672. eor v6.16b, v6.16b, v14.16b
  1673. eor v3.16b, v3.16b, v15.16b
  1674. str q0, [x21], #16
  1675. str q1, [x21], #16
  1676. str q4, [x21], #16
  1677. str q6, [x21], #16
  1678. str q3, [x21], #16
  1679. b .Lxts_enc_done
  1680. .align 4
  1681. .Lxts_enc_4:
  1682. eor v2.16b, v2.16b, v13.16b
  1683. eor v3.16b, v3.16b, v14.16b
  1684. mov x9, sp // pass key schedule
  1685. mov x10, x1 // pass rounds
  1686. add x0, x19, #16
  1687. bl _bsaes_encrypt8
  1688. eor v0.16b, v0.16b, v11.16b
  1689. eor v1.16b, v1.16b, v12.16b
  1690. eor v4.16b, v4.16b, v13.16b
  1691. eor v6.16b, v6.16b, v14.16b
  1692. mov v11.16b, v15.16b // next round tweak
  1693. str q0, [x21], #16
  1694. str q1, [x21], #16
  1695. str q4, [x21], #16
  1696. str q6, [x21], #16
  1697. b .Lxts_enc_done
  1698. .align 4
  1699. .Lxts_enc_3:
  1700. eor v1.16b, v1.16b, v12.16b
  1701. eor v2.16b, v2.16b, v13.16b
  1702. mov x9, sp // pass key schedule
  1703. mov x10, x1 // pass rounds
  1704. add x0, x19, #16
  1705. bl _bsaes_encrypt8
  1706. eor v0.16b, v0.16b, v11.16b
  1707. eor v1.16b, v1.16b, v12.16b
  1708. eor v4.16b, v4.16b, v13.16b
  1709. mov v11.16b, v14.16b // next round tweak
  1710. str q0, [x21], #16
  1711. str q1, [x21], #16
  1712. str q4, [x21], #16
  1713. b .Lxts_enc_done
  1714. .align 4
  1715. .Lxts_enc_2:
  1716. eor v0.16b, v0.16b, v11.16b
  1717. eor v1.16b, v1.16b, v12.16b
  1718. mov x9, sp // pass key schedule
  1719. mov x10, x1 // pass rounds
  1720. add x0, x19, #16
  1721. bl _bsaes_encrypt8
  1722. eor v0.16b, v0.16b, v11.16b
  1723. eor v1.16b, v1.16b, v12.16b
  1724. mov v11.16b, v13.16b // next round tweak
  1725. str q0, [x21], #16
  1726. str q1, [x21], #16
  1727. b .Lxts_enc_done
  1728. .align 4
  1729. .Lxts_enc_1:
  1730. eor v0.16b, v0.16b, v11.16b
  1731. sub x0, sp, #16
  1732. sub x1, sp, #16
  1733. mov x2, x23
  1734. mov v13.d[0], v11.d[1] // just in case AES_encrypt corrupts top half of callee-saved SIMD registers
  1735. mov v14.d[0], v12.d[1]
  1736. str q0, [sp, #-16]!
  1737. bl AES_encrypt
  1738. ldr q0, [sp], #16
  1739. trn1 v13.2d, v11.2d, v13.2d
  1740. trn1 v11.2d, v12.2d, v14.2d // next round tweak
  1741. eor v0.16b, v0.16b, v13.16b
  1742. str q0, [x21], #16
  1743. .Lxts_enc_done:
  1744. adds x22, x22, #0x10
  1745. beq .Lxts_enc_ret
  1746. sub x6, x21, #0x10
  1747. // Penultimate plaintext block produces final ciphertext part-block
  1748. // plus remaining part of final plaintext block. Move ciphertext part
  1749. // to final position and reuse penultimate ciphertext block buffer to
  1750. // construct final plaintext block
  1751. .Lxts_enc_steal:
  1752. ldrb w0, [x20], #1
  1753. ldrb w1, [x21, #-0x10]
  1754. strb w0, [x21, #-0x10]
  1755. strb w1, [x21], #1
  1756. subs x22, x22, #1
  1757. bhi .Lxts_enc_steal
  1758. // Finally encrypt the penultimate ciphertext block using the
  1759. // last tweak
  1760. ldr q0, [x6]
  1761. eor v0.16b, v0.16b, v11.16b
  1762. str q0, [sp, #-16]!
  1763. mov x0, sp
  1764. mov x1, sp
  1765. mov x2, x23
  1766. mov x21, x6
  1767. mov v13.d[0], v11.d[1] // just in case AES_encrypt corrupts top half of callee-saved SIMD registers
  1768. bl AES_encrypt
  1769. trn1 v11.2d, v11.2d, v13.2d
  1770. ldr q0, [sp], #16
  1771. eor v0.16b, v0.16b, v11.16b
  1772. str q0, [x21]
  1773. .Lxts_enc_ret:
  1774. movi v0.16b, #0
  1775. movi v1.16b, #0
  1776. .Lxts_enc_bzero: // wipe key schedule
  1777. stp q0, q1, [sp], #32
  1778. cmp sp, x19
  1779. bne .Lxts_enc_bzero
  1780. ldp x19, x20, [sp, #80]
  1781. ldp x21, x22, [sp, #96]
  1782. ldr x23, [sp, #112]
  1783. ldp d8, d9, [sp, #128]
  1784. ldp d10, d11, [sp, #144]
  1785. ldp d12, d13, [sp, #160]
  1786. ldp d14, d15, [sp, #176]
  1787. ldp x29, x30, [sp], #192
  1788. ret
  1789. .size ossl_bsaes_xts_encrypt,.-ossl_bsaes_xts_encrypt
  1790. // The assembler doesn't seem capable of de-duplicating these when expressed
  1791. // using `ldr qd,=` syntax, so assign a symbolic address
  1792. .align 5
  1793. .Lxts_magic:
  1794. .quad 1, 0x87, 0x4000000000000000, 0x4000000000000000
  1795. .globl ossl_bsaes_xts_decrypt
  1796. .type ossl_bsaes_xts_decrypt,%function
  1797. .align 4
  1798. // On entry:
  1799. // x0 -> input ciphertext
  1800. // x1 -> output plaintext
  1801. // x2 -> length of text in bytes (must be at least 16)
  1802. // x3 -> key1 (used to decrypt the XORed ciphertext blocks)
  1803. // x4 -> key2 (used to encrypt the initial vector to yield the initial tweak)
  1804. // x5 -> 16-byte initial vector (typically, sector number)
  1805. // On exit:
  1806. // Output plaintext filled in
  1807. // No output registers, usual AAPCS64 register preservation
  1808. ossl_bsaes_xts_decrypt:
  1809. // Stack layout:
  1810. // sp ->
  1811. // nrounds*128-96 bytes: key schedule
  1812. // x19 ->
  1813. // 16 bytes: frame record
  1814. // 4*16 bytes: tweak storage across _bsaes_decrypt8
  1815. // 6*8 bytes: storage for 5 callee-saved general-purpose registers
  1816. // 8*8 bytes: storage for 8 callee-saved SIMD registers
  1817. stp x29, x30, [sp, #-192]!
  1818. stp x19, x20, [sp, #80]
  1819. stp x21, x22, [sp, #96]
  1820. str x23, [sp, #112]
  1821. stp d8, d9, [sp, #128]
  1822. stp d10, d11, [sp, #144]
  1823. stp d12, d13, [sp, #160]
  1824. stp d14, d15, [sp, #176]
  1825. mov x19, sp
  1826. mov x20, x0
  1827. mov x21, x1
  1828. mov x22, x2
  1829. mov x23, x3
  1830. // generate initial tweak
  1831. sub sp, sp, #16
  1832. mov x0, x5 // iv[]
  1833. mov x1, sp
  1834. mov x2, x4 // key2
  1835. bl AES_encrypt
  1836. ldr q11, [sp], #16
  1837. ldr w1, [x23, #240] // get # of rounds
  1838. // allocate the key schedule on the stack
  1839. add x17, sp, #96
  1840. sub x17, x17, x1, lsl #7 // 128 bytes per inner round key, less 96 bytes
  1841. // populate the key schedule
  1842. mov x9, x23 // pass key
  1843. mov x10, x1 // pass # of rounds
  1844. mov sp, x17
  1845. bl _bsaes_key_convert
  1846. ldr q6, [sp]
  1847. str q15, [x17] // save last round key
  1848. eor v6.16b, v6.16b, v7.16b // fix up round 0 key (by XORing with 0x63)
  1849. str q6, [sp]
  1850. sub x30, x22, #0x10
  1851. tst x22, #0xf // if not multiple of 16
  1852. csel x22, x30, x22, ne // subtract another 16 bytes
  1853. subs x22, x22, #0x80
  1854. blo .Lxts_dec_short
  1855. b .Lxts_dec_loop
  1856. .align 4
  1857. .Lxts_dec_loop:
  1858. ldr q8, .Lxts_magic
  1859. mov x10, x1 // pass rounds
  1860. add x2, x19, #16
  1861. ldr q0, [x20], #16
  1862. sshr v1.2d, v11.2d, #63
  1863. mov x9, sp // pass key schedule
  1864. ldr q6, .Lxts_magic+16
  1865. add v2.2d, v11.2d, v11.2d
  1866. cmtst v3.2d, v11.2d, v6.2d
  1867. and v1.16b, v1.16b, v8.16b
  1868. ext v1.16b, v1.16b, v1.16b, #8
  1869. and v3.16b, v3.16b, v8.16b
  1870. ldr q4, [x20], #16
  1871. eor v12.16b, v2.16b, v1.16b
  1872. eor v1.16b, v4.16b, v12.16b
  1873. eor v0.16b, v0.16b, v11.16b
  1874. cmtst v2.2d, v12.2d, v6.2d
  1875. add v4.2d, v12.2d, v12.2d
  1876. add x0, x19, #16
  1877. ext v3.16b, v3.16b, v3.16b, #8
  1878. and v2.16b, v2.16b, v8.16b
  1879. eor v13.16b, v4.16b, v3.16b
  1880. ldr q3, [x20], #16
  1881. ext v4.16b, v2.16b, v2.16b, #8
  1882. eor v2.16b, v3.16b, v13.16b
  1883. ldr q3, [x20], #16
  1884. add v5.2d, v13.2d, v13.2d
  1885. cmtst v7.2d, v13.2d, v6.2d
  1886. and v7.16b, v7.16b, v8.16b
  1887. ldr q9, [x20], #16
  1888. ext v7.16b, v7.16b, v7.16b, #8
  1889. ldr q10, [x20], #16
  1890. eor v14.16b, v5.16b, v4.16b
  1891. ldr q16, [x20], #16
  1892. add v4.2d, v14.2d, v14.2d
  1893. eor v3.16b, v3.16b, v14.16b
  1894. eor v15.16b, v4.16b, v7.16b
  1895. add v5.2d, v15.2d, v15.2d
  1896. ldr q7, [x20], #16
  1897. cmtst v4.2d, v14.2d, v6.2d
  1898. and v17.16b, v4.16b, v8.16b
  1899. cmtst v18.2d, v15.2d, v6.2d
  1900. eor v4.16b, v9.16b, v15.16b
  1901. ext v9.16b, v17.16b, v17.16b, #8
  1902. eor v9.16b, v5.16b, v9.16b
  1903. add v17.2d, v9.2d, v9.2d
  1904. and v18.16b, v18.16b, v8.16b
  1905. eor v5.16b, v10.16b, v9.16b
  1906. str q9, [x2], #16
  1907. ext v10.16b, v18.16b, v18.16b, #8
  1908. cmtst v9.2d, v9.2d, v6.2d
  1909. and v9.16b, v9.16b, v8.16b
  1910. eor v10.16b, v17.16b, v10.16b
  1911. cmtst v17.2d, v10.2d, v6.2d
  1912. eor v6.16b, v16.16b, v10.16b
  1913. str q10, [x2], #16
  1914. ext v9.16b, v9.16b, v9.16b, #8
  1915. add v10.2d, v10.2d, v10.2d
  1916. eor v9.16b, v10.16b, v9.16b
  1917. str q9, [x2], #16
  1918. eor v7.16b, v7.16b, v9.16b
  1919. add v9.2d, v9.2d, v9.2d
  1920. and v8.16b, v17.16b, v8.16b
  1921. ext v8.16b, v8.16b, v8.16b, #8
  1922. eor v8.16b, v9.16b, v8.16b
  1923. str q8, [x2] // next round tweak
  1924. bl _bsaes_decrypt8
  1925. eor v6.16b, v6.16b, v13.16b
  1926. eor v0.16b, v0.16b, v11.16b
  1927. ldr q8, [x0], #16
  1928. eor v7.16b, v7.16b, v8.16b
  1929. str q0, [x21], #16
  1930. eor v0.16b, v1.16b, v12.16b
  1931. ldr q1, [x0], #16
  1932. eor v1.16b, v3.16b, v1.16b
  1933. subs x22, x22, #0x80
  1934. eor v2.16b, v2.16b, v15.16b
  1935. eor v3.16b, v4.16b, v14.16b
  1936. ldr q4, [x0], #16
  1937. str q0, [x21], #16
  1938. ldr q11, [x0] // next round tweak
  1939. eor v0.16b, v5.16b, v4.16b
  1940. str q6, [x21], #16
  1941. str q3, [x21], #16
  1942. str q2, [x21], #16
  1943. str q7, [x21], #16
  1944. str q1, [x21], #16
  1945. str q0, [x21], #16
  1946. bpl .Lxts_dec_loop
  1947. .Lxts_dec_short:
  1948. adds x22, x22, #0x70
  1949. bmi .Lxts_dec_done
  1950. ldr q8, .Lxts_magic
  1951. sshr v1.2d, v11.2d, #63
  1952. add v2.2d, v11.2d, v11.2d
  1953. ldr q9, .Lxts_magic+16
  1954. subs x22, x22, #0x10
  1955. ldr q0, [x20], #16
  1956. and v1.16b, v1.16b, v8.16b
  1957. cmtst v3.2d, v11.2d, v9.2d
  1958. ext v1.16b, v1.16b, v1.16b, #8
  1959. and v3.16b, v3.16b, v8.16b
  1960. eor v12.16b, v2.16b, v1.16b
  1961. ext v1.16b, v3.16b, v3.16b, #8
  1962. add v2.2d, v12.2d, v12.2d
  1963. cmtst v3.2d, v12.2d, v9.2d
  1964. eor v13.16b, v2.16b, v1.16b
  1965. and v22.16b, v3.16b, v8.16b
  1966. bmi .Lxts_dec_1
  1967. ext v2.16b, v22.16b, v22.16b, #8
  1968. add v3.2d, v13.2d, v13.2d
  1969. ldr q1, [x20], #16
  1970. cmtst v4.2d, v13.2d, v9.2d
  1971. subs x22, x22, #0x10
  1972. eor v14.16b, v3.16b, v2.16b
  1973. and v23.16b, v4.16b, v8.16b
  1974. bmi .Lxts_dec_2
  1975. ext v3.16b, v23.16b, v23.16b, #8
  1976. add v4.2d, v14.2d, v14.2d
  1977. ldr q2, [x20], #16
  1978. cmtst v5.2d, v14.2d, v9.2d
  1979. eor v0.16b, v0.16b, v11.16b
  1980. subs x22, x22, #0x10
  1981. eor v15.16b, v4.16b, v3.16b
  1982. and v24.16b, v5.16b, v8.16b
  1983. bmi .Lxts_dec_3
  1984. ext v4.16b, v24.16b, v24.16b, #8
  1985. add v5.2d, v15.2d, v15.2d
  1986. ldr q3, [x20], #16
  1987. cmtst v6.2d, v15.2d, v9.2d
  1988. eor v1.16b, v1.16b, v12.16b
  1989. subs x22, x22, #0x10
  1990. eor v16.16b, v5.16b, v4.16b
  1991. and v25.16b, v6.16b, v8.16b
  1992. bmi .Lxts_dec_4
  1993. ext v5.16b, v25.16b, v25.16b, #8
  1994. add v6.2d, v16.2d, v16.2d
  1995. add x0, x19, #16
  1996. cmtst v7.2d, v16.2d, v9.2d
  1997. ldr q4, [x20], #16
  1998. eor v2.16b, v2.16b, v13.16b
  1999. str q16, [x0], #16
  2000. subs x22, x22, #0x10
  2001. eor v17.16b, v6.16b, v5.16b
  2002. and v26.16b, v7.16b, v8.16b
  2003. bmi .Lxts_dec_5
  2004. ext v7.16b, v26.16b, v26.16b, #8
  2005. add v18.2d, v17.2d, v17.2d
  2006. ldr q5, [x20], #16
  2007. eor v3.16b, v3.16b, v14.16b
  2008. str q17, [x0], #16
  2009. subs x22, x22, #0x10
  2010. eor v18.16b, v18.16b, v7.16b
  2011. bmi .Lxts_dec_6
  2012. ldr q6, [x20], #16
  2013. eor v4.16b, v4.16b, v15.16b
  2014. eor v5.16b, v5.16b, v16.16b
  2015. str q18, [x0] // next round tweak
  2016. mov x9, sp // pass key schedule
  2017. mov x10, x1
  2018. add x0, x19, #16
  2019. sub x22, x22, #0x10
  2020. eor v6.16b, v6.16b, v17.16b
  2021. bl _bsaes_decrypt8
  2022. ldr q16, [x0], #16
  2023. eor v0.16b, v0.16b, v11.16b
  2024. eor v1.16b, v1.16b, v12.16b
  2025. ldr q17, [x0], #16
  2026. eor v6.16b, v6.16b, v13.16b
  2027. eor v4.16b, v4.16b, v14.16b
  2028. eor v2.16b, v2.16b, v15.16b
  2029. ldr q11, [x0] // next round tweak
  2030. str q0, [x21], #16
  2031. str q1, [x21], #16
  2032. eor v0.16b, v7.16b, v16.16b
  2033. eor v1.16b, v3.16b, v17.16b
  2034. str q6, [x21], #16
  2035. str q4, [x21], #16
  2036. str q2, [x21], #16
  2037. str q0, [x21], #16
  2038. str q1, [x21], #16
  2039. b .Lxts_dec_done
  2040. .align 4
  2041. .Lxts_dec_6:
  2042. eor v4.16b, v4.16b, v15.16b
  2043. eor v5.16b, v5.16b, v16.16b
  2044. mov x9, sp // pass key schedule
  2045. mov x10, x1 // pass rounds
  2046. add x0, x19, #16
  2047. bl _bsaes_decrypt8
  2048. ldr q16, [x0], #16
  2049. eor v0.16b, v0.16b, v11.16b
  2050. eor v1.16b, v1.16b, v12.16b
  2051. eor v6.16b, v6.16b, v13.16b
  2052. eor v4.16b, v4.16b, v14.16b
  2053. ldr q11, [x0] // next round tweak
  2054. eor v2.16b, v2.16b, v15.16b
  2055. str q0, [x21], #16
  2056. str q1, [x21], #16
  2057. eor v0.16b, v7.16b, v16.16b
  2058. str q6, [x21], #16
  2059. str q4, [x21], #16
  2060. str q2, [x21], #16
  2061. str q0, [x21], #16
  2062. b .Lxts_dec_done
  2063. .align 4
  2064. .Lxts_dec_5:
  2065. eor v3.16b, v3.16b, v14.16b
  2066. eor v4.16b, v4.16b, v15.16b
  2067. mov x9, sp // pass key schedule
  2068. mov x10, x1 // pass rounds
  2069. add x0, x19, #16
  2070. bl _bsaes_decrypt8
  2071. eor v0.16b, v0.16b, v11.16b
  2072. eor v1.16b, v1.16b, v12.16b
  2073. ldr q11, [x0] // next round tweak
  2074. eor v6.16b, v6.16b, v13.16b
  2075. eor v4.16b, v4.16b, v14.16b
  2076. eor v2.16b, v2.16b, v15.16b
  2077. str q0, [x21], #16
  2078. str q1, [x21], #16
  2079. str q6, [x21], #16
  2080. str q4, [x21], #16
  2081. str q2, [x21], #16
  2082. b .Lxts_dec_done
  2083. .align 4
  2084. .Lxts_dec_4:
  2085. eor v2.16b, v2.16b, v13.16b
  2086. eor v3.16b, v3.16b, v14.16b
  2087. mov x9, sp // pass key schedule
  2088. mov x10, x1 // pass rounds
  2089. add x0, x19, #16
  2090. bl _bsaes_decrypt8
  2091. eor v0.16b, v0.16b, v11.16b
  2092. eor v1.16b, v1.16b, v12.16b
  2093. eor v6.16b, v6.16b, v13.16b
  2094. eor v4.16b, v4.16b, v14.16b
  2095. mov v11.16b, v15.16b // next round tweak
  2096. str q0, [x21], #16
  2097. str q1, [x21], #16
  2098. str q6, [x21], #16
  2099. str q4, [x21], #16
  2100. b .Lxts_dec_done
  2101. .align 4
  2102. .Lxts_dec_3:
  2103. eor v1.16b, v1.16b, v12.16b
  2104. eor v2.16b, v2.16b, v13.16b
  2105. mov x9, sp // pass key schedule
  2106. mov x10, x1 // pass rounds
  2107. add x0, x19, #16
  2108. bl _bsaes_decrypt8
  2109. eor v0.16b, v0.16b, v11.16b
  2110. eor v1.16b, v1.16b, v12.16b
  2111. eor v6.16b, v6.16b, v13.16b
  2112. mov v11.16b, v14.16b // next round tweak
  2113. str q0, [x21], #16
  2114. str q1, [x21], #16
  2115. str q6, [x21], #16
  2116. b .Lxts_dec_done
  2117. .align 4
  2118. .Lxts_dec_2:
  2119. eor v0.16b, v0.16b, v11.16b
  2120. eor v1.16b, v1.16b, v12.16b
  2121. mov x9, sp // pass key schedule
  2122. mov x10, x1 // pass rounds
  2123. add x0, x19, #16
  2124. bl _bsaes_decrypt8
  2125. eor v0.16b, v0.16b, v11.16b
  2126. eor v1.16b, v1.16b, v12.16b
  2127. mov v11.16b, v13.16b // next round tweak
  2128. str q0, [x21], #16
  2129. str q1, [x21], #16
  2130. b .Lxts_dec_done
  2131. .align 4
  2132. .Lxts_dec_1:
  2133. eor v0.16b, v0.16b, v11.16b
  2134. sub x0, sp, #16
  2135. sub x1, sp, #16
  2136. mov x2, x23
  2137. mov v13.d[0], v11.d[1] // just in case AES_decrypt corrupts top half of callee-saved SIMD registers
  2138. mov v14.d[0], v12.d[1]
  2139. str q0, [sp, #-16]!
  2140. bl AES_decrypt
  2141. ldr q0, [sp], #16
  2142. trn1 v13.2d, v11.2d, v13.2d
  2143. trn1 v11.2d, v12.2d, v14.2d // next round tweak
  2144. eor v0.16b, v0.16b, v13.16b
  2145. str q0, [x21], #16
  2146. .Lxts_dec_done:
  2147. adds x22, x22, #0x10
  2148. beq .Lxts_dec_ret
  2149. // calculate one round of extra tweak for the stolen ciphertext
  2150. ldr q8, .Lxts_magic
  2151. sshr v6.2d, v11.2d, #63
  2152. and v6.16b, v6.16b, v8.16b
  2153. add v12.2d, v11.2d, v11.2d
  2154. ext v6.16b, v6.16b, v6.16b, #8
  2155. eor v12.16b, v12.16b, v6.16b
  2156. // perform the final decryption with the last tweak value
  2157. ldr q0, [x20], #16
  2158. eor v0.16b, v0.16b, v12.16b
  2159. str q0, [sp, #-16]!
  2160. mov x0, sp
  2161. mov x1, sp
  2162. mov x2, x23
  2163. mov v13.d[0], v11.d[1] // just in case AES_decrypt corrupts top half of callee-saved SIMD registers
  2164. mov v14.d[0], v12.d[1]
  2165. bl AES_decrypt
  2166. trn1 v12.2d, v12.2d, v14.2d
  2167. trn1 v11.2d, v11.2d, v13.2d
  2168. ldr q0, [sp], #16
  2169. eor v0.16b, v0.16b, v12.16b
  2170. str q0, [x21]
  2171. mov x6, x21
  2172. // Penultimate ciphertext block produces final plaintext part-block
  2173. // plus remaining part of final ciphertext block. Move plaintext part
  2174. // to final position and reuse penultimate plaintext block buffer to
  2175. // construct final ciphertext block
  2176. .Lxts_dec_steal:
  2177. ldrb w1, [x21]
  2178. ldrb w0, [x20], #1
  2179. strb w1, [x21, #0x10]
  2180. strb w0, [x21], #1
  2181. subs x22, x22, #1
  2182. bhi .Lxts_dec_steal
  2183. // Finally decrypt the penultimate plaintext block using the
  2184. // penultimate tweak
  2185. ldr q0, [x6]
  2186. eor v0.16b, v0.16b, v11.16b
  2187. str q0, [sp, #-16]!
  2188. mov x0, sp
  2189. mov x1, sp
  2190. mov x2, x23
  2191. mov x21, x6
  2192. bl AES_decrypt
  2193. trn1 v11.2d, v11.2d, v13.2d
  2194. ldr q0, [sp], #16
  2195. eor v0.16b, v0.16b, v11.16b
  2196. str q0, [x21]
  2197. .Lxts_dec_ret:
  2198. movi v0.16b, #0
  2199. movi v1.16b, #0
  2200. .Lxts_dec_bzero: // wipe key schedule
  2201. stp q0, q1, [sp], #32
  2202. cmp sp, x19
  2203. bne .Lxts_dec_bzero
  2204. ldp x19, x20, [sp, #80]
  2205. ldp x21, x22, [sp, #96]
  2206. ldr x23, [sp, #112]
  2207. ldp d8, d9, [sp, #128]
  2208. ldp d10, d11, [sp, #144]
  2209. ldp d12, d13, [sp, #160]
  2210. ldp d14, d15, [sp, #176]
  2211. ldp x29, x30, [sp], #192
  2212. ret
  2213. .size ossl_bsaes_xts_decrypt,.-ossl_bsaes_xts_decrypt