bsaes-armv8.pl 82 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381
  1. #!/usr/bin/env perl
  2. # Copyright 2020-2023 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. use strict;
  9. my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  10. my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  11. my $xlate;
  12. $0 =~ m/(.*[\/\\])[^\/\\]+$/; my $dir=$1;
  13. ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
  14. ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate ) or
  15. die "can't locate arm-xlate.pl";
  16. open OUT,"| \"$^X\" $xlate $flavour $output";
  17. *STDOUT=*OUT;
  18. my $code = data();
  19. print $code;
  20. close STDOUT or die "error closing STDOUT: $!"; # enforce flush
  21. sub data
  22. {
  23. local $/;
  24. return <DATA>;
  25. }
  26. __END__
  27. // Copyright 2021-2023 The OpenSSL Project Authors. All Rights Reserved.
  28. //
  29. // Licensed under the OpenSSL license (the "License"). You may not use
  30. // this file except in compliance with the License. You can obtain a copy
  31. // in the file LICENSE in the source distribution or at
  32. // https://www.openssl.org/source/license.html
  33. //
  34. // ====================================================================
  35. // Written by Ben Avison <bavison@riscosopen.org> for the OpenSSL
  36. // project. Rights for redistribution and usage in source and binary
  37. // forms are granted according to the OpenSSL license.
  38. // ====================================================================
  39. //
  40. // This implementation is a translation of bsaes-armv7 for AArch64.
  41. // No attempt has been made to carry across the build switches for
  42. // kernel targets, since the Linux kernel crypto support has moved on
  43. // from when it was based on OpenSSL.
  44. // A lot of hand-scheduling has been performed. Consequently, this code
  45. // doesn't factor out neatly into macros in the same way that the
  46. // AArch32 version did, and there is little to be gained by wrapping it
  47. // up in Perl, and it is presented as pure assembly.
  48. #include "crypto/arm_arch.h"
  49. .text
  50. .extern AES_cbc_encrypt
  51. .extern AES_encrypt
  52. .extern AES_decrypt
  53. .type _bsaes_decrypt8,%function
  54. .align 4
  55. // On entry:
  56. // x9 -> key (previously expanded using _bsaes_key_convert)
  57. // x10 = number of rounds
  58. // v0-v7 input data
  59. // On exit:
  60. // x9-x11 corrupted
  61. // other general-purpose registers preserved
  62. // v0-v7 output data
  63. // v11-v15 preserved
  64. // other SIMD registers corrupted
  65. _bsaes_decrypt8:
  66. ldr q8, [x9], #16
  67. adr x11, .LM0ISR
  68. movi v9.16b, #0x55
  69. ldr q10, [x11], #16
  70. movi v16.16b, #0x33
  71. movi v17.16b, #0x0f
  72. sub x10, x10, #1
  73. eor v0.16b, v0.16b, v8.16b
  74. eor v1.16b, v1.16b, v8.16b
  75. eor v2.16b, v2.16b, v8.16b
  76. eor v4.16b, v4.16b, v8.16b
  77. eor v3.16b, v3.16b, v8.16b
  78. eor v5.16b, v5.16b, v8.16b
  79. tbl v0.16b, {v0.16b}, v10.16b
  80. tbl v1.16b, {v1.16b}, v10.16b
  81. tbl v2.16b, {v2.16b}, v10.16b
  82. tbl v4.16b, {v4.16b}, v10.16b
  83. eor v6.16b, v6.16b, v8.16b
  84. eor v7.16b, v7.16b, v8.16b
  85. tbl v3.16b, {v3.16b}, v10.16b
  86. tbl v5.16b, {v5.16b}, v10.16b
  87. tbl v6.16b, {v6.16b}, v10.16b
  88. ushr v8.2d, v0.2d, #1
  89. tbl v7.16b, {v7.16b}, v10.16b
  90. ushr v10.2d, v4.2d, #1
  91. ushr v18.2d, v2.2d, #1
  92. eor v8.16b, v8.16b, v1.16b
  93. ushr v19.2d, v6.2d, #1
  94. eor v10.16b, v10.16b, v5.16b
  95. eor v18.16b, v18.16b, v3.16b
  96. and v8.16b, v8.16b, v9.16b
  97. eor v19.16b, v19.16b, v7.16b
  98. and v10.16b, v10.16b, v9.16b
  99. and v18.16b, v18.16b, v9.16b
  100. eor v1.16b, v1.16b, v8.16b
  101. shl v8.2d, v8.2d, #1
  102. and v9.16b, v19.16b, v9.16b
  103. eor v5.16b, v5.16b, v10.16b
  104. shl v10.2d, v10.2d, #1
  105. eor v3.16b, v3.16b, v18.16b
  106. shl v18.2d, v18.2d, #1
  107. eor v0.16b, v0.16b, v8.16b
  108. shl v8.2d, v9.2d, #1
  109. eor v7.16b, v7.16b, v9.16b
  110. eor v4.16b, v4.16b, v10.16b
  111. eor v2.16b, v2.16b, v18.16b
  112. ushr v9.2d, v1.2d, #2
  113. eor v6.16b, v6.16b, v8.16b
  114. ushr v8.2d, v0.2d, #2
  115. ushr v10.2d, v5.2d, #2
  116. ushr v18.2d, v4.2d, #2
  117. eor v9.16b, v9.16b, v3.16b
  118. eor v8.16b, v8.16b, v2.16b
  119. eor v10.16b, v10.16b, v7.16b
  120. eor v18.16b, v18.16b, v6.16b
  121. and v9.16b, v9.16b, v16.16b
  122. and v8.16b, v8.16b, v16.16b
  123. and v10.16b, v10.16b, v16.16b
  124. and v16.16b, v18.16b, v16.16b
  125. eor v3.16b, v3.16b, v9.16b
  126. shl v9.2d, v9.2d, #2
  127. eor v2.16b, v2.16b, v8.16b
  128. shl v8.2d, v8.2d, #2
  129. eor v7.16b, v7.16b, v10.16b
  130. shl v10.2d, v10.2d, #2
  131. eor v6.16b, v6.16b, v16.16b
  132. shl v16.2d, v16.2d, #2
  133. eor v1.16b, v1.16b, v9.16b
  134. eor v0.16b, v0.16b, v8.16b
  135. eor v5.16b, v5.16b, v10.16b
  136. eor v4.16b, v4.16b, v16.16b
  137. ushr v8.2d, v3.2d, #4
  138. ushr v9.2d, v2.2d, #4
  139. ushr v10.2d, v1.2d, #4
  140. ushr v16.2d, v0.2d, #4
  141. eor v8.16b, v8.16b, v7.16b
  142. eor v9.16b, v9.16b, v6.16b
  143. eor v10.16b, v10.16b, v5.16b
  144. eor v16.16b, v16.16b, v4.16b
  145. and v8.16b, v8.16b, v17.16b
  146. and v9.16b, v9.16b, v17.16b
  147. and v10.16b, v10.16b, v17.16b
  148. and v16.16b, v16.16b, v17.16b
  149. eor v7.16b, v7.16b, v8.16b
  150. shl v8.2d, v8.2d, #4
  151. eor v6.16b, v6.16b, v9.16b
  152. shl v9.2d, v9.2d, #4
  153. eor v5.16b, v5.16b, v10.16b
  154. shl v10.2d, v10.2d, #4
  155. eor v4.16b, v4.16b, v16.16b
  156. shl v16.2d, v16.2d, #4
  157. eor v3.16b, v3.16b, v8.16b
  158. eor v2.16b, v2.16b, v9.16b
  159. eor v1.16b, v1.16b, v10.16b
  160. eor v0.16b, v0.16b, v16.16b
  161. b .Ldec_sbox
  162. .align 4
  163. .Ldec_loop:
  164. ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x9], #64
  165. ldp q8, q9, [x9], #32
  166. eor v0.16b, v16.16b, v0.16b
  167. ldr q10, [x9], #16
  168. eor v1.16b, v17.16b, v1.16b
  169. ldr q16, [x9], #16
  170. eor v2.16b, v18.16b, v2.16b
  171. eor v3.16b, v19.16b, v3.16b
  172. eor v4.16b, v8.16b, v4.16b
  173. eor v5.16b, v9.16b, v5.16b
  174. eor v6.16b, v10.16b, v6.16b
  175. eor v7.16b, v16.16b, v7.16b
  176. tbl v0.16b, {v0.16b}, v28.16b
  177. tbl v1.16b, {v1.16b}, v28.16b
  178. tbl v2.16b, {v2.16b}, v28.16b
  179. tbl v3.16b, {v3.16b}, v28.16b
  180. tbl v4.16b, {v4.16b}, v28.16b
  181. tbl v5.16b, {v5.16b}, v28.16b
  182. tbl v6.16b, {v6.16b}, v28.16b
  183. tbl v7.16b, {v7.16b}, v28.16b
  184. .Ldec_sbox:
  185. eor v1.16b, v1.16b, v4.16b
  186. eor v3.16b, v3.16b, v4.16b
  187. subs x10, x10, #1
  188. eor v4.16b, v4.16b, v7.16b
  189. eor v2.16b, v2.16b, v7.16b
  190. eor v1.16b, v1.16b, v6.16b
  191. eor v6.16b, v6.16b, v4.16b
  192. eor v2.16b, v2.16b, v5.16b
  193. eor v0.16b, v0.16b, v1.16b
  194. eor v7.16b, v7.16b, v6.16b
  195. eor v8.16b, v6.16b, v2.16b
  196. and v9.16b, v4.16b, v6.16b
  197. eor v10.16b, v2.16b, v6.16b
  198. eor v3.16b, v3.16b, v0.16b
  199. eor v5.16b, v5.16b, v0.16b
  200. eor v16.16b, v7.16b, v4.16b
  201. eor v17.16b, v4.16b, v0.16b
  202. and v18.16b, v0.16b, v2.16b
  203. eor v19.16b, v7.16b, v4.16b
  204. eor v1.16b, v1.16b, v3.16b
  205. eor v20.16b, v3.16b, v0.16b
  206. eor v21.16b, v5.16b, v2.16b
  207. eor v22.16b, v3.16b, v7.16b
  208. and v8.16b, v17.16b, v8.16b
  209. orr v17.16b, v3.16b, v5.16b
  210. eor v23.16b, v1.16b, v6.16b
  211. eor v24.16b, v20.16b, v16.16b
  212. eor v25.16b, v1.16b, v5.16b
  213. orr v26.16b, v20.16b, v21.16b
  214. and v20.16b, v20.16b, v21.16b
  215. and v27.16b, v7.16b, v1.16b
  216. eor v21.16b, v21.16b, v23.16b
  217. orr v28.16b, v16.16b, v23.16b
  218. orr v29.16b, v22.16b, v25.16b
  219. eor v26.16b, v26.16b, v8.16b
  220. and v16.16b, v16.16b, v23.16b
  221. and v22.16b, v22.16b, v25.16b
  222. and v21.16b, v24.16b, v21.16b
  223. eor v8.16b, v28.16b, v8.16b
  224. eor v23.16b, v5.16b, v2.16b
  225. eor v24.16b, v1.16b, v6.16b
  226. eor v16.16b, v16.16b, v22.16b
  227. eor v22.16b, v3.16b, v0.16b
  228. eor v25.16b, v29.16b, v21.16b
  229. eor v21.16b, v26.16b, v21.16b
  230. eor v8.16b, v8.16b, v20.16b
  231. eor v26.16b, v23.16b, v24.16b
  232. eor v16.16b, v16.16b, v20.16b
  233. eor v28.16b, v22.16b, v19.16b
  234. eor v20.16b, v25.16b, v20.16b
  235. eor v9.16b, v21.16b, v9.16b
  236. eor v8.16b, v8.16b, v18.16b
  237. eor v18.16b, v5.16b, v1.16b
  238. eor v21.16b, v16.16b, v17.16b
  239. eor v16.16b, v16.16b, v17.16b
  240. eor v17.16b, v20.16b, v27.16b
  241. eor v20.16b, v3.16b, v7.16b
  242. eor v25.16b, v9.16b, v8.16b
  243. eor v27.16b, v0.16b, v4.16b
  244. and v29.16b, v9.16b, v17.16b
  245. eor v30.16b, v8.16b, v29.16b
  246. eor v31.16b, v21.16b, v29.16b
  247. eor v29.16b, v21.16b, v29.16b
  248. bsl v30.16b, v17.16b, v21.16b
  249. bsl v31.16b, v9.16b, v8.16b
  250. bsl v16.16b, v30.16b, v29.16b
  251. bsl v21.16b, v29.16b, v30.16b
  252. eor v8.16b, v31.16b, v30.16b
  253. and v1.16b, v1.16b, v31.16b
  254. and v9.16b, v16.16b, v31.16b
  255. and v6.16b, v6.16b, v30.16b
  256. eor v16.16b, v17.16b, v21.16b
  257. and v4.16b, v4.16b, v30.16b
  258. eor v17.16b, v8.16b, v30.16b
  259. and v21.16b, v24.16b, v8.16b
  260. eor v9.16b, v9.16b, v25.16b
  261. and v19.16b, v19.16b, v8.16b
  262. eor v24.16b, v30.16b, v16.16b
  263. eor v25.16b, v30.16b, v16.16b
  264. and v7.16b, v7.16b, v17.16b
  265. and v10.16b, v10.16b, v16.16b
  266. eor v29.16b, v9.16b, v16.16b
  267. eor v30.16b, v31.16b, v9.16b
  268. and v0.16b, v24.16b, v0.16b
  269. and v9.16b, v18.16b, v9.16b
  270. and v2.16b, v25.16b, v2.16b
  271. eor v10.16b, v10.16b, v6.16b
  272. eor v18.16b, v29.16b, v16.16b
  273. and v5.16b, v30.16b, v5.16b
  274. eor v24.16b, v8.16b, v29.16b
  275. and v25.16b, v26.16b, v29.16b
  276. and v26.16b, v28.16b, v29.16b
  277. eor v8.16b, v8.16b, v29.16b
  278. eor v17.16b, v17.16b, v18.16b
  279. eor v5.16b, v1.16b, v5.16b
  280. and v23.16b, v24.16b, v23.16b
  281. eor v21.16b, v21.16b, v25.16b
  282. eor v19.16b, v19.16b, v26.16b
  283. eor v0.16b, v4.16b, v0.16b
  284. and v3.16b, v17.16b, v3.16b
  285. eor v1.16b, v9.16b, v1.16b
  286. eor v9.16b, v25.16b, v23.16b
  287. eor v5.16b, v5.16b, v21.16b
  288. eor v2.16b, v6.16b, v2.16b
  289. and v6.16b, v8.16b, v22.16b
  290. eor v3.16b, v7.16b, v3.16b
  291. and v8.16b, v20.16b, v18.16b
  292. eor v10.16b, v10.16b, v9.16b
  293. eor v0.16b, v0.16b, v19.16b
  294. eor v9.16b, v1.16b, v9.16b
  295. eor v1.16b, v2.16b, v21.16b
  296. eor v3.16b, v3.16b, v19.16b
  297. and v16.16b, v27.16b, v16.16b
  298. eor v17.16b, v26.16b, v6.16b
  299. eor v6.16b, v8.16b, v7.16b
  300. eor v7.16b, v1.16b, v9.16b
  301. eor v1.16b, v5.16b, v3.16b
  302. eor v2.16b, v10.16b, v3.16b
  303. eor v4.16b, v16.16b, v4.16b
  304. eor v8.16b, v6.16b, v17.16b
  305. eor v5.16b, v9.16b, v3.16b
  306. eor v9.16b, v0.16b, v1.16b
  307. eor v6.16b, v7.16b, v1.16b
  308. eor v0.16b, v4.16b, v17.16b
  309. eor v4.16b, v8.16b, v7.16b
  310. eor v7.16b, v9.16b, v2.16b
  311. eor v8.16b, v3.16b, v0.16b
  312. eor v7.16b, v7.16b, v5.16b
  313. eor v3.16b, v4.16b, v7.16b
  314. eor v4.16b, v7.16b, v0.16b
  315. eor v7.16b, v8.16b, v3.16b
  316. bcc .Ldec_done
  317. ext v8.16b, v0.16b, v0.16b, #8
  318. ext v9.16b, v1.16b, v1.16b, #8
  319. ldr q28, [x11] // load from .LISR in common case (x10 > 0)
  320. ext v10.16b, v6.16b, v6.16b, #8
  321. ext v16.16b, v3.16b, v3.16b, #8
  322. ext v17.16b, v5.16b, v5.16b, #8
  323. ext v18.16b, v4.16b, v4.16b, #8
  324. eor v8.16b, v8.16b, v0.16b
  325. eor v9.16b, v9.16b, v1.16b
  326. eor v10.16b, v10.16b, v6.16b
  327. eor v16.16b, v16.16b, v3.16b
  328. eor v17.16b, v17.16b, v5.16b
  329. ext v19.16b, v2.16b, v2.16b, #8
  330. ext v20.16b, v7.16b, v7.16b, #8
  331. eor v18.16b, v18.16b, v4.16b
  332. eor v6.16b, v6.16b, v8.16b
  333. eor v8.16b, v2.16b, v10.16b
  334. eor v4.16b, v4.16b, v9.16b
  335. eor v2.16b, v19.16b, v2.16b
  336. eor v9.16b, v20.16b, v7.16b
  337. eor v0.16b, v0.16b, v16.16b
  338. eor v1.16b, v1.16b, v16.16b
  339. eor v6.16b, v6.16b, v17.16b
  340. eor v8.16b, v8.16b, v16.16b
  341. eor v7.16b, v7.16b, v18.16b
  342. eor v4.16b, v4.16b, v16.16b
  343. eor v2.16b, v3.16b, v2.16b
  344. eor v1.16b, v1.16b, v17.16b
  345. eor v3.16b, v5.16b, v9.16b
  346. eor v5.16b, v8.16b, v17.16b
  347. eor v7.16b, v7.16b, v17.16b
  348. ext v8.16b, v0.16b, v0.16b, #12
  349. ext v9.16b, v6.16b, v6.16b, #12
  350. ext v10.16b, v4.16b, v4.16b, #12
  351. ext v16.16b, v1.16b, v1.16b, #12
  352. ext v17.16b, v5.16b, v5.16b, #12
  353. ext v18.16b, v7.16b, v7.16b, #12
  354. eor v0.16b, v0.16b, v8.16b
  355. eor v6.16b, v6.16b, v9.16b
  356. eor v4.16b, v4.16b, v10.16b
  357. ext v19.16b, v2.16b, v2.16b, #12
  358. ext v20.16b, v3.16b, v3.16b, #12
  359. eor v1.16b, v1.16b, v16.16b
  360. eor v5.16b, v5.16b, v17.16b
  361. eor v7.16b, v7.16b, v18.16b
  362. eor v2.16b, v2.16b, v19.16b
  363. eor v16.16b, v16.16b, v0.16b
  364. eor v3.16b, v3.16b, v20.16b
  365. eor v17.16b, v17.16b, v4.16b
  366. eor v10.16b, v10.16b, v6.16b
  367. ext v0.16b, v0.16b, v0.16b, #8
  368. eor v9.16b, v9.16b, v1.16b
  369. ext v1.16b, v1.16b, v1.16b, #8
  370. eor v8.16b, v8.16b, v3.16b
  371. eor v16.16b, v16.16b, v3.16b
  372. eor v18.16b, v18.16b, v5.16b
  373. eor v19.16b, v19.16b, v7.16b
  374. ext v21.16b, v5.16b, v5.16b, #8
  375. ext v5.16b, v7.16b, v7.16b, #8
  376. eor v7.16b, v20.16b, v2.16b
  377. ext v4.16b, v4.16b, v4.16b, #8
  378. ext v20.16b, v3.16b, v3.16b, #8
  379. eor v17.16b, v17.16b, v3.16b
  380. ext v2.16b, v2.16b, v2.16b, #8
  381. eor v3.16b, v10.16b, v3.16b
  382. ext v10.16b, v6.16b, v6.16b, #8
  383. eor v0.16b, v0.16b, v8.16b
  384. eor v1.16b, v1.16b, v16.16b
  385. eor v5.16b, v5.16b, v18.16b
  386. eor v3.16b, v3.16b, v4.16b
  387. eor v7.16b, v20.16b, v7.16b
  388. eor v6.16b, v2.16b, v19.16b
  389. eor v4.16b, v21.16b, v17.16b
  390. eor v2.16b, v10.16b, v9.16b
  391. bne .Ldec_loop
  392. ldr q28, [x11, #16]! // load from .LISRM0 on last round (x10 == 0)
  393. b .Ldec_loop
  394. .align 4
  395. .Ldec_done:
  396. ushr v8.2d, v0.2d, #1
  397. movi v9.16b, #0x55
  398. ldr q10, [x9]
  399. ushr v16.2d, v2.2d, #1
  400. movi v17.16b, #0x33
  401. ushr v18.2d, v6.2d, #1
  402. movi v19.16b, #0x0f
  403. eor v8.16b, v8.16b, v1.16b
  404. ushr v20.2d, v3.2d, #1
  405. eor v16.16b, v16.16b, v7.16b
  406. eor v18.16b, v18.16b, v4.16b
  407. and v8.16b, v8.16b, v9.16b
  408. eor v20.16b, v20.16b, v5.16b
  409. and v16.16b, v16.16b, v9.16b
  410. and v18.16b, v18.16b, v9.16b
  411. shl v21.2d, v8.2d, #1
  412. eor v1.16b, v1.16b, v8.16b
  413. and v8.16b, v20.16b, v9.16b
  414. eor v7.16b, v7.16b, v16.16b
  415. shl v9.2d, v16.2d, #1
  416. eor v4.16b, v4.16b, v18.16b
  417. shl v16.2d, v18.2d, #1
  418. eor v0.16b, v0.16b, v21.16b
  419. shl v18.2d, v8.2d, #1
  420. eor v5.16b, v5.16b, v8.16b
  421. eor v2.16b, v2.16b, v9.16b
  422. eor v6.16b, v6.16b, v16.16b
  423. ushr v8.2d, v1.2d, #2
  424. eor v3.16b, v3.16b, v18.16b
  425. ushr v9.2d, v0.2d, #2
  426. ushr v16.2d, v7.2d, #2
  427. ushr v18.2d, v2.2d, #2
  428. eor v8.16b, v8.16b, v4.16b
  429. eor v9.16b, v9.16b, v6.16b
  430. eor v16.16b, v16.16b, v5.16b
  431. eor v18.16b, v18.16b, v3.16b
  432. and v8.16b, v8.16b, v17.16b
  433. and v9.16b, v9.16b, v17.16b
  434. and v16.16b, v16.16b, v17.16b
  435. and v17.16b, v18.16b, v17.16b
  436. eor v4.16b, v4.16b, v8.16b
  437. shl v8.2d, v8.2d, #2
  438. eor v6.16b, v6.16b, v9.16b
  439. shl v9.2d, v9.2d, #2
  440. eor v5.16b, v5.16b, v16.16b
  441. shl v16.2d, v16.2d, #2
  442. eor v3.16b, v3.16b, v17.16b
  443. shl v17.2d, v17.2d, #2
  444. eor v1.16b, v1.16b, v8.16b
  445. eor v0.16b, v0.16b, v9.16b
  446. eor v7.16b, v7.16b, v16.16b
  447. eor v2.16b, v2.16b, v17.16b
  448. ushr v8.2d, v4.2d, #4
  449. ushr v9.2d, v6.2d, #4
  450. ushr v16.2d, v1.2d, #4
  451. ushr v17.2d, v0.2d, #4
  452. eor v8.16b, v8.16b, v5.16b
  453. eor v9.16b, v9.16b, v3.16b
  454. eor v16.16b, v16.16b, v7.16b
  455. eor v17.16b, v17.16b, v2.16b
  456. and v8.16b, v8.16b, v19.16b
  457. and v9.16b, v9.16b, v19.16b
  458. and v16.16b, v16.16b, v19.16b
  459. and v17.16b, v17.16b, v19.16b
  460. eor v5.16b, v5.16b, v8.16b
  461. shl v8.2d, v8.2d, #4
  462. eor v3.16b, v3.16b, v9.16b
  463. shl v9.2d, v9.2d, #4
  464. eor v7.16b, v7.16b, v16.16b
  465. shl v16.2d, v16.2d, #4
  466. eor v2.16b, v2.16b, v17.16b
  467. shl v17.2d, v17.2d, #4
  468. eor v4.16b, v4.16b, v8.16b
  469. eor v6.16b, v6.16b, v9.16b
  470. eor v7.16b, v7.16b, v10.16b
  471. eor v1.16b, v1.16b, v16.16b
  472. eor v2.16b, v2.16b, v10.16b
  473. eor v0.16b, v0.16b, v17.16b
  474. eor v4.16b, v4.16b, v10.16b
  475. eor v6.16b, v6.16b, v10.16b
  476. eor v3.16b, v3.16b, v10.16b
  477. eor v5.16b, v5.16b, v10.16b
  478. eor v1.16b, v1.16b, v10.16b
  479. eor v0.16b, v0.16b, v10.16b
  480. ret
  481. .size _bsaes_decrypt8,.-_bsaes_decrypt8
  482. .type _bsaes_const,%object
  483. .align 6
  484. _bsaes_const:
  485. // InvShiftRows constants
  486. // Used in _bsaes_decrypt8, which assumes contiguity
  487. // .LM0ISR used with round 0 key
  488. // .LISR used with middle round keys
  489. // .LISRM0 used with final round key
  490. .LM0ISR:
  491. .quad 0x0a0e0206070b0f03, 0x0004080c0d010509
  492. .LISR:
  493. .quad 0x0504070602010003, 0x0f0e0d0c080b0a09
  494. .LISRM0:
  495. .quad 0x01040b0e0205080f, 0x0306090c00070a0d
  496. // ShiftRows constants
  497. // Used in _bsaes_encrypt8, which assumes contiguity
  498. // .LM0SR used with round 0 key
  499. // .LSR used with middle round keys
  500. // .LSRM0 used with final round key
  501. .LM0SR:
  502. .quad 0x0a0e02060f03070b, 0x0004080c05090d01
  503. .LSR:
  504. .quad 0x0504070600030201, 0x0f0e0d0c0a09080b
  505. .LSRM0:
  506. .quad 0x0304090e00050a0f, 0x01060b0c0207080d
  507. .LM0_bigendian:
  508. .quad 0x02060a0e03070b0f, 0x0004080c0105090d
  509. .LM0_littleendian:
  510. .quad 0x0105090d0004080c, 0x03070b0f02060a0e
  511. // Used in ossl_bsaes_ctr32_encrypt_blocks, prior to dropping into
  512. // _bsaes_encrypt8_alt, for round 0 key in place of .LM0SR
  513. .LREVM0SR:
  514. .quad 0x090d01050c000408, 0x03070b0f060a0e02
  515. .align 6
  516. .size _bsaes_const,.-_bsaes_const
  517. .type _bsaes_encrypt8,%function
  518. .align 4
  519. // On entry:
  520. // x9 -> key (previously expanded using _bsaes_key_convert)
  521. // x10 = number of rounds
  522. // v0-v7 input data
  523. // On exit:
  524. // x9-x11 corrupted
  525. // other general-purpose registers preserved
  526. // v0-v7 output data
  527. // v11-v15 preserved
  528. // other SIMD registers corrupted
  529. _bsaes_encrypt8:
  530. ldr q8, [x9], #16
  531. adr x11, .LM0SR
  532. ldr q9, [x11], #16
  533. _bsaes_encrypt8_alt:
  534. eor v0.16b, v0.16b, v8.16b
  535. eor v1.16b, v1.16b, v8.16b
  536. sub x10, x10, #1
  537. eor v2.16b, v2.16b, v8.16b
  538. eor v4.16b, v4.16b, v8.16b
  539. eor v3.16b, v3.16b, v8.16b
  540. eor v5.16b, v5.16b, v8.16b
  541. tbl v0.16b, {v0.16b}, v9.16b
  542. tbl v1.16b, {v1.16b}, v9.16b
  543. tbl v2.16b, {v2.16b}, v9.16b
  544. tbl v4.16b, {v4.16b}, v9.16b
  545. eor v6.16b, v6.16b, v8.16b
  546. eor v7.16b, v7.16b, v8.16b
  547. tbl v3.16b, {v3.16b}, v9.16b
  548. tbl v5.16b, {v5.16b}, v9.16b
  549. tbl v6.16b, {v6.16b}, v9.16b
  550. ushr v8.2d, v0.2d, #1
  551. movi v10.16b, #0x55
  552. tbl v7.16b, {v7.16b}, v9.16b
  553. ushr v9.2d, v4.2d, #1
  554. movi v16.16b, #0x33
  555. ushr v17.2d, v2.2d, #1
  556. eor v8.16b, v8.16b, v1.16b
  557. movi v18.16b, #0x0f
  558. ushr v19.2d, v6.2d, #1
  559. eor v9.16b, v9.16b, v5.16b
  560. eor v17.16b, v17.16b, v3.16b
  561. and v8.16b, v8.16b, v10.16b
  562. eor v19.16b, v19.16b, v7.16b
  563. and v9.16b, v9.16b, v10.16b
  564. and v17.16b, v17.16b, v10.16b
  565. eor v1.16b, v1.16b, v8.16b
  566. shl v8.2d, v8.2d, #1
  567. and v10.16b, v19.16b, v10.16b
  568. eor v5.16b, v5.16b, v9.16b
  569. shl v9.2d, v9.2d, #1
  570. eor v3.16b, v3.16b, v17.16b
  571. shl v17.2d, v17.2d, #1
  572. eor v0.16b, v0.16b, v8.16b
  573. shl v8.2d, v10.2d, #1
  574. eor v7.16b, v7.16b, v10.16b
  575. eor v4.16b, v4.16b, v9.16b
  576. eor v2.16b, v2.16b, v17.16b
  577. ushr v9.2d, v1.2d, #2
  578. eor v6.16b, v6.16b, v8.16b
  579. ushr v8.2d, v0.2d, #2
  580. ushr v10.2d, v5.2d, #2
  581. ushr v17.2d, v4.2d, #2
  582. eor v9.16b, v9.16b, v3.16b
  583. eor v8.16b, v8.16b, v2.16b
  584. eor v10.16b, v10.16b, v7.16b
  585. eor v17.16b, v17.16b, v6.16b
  586. and v9.16b, v9.16b, v16.16b
  587. and v8.16b, v8.16b, v16.16b
  588. and v10.16b, v10.16b, v16.16b
  589. and v16.16b, v17.16b, v16.16b
  590. eor v3.16b, v3.16b, v9.16b
  591. shl v9.2d, v9.2d, #2
  592. eor v2.16b, v2.16b, v8.16b
  593. shl v8.2d, v8.2d, #2
  594. eor v7.16b, v7.16b, v10.16b
  595. shl v10.2d, v10.2d, #2
  596. eor v6.16b, v6.16b, v16.16b
  597. shl v16.2d, v16.2d, #2
  598. eor v1.16b, v1.16b, v9.16b
  599. eor v0.16b, v0.16b, v8.16b
  600. eor v5.16b, v5.16b, v10.16b
  601. eor v4.16b, v4.16b, v16.16b
  602. ushr v8.2d, v3.2d, #4
  603. ushr v9.2d, v2.2d, #4
  604. ushr v10.2d, v1.2d, #4
  605. ushr v16.2d, v0.2d, #4
  606. eor v8.16b, v8.16b, v7.16b
  607. eor v9.16b, v9.16b, v6.16b
  608. eor v10.16b, v10.16b, v5.16b
  609. eor v16.16b, v16.16b, v4.16b
  610. and v8.16b, v8.16b, v18.16b
  611. and v9.16b, v9.16b, v18.16b
  612. and v10.16b, v10.16b, v18.16b
  613. and v16.16b, v16.16b, v18.16b
  614. eor v7.16b, v7.16b, v8.16b
  615. shl v8.2d, v8.2d, #4
  616. eor v6.16b, v6.16b, v9.16b
  617. shl v9.2d, v9.2d, #4
  618. eor v5.16b, v5.16b, v10.16b
  619. shl v10.2d, v10.2d, #4
  620. eor v4.16b, v4.16b, v16.16b
  621. shl v16.2d, v16.2d, #4
  622. eor v3.16b, v3.16b, v8.16b
  623. eor v2.16b, v2.16b, v9.16b
  624. eor v1.16b, v1.16b, v10.16b
  625. eor v0.16b, v0.16b, v16.16b
  626. b .Lenc_sbox
  627. .align 4
  628. .Lenc_loop:
  629. ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x9], #64
  630. ldp q8, q9, [x9], #32
  631. eor v0.16b, v16.16b, v0.16b
  632. ldr q10, [x9], #16
  633. eor v1.16b, v17.16b, v1.16b
  634. ldr q16, [x9], #16
  635. eor v2.16b, v18.16b, v2.16b
  636. eor v3.16b, v19.16b, v3.16b
  637. eor v4.16b, v8.16b, v4.16b
  638. eor v5.16b, v9.16b, v5.16b
  639. eor v6.16b, v10.16b, v6.16b
  640. eor v7.16b, v16.16b, v7.16b
  641. tbl v0.16b, {v0.16b}, v28.16b
  642. tbl v1.16b, {v1.16b}, v28.16b
  643. tbl v2.16b, {v2.16b}, v28.16b
  644. tbl v3.16b, {v3.16b}, v28.16b
  645. tbl v4.16b, {v4.16b}, v28.16b
  646. tbl v5.16b, {v5.16b}, v28.16b
  647. tbl v6.16b, {v6.16b}, v28.16b
  648. tbl v7.16b, {v7.16b}, v28.16b
  649. .Lenc_sbox:
  650. eor v5.16b, v5.16b, v6.16b
  651. eor v3.16b, v3.16b, v0.16b
  652. subs x10, x10, #1
  653. eor v2.16b, v2.16b, v1.16b
  654. eor v5.16b, v5.16b, v0.16b
  655. eor v8.16b, v3.16b, v7.16b
  656. eor v6.16b, v6.16b, v2.16b
  657. eor v7.16b, v7.16b, v5.16b
  658. eor v8.16b, v8.16b, v4.16b
  659. eor v3.16b, v6.16b, v3.16b
  660. eor v4.16b, v4.16b, v5.16b
  661. eor v6.16b, v1.16b, v5.16b
  662. eor v2.16b, v2.16b, v7.16b
  663. eor v1.16b, v8.16b, v1.16b
  664. eor v8.16b, v7.16b, v4.16b
  665. eor v9.16b, v3.16b, v0.16b
  666. eor v10.16b, v7.16b, v6.16b
  667. eor v16.16b, v5.16b, v3.16b
  668. eor v17.16b, v6.16b, v2.16b
  669. eor v18.16b, v5.16b, v1.16b
  670. eor v19.16b, v2.16b, v4.16b
  671. eor v20.16b, v1.16b, v0.16b
  672. orr v21.16b, v8.16b, v9.16b
  673. orr v22.16b, v10.16b, v16.16b
  674. eor v23.16b, v8.16b, v17.16b
  675. eor v24.16b, v9.16b, v18.16b
  676. and v19.16b, v19.16b, v20.16b
  677. orr v20.16b, v17.16b, v18.16b
  678. and v8.16b, v8.16b, v9.16b
  679. and v9.16b, v17.16b, v18.16b
  680. and v17.16b, v23.16b, v24.16b
  681. and v10.16b, v10.16b, v16.16b
  682. eor v16.16b, v21.16b, v19.16b
  683. eor v18.16b, v20.16b, v19.16b
  684. and v19.16b, v2.16b, v1.16b
  685. and v20.16b, v6.16b, v5.16b
  686. eor v21.16b, v22.16b, v17.16b
  687. eor v9.16b, v9.16b, v10.16b
  688. eor v10.16b, v16.16b, v17.16b
  689. eor v16.16b, v18.16b, v8.16b
  690. and v17.16b, v4.16b, v0.16b
  691. orr v18.16b, v7.16b, v3.16b
  692. eor v21.16b, v21.16b, v8.16b
  693. eor v8.16b, v9.16b, v8.16b
  694. eor v9.16b, v10.16b, v19.16b
  695. eor v10.16b, v3.16b, v0.16b
  696. eor v16.16b, v16.16b, v17.16b
  697. eor v17.16b, v5.16b, v1.16b
  698. eor v19.16b, v21.16b, v20.16b
  699. eor v20.16b, v8.16b, v18.16b
  700. eor v8.16b, v8.16b, v18.16b
  701. eor v18.16b, v7.16b, v4.16b
  702. eor v21.16b, v9.16b, v16.16b
  703. eor v22.16b, v6.16b, v2.16b
  704. and v23.16b, v9.16b, v19.16b
  705. eor v24.16b, v10.16b, v17.16b
  706. eor v25.16b, v0.16b, v1.16b
  707. eor v26.16b, v7.16b, v6.16b
  708. eor v27.16b, v18.16b, v22.16b
  709. eor v28.16b, v3.16b, v5.16b
  710. eor v29.16b, v16.16b, v23.16b
  711. eor v30.16b, v20.16b, v23.16b
  712. eor v23.16b, v20.16b, v23.16b
  713. eor v31.16b, v4.16b, v2.16b
  714. bsl v29.16b, v19.16b, v20.16b
  715. bsl v30.16b, v9.16b, v16.16b
  716. bsl v8.16b, v29.16b, v23.16b
  717. bsl v20.16b, v23.16b, v29.16b
  718. eor v9.16b, v30.16b, v29.16b
  719. and v5.16b, v5.16b, v30.16b
  720. and v8.16b, v8.16b, v30.16b
  721. and v1.16b, v1.16b, v29.16b
  722. eor v16.16b, v19.16b, v20.16b
  723. and v2.16b, v2.16b, v29.16b
  724. eor v19.16b, v9.16b, v29.16b
  725. and v17.16b, v17.16b, v9.16b
  726. eor v8.16b, v8.16b, v21.16b
  727. and v20.16b, v22.16b, v9.16b
  728. eor v21.16b, v29.16b, v16.16b
  729. eor v22.16b, v29.16b, v16.16b
  730. and v23.16b, v25.16b, v16.16b
  731. and v6.16b, v6.16b, v19.16b
  732. eor v25.16b, v8.16b, v16.16b
  733. eor v29.16b, v30.16b, v8.16b
  734. and v4.16b, v21.16b, v4.16b
  735. and v8.16b, v28.16b, v8.16b
  736. and v0.16b, v22.16b, v0.16b
  737. eor v21.16b, v23.16b, v1.16b
  738. eor v22.16b, v9.16b, v25.16b
  739. eor v9.16b, v9.16b, v25.16b
  740. eor v23.16b, v25.16b, v16.16b
  741. and v3.16b, v29.16b, v3.16b
  742. and v24.16b, v24.16b, v25.16b
  743. and v25.16b, v27.16b, v25.16b
  744. and v10.16b, v22.16b, v10.16b
  745. and v9.16b, v9.16b, v18.16b
  746. eor v18.16b, v19.16b, v23.16b
  747. and v19.16b, v26.16b, v23.16b
  748. eor v3.16b, v5.16b, v3.16b
  749. eor v17.16b, v17.16b, v24.16b
  750. eor v10.16b, v24.16b, v10.16b
  751. and v16.16b, v31.16b, v16.16b
  752. eor v20.16b, v20.16b, v25.16b
  753. eor v9.16b, v25.16b, v9.16b
  754. eor v4.16b, v2.16b, v4.16b
  755. and v7.16b, v18.16b, v7.16b
  756. eor v18.16b, v19.16b, v6.16b
  757. eor v5.16b, v8.16b, v5.16b
  758. eor v0.16b, v1.16b, v0.16b
  759. eor v1.16b, v21.16b, v10.16b
  760. eor v8.16b, v3.16b, v17.16b
  761. eor v2.16b, v16.16b, v2.16b
  762. eor v3.16b, v6.16b, v7.16b
  763. eor v6.16b, v18.16b, v9.16b
  764. eor v4.16b, v4.16b, v20.16b
  765. eor v10.16b, v5.16b, v10.16b
  766. eor v0.16b, v0.16b, v17.16b
  767. eor v9.16b, v2.16b, v9.16b
  768. eor v3.16b, v3.16b, v20.16b
  769. eor v7.16b, v6.16b, v1.16b
  770. eor v5.16b, v8.16b, v4.16b
  771. eor v6.16b, v10.16b, v1.16b
  772. eor v2.16b, v4.16b, v0.16b
  773. eor v4.16b, v3.16b, v10.16b
  774. eor v9.16b, v9.16b, v7.16b
  775. eor v3.16b, v0.16b, v5.16b
  776. eor v0.16b, v1.16b, v4.16b
  777. eor v1.16b, v4.16b, v8.16b
  778. eor v4.16b, v9.16b, v5.16b
  779. eor v6.16b, v6.16b, v3.16b
  780. bcc .Lenc_done
  781. ext v8.16b, v0.16b, v0.16b, #12
  782. ext v9.16b, v4.16b, v4.16b, #12
  783. ldr q28, [x11]
  784. ext v10.16b, v6.16b, v6.16b, #12
  785. ext v16.16b, v1.16b, v1.16b, #12
  786. ext v17.16b, v3.16b, v3.16b, #12
  787. ext v18.16b, v7.16b, v7.16b, #12
  788. eor v0.16b, v0.16b, v8.16b
  789. eor v4.16b, v4.16b, v9.16b
  790. eor v6.16b, v6.16b, v10.16b
  791. ext v19.16b, v2.16b, v2.16b, #12
  792. ext v20.16b, v5.16b, v5.16b, #12
  793. eor v1.16b, v1.16b, v16.16b
  794. eor v3.16b, v3.16b, v17.16b
  795. eor v7.16b, v7.16b, v18.16b
  796. eor v2.16b, v2.16b, v19.16b
  797. eor v16.16b, v16.16b, v0.16b
  798. eor v5.16b, v5.16b, v20.16b
  799. eor v17.16b, v17.16b, v6.16b
  800. eor v10.16b, v10.16b, v4.16b
  801. ext v0.16b, v0.16b, v0.16b, #8
  802. eor v9.16b, v9.16b, v1.16b
  803. ext v1.16b, v1.16b, v1.16b, #8
  804. eor v8.16b, v8.16b, v5.16b
  805. eor v16.16b, v16.16b, v5.16b
  806. eor v18.16b, v18.16b, v3.16b
  807. eor v19.16b, v19.16b, v7.16b
  808. ext v3.16b, v3.16b, v3.16b, #8
  809. ext v7.16b, v7.16b, v7.16b, #8
  810. eor v20.16b, v20.16b, v2.16b
  811. ext v6.16b, v6.16b, v6.16b, #8
  812. ext v21.16b, v5.16b, v5.16b, #8
  813. eor v17.16b, v17.16b, v5.16b
  814. ext v2.16b, v2.16b, v2.16b, #8
  815. eor v10.16b, v10.16b, v5.16b
  816. ext v22.16b, v4.16b, v4.16b, #8
  817. eor v0.16b, v0.16b, v8.16b
  818. eor v1.16b, v1.16b, v16.16b
  819. eor v5.16b, v7.16b, v18.16b
  820. eor v4.16b, v3.16b, v17.16b
  821. eor v3.16b, v6.16b, v10.16b
  822. eor v7.16b, v21.16b, v20.16b
  823. eor v6.16b, v2.16b, v19.16b
  824. eor v2.16b, v22.16b, v9.16b
  825. bne .Lenc_loop
  826. ldr q28, [x11, #16]! // load from .LSRM0 on last round (x10 == 0)
  827. b .Lenc_loop
  828. .align 4
  829. .Lenc_done:
  830. ushr v8.2d, v0.2d, #1
  831. movi v9.16b, #0x55
  832. ldr q10, [x9]
  833. ushr v16.2d, v3.2d, #1
  834. movi v17.16b, #0x33
  835. ushr v18.2d, v4.2d, #1
  836. movi v19.16b, #0x0f
  837. eor v8.16b, v8.16b, v1.16b
  838. ushr v20.2d, v2.2d, #1
  839. eor v16.16b, v16.16b, v7.16b
  840. eor v18.16b, v18.16b, v6.16b
  841. and v8.16b, v8.16b, v9.16b
  842. eor v20.16b, v20.16b, v5.16b
  843. and v16.16b, v16.16b, v9.16b
  844. and v18.16b, v18.16b, v9.16b
  845. shl v21.2d, v8.2d, #1
  846. eor v1.16b, v1.16b, v8.16b
  847. and v8.16b, v20.16b, v9.16b
  848. eor v7.16b, v7.16b, v16.16b
  849. shl v9.2d, v16.2d, #1
  850. eor v6.16b, v6.16b, v18.16b
  851. shl v16.2d, v18.2d, #1
  852. eor v0.16b, v0.16b, v21.16b
  853. shl v18.2d, v8.2d, #1
  854. eor v5.16b, v5.16b, v8.16b
  855. eor v3.16b, v3.16b, v9.16b
  856. eor v4.16b, v4.16b, v16.16b
  857. ushr v8.2d, v1.2d, #2
  858. eor v2.16b, v2.16b, v18.16b
  859. ushr v9.2d, v0.2d, #2
  860. ushr v16.2d, v7.2d, #2
  861. ushr v18.2d, v3.2d, #2
  862. eor v8.16b, v8.16b, v6.16b
  863. eor v9.16b, v9.16b, v4.16b
  864. eor v16.16b, v16.16b, v5.16b
  865. eor v18.16b, v18.16b, v2.16b
  866. and v8.16b, v8.16b, v17.16b
  867. and v9.16b, v9.16b, v17.16b
  868. and v16.16b, v16.16b, v17.16b
  869. and v17.16b, v18.16b, v17.16b
  870. eor v6.16b, v6.16b, v8.16b
  871. shl v8.2d, v8.2d, #2
  872. eor v4.16b, v4.16b, v9.16b
  873. shl v9.2d, v9.2d, #2
  874. eor v5.16b, v5.16b, v16.16b
  875. shl v16.2d, v16.2d, #2
  876. eor v2.16b, v2.16b, v17.16b
  877. shl v17.2d, v17.2d, #2
  878. eor v1.16b, v1.16b, v8.16b
  879. eor v0.16b, v0.16b, v9.16b
  880. eor v7.16b, v7.16b, v16.16b
  881. eor v3.16b, v3.16b, v17.16b
  882. ushr v8.2d, v6.2d, #4
  883. ushr v9.2d, v4.2d, #4
  884. ushr v16.2d, v1.2d, #4
  885. ushr v17.2d, v0.2d, #4
  886. eor v8.16b, v8.16b, v5.16b
  887. eor v9.16b, v9.16b, v2.16b
  888. eor v16.16b, v16.16b, v7.16b
  889. eor v17.16b, v17.16b, v3.16b
  890. and v8.16b, v8.16b, v19.16b
  891. and v9.16b, v9.16b, v19.16b
  892. and v16.16b, v16.16b, v19.16b
  893. and v17.16b, v17.16b, v19.16b
  894. eor v5.16b, v5.16b, v8.16b
  895. shl v8.2d, v8.2d, #4
  896. eor v2.16b, v2.16b, v9.16b
  897. shl v9.2d, v9.2d, #4
  898. eor v7.16b, v7.16b, v16.16b
  899. shl v16.2d, v16.2d, #4
  900. eor v3.16b, v3.16b, v17.16b
  901. shl v17.2d, v17.2d, #4
  902. eor v6.16b, v6.16b, v8.16b
  903. eor v4.16b, v4.16b, v9.16b
  904. eor v7.16b, v7.16b, v10.16b
  905. eor v1.16b, v1.16b, v16.16b
  906. eor v3.16b, v3.16b, v10.16b
  907. eor v0.16b, v0.16b, v17.16b
  908. eor v6.16b, v6.16b, v10.16b
  909. eor v4.16b, v4.16b, v10.16b
  910. eor v2.16b, v2.16b, v10.16b
  911. eor v5.16b, v5.16b, v10.16b
  912. eor v1.16b, v1.16b, v10.16b
  913. eor v0.16b, v0.16b, v10.16b
  914. ret
  915. .size _bsaes_encrypt8,.-_bsaes_encrypt8
  916. .type _bsaes_key_convert,%function
  917. .align 4
  918. // On entry:
  919. // x9 -> input key (big-endian)
  920. // x10 = number of rounds
  921. // x17 -> output key (native endianness)
  922. // On exit:
  923. // x9, x10 corrupted
  924. // x11 -> .LM0_bigendian
  925. // x17 -> last quadword of output key
  926. // other general-purpose registers preserved
  927. // v2-v6 preserved
  928. // v7.16b[] = 0x63
  929. // v8-v14 preserved
  930. // v15 = last round key (converted to native endianness)
  931. // other SIMD registers corrupted
  932. _bsaes_key_convert:
  933. #ifdef __AARCH64EL__
  934. adr x11, .LM0_littleendian
  935. #else
  936. adr x11, .LM0_bigendian
  937. #endif
  938. ldr q0, [x9], #16 // load round 0 key
  939. ldr q1, [x11] // .LM0
  940. ldr q15, [x9], #16 // load round 1 key
  941. movi v7.16b, #0x63 // compose .L63
  942. movi v16.16b, #0x01 // bit masks
  943. movi v17.16b, #0x02
  944. movi v18.16b, #0x04
  945. movi v19.16b, #0x08
  946. movi v20.16b, #0x10
  947. movi v21.16b, #0x20
  948. movi v22.16b, #0x40
  949. movi v23.16b, #0x80
  950. #ifdef __AARCH64EL__
  951. rev32 v0.16b, v0.16b
  952. #endif
  953. sub x10, x10, #1
  954. str q0, [x17], #16 // save round 0 key
  955. .align 4
  956. .Lkey_loop:
  957. tbl v0.16b, {v15.16b}, v1.16b
  958. ldr q15, [x9], #16 // load next round key
  959. eor v0.16b, v0.16b, v7.16b
  960. cmtst v24.16b, v0.16b, v16.16b
  961. cmtst v25.16b, v0.16b, v17.16b
  962. cmtst v26.16b, v0.16b, v18.16b
  963. cmtst v27.16b, v0.16b, v19.16b
  964. cmtst v28.16b, v0.16b, v20.16b
  965. cmtst v29.16b, v0.16b, v21.16b
  966. cmtst v30.16b, v0.16b, v22.16b
  967. cmtst v31.16b, v0.16b, v23.16b
  968. sub x10, x10, #1
  969. st1 {v24.16b-v27.16b}, [x17], #64 // write bit-sliced round key
  970. st1 {v28.16b-v31.16b}, [x17], #64
  971. cbnz x10, .Lkey_loop
  972. // don't save last round key
  973. #ifdef __AARCH64EL__
  974. rev32 v15.16b, v15.16b
  975. adr x11, .LM0_bigendian
  976. #endif
  977. ret
  978. .size _bsaes_key_convert,.-_bsaes_key_convert
  979. .globl ossl_bsaes_cbc_encrypt
  980. .type ossl_bsaes_cbc_encrypt,%function
  981. .align 4
  982. // On entry:
  983. // x0 -> input ciphertext
  984. // x1 -> output plaintext
  985. // x2 = size of ciphertext and plaintext in bytes (assumed a multiple of 16)
  986. // x3 -> key
  987. // x4 -> 128-bit initialisation vector (or preceding 128-bit block of ciphertext if continuing after an earlier call)
  988. // w5 must be == 0
  989. // On exit:
  990. // Output plaintext filled in
  991. // Initialisation vector overwritten with last quadword of ciphertext
  992. // No output registers, usual AAPCS64 register preservation
  993. ossl_bsaes_cbc_encrypt:
  994. AARCH64_VALID_CALL_TARGET
  995. cmp x2, #128
  996. bhs .Lcbc_do_bsaes
  997. b AES_cbc_encrypt
  998. .Lcbc_do_bsaes:
  999. // it is up to the caller to make sure we are called with enc == 0
  1000. stp x29, x30, [sp, #-48]!
  1001. stp d8, d9, [sp, #16]
  1002. stp d10, d15, [sp, #32]
  1003. lsr x2, x2, #4 // len in 16 byte blocks
  1004. ldr w15, [x3, #240] // get # of rounds
  1005. mov x14, sp
  1006. // allocate the key schedule on the stack
  1007. add x17, sp, #96
  1008. sub x17, x17, x15, lsl #7 // 128 bytes per inner round key, less 96 bytes
  1009. // populate the key schedule
  1010. mov x9, x3 // pass key
  1011. mov x10, x15 // pass # of rounds
  1012. mov sp, x17 // sp is sp
  1013. bl _bsaes_key_convert
  1014. ldr q6, [sp]
  1015. str q15, [x17] // save last round key
  1016. eor v6.16b, v6.16b, v7.16b // fix up round 0 key (by XORing with 0x63)
  1017. str q6, [sp]
  1018. ldr q15, [x4] // load IV
  1019. b .Lcbc_dec_loop
  1020. .align 4
  1021. .Lcbc_dec_loop:
  1022. subs x2, x2, #0x8
  1023. bmi .Lcbc_dec_loop_finish
  1024. ldr q0, [x0], #16 // load input
  1025. mov x9, sp // pass the key
  1026. ldr q1, [x0], #16
  1027. mov x10, x15
  1028. ldr q2, [x0], #16
  1029. ldr q3, [x0], #16
  1030. ldr q4, [x0], #16
  1031. ldr q5, [x0], #16
  1032. ldr q6, [x0], #16
  1033. ldr q7, [x0], #-7*16
  1034. bl _bsaes_decrypt8
  1035. ldr q16, [x0], #16 // reload input
  1036. eor v0.16b, v0.16b, v15.16b // ^= IV
  1037. eor v1.16b, v1.16b, v16.16b
  1038. str q0, [x1], #16 // write output
  1039. ldr q0, [x0], #16
  1040. str q1, [x1], #16
  1041. ldr q1, [x0], #16
  1042. eor v1.16b, v4.16b, v1.16b
  1043. ldr q4, [x0], #16
  1044. eor v2.16b, v2.16b, v4.16b
  1045. eor v0.16b, v6.16b, v0.16b
  1046. ldr q4, [x0], #16
  1047. str q0, [x1], #16
  1048. str q1, [x1], #16
  1049. eor v0.16b, v7.16b, v4.16b
  1050. ldr q1, [x0], #16
  1051. str q2, [x1], #16
  1052. ldr q2, [x0], #16
  1053. ldr q15, [x0], #16
  1054. str q0, [x1], #16
  1055. eor v0.16b, v5.16b, v2.16b
  1056. eor v1.16b, v3.16b, v1.16b
  1057. str q1, [x1], #16
  1058. str q0, [x1], #16
  1059. b .Lcbc_dec_loop
  1060. .Lcbc_dec_loop_finish:
  1061. adds x2, x2, #8
  1062. beq .Lcbc_dec_done
  1063. ldr q0, [x0], #16 // load input
  1064. cmp x2, #2
  1065. blo .Lcbc_dec_one
  1066. ldr q1, [x0], #16
  1067. mov x9, sp // pass the key
  1068. mov x10, x15
  1069. beq .Lcbc_dec_two
  1070. ldr q2, [x0], #16
  1071. cmp x2, #4
  1072. blo .Lcbc_dec_three
  1073. ldr q3, [x0], #16
  1074. beq .Lcbc_dec_four
  1075. ldr q4, [x0], #16
  1076. cmp x2, #6
  1077. blo .Lcbc_dec_five
  1078. ldr q5, [x0], #16
  1079. beq .Lcbc_dec_six
  1080. ldr q6, [x0], #-6*16
  1081. bl _bsaes_decrypt8
  1082. ldr q5, [x0], #16 // reload input
  1083. eor v0.16b, v0.16b, v15.16b // ^= IV
  1084. ldr q8, [x0], #16
  1085. ldr q9, [x0], #16
  1086. ldr q10, [x0], #16
  1087. str q0, [x1], #16 // write output
  1088. ldr q0, [x0], #16
  1089. eor v1.16b, v1.16b, v5.16b
  1090. ldr q5, [x0], #16
  1091. eor v6.16b, v6.16b, v8.16b
  1092. ldr q15, [x0]
  1093. eor v4.16b, v4.16b, v9.16b
  1094. eor v2.16b, v2.16b, v10.16b
  1095. str q1, [x1], #16
  1096. eor v0.16b, v7.16b, v0.16b
  1097. str q6, [x1], #16
  1098. eor v1.16b, v3.16b, v5.16b
  1099. str q4, [x1], #16
  1100. str q2, [x1], #16
  1101. str q0, [x1], #16
  1102. str q1, [x1]
  1103. b .Lcbc_dec_done
  1104. .align 4
  1105. .Lcbc_dec_six:
  1106. sub x0, x0, #0x60
  1107. bl _bsaes_decrypt8
  1108. ldr q3, [x0], #16 // reload input
  1109. eor v0.16b, v0.16b, v15.16b // ^= IV
  1110. ldr q5, [x0], #16
  1111. ldr q8, [x0], #16
  1112. ldr q9, [x0], #16
  1113. str q0, [x1], #16 // write output
  1114. ldr q0, [x0], #16
  1115. eor v1.16b, v1.16b, v3.16b
  1116. ldr q15, [x0]
  1117. eor v3.16b, v6.16b, v5.16b
  1118. eor v4.16b, v4.16b, v8.16b
  1119. eor v2.16b, v2.16b, v9.16b
  1120. str q1, [x1], #16
  1121. eor v0.16b, v7.16b, v0.16b
  1122. str q3, [x1], #16
  1123. str q4, [x1], #16
  1124. str q2, [x1], #16
  1125. str q0, [x1]
  1126. b .Lcbc_dec_done
  1127. .align 4
  1128. .Lcbc_dec_five:
  1129. sub x0, x0, #0x50
  1130. bl _bsaes_decrypt8
  1131. ldr q3, [x0], #16 // reload input
  1132. eor v0.16b, v0.16b, v15.16b // ^= IV
  1133. ldr q5, [x0], #16
  1134. ldr q7, [x0], #16
  1135. ldr q8, [x0], #16
  1136. str q0, [x1], #16 // write output
  1137. ldr q15, [x0]
  1138. eor v0.16b, v1.16b, v3.16b
  1139. eor v1.16b, v6.16b, v5.16b
  1140. eor v3.16b, v4.16b, v7.16b
  1141. str q0, [x1], #16
  1142. eor v0.16b, v2.16b, v8.16b
  1143. str q1, [x1], #16
  1144. str q3, [x1], #16
  1145. str q0, [x1]
  1146. b .Lcbc_dec_done
  1147. .align 4
  1148. .Lcbc_dec_four:
  1149. sub x0, x0, #0x40
  1150. bl _bsaes_decrypt8
  1151. ldr q2, [x0], #16 // reload input
  1152. eor v0.16b, v0.16b, v15.16b // ^= IV
  1153. ldr q3, [x0], #16
  1154. ldr q5, [x0], #16
  1155. str q0, [x1], #16 // write output
  1156. ldr q15, [x0]
  1157. eor v0.16b, v1.16b, v2.16b
  1158. eor v1.16b, v6.16b, v3.16b
  1159. eor v2.16b, v4.16b, v5.16b
  1160. str q0, [x1], #16
  1161. str q1, [x1], #16
  1162. str q2, [x1]
  1163. b .Lcbc_dec_done
  1164. .align 4
  1165. .Lcbc_dec_three:
  1166. sub x0, x0, #0x30
  1167. bl _bsaes_decrypt8
  1168. ldr q2, [x0], #16 // reload input
  1169. eor v0.16b, v0.16b, v15.16b // ^= IV
  1170. ldr q3, [x0], #16
  1171. ldr q15, [x0]
  1172. str q0, [x1], #16 // write output
  1173. eor v0.16b, v1.16b, v2.16b
  1174. eor v1.16b, v6.16b, v3.16b
  1175. str q0, [x1], #16
  1176. str q1, [x1]
  1177. b .Lcbc_dec_done
  1178. .align 4
  1179. .Lcbc_dec_two:
  1180. sub x0, x0, #0x20
  1181. bl _bsaes_decrypt8
  1182. ldr q2, [x0], #16 // reload input
  1183. eor v0.16b, v0.16b, v15.16b // ^= IV
  1184. ldr q15, [x0]
  1185. str q0, [x1], #16 // write output
  1186. eor v0.16b, v1.16b, v2.16b
  1187. str q0, [x1]
  1188. b .Lcbc_dec_done
  1189. .align 4
  1190. .Lcbc_dec_one:
  1191. sub x0, x0, #0x10
  1192. stp x1, x4, [sp, #-32]!
  1193. str x14, [sp, #16]
  1194. mov v8.16b, v15.16b
  1195. mov v15.16b, v0.16b
  1196. mov x2, x3
  1197. bl AES_decrypt
  1198. ldr x14, [sp, #16]
  1199. ldp x1, x4, [sp], #32
  1200. ldr q0, [x1] // load result
  1201. eor v0.16b, v0.16b, v8.16b // ^= IV
  1202. str q0, [x1] // write output
  1203. .align 4
  1204. .Lcbc_dec_done:
  1205. movi v0.16b, #0
  1206. movi v1.16b, #0
  1207. .Lcbc_dec_bzero:// wipe key schedule [if any]
  1208. stp q0, q1, [sp], #32
  1209. cmp sp, x14
  1210. bne .Lcbc_dec_bzero
  1211. str q15, [x4] // return IV
  1212. ldp d8, d9, [sp, #16]
  1213. ldp d10, d15, [sp, #32]
  1214. ldp x29, x30, [sp], #48
  1215. ret
  1216. .size ossl_bsaes_cbc_encrypt,.-ossl_bsaes_cbc_encrypt
  1217. .globl ossl_bsaes_ctr32_encrypt_blocks
  1218. .type ossl_bsaes_ctr32_encrypt_blocks,%function
  1219. .align 4
  1220. // On entry:
  1221. // x0 -> input text (whole 16-byte blocks)
  1222. // x1 -> output text (whole 16-byte blocks)
  1223. // x2 = number of 16-byte blocks to encrypt/decrypt (> 0)
  1224. // x3 -> key
  1225. // x4 -> initial value of 128-bit counter (stored big-endian) which increments, modulo 2^32, for each block
  1226. // On exit:
  1227. // Output text filled in
  1228. // No output registers, usual AAPCS64 register preservation
  1229. ossl_bsaes_ctr32_encrypt_blocks:
  1230. AARCH64_VALID_CALL_TARGET
  1231. cmp x2, #8 // use plain AES for
  1232. blo .Lctr_enc_short // small sizes
  1233. stp x29, x30, [sp, #-80]!
  1234. stp d8, d9, [sp, #16]
  1235. stp d10, d11, [sp, #32]
  1236. stp d12, d13, [sp, #48]
  1237. stp d14, d15, [sp, #64]
  1238. ldr w15, [x3, #240] // get # of rounds
  1239. mov x14, sp
  1240. // allocate the key schedule on the stack
  1241. add x17, sp, #96
  1242. sub x17, x17, x15, lsl #7 // 128 bytes per inner round key, less 96 bytes
  1243. // populate the key schedule
  1244. mov x9, x3 // pass key
  1245. mov x10, x15 // pass # of rounds
  1246. mov sp, x17 // sp is sp
  1247. bl _bsaes_key_convert
  1248. eor v7.16b, v7.16b, v15.16b // fix up last round key
  1249. str q7, [x17] // save last round key
  1250. ldr q0, [x4] // load counter
  1251. add x13, x11, #.LREVM0SR-.LM0_bigendian
  1252. ldr q4, [sp] // load round0 key
  1253. movi v8.4s, #1 // compose 1<<96
  1254. movi v9.16b, #0
  1255. rev32 v15.16b, v0.16b
  1256. rev32 v0.16b, v0.16b
  1257. ext v11.16b, v9.16b, v8.16b, #4
  1258. rev32 v4.16b, v4.16b
  1259. add v12.4s, v11.4s, v11.4s // compose 2<<96
  1260. str q4, [sp] // save adjusted round0 key
  1261. add v13.4s, v11.4s, v12.4s // compose 3<<96
  1262. add v14.4s, v12.4s, v12.4s // compose 4<<96
  1263. b .Lctr_enc_loop
  1264. .align 4
  1265. .Lctr_enc_loop:
  1266. // Intermix prologue from _bsaes_encrypt8 to use the opportunity
  1267. // to flip byte order in 32-bit counter
  1268. add v1.4s, v15.4s, v11.4s // +1
  1269. add x9, sp, #0x10 // pass next round key
  1270. add v2.4s, v15.4s, v12.4s // +2
  1271. ldr q9, [x13] // .LREVM0SR
  1272. ldr q8, [sp] // load round0 key
  1273. add v3.4s, v15.4s, v13.4s // +3
  1274. mov x10, x15 // pass rounds
  1275. sub x11, x13, #.LREVM0SR-.LSR // pass constants
  1276. add v6.4s, v2.4s, v14.4s
  1277. add v4.4s, v15.4s, v14.4s // +4
  1278. add v7.4s, v3.4s, v14.4s
  1279. add v15.4s, v4.4s, v14.4s // next counter
  1280. add v5.4s, v1.4s, v14.4s
  1281. bl _bsaes_encrypt8_alt
  1282. subs x2, x2, #8
  1283. blo .Lctr_enc_loop_done
  1284. ldr q16, [x0], #16
  1285. ldr q17, [x0], #16
  1286. eor v1.16b, v1.16b, v17.16b
  1287. ldr q17, [x0], #16
  1288. eor v0.16b, v0.16b, v16.16b
  1289. eor v4.16b, v4.16b, v17.16b
  1290. str q0, [x1], #16
  1291. ldr q16, [x0], #16
  1292. str q1, [x1], #16
  1293. mov v0.16b, v15.16b
  1294. str q4, [x1], #16
  1295. ldr q1, [x0], #16
  1296. eor v4.16b, v6.16b, v16.16b
  1297. eor v1.16b, v3.16b, v1.16b
  1298. ldr q3, [x0], #16
  1299. eor v3.16b, v7.16b, v3.16b
  1300. ldr q6, [x0], #16
  1301. eor v2.16b, v2.16b, v6.16b
  1302. ldr q6, [x0], #16
  1303. eor v5.16b, v5.16b, v6.16b
  1304. str q4, [x1], #16
  1305. str q1, [x1], #16
  1306. str q3, [x1], #16
  1307. str q2, [x1], #16
  1308. str q5, [x1], #16
  1309. bne .Lctr_enc_loop
  1310. b .Lctr_enc_done
  1311. .align 4
  1312. .Lctr_enc_loop_done:
  1313. add x2, x2, #8
  1314. ldr q16, [x0], #16 // load input
  1315. eor v0.16b, v0.16b, v16.16b
  1316. str q0, [x1], #16 // write output
  1317. cmp x2, #2
  1318. blo .Lctr_enc_done
  1319. ldr q17, [x0], #16
  1320. eor v1.16b, v1.16b, v17.16b
  1321. str q1, [x1], #16
  1322. beq .Lctr_enc_done
  1323. ldr q18, [x0], #16
  1324. eor v4.16b, v4.16b, v18.16b
  1325. str q4, [x1], #16
  1326. cmp x2, #4
  1327. blo .Lctr_enc_done
  1328. ldr q19, [x0], #16
  1329. eor v6.16b, v6.16b, v19.16b
  1330. str q6, [x1], #16
  1331. beq .Lctr_enc_done
  1332. ldr q20, [x0], #16
  1333. eor v3.16b, v3.16b, v20.16b
  1334. str q3, [x1], #16
  1335. cmp x2, #6
  1336. blo .Lctr_enc_done
  1337. ldr q21, [x0], #16
  1338. eor v7.16b, v7.16b, v21.16b
  1339. str q7, [x1], #16
  1340. beq .Lctr_enc_done
  1341. ldr q22, [x0]
  1342. eor v2.16b, v2.16b, v22.16b
  1343. str q2, [x1], #16
  1344. .Lctr_enc_done:
  1345. movi v0.16b, #0
  1346. movi v1.16b, #0
  1347. .Lctr_enc_bzero: // wipe key schedule [if any]
  1348. stp q0, q1, [sp], #32
  1349. cmp sp, x14
  1350. bne .Lctr_enc_bzero
  1351. ldp d8, d9, [sp, #16]
  1352. ldp d10, d11, [sp, #32]
  1353. ldp d12, d13, [sp, #48]
  1354. ldp d14, d15, [sp, #64]
  1355. ldp x29, x30, [sp], #80
  1356. ret
  1357. .Lctr_enc_short:
  1358. stp x29, x30, [sp, #-96]!
  1359. stp x19, x20, [sp, #16]
  1360. stp x21, x22, [sp, #32]
  1361. str x23, [sp, #48]
  1362. mov x19, x0 // copy arguments
  1363. mov x20, x1
  1364. mov x21, x2
  1365. mov x22, x3
  1366. ldr w23, [x4, #12] // load counter .LSW
  1367. ldr q1, [x4] // load whole counter value
  1368. #ifdef __AARCH64EL__
  1369. rev w23, w23
  1370. #endif
  1371. str q1, [sp, #80] // copy counter value
  1372. .Lctr_enc_short_loop:
  1373. add x0, sp, #80 // input counter value
  1374. add x1, sp, #64 // output on the stack
  1375. mov x2, x22 // key
  1376. bl AES_encrypt
  1377. ldr q0, [x19], #16 // load input
  1378. ldr q1, [sp, #64] // load encrypted counter
  1379. add x23, x23, #1
  1380. #ifdef __AARCH64EL__
  1381. rev w0, w23
  1382. str w0, [sp, #80+12] // next counter value
  1383. #else
  1384. str w23, [sp, #80+12] // next counter value
  1385. #endif
  1386. eor v0.16b, v0.16b, v1.16b
  1387. str q0, [x20], #16 // store output
  1388. subs x21, x21, #1
  1389. bne .Lctr_enc_short_loop
  1390. movi v0.16b, #0
  1391. movi v1.16b, #0
  1392. stp q0, q1, [sp, #64]
  1393. ldr x23, [sp, #48]
  1394. ldp x21, x22, [sp, #32]
  1395. ldp x19, x20, [sp, #16]
  1396. ldp x29, x30, [sp], #96
  1397. ret
  1398. .size ossl_bsaes_ctr32_encrypt_blocks,.-ossl_bsaes_ctr32_encrypt_blocks
  1399. .globl ossl_bsaes_xts_encrypt
  1400. .type ossl_bsaes_xts_encrypt,%function
  1401. .align 4
  1402. // On entry:
  1403. // x0 -> input plaintext
  1404. // x1 -> output ciphertext
  1405. // x2 -> length of text in bytes (must be at least 16)
  1406. // x3 -> key1 (used to encrypt the XORed plaintext blocks)
  1407. // x4 -> key2 (used to encrypt the initial vector to yield the initial tweak)
  1408. // x5 -> 16-byte initial vector (typically, sector number)
  1409. // On exit:
  1410. // Output ciphertext filled in
  1411. // No output registers, usual AAPCS64 register preservation
  1412. ossl_bsaes_xts_encrypt:
  1413. AARCH64_VALID_CALL_TARGET
  1414. // Stack layout:
  1415. // sp ->
  1416. // nrounds*128-96 bytes: key schedule
  1417. // x19 ->
  1418. // 16 bytes: frame record
  1419. // 4*16 bytes: tweak storage across _bsaes_encrypt8
  1420. // 6*8 bytes: storage for 5 callee-saved general-purpose registers
  1421. // 8*8 bytes: storage for 8 callee-saved SIMD registers
  1422. stp x29, x30, [sp, #-192]!
  1423. stp x19, x20, [sp, #80]
  1424. stp x21, x22, [sp, #96]
  1425. str x23, [sp, #112]
  1426. stp d8, d9, [sp, #128]
  1427. stp d10, d11, [sp, #144]
  1428. stp d12, d13, [sp, #160]
  1429. stp d14, d15, [sp, #176]
  1430. mov x19, sp
  1431. mov x20, x0
  1432. mov x21, x1
  1433. mov x22, x2
  1434. mov x23, x3
  1435. // generate initial tweak
  1436. sub sp, sp, #16
  1437. mov x0, x5 // iv[]
  1438. mov x1, sp
  1439. mov x2, x4 // key2
  1440. bl AES_encrypt
  1441. ldr q11, [sp], #16
  1442. ldr w1, [x23, #240] // get # of rounds
  1443. // allocate the key schedule on the stack
  1444. add x17, sp, #96
  1445. sub x17, x17, x1, lsl #7 // 128 bytes per inner round key, less 96 bytes
  1446. // populate the key schedule
  1447. mov x9, x23 // pass key
  1448. mov x10, x1 // pass # of rounds
  1449. mov sp, x17
  1450. bl _bsaes_key_convert
  1451. eor v15.16b, v15.16b, v7.16b // fix up last round key
  1452. str q15, [x17] // save last round key
  1453. subs x22, x22, #0x80
  1454. blo .Lxts_enc_short
  1455. b .Lxts_enc_loop
  1456. .align 4
  1457. .Lxts_enc_loop:
  1458. ldr q8, .Lxts_magic
  1459. mov x10, x1 // pass rounds
  1460. add x2, x19, #16
  1461. ldr q0, [x20], #16
  1462. sshr v1.2d, v11.2d, #63
  1463. mov x9, sp // pass key schedule
  1464. ldr q6, .Lxts_magic+16
  1465. add v2.2d, v11.2d, v11.2d
  1466. cmtst v3.2d, v11.2d, v6.2d
  1467. and v1.16b, v1.16b, v8.16b
  1468. ext v1.16b, v1.16b, v1.16b, #8
  1469. and v3.16b, v3.16b, v8.16b
  1470. ldr q4, [x20], #16
  1471. eor v12.16b, v2.16b, v1.16b
  1472. eor v1.16b, v4.16b, v12.16b
  1473. eor v0.16b, v0.16b, v11.16b
  1474. cmtst v2.2d, v12.2d, v6.2d
  1475. add v4.2d, v12.2d, v12.2d
  1476. add x0, x19, #16
  1477. ext v3.16b, v3.16b, v3.16b, #8
  1478. and v2.16b, v2.16b, v8.16b
  1479. eor v13.16b, v4.16b, v3.16b
  1480. ldr q3, [x20], #16
  1481. ext v4.16b, v2.16b, v2.16b, #8
  1482. eor v2.16b, v3.16b, v13.16b
  1483. ldr q3, [x20], #16
  1484. add v5.2d, v13.2d, v13.2d
  1485. cmtst v7.2d, v13.2d, v6.2d
  1486. and v7.16b, v7.16b, v8.16b
  1487. ldr q9, [x20], #16
  1488. ext v7.16b, v7.16b, v7.16b, #8
  1489. ldr q10, [x20], #16
  1490. eor v14.16b, v5.16b, v4.16b
  1491. ldr q16, [x20], #16
  1492. add v4.2d, v14.2d, v14.2d
  1493. eor v3.16b, v3.16b, v14.16b
  1494. eor v15.16b, v4.16b, v7.16b
  1495. add v5.2d, v15.2d, v15.2d
  1496. ldr q7, [x20], #16
  1497. cmtst v4.2d, v14.2d, v6.2d
  1498. and v17.16b, v4.16b, v8.16b
  1499. cmtst v18.2d, v15.2d, v6.2d
  1500. eor v4.16b, v9.16b, v15.16b
  1501. ext v9.16b, v17.16b, v17.16b, #8
  1502. eor v9.16b, v5.16b, v9.16b
  1503. add v17.2d, v9.2d, v9.2d
  1504. and v18.16b, v18.16b, v8.16b
  1505. eor v5.16b, v10.16b, v9.16b
  1506. str q9, [x2], #16
  1507. ext v10.16b, v18.16b, v18.16b, #8
  1508. cmtst v9.2d, v9.2d, v6.2d
  1509. and v9.16b, v9.16b, v8.16b
  1510. eor v10.16b, v17.16b, v10.16b
  1511. cmtst v17.2d, v10.2d, v6.2d
  1512. eor v6.16b, v16.16b, v10.16b
  1513. str q10, [x2], #16
  1514. ext v9.16b, v9.16b, v9.16b, #8
  1515. add v10.2d, v10.2d, v10.2d
  1516. eor v9.16b, v10.16b, v9.16b
  1517. str q9, [x2], #16
  1518. eor v7.16b, v7.16b, v9.16b
  1519. add v9.2d, v9.2d, v9.2d
  1520. and v8.16b, v17.16b, v8.16b
  1521. ext v8.16b, v8.16b, v8.16b, #8
  1522. eor v8.16b, v9.16b, v8.16b
  1523. str q8, [x2] // next round tweak
  1524. bl _bsaes_encrypt8
  1525. ldr q8, [x0], #16
  1526. eor v0.16b, v0.16b, v11.16b
  1527. eor v1.16b, v1.16b, v12.16b
  1528. ldr q9, [x0], #16
  1529. eor v4.16b, v4.16b, v13.16b
  1530. eor v6.16b, v6.16b, v14.16b
  1531. ldr q10, [x0], #16
  1532. eor v3.16b, v3.16b, v15.16b
  1533. subs x22, x22, #0x80
  1534. str q0, [x21], #16
  1535. ldr q11, [x0] // next round tweak
  1536. str q1, [x21], #16
  1537. eor v0.16b, v7.16b, v8.16b
  1538. eor v1.16b, v2.16b, v9.16b
  1539. str q4, [x21], #16
  1540. eor v2.16b, v5.16b, v10.16b
  1541. str q6, [x21], #16
  1542. str q3, [x21], #16
  1543. str q0, [x21], #16
  1544. str q1, [x21], #16
  1545. str q2, [x21], #16
  1546. bpl .Lxts_enc_loop
  1547. .Lxts_enc_short:
  1548. adds x22, x22, #0x70
  1549. bmi .Lxts_enc_done
  1550. ldr q8, .Lxts_magic
  1551. sshr v1.2d, v11.2d, #63
  1552. add v2.2d, v11.2d, v11.2d
  1553. ldr q9, .Lxts_magic+16
  1554. subs x22, x22, #0x10
  1555. ldr q0, [x20], #16
  1556. and v1.16b, v1.16b, v8.16b
  1557. cmtst v3.2d, v11.2d, v9.2d
  1558. ext v1.16b, v1.16b, v1.16b, #8
  1559. and v3.16b, v3.16b, v8.16b
  1560. eor v12.16b, v2.16b, v1.16b
  1561. ext v1.16b, v3.16b, v3.16b, #8
  1562. add v2.2d, v12.2d, v12.2d
  1563. cmtst v3.2d, v12.2d, v9.2d
  1564. eor v13.16b, v2.16b, v1.16b
  1565. and v22.16b, v3.16b, v8.16b
  1566. bmi .Lxts_enc_1
  1567. ext v2.16b, v22.16b, v22.16b, #8
  1568. add v3.2d, v13.2d, v13.2d
  1569. ldr q1, [x20], #16
  1570. cmtst v4.2d, v13.2d, v9.2d
  1571. subs x22, x22, #0x10
  1572. eor v14.16b, v3.16b, v2.16b
  1573. and v23.16b, v4.16b, v8.16b
  1574. bmi .Lxts_enc_2
  1575. ext v3.16b, v23.16b, v23.16b, #8
  1576. add v4.2d, v14.2d, v14.2d
  1577. ldr q2, [x20], #16
  1578. cmtst v5.2d, v14.2d, v9.2d
  1579. eor v0.16b, v0.16b, v11.16b
  1580. subs x22, x22, #0x10
  1581. eor v15.16b, v4.16b, v3.16b
  1582. and v24.16b, v5.16b, v8.16b
  1583. bmi .Lxts_enc_3
  1584. ext v4.16b, v24.16b, v24.16b, #8
  1585. add v5.2d, v15.2d, v15.2d
  1586. ldr q3, [x20], #16
  1587. cmtst v6.2d, v15.2d, v9.2d
  1588. eor v1.16b, v1.16b, v12.16b
  1589. subs x22, x22, #0x10
  1590. eor v16.16b, v5.16b, v4.16b
  1591. and v25.16b, v6.16b, v8.16b
  1592. bmi .Lxts_enc_4
  1593. ext v5.16b, v25.16b, v25.16b, #8
  1594. add v6.2d, v16.2d, v16.2d
  1595. add x0, x19, #16
  1596. cmtst v7.2d, v16.2d, v9.2d
  1597. ldr q4, [x20], #16
  1598. eor v2.16b, v2.16b, v13.16b
  1599. str q16, [x0], #16
  1600. subs x22, x22, #0x10
  1601. eor v17.16b, v6.16b, v5.16b
  1602. and v26.16b, v7.16b, v8.16b
  1603. bmi .Lxts_enc_5
  1604. ext v7.16b, v26.16b, v26.16b, #8
  1605. add v18.2d, v17.2d, v17.2d
  1606. ldr q5, [x20], #16
  1607. eor v3.16b, v3.16b, v14.16b
  1608. str q17, [x0], #16
  1609. subs x22, x22, #0x10
  1610. eor v18.16b, v18.16b, v7.16b
  1611. bmi .Lxts_enc_6
  1612. ldr q6, [x20], #16
  1613. eor v4.16b, v4.16b, v15.16b
  1614. eor v5.16b, v5.16b, v16.16b
  1615. str q18, [x0] // next round tweak
  1616. mov x9, sp // pass key schedule
  1617. mov x10, x1
  1618. add x0, x19, #16
  1619. sub x22, x22, #0x10
  1620. eor v6.16b, v6.16b, v17.16b
  1621. bl _bsaes_encrypt8
  1622. ldr q16, [x0], #16
  1623. eor v0.16b, v0.16b, v11.16b
  1624. eor v1.16b, v1.16b, v12.16b
  1625. ldr q17, [x0], #16
  1626. eor v4.16b, v4.16b, v13.16b
  1627. eor v6.16b, v6.16b, v14.16b
  1628. eor v3.16b, v3.16b, v15.16b
  1629. ldr q11, [x0] // next round tweak
  1630. str q0, [x21], #16
  1631. str q1, [x21], #16
  1632. eor v0.16b, v7.16b, v16.16b
  1633. eor v1.16b, v2.16b, v17.16b
  1634. str q4, [x21], #16
  1635. str q6, [x21], #16
  1636. str q3, [x21], #16
  1637. str q0, [x21], #16
  1638. str q1, [x21], #16
  1639. b .Lxts_enc_done
  1640. .align 4
  1641. .Lxts_enc_6:
  1642. eor v4.16b, v4.16b, v15.16b
  1643. eor v5.16b, v5.16b, v16.16b
  1644. mov x9, sp // pass key schedule
  1645. mov x10, x1 // pass rounds
  1646. add x0, x19, #16
  1647. bl _bsaes_encrypt8
  1648. ldr q16, [x0], #16
  1649. eor v0.16b, v0.16b, v11.16b
  1650. eor v1.16b, v1.16b, v12.16b
  1651. eor v4.16b, v4.16b, v13.16b
  1652. eor v6.16b, v6.16b, v14.16b
  1653. ldr q11, [x0] // next round tweak
  1654. eor v3.16b, v3.16b, v15.16b
  1655. str q0, [x21], #16
  1656. str q1, [x21], #16
  1657. eor v0.16b, v7.16b, v16.16b
  1658. str q4, [x21], #16
  1659. str q6, [x21], #16
  1660. str q3, [x21], #16
  1661. str q0, [x21], #16
  1662. b .Lxts_enc_done
  1663. .align 4
  1664. .Lxts_enc_5:
  1665. eor v3.16b, v3.16b, v14.16b
  1666. eor v4.16b, v4.16b, v15.16b
  1667. mov x9, sp // pass key schedule
  1668. mov x10, x1 // pass rounds
  1669. add x0, x19, #16
  1670. bl _bsaes_encrypt8
  1671. eor v0.16b, v0.16b, v11.16b
  1672. eor v1.16b, v1.16b, v12.16b
  1673. ldr q11, [x0] // next round tweak
  1674. eor v4.16b, v4.16b, v13.16b
  1675. eor v6.16b, v6.16b, v14.16b
  1676. eor v3.16b, v3.16b, v15.16b
  1677. str q0, [x21], #16
  1678. str q1, [x21], #16
  1679. str q4, [x21], #16
  1680. str q6, [x21], #16
  1681. str q3, [x21], #16
  1682. b .Lxts_enc_done
  1683. .align 4
  1684. .Lxts_enc_4:
  1685. eor v2.16b, v2.16b, v13.16b
  1686. eor v3.16b, v3.16b, v14.16b
  1687. mov x9, sp // pass key schedule
  1688. mov x10, x1 // pass rounds
  1689. add x0, x19, #16
  1690. bl _bsaes_encrypt8
  1691. eor v0.16b, v0.16b, v11.16b
  1692. eor v1.16b, v1.16b, v12.16b
  1693. eor v4.16b, v4.16b, v13.16b
  1694. eor v6.16b, v6.16b, v14.16b
  1695. mov v11.16b, v15.16b // next round tweak
  1696. str q0, [x21], #16
  1697. str q1, [x21], #16
  1698. str q4, [x21], #16
  1699. str q6, [x21], #16
  1700. b .Lxts_enc_done
  1701. .align 4
  1702. .Lxts_enc_3:
  1703. eor v1.16b, v1.16b, v12.16b
  1704. eor v2.16b, v2.16b, v13.16b
  1705. mov x9, sp // pass key schedule
  1706. mov x10, x1 // pass rounds
  1707. add x0, x19, #16
  1708. bl _bsaes_encrypt8
  1709. eor v0.16b, v0.16b, v11.16b
  1710. eor v1.16b, v1.16b, v12.16b
  1711. eor v4.16b, v4.16b, v13.16b
  1712. mov v11.16b, v14.16b // next round tweak
  1713. str q0, [x21], #16
  1714. str q1, [x21], #16
  1715. str q4, [x21], #16
  1716. b .Lxts_enc_done
  1717. .align 4
  1718. .Lxts_enc_2:
  1719. eor v0.16b, v0.16b, v11.16b
  1720. eor v1.16b, v1.16b, v12.16b
  1721. mov x9, sp // pass key schedule
  1722. mov x10, x1 // pass rounds
  1723. add x0, x19, #16
  1724. bl _bsaes_encrypt8
  1725. eor v0.16b, v0.16b, v11.16b
  1726. eor v1.16b, v1.16b, v12.16b
  1727. mov v11.16b, v13.16b // next round tweak
  1728. str q0, [x21], #16
  1729. str q1, [x21], #16
  1730. b .Lxts_enc_done
  1731. .align 4
  1732. .Lxts_enc_1:
  1733. eor v0.16b, v0.16b, v11.16b
  1734. sub x0, sp, #16
  1735. sub x1, sp, #16
  1736. mov x2, x23
  1737. mov v13.d[0], v11.d[1] // just in case AES_encrypt corrupts top half of callee-saved SIMD registers
  1738. mov v14.d[0], v12.d[1]
  1739. str q0, [sp, #-16]!
  1740. bl AES_encrypt
  1741. ldr q0, [sp], #16
  1742. trn1 v13.2d, v11.2d, v13.2d
  1743. trn1 v11.2d, v12.2d, v14.2d // next round tweak
  1744. eor v0.16b, v0.16b, v13.16b
  1745. str q0, [x21], #16
  1746. .Lxts_enc_done:
  1747. adds x22, x22, #0x10
  1748. beq .Lxts_enc_ret
  1749. sub x6, x21, #0x10
  1750. // Penultimate plaintext block produces final ciphertext part-block
  1751. // plus remaining part of final plaintext block. Move ciphertext part
  1752. // to final position and reuse penultimate ciphertext block buffer to
  1753. // construct final plaintext block
  1754. .Lxts_enc_steal:
  1755. ldrb w0, [x20], #1
  1756. ldrb w1, [x21, #-0x10]
  1757. strb w0, [x21, #-0x10]
  1758. strb w1, [x21], #1
  1759. subs x22, x22, #1
  1760. bhi .Lxts_enc_steal
  1761. // Finally encrypt the penultimate ciphertext block using the
  1762. // last tweak
  1763. ldr q0, [x6]
  1764. eor v0.16b, v0.16b, v11.16b
  1765. str q0, [sp, #-16]!
  1766. mov x0, sp
  1767. mov x1, sp
  1768. mov x2, x23
  1769. mov x21, x6
  1770. mov v13.d[0], v11.d[1] // just in case AES_encrypt corrupts top half of callee-saved SIMD registers
  1771. bl AES_encrypt
  1772. trn1 v11.2d, v11.2d, v13.2d
  1773. ldr q0, [sp], #16
  1774. eor v0.16b, v0.16b, v11.16b
  1775. str q0, [x21]
  1776. .Lxts_enc_ret:
  1777. movi v0.16b, #0
  1778. movi v1.16b, #0
  1779. .Lxts_enc_bzero: // wipe key schedule
  1780. stp q0, q1, [sp], #32
  1781. cmp sp, x19
  1782. bne .Lxts_enc_bzero
  1783. ldp x19, x20, [sp, #80]
  1784. ldp x21, x22, [sp, #96]
  1785. ldr x23, [sp, #112]
  1786. ldp d8, d9, [sp, #128]
  1787. ldp d10, d11, [sp, #144]
  1788. ldp d12, d13, [sp, #160]
  1789. ldp d14, d15, [sp, #176]
  1790. ldp x29, x30, [sp], #192
  1791. ret
  1792. .size ossl_bsaes_xts_encrypt,.-ossl_bsaes_xts_encrypt
  1793. // The assembler doesn't seem capable of de-duplicating these when expressed
  1794. // using `ldr qd,=` syntax, so assign a symbolic address
  1795. .align 5
  1796. .Lxts_magic:
  1797. .quad 1, 0x87, 0x4000000000000000, 0x4000000000000000
  1798. .globl ossl_bsaes_xts_decrypt
  1799. .type ossl_bsaes_xts_decrypt,%function
  1800. .align 4
  1801. // On entry:
  1802. // x0 -> input ciphertext
  1803. // x1 -> output plaintext
  1804. // x2 -> length of text in bytes (must be at least 16)
  1805. // x3 -> key1 (used to decrypt the XORed ciphertext blocks)
  1806. // x4 -> key2 (used to encrypt the initial vector to yield the initial tweak)
  1807. // x5 -> 16-byte initial vector (typically, sector number)
  1808. // On exit:
  1809. // Output plaintext filled in
  1810. // No output registers, usual AAPCS64 register preservation
  1811. ossl_bsaes_xts_decrypt:
  1812. AARCH64_VALID_CALL_TARGET
  1813. // Stack layout:
  1814. // sp ->
  1815. // nrounds*128-96 bytes: key schedule
  1816. // x19 ->
  1817. // 16 bytes: frame record
  1818. // 4*16 bytes: tweak storage across _bsaes_decrypt8
  1819. // 6*8 bytes: storage for 5 callee-saved general-purpose registers
  1820. // 8*8 bytes: storage for 8 callee-saved SIMD registers
  1821. stp x29, x30, [sp, #-192]!
  1822. stp x19, x20, [sp, #80]
  1823. stp x21, x22, [sp, #96]
  1824. str x23, [sp, #112]
  1825. stp d8, d9, [sp, #128]
  1826. stp d10, d11, [sp, #144]
  1827. stp d12, d13, [sp, #160]
  1828. stp d14, d15, [sp, #176]
  1829. mov x19, sp
  1830. mov x20, x0
  1831. mov x21, x1
  1832. mov x22, x2
  1833. mov x23, x3
  1834. // generate initial tweak
  1835. sub sp, sp, #16
  1836. mov x0, x5 // iv[]
  1837. mov x1, sp
  1838. mov x2, x4 // key2
  1839. bl AES_encrypt
  1840. ldr q11, [sp], #16
  1841. ldr w1, [x23, #240] // get # of rounds
  1842. // allocate the key schedule on the stack
  1843. add x17, sp, #96
  1844. sub x17, x17, x1, lsl #7 // 128 bytes per inner round key, less 96 bytes
  1845. // populate the key schedule
  1846. mov x9, x23 // pass key
  1847. mov x10, x1 // pass # of rounds
  1848. mov sp, x17
  1849. bl _bsaes_key_convert
  1850. ldr q6, [sp]
  1851. str q15, [x17] // save last round key
  1852. eor v6.16b, v6.16b, v7.16b // fix up round 0 key (by XORing with 0x63)
  1853. str q6, [sp]
  1854. sub x30, x22, #0x10
  1855. tst x22, #0xf // if not multiple of 16
  1856. csel x22, x30, x22, ne // subtract another 16 bytes
  1857. subs x22, x22, #0x80
  1858. blo .Lxts_dec_short
  1859. b .Lxts_dec_loop
  1860. .align 4
  1861. .Lxts_dec_loop:
  1862. ldr q8, .Lxts_magic
  1863. mov x10, x1 // pass rounds
  1864. add x2, x19, #16
  1865. ldr q0, [x20], #16
  1866. sshr v1.2d, v11.2d, #63
  1867. mov x9, sp // pass key schedule
  1868. ldr q6, .Lxts_magic+16
  1869. add v2.2d, v11.2d, v11.2d
  1870. cmtst v3.2d, v11.2d, v6.2d
  1871. and v1.16b, v1.16b, v8.16b
  1872. ext v1.16b, v1.16b, v1.16b, #8
  1873. and v3.16b, v3.16b, v8.16b
  1874. ldr q4, [x20], #16
  1875. eor v12.16b, v2.16b, v1.16b
  1876. eor v1.16b, v4.16b, v12.16b
  1877. eor v0.16b, v0.16b, v11.16b
  1878. cmtst v2.2d, v12.2d, v6.2d
  1879. add v4.2d, v12.2d, v12.2d
  1880. add x0, x19, #16
  1881. ext v3.16b, v3.16b, v3.16b, #8
  1882. and v2.16b, v2.16b, v8.16b
  1883. eor v13.16b, v4.16b, v3.16b
  1884. ldr q3, [x20], #16
  1885. ext v4.16b, v2.16b, v2.16b, #8
  1886. eor v2.16b, v3.16b, v13.16b
  1887. ldr q3, [x20], #16
  1888. add v5.2d, v13.2d, v13.2d
  1889. cmtst v7.2d, v13.2d, v6.2d
  1890. and v7.16b, v7.16b, v8.16b
  1891. ldr q9, [x20], #16
  1892. ext v7.16b, v7.16b, v7.16b, #8
  1893. ldr q10, [x20], #16
  1894. eor v14.16b, v5.16b, v4.16b
  1895. ldr q16, [x20], #16
  1896. add v4.2d, v14.2d, v14.2d
  1897. eor v3.16b, v3.16b, v14.16b
  1898. eor v15.16b, v4.16b, v7.16b
  1899. add v5.2d, v15.2d, v15.2d
  1900. ldr q7, [x20], #16
  1901. cmtst v4.2d, v14.2d, v6.2d
  1902. and v17.16b, v4.16b, v8.16b
  1903. cmtst v18.2d, v15.2d, v6.2d
  1904. eor v4.16b, v9.16b, v15.16b
  1905. ext v9.16b, v17.16b, v17.16b, #8
  1906. eor v9.16b, v5.16b, v9.16b
  1907. add v17.2d, v9.2d, v9.2d
  1908. and v18.16b, v18.16b, v8.16b
  1909. eor v5.16b, v10.16b, v9.16b
  1910. str q9, [x2], #16
  1911. ext v10.16b, v18.16b, v18.16b, #8
  1912. cmtst v9.2d, v9.2d, v6.2d
  1913. and v9.16b, v9.16b, v8.16b
  1914. eor v10.16b, v17.16b, v10.16b
  1915. cmtst v17.2d, v10.2d, v6.2d
  1916. eor v6.16b, v16.16b, v10.16b
  1917. str q10, [x2], #16
  1918. ext v9.16b, v9.16b, v9.16b, #8
  1919. add v10.2d, v10.2d, v10.2d
  1920. eor v9.16b, v10.16b, v9.16b
  1921. str q9, [x2], #16
  1922. eor v7.16b, v7.16b, v9.16b
  1923. add v9.2d, v9.2d, v9.2d
  1924. and v8.16b, v17.16b, v8.16b
  1925. ext v8.16b, v8.16b, v8.16b, #8
  1926. eor v8.16b, v9.16b, v8.16b
  1927. str q8, [x2] // next round tweak
  1928. bl _bsaes_decrypt8
  1929. eor v6.16b, v6.16b, v13.16b
  1930. eor v0.16b, v0.16b, v11.16b
  1931. ldr q8, [x0], #16
  1932. eor v7.16b, v7.16b, v8.16b
  1933. str q0, [x21], #16
  1934. eor v0.16b, v1.16b, v12.16b
  1935. ldr q1, [x0], #16
  1936. eor v1.16b, v3.16b, v1.16b
  1937. subs x22, x22, #0x80
  1938. eor v2.16b, v2.16b, v15.16b
  1939. eor v3.16b, v4.16b, v14.16b
  1940. ldr q4, [x0], #16
  1941. str q0, [x21], #16
  1942. ldr q11, [x0] // next round tweak
  1943. eor v0.16b, v5.16b, v4.16b
  1944. str q6, [x21], #16
  1945. str q3, [x21], #16
  1946. str q2, [x21], #16
  1947. str q7, [x21], #16
  1948. str q1, [x21], #16
  1949. str q0, [x21], #16
  1950. bpl .Lxts_dec_loop
  1951. .Lxts_dec_short:
  1952. adds x22, x22, #0x70
  1953. bmi .Lxts_dec_done
  1954. ldr q8, .Lxts_magic
  1955. sshr v1.2d, v11.2d, #63
  1956. add v2.2d, v11.2d, v11.2d
  1957. ldr q9, .Lxts_magic+16
  1958. subs x22, x22, #0x10
  1959. ldr q0, [x20], #16
  1960. and v1.16b, v1.16b, v8.16b
  1961. cmtst v3.2d, v11.2d, v9.2d
  1962. ext v1.16b, v1.16b, v1.16b, #8
  1963. and v3.16b, v3.16b, v8.16b
  1964. eor v12.16b, v2.16b, v1.16b
  1965. ext v1.16b, v3.16b, v3.16b, #8
  1966. add v2.2d, v12.2d, v12.2d
  1967. cmtst v3.2d, v12.2d, v9.2d
  1968. eor v13.16b, v2.16b, v1.16b
  1969. and v22.16b, v3.16b, v8.16b
  1970. bmi .Lxts_dec_1
  1971. ext v2.16b, v22.16b, v22.16b, #8
  1972. add v3.2d, v13.2d, v13.2d
  1973. ldr q1, [x20], #16
  1974. cmtst v4.2d, v13.2d, v9.2d
  1975. subs x22, x22, #0x10
  1976. eor v14.16b, v3.16b, v2.16b
  1977. and v23.16b, v4.16b, v8.16b
  1978. bmi .Lxts_dec_2
  1979. ext v3.16b, v23.16b, v23.16b, #8
  1980. add v4.2d, v14.2d, v14.2d
  1981. ldr q2, [x20], #16
  1982. cmtst v5.2d, v14.2d, v9.2d
  1983. eor v0.16b, v0.16b, v11.16b
  1984. subs x22, x22, #0x10
  1985. eor v15.16b, v4.16b, v3.16b
  1986. and v24.16b, v5.16b, v8.16b
  1987. bmi .Lxts_dec_3
  1988. ext v4.16b, v24.16b, v24.16b, #8
  1989. add v5.2d, v15.2d, v15.2d
  1990. ldr q3, [x20], #16
  1991. cmtst v6.2d, v15.2d, v9.2d
  1992. eor v1.16b, v1.16b, v12.16b
  1993. subs x22, x22, #0x10
  1994. eor v16.16b, v5.16b, v4.16b
  1995. and v25.16b, v6.16b, v8.16b
  1996. bmi .Lxts_dec_4
  1997. ext v5.16b, v25.16b, v25.16b, #8
  1998. add v6.2d, v16.2d, v16.2d
  1999. add x0, x19, #16
  2000. cmtst v7.2d, v16.2d, v9.2d
  2001. ldr q4, [x20], #16
  2002. eor v2.16b, v2.16b, v13.16b
  2003. str q16, [x0], #16
  2004. subs x22, x22, #0x10
  2005. eor v17.16b, v6.16b, v5.16b
  2006. and v26.16b, v7.16b, v8.16b
  2007. bmi .Lxts_dec_5
  2008. ext v7.16b, v26.16b, v26.16b, #8
  2009. add v18.2d, v17.2d, v17.2d
  2010. ldr q5, [x20], #16
  2011. eor v3.16b, v3.16b, v14.16b
  2012. str q17, [x0], #16
  2013. subs x22, x22, #0x10
  2014. eor v18.16b, v18.16b, v7.16b
  2015. bmi .Lxts_dec_6
  2016. ldr q6, [x20], #16
  2017. eor v4.16b, v4.16b, v15.16b
  2018. eor v5.16b, v5.16b, v16.16b
  2019. str q18, [x0] // next round tweak
  2020. mov x9, sp // pass key schedule
  2021. mov x10, x1
  2022. add x0, x19, #16
  2023. sub x22, x22, #0x10
  2024. eor v6.16b, v6.16b, v17.16b
  2025. bl _bsaes_decrypt8
  2026. ldr q16, [x0], #16
  2027. eor v0.16b, v0.16b, v11.16b
  2028. eor v1.16b, v1.16b, v12.16b
  2029. ldr q17, [x0], #16
  2030. eor v6.16b, v6.16b, v13.16b
  2031. eor v4.16b, v4.16b, v14.16b
  2032. eor v2.16b, v2.16b, v15.16b
  2033. ldr q11, [x0] // next round tweak
  2034. str q0, [x21], #16
  2035. str q1, [x21], #16
  2036. eor v0.16b, v7.16b, v16.16b
  2037. eor v1.16b, v3.16b, v17.16b
  2038. str q6, [x21], #16
  2039. str q4, [x21], #16
  2040. str q2, [x21], #16
  2041. str q0, [x21], #16
  2042. str q1, [x21], #16
  2043. b .Lxts_dec_done
  2044. .align 4
  2045. .Lxts_dec_6:
  2046. eor v4.16b, v4.16b, v15.16b
  2047. eor v5.16b, v5.16b, v16.16b
  2048. mov x9, sp // pass key schedule
  2049. mov x10, x1 // pass rounds
  2050. add x0, x19, #16
  2051. bl _bsaes_decrypt8
  2052. ldr q16, [x0], #16
  2053. eor v0.16b, v0.16b, v11.16b
  2054. eor v1.16b, v1.16b, v12.16b
  2055. eor v6.16b, v6.16b, v13.16b
  2056. eor v4.16b, v4.16b, v14.16b
  2057. ldr q11, [x0] // next round tweak
  2058. eor v2.16b, v2.16b, v15.16b
  2059. str q0, [x21], #16
  2060. str q1, [x21], #16
  2061. eor v0.16b, v7.16b, v16.16b
  2062. str q6, [x21], #16
  2063. str q4, [x21], #16
  2064. str q2, [x21], #16
  2065. str q0, [x21], #16
  2066. b .Lxts_dec_done
  2067. .align 4
  2068. .Lxts_dec_5:
  2069. eor v3.16b, v3.16b, v14.16b
  2070. eor v4.16b, v4.16b, v15.16b
  2071. mov x9, sp // pass key schedule
  2072. mov x10, x1 // pass rounds
  2073. add x0, x19, #16
  2074. bl _bsaes_decrypt8
  2075. eor v0.16b, v0.16b, v11.16b
  2076. eor v1.16b, v1.16b, v12.16b
  2077. ldr q11, [x0] // next round tweak
  2078. eor v6.16b, v6.16b, v13.16b
  2079. eor v4.16b, v4.16b, v14.16b
  2080. eor v2.16b, v2.16b, v15.16b
  2081. str q0, [x21], #16
  2082. str q1, [x21], #16
  2083. str q6, [x21], #16
  2084. str q4, [x21], #16
  2085. str q2, [x21], #16
  2086. b .Lxts_dec_done
  2087. .align 4
  2088. .Lxts_dec_4:
  2089. eor v2.16b, v2.16b, v13.16b
  2090. eor v3.16b, v3.16b, v14.16b
  2091. mov x9, sp // pass key schedule
  2092. mov x10, x1 // pass rounds
  2093. add x0, x19, #16
  2094. bl _bsaes_decrypt8
  2095. eor v0.16b, v0.16b, v11.16b
  2096. eor v1.16b, v1.16b, v12.16b
  2097. eor v6.16b, v6.16b, v13.16b
  2098. eor v4.16b, v4.16b, v14.16b
  2099. mov v11.16b, v15.16b // next round tweak
  2100. str q0, [x21], #16
  2101. str q1, [x21], #16
  2102. str q6, [x21], #16
  2103. str q4, [x21], #16
  2104. b .Lxts_dec_done
  2105. .align 4
  2106. .Lxts_dec_3:
  2107. eor v1.16b, v1.16b, v12.16b
  2108. eor v2.16b, v2.16b, v13.16b
  2109. mov x9, sp // pass key schedule
  2110. mov x10, x1 // pass rounds
  2111. add x0, x19, #16
  2112. bl _bsaes_decrypt8
  2113. eor v0.16b, v0.16b, v11.16b
  2114. eor v1.16b, v1.16b, v12.16b
  2115. eor v6.16b, v6.16b, v13.16b
  2116. mov v11.16b, v14.16b // next round tweak
  2117. str q0, [x21], #16
  2118. str q1, [x21], #16
  2119. str q6, [x21], #16
  2120. b .Lxts_dec_done
  2121. .align 4
  2122. .Lxts_dec_2:
  2123. eor v0.16b, v0.16b, v11.16b
  2124. eor v1.16b, v1.16b, v12.16b
  2125. mov x9, sp // pass key schedule
  2126. mov x10, x1 // pass rounds
  2127. add x0, x19, #16
  2128. bl _bsaes_decrypt8
  2129. eor v0.16b, v0.16b, v11.16b
  2130. eor v1.16b, v1.16b, v12.16b
  2131. mov v11.16b, v13.16b // next round tweak
  2132. str q0, [x21], #16
  2133. str q1, [x21], #16
  2134. b .Lxts_dec_done
  2135. .align 4
  2136. .Lxts_dec_1:
  2137. eor v0.16b, v0.16b, v11.16b
  2138. sub x0, sp, #16
  2139. sub x1, sp, #16
  2140. mov x2, x23
  2141. mov v13.d[0], v11.d[1] // just in case AES_decrypt corrupts top half of callee-saved SIMD registers
  2142. mov v14.d[0], v12.d[1]
  2143. str q0, [sp, #-16]!
  2144. bl AES_decrypt
  2145. ldr q0, [sp], #16
  2146. trn1 v13.2d, v11.2d, v13.2d
  2147. trn1 v11.2d, v12.2d, v14.2d // next round tweak
  2148. eor v0.16b, v0.16b, v13.16b
  2149. str q0, [x21], #16
  2150. .Lxts_dec_done:
  2151. adds x22, x22, #0x10
  2152. beq .Lxts_dec_ret
  2153. // calculate one round of extra tweak for the stolen ciphertext
  2154. ldr q8, .Lxts_magic
  2155. sshr v6.2d, v11.2d, #63
  2156. and v6.16b, v6.16b, v8.16b
  2157. add v12.2d, v11.2d, v11.2d
  2158. ext v6.16b, v6.16b, v6.16b, #8
  2159. eor v12.16b, v12.16b, v6.16b
  2160. // perform the final decryption with the last tweak value
  2161. ldr q0, [x20], #16
  2162. eor v0.16b, v0.16b, v12.16b
  2163. str q0, [sp, #-16]!
  2164. mov x0, sp
  2165. mov x1, sp
  2166. mov x2, x23
  2167. mov v13.d[0], v11.d[1] // just in case AES_decrypt corrupts top half of callee-saved SIMD registers
  2168. mov v14.d[0], v12.d[1]
  2169. bl AES_decrypt
  2170. trn1 v12.2d, v12.2d, v14.2d
  2171. trn1 v11.2d, v11.2d, v13.2d
  2172. ldr q0, [sp], #16
  2173. eor v0.16b, v0.16b, v12.16b
  2174. str q0, [x21]
  2175. mov x6, x21
  2176. // Penultimate ciphertext block produces final plaintext part-block
  2177. // plus remaining part of final ciphertext block. Move plaintext part
  2178. // to final position and reuse penultimate plaintext block buffer to
  2179. // construct final ciphertext block
  2180. .Lxts_dec_steal:
  2181. ldrb w1, [x21]
  2182. ldrb w0, [x20], #1
  2183. strb w1, [x21, #0x10]
  2184. strb w0, [x21], #1
  2185. subs x22, x22, #1
  2186. bhi .Lxts_dec_steal
  2187. // Finally decrypt the penultimate plaintext block using the
  2188. // penultimate tweak
  2189. ldr q0, [x6]
  2190. eor v0.16b, v0.16b, v11.16b
  2191. str q0, [sp, #-16]!
  2192. mov x0, sp
  2193. mov x1, sp
  2194. mov x2, x23
  2195. mov x21, x6
  2196. bl AES_decrypt
  2197. trn1 v11.2d, v11.2d, v13.2d
  2198. ldr q0, [sp], #16
  2199. eor v0.16b, v0.16b, v11.16b
  2200. str q0, [x21]
  2201. .Lxts_dec_ret:
  2202. movi v0.16b, #0
  2203. movi v1.16b, #0
  2204. .Lxts_dec_bzero: // wipe key schedule
  2205. stp q0, q1, [sp], #32
  2206. cmp sp, x19
  2207. bne .Lxts_dec_bzero
  2208. ldp x19, x20, [sp, #80]
  2209. ldp x21, x22, [sp, #96]
  2210. ldr x23, [sp, #112]
  2211. ldp d8, d9, [sp, #128]
  2212. ldp d10, d11, [sp, #144]
  2213. ldp d12, d13, [sp, #160]
  2214. ldp d14, d15, [sp, #176]
  2215. ldp x29, x30, [sp], #192
  2216. ret
  2217. .size ossl_bsaes_xts_decrypt,.-ossl_bsaes_xts_decrypt