bsaes-armv7.pl 62 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491
  1. #! /usr/bin/env perl
  2. # Copyright 2012-2018 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the OpenSSL license (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. # ====================================================================
  9. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  10. # project. The module is, however, dual licensed under OpenSSL and
  11. # CRYPTOGAMS licenses depending on where you obtain it. For further
  12. # details see http://www.openssl.org/~appro/cryptogams/.
  13. #
  14. # Specific modes and adaptation for Linux kernel by Ard Biesheuvel
  15. # of Linaro. Permission to use under GPL terms is granted.
  16. # ====================================================================
  17. # Bit-sliced AES for ARM NEON
  18. #
  19. # February 2012.
  20. #
  21. # This implementation is direct adaptation of bsaes-x86_64 module for
  22. # ARM NEON. Except that this module is endian-neutral [in sense that
  23. # it can be compiled for either endianness] by courtesy of vld1.8's
  24. # neutrality. Initial version doesn't implement interface to OpenSSL,
  25. # only low-level primitives and unsupported entry points, just enough
  26. # to collect performance results, which for Cortex-A8 core are:
  27. #
  28. # encrypt 19.5 cycles per byte processed with 128-bit key
  29. # decrypt 22.1 cycles per byte processed with 128-bit key
  30. # key conv. 440 cycles per 128-bit key/0.18 of 8x block
  31. #
  32. # Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 19.7,
  33. # which is [much] worse than anticipated (for further details see
  34. # http://www.openssl.org/~appro/Snapdragon-S4.html).
  35. #
  36. # Cortex-A15 manages in 14.2/16.1 cycles [when integer-only code
  37. # manages in 20.0 cycles].
  38. #
  39. # When comparing to x86_64 results keep in mind that NEON unit is
  40. # [mostly] single-issue and thus can't [fully] benefit from
  41. # instruction-level parallelism. And when comparing to aes-armv4
  42. # results keep in mind key schedule conversion overhead (see
  43. # bsaes-x86_64.pl for further details)...
  44. #
  45. # <appro@openssl.org>
  46. # April-August 2013
  47. # Add CBC, CTR and XTS subroutines and adapt for kernel use; courtesy of Ard.
  48. $flavour = shift;
  49. if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
  50. else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
  51. if ($flavour && $flavour ne "void") {
  52. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  53. ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
  54. ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
  55. die "can't locate arm-xlate.pl";
  56. open STDOUT,"| \"$^X\" $xlate $flavour $output";
  57. } else {
  58. open STDOUT,">$output";
  59. }
  60. my ($inp,$out,$len,$key)=("r0","r1","r2","r3");
  61. my @XMM=map("q$_",(0..15));
  62. {
  63. my ($key,$rounds,$const)=("r4","r5","r6");
  64. sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
  65. sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
  66. sub Sbox {
  67. # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
  68. # output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
  69. my @b=@_[0..7];
  70. my @t=@_[8..11];
  71. my @s=@_[12..15];
  72. &InBasisChange (@b);
  73. &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s);
  74. &OutBasisChange (@b[7,1,4,2,6,5,0,3]);
  75. }
  76. sub InBasisChange {
  77. # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
  78. # output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
  79. my @b=@_[0..7];
  80. $code.=<<___;
  81. veor @b[2], @b[2], @b[1]
  82. veor @b[5], @b[5], @b[6]
  83. veor @b[3], @b[3], @b[0]
  84. veor @b[6], @b[6], @b[2]
  85. veor @b[5], @b[5], @b[0]
  86. veor @b[6], @b[6], @b[3]
  87. veor @b[3], @b[3], @b[7]
  88. veor @b[7], @b[7], @b[5]
  89. veor @b[3], @b[3], @b[4]
  90. veor @b[4], @b[4], @b[5]
  91. veor @b[2], @b[2], @b[7]
  92. veor @b[3], @b[3], @b[1]
  93. veor @b[1], @b[1], @b[5]
  94. ___
  95. }
  96. sub OutBasisChange {
  97. # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
  98. # output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
  99. my @b=@_[0..7];
  100. $code.=<<___;
  101. veor @b[0], @b[0], @b[6]
  102. veor @b[1], @b[1], @b[4]
  103. veor @b[4], @b[4], @b[6]
  104. veor @b[2], @b[2], @b[0]
  105. veor @b[6], @b[6], @b[1]
  106. veor @b[1], @b[1], @b[5]
  107. veor @b[5], @b[5], @b[3]
  108. veor @b[3], @b[3], @b[7]
  109. veor @b[7], @b[7], @b[5]
  110. veor @b[2], @b[2], @b[5]
  111. veor @b[4], @b[4], @b[7]
  112. ___
  113. }
  114. sub InvSbox {
  115. # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
  116. # output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb
  117. my @b=@_[0..7];
  118. my @t=@_[8..11];
  119. my @s=@_[12..15];
  120. &InvInBasisChange (@b);
  121. &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s);
  122. &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]);
  123. }
  124. sub InvInBasisChange { # OutBasisChange in reverse (with twist)
  125. my @b=@_[5,1,2,6,3,7,0,4];
  126. $code.=<<___
  127. veor @b[1], @b[1], @b[7]
  128. veor @b[4], @b[4], @b[7]
  129. veor @b[7], @b[7], @b[5]
  130. veor @b[1], @b[1], @b[3]
  131. veor @b[2], @b[2], @b[5]
  132. veor @b[3], @b[3], @b[7]
  133. veor @b[6], @b[6], @b[1]
  134. veor @b[2], @b[2], @b[0]
  135. veor @b[5], @b[5], @b[3]
  136. veor @b[4], @b[4], @b[6]
  137. veor @b[0], @b[0], @b[6]
  138. veor @b[1], @b[1], @b[4]
  139. ___
  140. }
  141. sub InvOutBasisChange { # InBasisChange in reverse
  142. my @b=@_[2,5,7,3,6,1,0,4];
  143. $code.=<<___;
  144. veor @b[1], @b[1], @b[5]
  145. veor @b[2], @b[2], @b[7]
  146. veor @b[3], @b[3], @b[1]
  147. veor @b[4], @b[4], @b[5]
  148. veor @b[7], @b[7], @b[5]
  149. veor @b[3], @b[3], @b[4]
  150. veor @b[5], @b[5], @b[0]
  151. veor @b[3], @b[3], @b[7]
  152. veor @b[6], @b[6], @b[2]
  153. veor @b[2], @b[2], @b[1]
  154. veor @b[6], @b[6], @b[3]
  155. veor @b[3], @b[3], @b[0]
  156. veor @b[5], @b[5], @b[6]
  157. ___
  158. }
  159. sub Mul_GF4 {
  160. #;*************************************************************
  161. #;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
  162. #;*************************************************************
  163. my ($x0,$x1,$y0,$y1,$t0,$t1)=@_;
  164. $code.=<<___;
  165. veor $t0, $y0, $y1
  166. vand $t0, $t0, $x0
  167. veor $x0, $x0, $x1
  168. vand $t1, $x1, $y0
  169. vand $x0, $x0, $y1
  170. veor $x1, $t1, $t0
  171. veor $x0, $x0, $t1
  172. ___
  173. }
  174. sub Mul_GF4_N { # not used, see next subroutine
  175. # multiply and scale by N
  176. my ($x0,$x1,$y0,$y1,$t0)=@_;
  177. $code.=<<___;
  178. veor $t0, $y0, $y1
  179. vand $t0, $t0, $x0
  180. veor $x0, $x0, $x1
  181. vand $x1, $x1, $y0
  182. vand $x0, $x0, $y1
  183. veor $x1, $x1, $x0
  184. veor $x0, $x0, $t0
  185. ___
  186. }
  187. sub Mul_GF4_N_GF4 {
  188. # interleaved Mul_GF4_N and Mul_GF4
  189. my ($x0,$x1,$y0,$y1,$t0,
  190. $x2,$x3,$y2,$y3,$t1)=@_;
  191. $code.=<<___;
  192. veor $t0, $y0, $y1
  193. veor $t1, $y2, $y3
  194. vand $t0, $t0, $x0
  195. vand $t1, $t1, $x2
  196. veor $x0, $x0, $x1
  197. veor $x2, $x2, $x3
  198. vand $x1, $x1, $y0
  199. vand $x3, $x3, $y2
  200. vand $x0, $x0, $y1
  201. vand $x2, $x2, $y3
  202. veor $x1, $x1, $x0
  203. veor $x2, $x2, $x3
  204. veor $x0, $x0, $t0
  205. veor $x3, $x3, $t1
  206. ___
  207. }
  208. sub Mul_GF16_2 {
  209. my @x=@_[0..7];
  210. my @y=@_[8..11];
  211. my @t=@_[12..15];
  212. $code.=<<___;
  213. veor @t[0], @x[0], @x[2]
  214. veor @t[1], @x[1], @x[3]
  215. ___
  216. &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2..3]);
  217. $code.=<<___;
  218. veor @y[0], @y[0], @y[2]
  219. veor @y[1], @y[1], @y[3]
  220. ___
  221. Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
  222. @x[2], @x[3], @y[2], @y[3], @t[2]);
  223. $code.=<<___;
  224. veor @x[0], @x[0], @t[0]
  225. veor @x[2], @x[2], @t[0]
  226. veor @x[1], @x[1], @t[1]
  227. veor @x[3], @x[3], @t[1]
  228. veor @t[0], @x[4], @x[6]
  229. veor @t[1], @x[5], @x[7]
  230. ___
  231. &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
  232. @x[6], @x[7], @y[2], @y[3], @t[2]);
  233. $code.=<<___;
  234. veor @y[0], @y[0], @y[2]
  235. veor @y[1], @y[1], @y[3]
  236. ___
  237. &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[2..3]);
  238. $code.=<<___;
  239. veor @x[4], @x[4], @t[0]
  240. veor @x[6], @x[6], @t[0]
  241. veor @x[5], @x[5], @t[1]
  242. veor @x[7], @x[7], @t[1]
  243. ___
  244. }
  245. sub Inv_GF256 {
  246. #;********************************************************************
  247. #;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) *
  248. #;********************************************************************
  249. my @x=@_[0..7];
  250. my @t=@_[8..11];
  251. my @s=@_[12..15];
  252. # direct optimizations from hardware
  253. $code.=<<___;
  254. veor @t[3], @x[4], @x[6]
  255. veor @t[2], @x[5], @x[7]
  256. veor @t[1], @x[1], @x[3]
  257. veor @s[1], @x[7], @x[6]
  258. vmov @t[0], @t[2]
  259. veor @s[0], @x[0], @x[2]
  260. vorr @t[2], @t[2], @t[1]
  261. veor @s[3], @t[3], @t[0]
  262. vand @s[2], @t[3], @s[0]
  263. vorr @t[3], @t[3], @s[0]
  264. veor @s[0], @s[0], @t[1]
  265. vand @t[0], @t[0], @t[1]
  266. veor @t[1], @x[3], @x[2]
  267. vand @s[3], @s[3], @s[0]
  268. vand @s[1], @s[1], @t[1]
  269. veor @t[1], @x[4], @x[5]
  270. veor @s[0], @x[1], @x[0]
  271. veor @t[3], @t[3], @s[1]
  272. veor @t[2], @t[2], @s[1]
  273. vand @s[1], @t[1], @s[0]
  274. vorr @t[1], @t[1], @s[0]
  275. veor @t[3], @t[3], @s[3]
  276. veor @t[0], @t[0], @s[1]
  277. veor @t[2], @t[2], @s[2]
  278. veor @t[1], @t[1], @s[3]
  279. veor @t[0], @t[0], @s[2]
  280. vand @s[0], @x[7], @x[3]
  281. veor @t[1], @t[1], @s[2]
  282. vand @s[1], @x[6], @x[2]
  283. vand @s[2], @x[5], @x[1]
  284. vorr @s[3], @x[4], @x[0]
  285. veor @t[3], @t[3], @s[0]
  286. veor @t[1], @t[1], @s[2]
  287. veor @t[0], @t[0], @s[3]
  288. veor @t[2], @t[2], @s[1]
  289. @ Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
  290. @ new smaller inversion
  291. vand @s[2], @t[3], @t[1]
  292. vmov @s[0], @t[0]
  293. veor @s[1], @t[2], @s[2]
  294. veor @s[3], @t[0], @s[2]
  295. veor @s[2], @t[0], @s[2] @ @s[2]=@s[3]
  296. vbsl @s[1], @t[1], @t[0]
  297. vbsl @s[3], @t[3], @t[2]
  298. veor @t[3], @t[3], @t[2]
  299. vbsl @s[0], @s[1], @s[2]
  300. vbsl @t[0], @s[2], @s[1]
  301. vand @s[2], @s[0], @s[3]
  302. veor @t[1], @t[1], @t[0]
  303. veor @s[2], @s[2], @t[3]
  304. ___
  305. # output in s3, s2, s1, t1
  306. # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
  307. # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
  308. &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
  309. ### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
  310. }
  311. # AES linear components
  312. sub ShiftRows {
  313. my @x=@_[0..7];
  314. my @t=@_[8..11];
  315. my $mask=pop;
  316. $code.=<<___;
  317. vldmia $key!, {@t[0]-@t[3]}
  318. veor @t[0], @t[0], @x[0]
  319. veor @t[1], @t[1], @x[1]
  320. vtbl.8 `&Dlo(@x[0])`, {@t[0]}, `&Dlo($mask)`
  321. vtbl.8 `&Dhi(@x[0])`, {@t[0]}, `&Dhi($mask)`
  322. vldmia $key!, {@t[0]}
  323. veor @t[2], @t[2], @x[2]
  324. vtbl.8 `&Dlo(@x[1])`, {@t[1]}, `&Dlo($mask)`
  325. vtbl.8 `&Dhi(@x[1])`, {@t[1]}, `&Dhi($mask)`
  326. vldmia $key!, {@t[1]}
  327. veor @t[3], @t[3], @x[3]
  328. vtbl.8 `&Dlo(@x[2])`, {@t[2]}, `&Dlo($mask)`
  329. vtbl.8 `&Dhi(@x[2])`, {@t[2]}, `&Dhi($mask)`
  330. vldmia $key!, {@t[2]}
  331. vtbl.8 `&Dlo(@x[3])`, {@t[3]}, `&Dlo($mask)`
  332. vtbl.8 `&Dhi(@x[3])`, {@t[3]}, `&Dhi($mask)`
  333. vldmia $key!, {@t[3]}
  334. veor @t[0], @t[0], @x[4]
  335. veor @t[1], @t[1], @x[5]
  336. vtbl.8 `&Dlo(@x[4])`, {@t[0]}, `&Dlo($mask)`
  337. vtbl.8 `&Dhi(@x[4])`, {@t[0]}, `&Dhi($mask)`
  338. veor @t[2], @t[2], @x[6]
  339. vtbl.8 `&Dlo(@x[5])`, {@t[1]}, `&Dlo($mask)`
  340. vtbl.8 `&Dhi(@x[5])`, {@t[1]}, `&Dhi($mask)`
  341. veor @t[3], @t[3], @x[7]
  342. vtbl.8 `&Dlo(@x[6])`, {@t[2]}, `&Dlo($mask)`
  343. vtbl.8 `&Dhi(@x[6])`, {@t[2]}, `&Dhi($mask)`
  344. vtbl.8 `&Dlo(@x[7])`, {@t[3]}, `&Dlo($mask)`
  345. vtbl.8 `&Dhi(@x[7])`, {@t[3]}, `&Dhi($mask)`
  346. ___
  347. }
  348. sub MixColumns {
  349. # modified to emit output in order suitable for feeding back to aesenc[last]
  350. my @x=@_[0..7];
  351. my @t=@_[8..15];
  352. my $inv=@_[16]; # optional
  353. $code.=<<___;
  354. vext.8 @t[0], @x[0], @x[0], #12 @ x0 <<< 32
  355. vext.8 @t[1], @x[1], @x[1], #12
  356. veor @x[0], @x[0], @t[0] @ x0 ^ (x0 <<< 32)
  357. vext.8 @t[2], @x[2], @x[2], #12
  358. veor @x[1], @x[1], @t[1]
  359. vext.8 @t[3], @x[3], @x[3], #12
  360. veor @x[2], @x[2], @t[2]
  361. vext.8 @t[4], @x[4], @x[4], #12
  362. veor @x[3], @x[3], @t[3]
  363. vext.8 @t[5], @x[5], @x[5], #12
  364. veor @x[4], @x[4], @t[4]
  365. vext.8 @t[6], @x[6], @x[6], #12
  366. veor @x[5], @x[5], @t[5]
  367. vext.8 @t[7], @x[7], @x[7], #12
  368. veor @x[6], @x[6], @t[6]
  369. veor @t[1], @t[1], @x[0]
  370. veor @x[7], @x[7], @t[7]
  371. vext.8 @x[0], @x[0], @x[0], #8 @ (x0 ^ (x0 <<< 32)) <<< 64)
  372. veor @t[2], @t[2], @x[1]
  373. veor @t[0], @t[0], @x[7]
  374. veor @t[1], @t[1], @x[7]
  375. vext.8 @x[1], @x[1], @x[1], #8
  376. veor @t[5], @t[5], @x[4]
  377. veor @x[0], @x[0], @t[0]
  378. veor @t[6], @t[6], @x[5]
  379. veor @x[1], @x[1], @t[1]
  380. vext.8 @t[0], @x[4], @x[4], #8
  381. veor @t[4], @t[4], @x[3]
  382. vext.8 @t[1], @x[5], @x[5], #8
  383. veor @t[7], @t[7], @x[6]
  384. vext.8 @x[4], @x[3], @x[3], #8
  385. veor @t[3], @t[3], @x[2]
  386. vext.8 @x[5], @x[7], @x[7], #8
  387. veor @t[4], @t[4], @x[7]
  388. vext.8 @x[3], @x[6], @x[6], #8
  389. veor @t[3], @t[3], @x[7]
  390. vext.8 @x[6], @x[2], @x[2], #8
  391. veor @x[7], @t[1], @t[5]
  392. ___
  393. $code.=<<___ if (!$inv);
  394. veor @x[2], @t[0], @t[4]
  395. veor @x[4], @x[4], @t[3]
  396. veor @x[5], @x[5], @t[7]
  397. veor @x[3], @x[3], @t[6]
  398. @ vmov @x[2], @t[0]
  399. veor @x[6], @x[6], @t[2]
  400. @ vmov @x[7], @t[1]
  401. ___
  402. $code.=<<___ if ($inv);
  403. veor @t[3], @t[3], @x[4]
  404. veor @x[5], @x[5], @t[7]
  405. veor @x[2], @x[3], @t[6]
  406. veor @x[3], @t[0], @t[4]
  407. veor @x[4], @x[6], @t[2]
  408. vmov @x[6], @t[3]
  409. @ vmov @x[7], @t[1]
  410. ___
  411. }
  412. sub InvMixColumns_orig {
  413. my @x=@_[0..7];
  414. my @t=@_[8..15];
  415. $code.=<<___;
  416. @ multiplication by 0x0e
  417. vext.8 @t[7], @x[7], @x[7], #12
  418. vmov @t[2], @x[2]
  419. veor @x[2], @x[2], @x[5] @ 2 5
  420. veor @x[7], @x[7], @x[5] @ 7 5
  421. vext.8 @t[0], @x[0], @x[0], #12
  422. vmov @t[5], @x[5]
  423. veor @x[5], @x[5], @x[0] @ 5 0 [1]
  424. veor @x[0], @x[0], @x[1] @ 0 1
  425. vext.8 @t[1], @x[1], @x[1], #12
  426. veor @x[1], @x[1], @x[2] @ 1 25
  427. veor @x[0], @x[0], @x[6] @ 01 6 [2]
  428. vext.8 @t[3], @x[3], @x[3], #12
  429. veor @x[1], @x[1], @x[3] @ 125 3 [4]
  430. veor @x[2], @x[2], @x[0] @ 25 016 [3]
  431. veor @x[3], @x[3], @x[7] @ 3 75
  432. veor @x[7], @x[7], @x[6] @ 75 6 [0]
  433. vext.8 @t[6], @x[6], @x[6], #12
  434. vmov @t[4], @x[4]
  435. veor @x[6], @x[6], @x[4] @ 6 4
  436. veor @x[4], @x[4], @x[3] @ 4 375 [6]
  437. veor @x[3], @x[3], @x[7] @ 375 756=36
  438. veor @x[6], @x[6], @t[5] @ 64 5 [7]
  439. veor @x[3], @x[3], @t[2] @ 36 2
  440. vext.8 @t[5], @t[5], @t[5], #12
  441. veor @x[3], @x[3], @t[4] @ 362 4 [5]
  442. ___
  443. my @y = @x[7,5,0,2,1,3,4,6];
  444. $code.=<<___;
  445. @ multiplication by 0x0b
  446. veor @y[1], @y[1], @y[0]
  447. veor @y[0], @y[0], @t[0]
  448. vext.8 @t[2], @t[2], @t[2], #12
  449. veor @y[1], @y[1], @t[1]
  450. veor @y[0], @y[0], @t[5]
  451. vext.8 @t[4], @t[4], @t[4], #12
  452. veor @y[1], @y[1], @t[6]
  453. veor @y[0], @y[0], @t[7]
  454. veor @t[7], @t[7], @t[6] @ clobber t[7]
  455. veor @y[3], @y[3], @t[0]
  456. veor @y[1], @y[1], @y[0]
  457. vext.8 @t[0], @t[0], @t[0], #12
  458. veor @y[2], @y[2], @t[1]
  459. veor @y[4], @y[4], @t[1]
  460. vext.8 @t[1], @t[1], @t[1], #12
  461. veor @y[2], @y[2], @t[2]
  462. veor @y[3], @y[3], @t[2]
  463. veor @y[5], @y[5], @t[2]
  464. veor @y[2], @y[2], @t[7]
  465. vext.8 @t[2], @t[2], @t[2], #12
  466. veor @y[3], @y[3], @t[3]
  467. veor @y[6], @y[6], @t[3]
  468. veor @y[4], @y[4], @t[3]
  469. veor @y[7], @y[7], @t[4]
  470. vext.8 @t[3], @t[3], @t[3], #12
  471. veor @y[5], @y[5], @t[4]
  472. veor @y[7], @y[7], @t[7]
  473. veor @t[7], @t[7], @t[5] @ clobber t[7] even more
  474. veor @y[3], @y[3], @t[5]
  475. veor @y[4], @y[4], @t[4]
  476. veor @y[5], @y[5], @t[7]
  477. vext.8 @t[4], @t[4], @t[4], #12
  478. veor @y[6], @y[6], @t[7]
  479. veor @y[4], @y[4], @t[7]
  480. veor @t[7], @t[7], @t[5]
  481. vext.8 @t[5], @t[5], @t[5], #12
  482. @ multiplication by 0x0d
  483. veor @y[4], @y[4], @y[7]
  484. veor @t[7], @t[7], @t[6] @ restore t[7]
  485. veor @y[7], @y[7], @t[4]
  486. vext.8 @t[6], @t[6], @t[6], #12
  487. veor @y[2], @y[2], @t[0]
  488. veor @y[7], @y[7], @t[5]
  489. vext.8 @t[7], @t[7], @t[7], #12
  490. veor @y[2], @y[2], @t[2]
  491. veor @y[3], @y[3], @y[1]
  492. veor @y[1], @y[1], @t[1]
  493. veor @y[0], @y[0], @t[0]
  494. veor @y[3], @y[3], @t[0]
  495. veor @y[1], @y[1], @t[5]
  496. veor @y[0], @y[0], @t[5]
  497. vext.8 @t[0], @t[0], @t[0], #12
  498. veor @y[1], @y[1], @t[7]
  499. veor @y[0], @y[0], @t[6]
  500. veor @y[3], @y[3], @y[1]
  501. veor @y[4], @y[4], @t[1]
  502. vext.8 @t[1], @t[1], @t[1], #12
  503. veor @y[7], @y[7], @t[7]
  504. veor @y[4], @y[4], @t[2]
  505. veor @y[5], @y[5], @t[2]
  506. veor @y[2], @y[2], @t[6]
  507. veor @t[6], @t[6], @t[3] @ clobber t[6]
  508. vext.8 @t[2], @t[2], @t[2], #12
  509. veor @y[4], @y[4], @y[7]
  510. veor @y[3], @y[3], @t[6]
  511. veor @y[6], @y[6], @t[6]
  512. veor @y[5], @y[5], @t[5]
  513. vext.8 @t[5], @t[5], @t[5], #12
  514. veor @y[6], @y[6], @t[4]
  515. vext.8 @t[4], @t[4], @t[4], #12
  516. veor @y[5], @y[5], @t[6]
  517. veor @y[6], @y[6], @t[7]
  518. vext.8 @t[7], @t[7], @t[7], #12
  519. veor @t[6], @t[6], @t[3] @ restore t[6]
  520. vext.8 @t[3], @t[3], @t[3], #12
  521. @ multiplication by 0x09
  522. veor @y[4], @y[4], @y[1]
  523. veor @t[1], @t[1], @y[1] @ t[1]=y[1]
  524. veor @t[0], @t[0], @t[5] @ clobber t[0]
  525. vext.8 @t[6], @t[6], @t[6], #12
  526. veor @t[1], @t[1], @t[5]
  527. veor @y[3], @y[3], @t[0]
  528. veor @t[0], @t[0], @y[0] @ t[0]=y[0]
  529. veor @t[1], @t[1], @t[6]
  530. veor @t[6], @t[6], @t[7] @ clobber t[6]
  531. veor @y[4], @y[4], @t[1]
  532. veor @y[7], @y[7], @t[4]
  533. veor @y[6], @y[6], @t[3]
  534. veor @y[5], @y[5], @t[2]
  535. veor @t[4], @t[4], @y[4] @ t[4]=y[4]
  536. veor @t[3], @t[3], @y[3] @ t[3]=y[3]
  537. veor @t[5], @t[5], @y[5] @ t[5]=y[5]
  538. veor @t[2], @t[2], @y[2] @ t[2]=y[2]
  539. veor @t[3], @t[3], @t[7]
  540. veor @XMM[5], @t[5], @t[6]
  541. veor @XMM[6], @t[6], @y[6] @ t[6]=y[6]
  542. veor @XMM[2], @t[2], @t[6]
  543. veor @XMM[7], @t[7], @y[7] @ t[7]=y[7]
  544. vmov @XMM[0], @t[0]
  545. vmov @XMM[1], @t[1]
  546. @ vmov @XMM[2], @t[2]
  547. vmov @XMM[3], @t[3]
  548. vmov @XMM[4], @t[4]
  549. @ vmov @XMM[5], @t[5]
  550. @ vmov @XMM[6], @t[6]
  551. @ vmov @XMM[7], @t[7]
  552. ___
  553. }
  554. sub InvMixColumns {
  555. my @x=@_[0..7];
  556. my @t=@_[8..15];
  557. # Thanks to Jussi Kivilinna for providing pointer to
  558. #
  559. # | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 |
  560. # | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
  561. # | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 |
  562. # | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 |
  563. $code.=<<___;
  564. @ multiplication by 0x05-0x00-0x04-0x00
  565. vext.8 @t[0], @x[0], @x[0], #8
  566. vext.8 @t[6], @x[6], @x[6], #8
  567. vext.8 @t[7], @x[7], @x[7], #8
  568. veor @t[0], @t[0], @x[0]
  569. vext.8 @t[1], @x[1], @x[1], #8
  570. veor @t[6], @t[6], @x[6]
  571. vext.8 @t[2], @x[2], @x[2], #8
  572. veor @t[7], @t[7], @x[7]
  573. vext.8 @t[3], @x[3], @x[3], #8
  574. veor @t[1], @t[1], @x[1]
  575. vext.8 @t[4], @x[4], @x[4], #8
  576. veor @t[2], @t[2], @x[2]
  577. vext.8 @t[5], @x[5], @x[5], #8
  578. veor @t[3], @t[3], @x[3]
  579. veor @t[4], @t[4], @x[4]
  580. veor @t[5], @t[5], @x[5]
  581. veor @x[0], @x[0], @t[6]
  582. veor @x[1], @x[1], @t[6]
  583. veor @x[2], @x[2], @t[0]
  584. veor @x[4], @x[4], @t[2]
  585. veor @x[3], @x[3], @t[1]
  586. veor @x[1], @x[1], @t[7]
  587. veor @x[2], @x[2], @t[7]
  588. veor @x[4], @x[4], @t[6]
  589. veor @x[5], @x[5], @t[3]
  590. veor @x[3], @x[3], @t[6]
  591. veor @x[6], @x[6], @t[4]
  592. veor @x[4], @x[4], @t[7]
  593. veor @x[5], @x[5], @t[7]
  594. veor @x[7], @x[7], @t[5]
  595. ___
  596. &MixColumns (@x,@t,1); # flipped 2<->3 and 4<->6
  597. }
  598. sub swapmove {
  599. my ($a,$b,$n,$mask,$t)=@_;
  600. $code.=<<___;
  601. vshr.u64 $t, $b, #$n
  602. veor $t, $t, $a
  603. vand $t, $t, $mask
  604. veor $a, $a, $t
  605. vshl.u64 $t, $t, #$n
  606. veor $b, $b, $t
  607. ___
  608. }
  609. sub swapmove2x {
  610. my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
  611. $code.=<<___;
  612. vshr.u64 $t0, $b0, #$n
  613. vshr.u64 $t1, $b1, #$n
  614. veor $t0, $t0, $a0
  615. veor $t1, $t1, $a1
  616. vand $t0, $t0, $mask
  617. vand $t1, $t1, $mask
  618. veor $a0, $a0, $t0
  619. vshl.u64 $t0, $t0, #$n
  620. veor $a1, $a1, $t1
  621. vshl.u64 $t1, $t1, #$n
  622. veor $b0, $b0, $t0
  623. veor $b1, $b1, $t1
  624. ___
  625. }
  626. sub bitslice {
  627. my @x=reverse(@_[0..7]);
  628. my ($t0,$t1,$t2,$t3)=@_[8..11];
  629. $code.=<<___;
  630. vmov.i8 $t0,#0x55 @ compose .LBS0
  631. vmov.i8 $t1,#0x33 @ compose .LBS1
  632. ___
  633. &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
  634. &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
  635. $code.=<<___;
  636. vmov.i8 $t0,#0x0f @ compose .LBS2
  637. ___
  638. &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
  639. &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
  640. &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
  641. &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
  642. }
  643. $code.=<<___;
  644. #ifndef __KERNEL__
  645. # include "arm_arch.h"
  646. # define VFP_ABI_PUSH vstmdb sp!,{d8-d15}
  647. # define VFP_ABI_POP vldmia sp!,{d8-d15}
  648. # define VFP_ABI_FRAME 0x40
  649. #else
  650. # define VFP_ABI_PUSH
  651. # define VFP_ABI_POP
  652. # define VFP_ABI_FRAME 0
  653. # define BSAES_ASM_EXTENDED_KEY
  654. # define XTS_CHAIN_TWEAK
  655. # define __ARM_ARCH__ __LINUX_ARM_ARCH__
  656. # define __ARM_MAX_ARCH__ 7
  657. #endif
  658. #ifdef __thumb__
  659. # define adrl adr
  660. #endif
  661. #if __ARM_MAX_ARCH__>=7
  662. .arch armv7-a
  663. .fpu neon
  664. .text
  665. .syntax unified @ ARMv7-capable assembler is expected to handle this
  666. #if defined(__thumb2__) && !defined(__APPLE__)
  667. .thumb
  668. #else
  669. .code 32
  670. # undef __thumb2__
  671. #endif
  672. .type _bsaes_decrypt8,%function
  673. .align 4
  674. _bsaes_decrypt8:
  675. adr $const,.
  676. vldmia $key!, {@XMM[9]} @ round 0 key
  677. #if defined(__thumb2__) || defined(__APPLE__)
  678. adr $const,.LM0ISR
  679. #else
  680. add $const,$const,#.LM0ISR-_bsaes_decrypt8
  681. #endif
  682. vldmia $const!, {@XMM[8]} @ .LM0ISR
  683. veor @XMM[10], @XMM[0], @XMM[9] @ xor with round0 key
  684. veor @XMM[11], @XMM[1], @XMM[9]
  685. vtbl.8 `&Dlo(@XMM[0])`, {@XMM[10]}, `&Dlo(@XMM[8])`
  686. vtbl.8 `&Dhi(@XMM[0])`, {@XMM[10]}, `&Dhi(@XMM[8])`
  687. veor @XMM[12], @XMM[2], @XMM[9]
  688. vtbl.8 `&Dlo(@XMM[1])`, {@XMM[11]}, `&Dlo(@XMM[8])`
  689. vtbl.8 `&Dhi(@XMM[1])`, {@XMM[11]}, `&Dhi(@XMM[8])`
  690. veor @XMM[13], @XMM[3], @XMM[9]
  691. vtbl.8 `&Dlo(@XMM[2])`, {@XMM[12]}, `&Dlo(@XMM[8])`
  692. vtbl.8 `&Dhi(@XMM[2])`, {@XMM[12]}, `&Dhi(@XMM[8])`
  693. veor @XMM[14], @XMM[4], @XMM[9]
  694. vtbl.8 `&Dlo(@XMM[3])`, {@XMM[13]}, `&Dlo(@XMM[8])`
  695. vtbl.8 `&Dhi(@XMM[3])`, {@XMM[13]}, `&Dhi(@XMM[8])`
  696. veor @XMM[15], @XMM[5], @XMM[9]
  697. vtbl.8 `&Dlo(@XMM[4])`, {@XMM[14]}, `&Dlo(@XMM[8])`
  698. vtbl.8 `&Dhi(@XMM[4])`, {@XMM[14]}, `&Dhi(@XMM[8])`
  699. veor @XMM[10], @XMM[6], @XMM[9]
  700. vtbl.8 `&Dlo(@XMM[5])`, {@XMM[15]}, `&Dlo(@XMM[8])`
  701. vtbl.8 `&Dhi(@XMM[5])`, {@XMM[15]}, `&Dhi(@XMM[8])`
  702. veor @XMM[11], @XMM[7], @XMM[9]
  703. vtbl.8 `&Dlo(@XMM[6])`, {@XMM[10]}, `&Dlo(@XMM[8])`
  704. vtbl.8 `&Dhi(@XMM[6])`, {@XMM[10]}, `&Dhi(@XMM[8])`
  705. vtbl.8 `&Dlo(@XMM[7])`, {@XMM[11]}, `&Dlo(@XMM[8])`
  706. vtbl.8 `&Dhi(@XMM[7])`, {@XMM[11]}, `&Dhi(@XMM[8])`
  707. ___
  708. &bitslice (@XMM[0..7, 8..11]);
  709. $code.=<<___;
  710. sub $rounds,$rounds,#1
  711. b .Ldec_sbox
  712. .align 4
  713. .Ldec_loop:
  714. ___
  715. &ShiftRows (@XMM[0..7, 8..12]);
  716. $code.=".Ldec_sbox:\n";
  717. &InvSbox (@XMM[0..7, 8..15]);
  718. $code.=<<___;
  719. subs $rounds,$rounds,#1
  720. bcc .Ldec_done
  721. ___
  722. &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]);
  723. $code.=<<___;
  724. vldmia $const, {@XMM[12]} @ .LISR
  725. ite eq @ Thumb2 thing, sanity check in ARM
  726. addeq $const,$const,#0x10
  727. bne .Ldec_loop
  728. vldmia $const, {@XMM[12]} @ .LISRM0
  729. b .Ldec_loop
  730. .align 4
  731. .Ldec_done:
  732. ___
  733. &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]);
  734. $code.=<<___;
  735. vldmia $key, {@XMM[8]} @ last round key
  736. veor @XMM[6], @XMM[6], @XMM[8]
  737. veor @XMM[4], @XMM[4], @XMM[8]
  738. veor @XMM[2], @XMM[2], @XMM[8]
  739. veor @XMM[7], @XMM[7], @XMM[8]
  740. veor @XMM[3], @XMM[3], @XMM[8]
  741. veor @XMM[5], @XMM[5], @XMM[8]
  742. veor @XMM[0], @XMM[0], @XMM[8]
  743. veor @XMM[1], @XMM[1], @XMM[8]
  744. bx lr
  745. .size _bsaes_decrypt8,.-_bsaes_decrypt8
  746. .type _bsaes_const,%object
  747. .align 6
  748. _bsaes_const:
  749. .LM0ISR: @ InvShiftRows constants
  750. .quad 0x0a0e0206070b0f03, 0x0004080c0d010509
  751. .LISR:
  752. .quad 0x0504070602010003, 0x0f0e0d0c080b0a09
  753. .LISRM0:
  754. .quad 0x01040b0e0205080f, 0x0306090c00070a0d
  755. .LM0SR: @ ShiftRows constants
  756. .quad 0x0a0e02060f03070b, 0x0004080c05090d01
  757. .LSR:
  758. .quad 0x0504070600030201, 0x0f0e0d0c0a09080b
  759. .LSRM0:
  760. .quad 0x0304090e00050a0f, 0x01060b0c0207080d
  761. .LM0:
  762. .quad 0x02060a0e03070b0f, 0x0004080c0105090d
  763. .LREVM0SR:
  764. .quad 0x090d01050c000408, 0x03070b0f060a0e02
  765. .asciz "Bit-sliced AES for NEON, CRYPTOGAMS by <appro\@openssl.org>"
  766. .align 6
  767. .size _bsaes_const,.-_bsaes_const
  768. .type _bsaes_encrypt8,%function
  769. .align 4
  770. _bsaes_encrypt8:
  771. adr $const,.
  772. vldmia $key!, {@XMM[9]} @ round 0 key
  773. #if defined(__thumb2__) || defined(__APPLE__)
  774. adr $const,.LM0SR
  775. #else
  776. sub $const,$const,#_bsaes_encrypt8-.LM0SR
  777. #endif
  778. vldmia $const!, {@XMM[8]} @ .LM0SR
  779. _bsaes_encrypt8_alt:
  780. veor @XMM[10], @XMM[0], @XMM[9] @ xor with round0 key
  781. veor @XMM[11], @XMM[1], @XMM[9]
  782. vtbl.8 `&Dlo(@XMM[0])`, {@XMM[10]}, `&Dlo(@XMM[8])`
  783. vtbl.8 `&Dhi(@XMM[0])`, {@XMM[10]}, `&Dhi(@XMM[8])`
  784. veor @XMM[12], @XMM[2], @XMM[9]
  785. vtbl.8 `&Dlo(@XMM[1])`, {@XMM[11]}, `&Dlo(@XMM[8])`
  786. vtbl.8 `&Dhi(@XMM[1])`, {@XMM[11]}, `&Dhi(@XMM[8])`
  787. veor @XMM[13], @XMM[3], @XMM[9]
  788. vtbl.8 `&Dlo(@XMM[2])`, {@XMM[12]}, `&Dlo(@XMM[8])`
  789. vtbl.8 `&Dhi(@XMM[2])`, {@XMM[12]}, `&Dhi(@XMM[8])`
  790. veor @XMM[14], @XMM[4], @XMM[9]
  791. vtbl.8 `&Dlo(@XMM[3])`, {@XMM[13]}, `&Dlo(@XMM[8])`
  792. vtbl.8 `&Dhi(@XMM[3])`, {@XMM[13]}, `&Dhi(@XMM[8])`
  793. veor @XMM[15], @XMM[5], @XMM[9]
  794. vtbl.8 `&Dlo(@XMM[4])`, {@XMM[14]}, `&Dlo(@XMM[8])`
  795. vtbl.8 `&Dhi(@XMM[4])`, {@XMM[14]}, `&Dhi(@XMM[8])`
  796. veor @XMM[10], @XMM[6], @XMM[9]
  797. vtbl.8 `&Dlo(@XMM[5])`, {@XMM[15]}, `&Dlo(@XMM[8])`
  798. vtbl.8 `&Dhi(@XMM[5])`, {@XMM[15]}, `&Dhi(@XMM[8])`
  799. veor @XMM[11], @XMM[7], @XMM[9]
  800. vtbl.8 `&Dlo(@XMM[6])`, {@XMM[10]}, `&Dlo(@XMM[8])`
  801. vtbl.8 `&Dhi(@XMM[6])`, {@XMM[10]}, `&Dhi(@XMM[8])`
  802. vtbl.8 `&Dlo(@XMM[7])`, {@XMM[11]}, `&Dlo(@XMM[8])`
  803. vtbl.8 `&Dhi(@XMM[7])`, {@XMM[11]}, `&Dhi(@XMM[8])`
  804. _bsaes_encrypt8_bitslice:
  805. ___
  806. &bitslice (@XMM[0..7, 8..11]);
  807. $code.=<<___;
  808. sub $rounds,$rounds,#1
  809. b .Lenc_sbox
  810. .align 4
  811. .Lenc_loop:
  812. ___
  813. &ShiftRows (@XMM[0..7, 8..12]);
  814. $code.=".Lenc_sbox:\n";
  815. &Sbox (@XMM[0..7, 8..15]);
  816. $code.=<<___;
  817. subs $rounds,$rounds,#1
  818. bcc .Lenc_done
  819. ___
  820. &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]);
  821. $code.=<<___;
  822. vldmia $const, {@XMM[12]} @ .LSR
  823. ite eq @ Thumb2 thing, samity check in ARM
  824. addeq $const,$const,#0x10
  825. bne .Lenc_loop
  826. vldmia $const, {@XMM[12]} @ .LSRM0
  827. b .Lenc_loop
  828. .align 4
  829. .Lenc_done:
  830. ___
  831. # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
  832. &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]);
  833. $code.=<<___;
  834. vldmia $key, {@XMM[8]} @ last round key
  835. veor @XMM[4], @XMM[4], @XMM[8]
  836. veor @XMM[6], @XMM[6], @XMM[8]
  837. veor @XMM[3], @XMM[3], @XMM[8]
  838. veor @XMM[7], @XMM[7], @XMM[8]
  839. veor @XMM[2], @XMM[2], @XMM[8]
  840. veor @XMM[5], @XMM[5], @XMM[8]
  841. veor @XMM[0], @XMM[0], @XMM[8]
  842. veor @XMM[1], @XMM[1], @XMM[8]
  843. bx lr
  844. .size _bsaes_encrypt8,.-_bsaes_encrypt8
  845. ___
  846. }
  847. {
  848. my ($out,$inp,$rounds,$const)=("r12","r4","r5","r6");
  849. sub bitslice_key {
  850. my @x=reverse(@_[0..7]);
  851. my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
  852. &swapmove (@x[0,1],1,$bs0,$t2,$t3);
  853. $code.=<<___;
  854. @ &swapmove(@x[2,3],1,$t0,$t2,$t3);
  855. vmov @x[2], @x[0]
  856. vmov @x[3], @x[1]
  857. ___
  858. #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
  859. &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3);
  860. $code.=<<___;
  861. @ &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
  862. vmov @x[4], @x[0]
  863. vmov @x[6], @x[2]
  864. vmov @x[5], @x[1]
  865. vmov @x[7], @x[3]
  866. ___
  867. &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3);
  868. &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3);
  869. }
  870. $code.=<<___;
  871. .type _bsaes_key_convert,%function
  872. .align 4
  873. _bsaes_key_convert:
  874. adr $const,.
  875. vld1.8 {@XMM[7]}, [$inp]! @ load round 0 key
  876. #if defined(__thumb2__) || defined(__APPLE__)
  877. adr $const,.LM0
  878. #else
  879. sub $const,$const,#_bsaes_key_convert-.LM0
  880. #endif
  881. vld1.8 {@XMM[15]}, [$inp]! @ load round 1 key
  882. vmov.i8 @XMM[8], #0x01 @ bit masks
  883. vmov.i8 @XMM[9], #0x02
  884. vmov.i8 @XMM[10], #0x04
  885. vmov.i8 @XMM[11], #0x08
  886. vmov.i8 @XMM[12], #0x10
  887. vmov.i8 @XMM[13], #0x20
  888. vldmia $const, {@XMM[14]} @ .LM0
  889. #ifdef __ARMEL__
  890. vrev32.8 @XMM[7], @XMM[7]
  891. vrev32.8 @XMM[15], @XMM[15]
  892. #endif
  893. sub $rounds,$rounds,#1
  894. vstmia $out!, {@XMM[7]} @ save round 0 key
  895. b .Lkey_loop
  896. .align 4
  897. .Lkey_loop:
  898. vtbl.8 `&Dlo(@XMM[7])`,{@XMM[15]},`&Dlo(@XMM[14])`
  899. vtbl.8 `&Dhi(@XMM[7])`,{@XMM[15]},`&Dhi(@XMM[14])`
  900. vmov.i8 @XMM[6], #0x40
  901. vmov.i8 @XMM[15], #0x80
  902. vtst.8 @XMM[0], @XMM[7], @XMM[8]
  903. vtst.8 @XMM[1], @XMM[7], @XMM[9]
  904. vtst.8 @XMM[2], @XMM[7], @XMM[10]
  905. vtst.8 @XMM[3], @XMM[7], @XMM[11]
  906. vtst.8 @XMM[4], @XMM[7], @XMM[12]
  907. vtst.8 @XMM[5], @XMM[7], @XMM[13]
  908. vtst.8 @XMM[6], @XMM[7], @XMM[6]
  909. vtst.8 @XMM[7], @XMM[7], @XMM[15]
  910. vld1.8 {@XMM[15]}, [$inp]! @ load next round key
  911. vmvn @XMM[0], @XMM[0] @ "pnot"
  912. vmvn @XMM[1], @XMM[1]
  913. vmvn @XMM[5], @XMM[5]
  914. vmvn @XMM[6], @XMM[6]
  915. #ifdef __ARMEL__
  916. vrev32.8 @XMM[15], @XMM[15]
  917. #endif
  918. subs $rounds,$rounds,#1
  919. vstmia $out!,{@XMM[0]-@XMM[7]} @ write bit-sliced round key
  920. bne .Lkey_loop
  921. vmov.i8 @XMM[7],#0x63 @ compose .L63
  922. @ don't save last round key
  923. bx lr
  924. .size _bsaes_key_convert,.-_bsaes_key_convert
  925. ___
  926. }
  927. if (0) { # following four functions are unsupported interface
  928. # used for benchmarking...
  929. $code.=<<___;
  930. .globl bsaes_enc_key_convert
  931. .type bsaes_enc_key_convert,%function
  932. .align 4
  933. bsaes_enc_key_convert:
  934. stmdb sp!,{r4-r6,lr}
  935. vstmdb sp!,{d8-d15} @ ABI specification says so
  936. ldr r5,[$inp,#240] @ pass rounds
  937. mov r4,$inp @ pass key
  938. mov r12,$out @ pass key schedule
  939. bl _bsaes_key_convert
  940. veor @XMM[7],@XMM[7],@XMM[15] @ fix up last round key
  941. vstmia r12, {@XMM[7]} @ save last round key
  942. vldmia sp!,{d8-d15}
  943. ldmia sp!,{r4-r6,pc}
  944. .size bsaes_enc_key_convert,.-bsaes_enc_key_convert
  945. .globl bsaes_encrypt_128
  946. .type bsaes_encrypt_128,%function
  947. .align 4
  948. bsaes_encrypt_128:
  949. stmdb sp!,{r4-r6,lr}
  950. vstmdb sp!,{d8-d15} @ ABI specification says so
  951. .Lenc128_loop:
  952. vld1.8 {@XMM[0]-@XMM[1]}, [$inp]! @ load input
  953. vld1.8 {@XMM[2]-@XMM[3]}, [$inp]!
  954. mov r4,$key @ pass the key
  955. vld1.8 {@XMM[4]-@XMM[5]}, [$inp]!
  956. mov r5,#10 @ pass rounds
  957. vld1.8 {@XMM[6]-@XMM[7]}, [$inp]!
  958. bl _bsaes_encrypt8
  959. vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
  960. vst1.8 {@XMM[4]}, [$out]!
  961. vst1.8 {@XMM[6]}, [$out]!
  962. vst1.8 {@XMM[3]}, [$out]!
  963. vst1.8 {@XMM[7]}, [$out]!
  964. vst1.8 {@XMM[2]}, [$out]!
  965. subs $len,$len,#0x80
  966. vst1.8 {@XMM[5]}, [$out]!
  967. bhi .Lenc128_loop
  968. vldmia sp!,{d8-d15}
  969. ldmia sp!,{r4-r6,pc}
  970. .size bsaes_encrypt_128,.-bsaes_encrypt_128
  971. .globl bsaes_dec_key_convert
  972. .type bsaes_dec_key_convert,%function
  973. .align 4
  974. bsaes_dec_key_convert:
  975. stmdb sp!,{r4-r6,lr}
  976. vstmdb sp!,{d8-d15} @ ABI specification says so
  977. ldr r5,[$inp,#240] @ pass rounds
  978. mov r4,$inp @ pass key
  979. mov r12,$out @ pass key schedule
  980. bl _bsaes_key_convert
  981. vldmia $out, {@XMM[6]}
  982. vstmia r12, {@XMM[15]} @ save last round key
  983. veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
  984. vstmia $out, {@XMM[7]}
  985. vldmia sp!,{d8-d15}
  986. ldmia sp!,{r4-r6,pc}
  987. .size bsaes_dec_key_convert,.-bsaes_dec_key_convert
  988. .globl bsaes_decrypt_128
  989. .type bsaes_decrypt_128,%function
  990. .align 4
  991. bsaes_decrypt_128:
  992. stmdb sp!,{r4-r6,lr}
  993. vstmdb sp!,{d8-d15} @ ABI specification says so
  994. .Ldec128_loop:
  995. vld1.8 {@XMM[0]-@XMM[1]}, [$inp]! @ load input
  996. vld1.8 {@XMM[2]-@XMM[3]}, [$inp]!
  997. mov r4,$key @ pass the key
  998. vld1.8 {@XMM[4]-@XMM[5]}, [$inp]!
  999. mov r5,#10 @ pass rounds
  1000. vld1.8 {@XMM[6]-@XMM[7]}, [$inp]!
  1001. bl _bsaes_decrypt8
  1002. vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
  1003. vst1.8 {@XMM[6]}, [$out]!
  1004. vst1.8 {@XMM[4]}, [$out]!
  1005. vst1.8 {@XMM[2]}, [$out]!
  1006. vst1.8 {@XMM[7]}, [$out]!
  1007. vst1.8 {@XMM[3]}, [$out]!
  1008. subs $len,$len,#0x80
  1009. vst1.8 {@XMM[5]}, [$out]!
  1010. bhi .Ldec128_loop
  1011. vldmia sp!,{d8-d15}
  1012. ldmia sp!,{r4-r6,pc}
  1013. .size bsaes_decrypt_128,.-bsaes_decrypt_128
  1014. ___
  1015. }
  1016. {
  1017. my ($inp,$out,$len,$key, $ivp,$fp,$rounds)=map("r$_",(0..3,8..10));
  1018. my ($keysched)=("sp");
  1019. $code.=<<___;
  1020. .extern AES_cbc_encrypt
  1021. .extern AES_decrypt
  1022. .global bsaes_cbc_encrypt
  1023. .type bsaes_cbc_encrypt,%function
  1024. .align 5
  1025. bsaes_cbc_encrypt:
  1026. #ifndef __KERNEL__
  1027. cmp $len, #128
  1028. #ifndef __thumb__
  1029. blo AES_cbc_encrypt
  1030. #else
  1031. bhs 1f
  1032. b AES_cbc_encrypt
  1033. 1:
  1034. #endif
  1035. #endif
  1036. @ it is up to the caller to make sure we are called with enc == 0
  1037. mov ip, sp
  1038. stmdb sp!, {r4-r10, lr}
  1039. VFP_ABI_PUSH
  1040. ldr $ivp, [ip] @ IV is 1st arg on the stack
  1041. mov $len, $len, lsr#4 @ len in 16 byte blocks
  1042. sub sp, #0x10 @ scratch space to carry over the IV
  1043. mov $fp, sp @ save sp
  1044. ldr $rounds, [$key, #240] @ get # of rounds
  1045. #ifndef BSAES_ASM_EXTENDED_KEY
  1046. @ allocate the key schedule on the stack
  1047. sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key
  1048. add r12, #`128-32` @ sifze of bit-slices key schedule
  1049. @ populate the key schedule
  1050. mov r4, $key @ pass key
  1051. mov r5, $rounds @ pass # of rounds
  1052. mov sp, r12 @ sp is $keysched
  1053. bl _bsaes_key_convert
  1054. vldmia $keysched, {@XMM[6]}
  1055. vstmia r12, {@XMM[15]} @ save last round key
  1056. veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
  1057. vstmia $keysched, {@XMM[7]}
  1058. #else
  1059. ldr r12, [$key, #244]
  1060. eors r12, #1
  1061. beq 0f
  1062. @ populate the key schedule
  1063. str r12, [$key, #244]
  1064. mov r4, $key @ pass key
  1065. mov r5, $rounds @ pass # of rounds
  1066. add r12, $key, #248 @ pass key schedule
  1067. bl _bsaes_key_convert
  1068. add r4, $key, #248
  1069. vldmia r4, {@XMM[6]}
  1070. vstmia r12, {@XMM[15]} @ save last round key
  1071. veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
  1072. vstmia r4, {@XMM[7]}
  1073. .align 2
  1074. 0:
  1075. #endif
  1076. vld1.8 {@XMM[15]}, [$ivp] @ load IV
  1077. b .Lcbc_dec_loop
  1078. .align 4
  1079. .Lcbc_dec_loop:
  1080. subs $len, $len, #0x8
  1081. bmi .Lcbc_dec_loop_finish
  1082. vld1.8 {@XMM[0]-@XMM[1]}, [$inp]! @ load input
  1083. vld1.8 {@XMM[2]-@XMM[3]}, [$inp]!
  1084. #ifndef BSAES_ASM_EXTENDED_KEY
  1085. mov r4, $keysched @ pass the key
  1086. #else
  1087. add r4, $key, #248
  1088. #endif
  1089. vld1.8 {@XMM[4]-@XMM[5]}, [$inp]!
  1090. mov r5, $rounds
  1091. vld1.8 {@XMM[6]-@XMM[7]}, [$inp]
  1092. sub $inp, $inp, #0x60
  1093. vstmia $fp, {@XMM[15]} @ put aside IV
  1094. bl _bsaes_decrypt8
  1095. vldmia $fp, {@XMM[14]} @ reload IV
  1096. vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
  1097. veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
  1098. vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
  1099. veor @XMM[1], @XMM[1], @XMM[8]
  1100. veor @XMM[6], @XMM[6], @XMM[9]
  1101. vld1.8 {@XMM[12]-@XMM[13]}, [$inp]!
  1102. veor @XMM[4], @XMM[4], @XMM[10]
  1103. veor @XMM[2], @XMM[2], @XMM[11]
  1104. vld1.8 {@XMM[14]-@XMM[15]}, [$inp]!
  1105. veor @XMM[7], @XMM[7], @XMM[12]
  1106. vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
  1107. veor @XMM[3], @XMM[3], @XMM[13]
  1108. vst1.8 {@XMM[6]}, [$out]!
  1109. veor @XMM[5], @XMM[5], @XMM[14]
  1110. vst1.8 {@XMM[4]}, [$out]!
  1111. vst1.8 {@XMM[2]}, [$out]!
  1112. vst1.8 {@XMM[7]}, [$out]!
  1113. vst1.8 {@XMM[3]}, [$out]!
  1114. vst1.8 {@XMM[5]}, [$out]!
  1115. b .Lcbc_dec_loop
  1116. .Lcbc_dec_loop_finish:
  1117. adds $len, $len, #8
  1118. beq .Lcbc_dec_done
  1119. vld1.8 {@XMM[0]}, [$inp]! @ load input
  1120. cmp $len, #2
  1121. blo .Lcbc_dec_one
  1122. vld1.8 {@XMM[1]}, [$inp]!
  1123. #ifndef BSAES_ASM_EXTENDED_KEY
  1124. mov r4, $keysched @ pass the key
  1125. #else
  1126. add r4, $key, #248
  1127. #endif
  1128. mov r5, $rounds
  1129. vstmia $fp, {@XMM[15]} @ put aside IV
  1130. beq .Lcbc_dec_two
  1131. vld1.8 {@XMM[2]}, [$inp]!
  1132. cmp $len, #4
  1133. blo .Lcbc_dec_three
  1134. vld1.8 {@XMM[3]}, [$inp]!
  1135. beq .Lcbc_dec_four
  1136. vld1.8 {@XMM[4]}, [$inp]!
  1137. cmp $len, #6
  1138. blo .Lcbc_dec_five
  1139. vld1.8 {@XMM[5]}, [$inp]!
  1140. beq .Lcbc_dec_six
  1141. vld1.8 {@XMM[6]}, [$inp]!
  1142. sub $inp, $inp, #0x70
  1143. bl _bsaes_decrypt8
  1144. vldmia $fp, {@XMM[14]} @ reload IV
  1145. vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
  1146. veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
  1147. vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
  1148. veor @XMM[1], @XMM[1], @XMM[8]
  1149. veor @XMM[6], @XMM[6], @XMM[9]
  1150. vld1.8 {@XMM[12]-@XMM[13]}, [$inp]!
  1151. veor @XMM[4], @XMM[4], @XMM[10]
  1152. veor @XMM[2], @XMM[2], @XMM[11]
  1153. vld1.8 {@XMM[15]}, [$inp]!
  1154. veor @XMM[7], @XMM[7], @XMM[12]
  1155. vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
  1156. veor @XMM[3], @XMM[3], @XMM[13]
  1157. vst1.8 {@XMM[6]}, [$out]!
  1158. vst1.8 {@XMM[4]}, [$out]!
  1159. vst1.8 {@XMM[2]}, [$out]!
  1160. vst1.8 {@XMM[7]}, [$out]!
  1161. vst1.8 {@XMM[3]}, [$out]!
  1162. b .Lcbc_dec_done
  1163. .align 4
  1164. .Lcbc_dec_six:
  1165. sub $inp, $inp, #0x60
  1166. bl _bsaes_decrypt8
  1167. vldmia $fp,{@XMM[14]} @ reload IV
  1168. vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
  1169. veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
  1170. vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
  1171. veor @XMM[1], @XMM[1], @XMM[8]
  1172. veor @XMM[6], @XMM[6], @XMM[9]
  1173. vld1.8 {@XMM[12]}, [$inp]!
  1174. veor @XMM[4], @XMM[4], @XMM[10]
  1175. veor @XMM[2], @XMM[2], @XMM[11]
  1176. vld1.8 {@XMM[15]}, [$inp]!
  1177. veor @XMM[7], @XMM[7], @XMM[12]
  1178. vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
  1179. vst1.8 {@XMM[6]}, [$out]!
  1180. vst1.8 {@XMM[4]}, [$out]!
  1181. vst1.8 {@XMM[2]}, [$out]!
  1182. vst1.8 {@XMM[7]}, [$out]!
  1183. b .Lcbc_dec_done
  1184. .align 4
  1185. .Lcbc_dec_five:
  1186. sub $inp, $inp, #0x50
  1187. bl _bsaes_decrypt8
  1188. vldmia $fp, {@XMM[14]} @ reload IV
  1189. vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
  1190. veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
  1191. vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
  1192. veor @XMM[1], @XMM[1], @XMM[8]
  1193. veor @XMM[6], @XMM[6], @XMM[9]
  1194. vld1.8 {@XMM[15]}, [$inp]!
  1195. veor @XMM[4], @XMM[4], @XMM[10]
  1196. vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
  1197. veor @XMM[2], @XMM[2], @XMM[11]
  1198. vst1.8 {@XMM[6]}, [$out]!
  1199. vst1.8 {@XMM[4]}, [$out]!
  1200. vst1.8 {@XMM[2]}, [$out]!
  1201. b .Lcbc_dec_done
  1202. .align 4
  1203. .Lcbc_dec_four:
  1204. sub $inp, $inp, #0x40
  1205. bl _bsaes_decrypt8
  1206. vldmia $fp, {@XMM[14]} @ reload IV
  1207. vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
  1208. veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
  1209. vld1.8 {@XMM[10]}, [$inp]!
  1210. veor @XMM[1], @XMM[1], @XMM[8]
  1211. veor @XMM[6], @XMM[6], @XMM[9]
  1212. vld1.8 {@XMM[15]}, [$inp]!
  1213. veor @XMM[4], @XMM[4], @XMM[10]
  1214. vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
  1215. vst1.8 {@XMM[6]}, [$out]!
  1216. vst1.8 {@XMM[4]}, [$out]!
  1217. b .Lcbc_dec_done
  1218. .align 4
  1219. .Lcbc_dec_three:
  1220. sub $inp, $inp, #0x30
  1221. bl _bsaes_decrypt8
  1222. vldmia $fp, {@XMM[14]} @ reload IV
  1223. vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
  1224. veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
  1225. vld1.8 {@XMM[15]}, [$inp]!
  1226. veor @XMM[1], @XMM[1], @XMM[8]
  1227. veor @XMM[6], @XMM[6], @XMM[9]
  1228. vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
  1229. vst1.8 {@XMM[6]}, [$out]!
  1230. b .Lcbc_dec_done
  1231. .align 4
  1232. .Lcbc_dec_two:
  1233. sub $inp, $inp, #0x20
  1234. bl _bsaes_decrypt8
  1235. vldmia $fp, {@XMM[14]} @ reload IV
  1236. vld1.8 {@XMM[8]}, [$inp]! @ reload input
  1237. veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
  1238. vld1.8 {@XMM[15]}, [$inp]! @ reload input
  1239. veor @XMM[1], @XMM[1], @XMM[8]
  1240. vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
  1241. b .Lcbc_dec_done
  1242. .align 4
  1243. .Lcbc_dec_one:
  1244. sub $inp, $inp, #0x10
  1245. mov $rounds, $out @ save original out pointer
  1246. mov $out, $fp @ use the iv scratch space as out buffer
  1247. mov r2, $key
  1248. vmov @XMM[4],@XMM[15] @ just in case ensure that IV
  1249. vmov @XMM[5],@XMM[0] @ and input are preserved
  1250. bl AES_decrypt
  1251. vld1.8 {@XMM[0]}, [$fp] @ load result
  1252. veor @XMM[0], @XMM[0], @XMM[4] @ ^= IV
  1253. vmov @XMM[15], @XMM[5] @ @XMM[5] holds input
  1254. vst1.8 {@XMM[0]}, [$rounds] @ write output
  1255. .Lcbc_dec_done:
  1256. #ifndef BSAES_ASM_EXTENDED_KEY
  1257. vmov.i32 q0, #0
  1258. vmov.i32 q1, #0
  1259. .Lcbc_dec_bzero: @ wipe key schedule [if any]
  1260. vstmia $keysched!, {q0-q1}
  1261. cmp $keysched, $fp
  1262. bne .Lcbc_dec_bzero
  1263. #endif
  1264. mov sp, $fp
  1265. add sp, #0x10 @ add sp,$fp,#0x10 is no good for thumb
  1266. vst1.8 {@XMM[15]}, [$ivp] @ return IV
  1267. VFP_ABI_POP
  1268. ldmia sp!, {r4-r10, pc}
  1269. .size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
  1270. ___
  1271. }
  1272. {
  1273. my ($inp,$out,$len,$key, $ctr,$fp,$rounds)=(map("r$_",(0..3,8..10)));
  1274. my $const = "r6"; # shared with _bsaes_encrypt8_alt
  1275. my $keysched = "sp";
  1276. $code.=<<___;
  1277. .extern AES_encrypt
  1278. .global bsaes_ctr32_encrypt_blocks
  1279. .type bsaes_ctr32_encrypt_blocks,%function
  1280. .align 5
  1281. bsaes_ctr32_encrypt_blocks:
  1282. cmp $len, #8 @ use plain AES for
  1283. blo .Lctr_enc_short @ small sizes
  1284. mov ip, sp
  1285. stmdb sp!, {r4-r10, lr}
  1286. VFP_ABI_PUSH
  1287. ldr $ctr, [ip] @ ctr is 1st arg on the stack
  1288. sub sp, sp, #0x10 @ scratch space to carry over the ctr
  1289. mov $fp, sp @ save sp
  1290. ldr $rounds, [$key, #240] @ get # of rounds
  1291. #ifndef BSAES_ASM_EXTENDED_KEY
  1292. @ allocate the key schedule on the stack
  1293. sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key
  1294. add r12, #`128-32` @ size of bit-sliced key schedule
  1295. @ populate the key schedule
  1296. mov r4, $key @ pass key
  1297. mov r5, $rounds @ pass # of rounds
  1298. mov sp, r12 @ sp is $keysched
  1299. bl _bsaes_key_convert
  1300. veor @XMM[7],@XMM[7],@XMM[15] @ fix up last round key
  1301. vstmia r12, {@XMM[7]} @ save last round key
  1302. vld1.8 {@XMM[0]}, [$ctr] @ load counter
  1303. #ifdef __APPLE__
  1304. mov $ctr, #:lower16:(.LREVM0SR-.LM0)
  1305. add $ctr, $const, $ctr
  1306. #else
  1307. add $ctr, $const, #.LREVM0SR-.LM0 @ borrow $ctr
  1308. #endif
  1309. vldmia $keysched, {@XMM[4]} @ load round0 key
  1310. #else
  1311. ldr r12, [$key, #244]
  1312. eors r12, #1
  1313. beq 0f
  1314. @ populate the key schedule
  1315. str r12, [$key, #244]
  1316. mov r4, $key @ pass key
  1317. mov r5, $rounds @ pass # of rounds
  1318. add r12, $key, #248 @ pass key schedule
  1319. bl _bsaes_key_convert
  1320. veor @XMM[7],@XMM[7],@XMM[15] @ fix up last round key
  1321. vstmia r12, {@XMM[7]} @ save last round key
  1322. .align 2
  1323. 0: add r12, $key, #248
  1324. vld1.8 {@XMM[0]}, [$ctr] @ load counter
  1325. adrl $ctr, .LREVM0SR @ borrow $ctr
  1326. vldmia r12, {@XMM[4]} @ load round0 key
  1327. sub sp, #0x10 @ place for adjusted round0 key
  1328. #endif
  1329. vmov.i32 @XMM[8],#1 @ compose 1<<96
  1330. veor @XMM[9],@XMM[9],@XMM[9]
  1331. vrev32.8 @XMM[0],@XMM[0]
  1332. vext.8 @XMM[8],@XMM[9],@XMM[8],#4
  1333. vrev32.8 @XMM[4],@XMM[4]
  1334. vadd.u32 @XMM[9],@XMM[8],@XMM[8] @ compose 2<<96
  1335. vstmia $keysched, {@XMM[4]} @ save adjusted round0 key
  1336. b .Lctr_enc_loop
  1337. .align 4
  1338. .Lctr_enc_loop:
  1339. vadd.u32 @XMM[10], @XMM[8], @XMM[9] @ compose 3<<96
  1340. vadd.u32 @XMM[1], @XMM[0], @XMM[8] @ +1
  1341. vadd.u32 @XMM[2], @XMM[0], @XMM[9] @ +2
  1342. vadd.u32 @XMM[3], @XMM[0], @XMM[10] @ +3
  1343. vadd.u32 @XMM[4], @XMM[1], @XMM[10]
  1344. vadd.u32 @XMM[5], @XMM[2], @XMM[10]
  1345. vadd.u32 @XMM[6], @XMM[3], @XMM[10]
  1346. vadd.u32 @XMM[7], @XMM[4], @XMM[10]
  1347. vadd.u32 @XMM[10], @XMM[5], @XMM[10] @ next counter
  1348. @ Borrow prologue from _bsaes_encrypt8 to use the opportunity
  1349. @ to flip byte order in 32-bit counter
  1350. vldmia $keysched, {@XMM[9]} @ load round0 key
  1351. #ifndef BSAES_ASM_EXTENDED_KEY
  1352. add r4, $keysched, #0x10 @ pass next round key
  1353. #else
  1354. add r4, $key, #`248+16`
  1355. #endif
  1356. vldmia $ctr, {@XMM[8]} @ .LREVM0SR
  1357. mov r5, $rounds @ pass rounds
  1358. vstmia $fp, {@XMM[10]} @ save next counter
  1359. #ifdef __APPLE__
  1360. mov $const, #:lower16:(.LREVM0SR-.LSR)
  1361. sub $const, $ctr, $const
  1362. #else
  1363. sub $const, $ctr, #.LREVM0SR-.LSR @ pass constants
  1364. #endif
  1365. bl _bsaes_encrypt8_alt
  1366. subs $len, $len, #8
  1367. blo .Lctr_enc_loop_done
  1368. vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ load input
  1369. vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
  1370. veor @XMM[0], @XMM[8]
  1371. veor @XMM[1], @XMM[9]
  1372. vld1.8 {@XMM[12]-@XMM[13]}, [$inp]!
  1373. veor @XMM[4], @XMM[10]
  1374. veor @XMM[6], @XMM[11]
  1375. vld1.8 {@XMM[14]-@XMM[15]}, [$inp]!
  1376. veor @XMM[3], @XMM[12]
  1377. vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
  1378. veor @XMM[7], @XMM[13]
  1379. veor @XMM[2], @XMM[14]
  1380. vst1.8 {@XMM[4]}, [$out]!
  1381. veor @XMM[5], @XMM[15]
  1382. vst1.8 {@XMM[6]}, [$out]!
  1383. vmov.i32 @XMM[8], #1 @ compose 1<<96
  1384. vst1.8 {@XMM[3]}, [$out]!
  1385. veor @XMM[9], @XMM[9], @XMM[9]
  1386. vst1.8 {@XMM[7]}, [$out]!
  1387. vext.8 @XMM[8], @XMM[9], @XMM[8], #4
  1388. vst1.8 {@XMM[2]}, [$out]!
  1389. vadd.u32 @XMM[9],@XMM[8],@XMM[8] @ compose 2<<96
  1390. vst1.8 {@XMM[5]}, [$out]!
  1391. vldmia $fp, {@XMM[0]} @ load counter
  1392. bne .Lctr_enc_loop
  1393. b .Lctr_enc_done
  1394. .align 4
  1395. .Lctr_enc_loop_done:
  1396. add $len, $len, #8
  1397. vld1.8 {@XMM[8]}, [$inp]! @ load input
  1398. veor @XMM[0], @XMM[8]
  1399. vst1.8 {@XMM[0]}, [$out]! @ write output
  1400. cmp $len, #2
  1401. blo .Lctr_enc_done
  1402. vld1.8 {@XMM[9]}, [$inp]!
  1403. veor @XMM[1], @XMM[9]
  1404. vst1.8 {@XMM[1]}, [$out]!
  1405. beq .Lctr_enc_done
  1406. vld1.8 {@XMM[10]}, [$inp]!
  1407. veor @XMM[4], @XMM[10]
  1408. vst1.8 {@XMM[4]}, [$out]!
  1409. cmp $len, #4
  1410. blo .Lctr_enc_done
  1411. vld1.8 {@XMM[11]}, [$inp]!
  1412. veor @XMM[6], @XMM[11]
  1413. vst1.8 {@XMM[6]}, [$out]!
  1414. beq .Lctr_enc_done
  1415. vld1.8 {@XMM[12]}, [$inp]!
  1416. veor @XMM[3], @XMM[12]
  1417. vst1.8 {@XMM[3]}, [$out]!
  1418. cmp $len, #6
  1419. blo .Lctr_enc_done
  1420. vld1.8 {@XMM[13]}, [$inp]!
  1421. veor @XMM[7], @XMM[13]
  1422. vst1.8 {@XMM[7]}, [$out]!
  1423. beq .Lctr_enc_done
  1424. vld1.8 {@XMM[14]}, [$inp]
  1425. veor @XMM[2], @XMM[14]
  1426. vst1.8 {@XMM[2]}, [$out]!
  1427. .Lctr_enc_done:
  1428. vmov.i32 q0, #0
  1429. vmov.i32 q1, #0
  1430. #ifndef BSAES_ASM_EXTENDED_KEY
  1431. .Lctr_enc_bzero: @ wipe key schedule [if any]
  1432. vstmia $keysched!, {q0-q1}
  1433. cmp $keysched, $fp
  1434. bne .Lctr_enc_bzero
  1435. #else
  1436. vstmia $keysched, {q0-q1}
  1437. #endif
  1438. mov sp, $fp
  1439. add sp, #0x10 @ add sp,$fp,#0x10 is no good for thumb
  1440. VFP_ABI_POP
  1441. ldmia sp!, {r4-r10, pc} @ return
  1442. .align 4
  1443. .Lctr_enc_short:
  1444. ldr ip, [sp] @ ctr pointer is passed on stack
  1445. stmdb sp!, {r4-r8, lr}
  1446. mov r4, $inp @ copy arguments
  1447. mov r5, $out
  1448. mov r6, $len
  1449. mov r7, $key
  1450. ldr r8, [ip, #12] @ load counter LSW
  1451. vld1.8 {@XMM[1]}, [ip] @ load whole counter value
  1452. #ifdef __ARMEL__
  1453. rev r8, r8
  1454. #endif
  1455. sub sp, sp, #0x10
  1456. vst1.8 {@XMM[1]}, [sp] @ copy counter value
  1457. sub sp, sp, #0x10
  1458. .Lctr_enc_short_loop:
  1459. add r0, sp, #0x10 @ input counter value
  1460. mov r1, sp @ output on the stack
  1461. mov r2, r7 @ key
  1462. bl AES_encrypt
  1463. vld1.8 {@XMM[0]}, [r4]! @ load input
  1464. vld1.8 {@XMM[1]}, [sp] @ load encrypted counter
  1465. add r8, r8, #1
  1466. #ifdef __ARMEL__
  1467. rev r0, r8
  1468. str r0, [sp, #0x1c] @ next counter value
  1469. #else
  1470. str r8, [sp, #0x1c] @ next counter value
  1471. #endif
  1472. veor @XMM[0],@XMM[0],@XMM[1]
  1473. vst1.8 {@XMM[0]}, [r5]! @ store output
  1474. subs r6, r6, #1
  1475. bne .Lctr_enc_short_loop
  1476. vmov.i32 q0, #0
  1477. vmov.i32 q1, #0
  1478. vstmia sp!, {q0-q1}
  1479. ldmia sp!, {r4-r8, pc}
  1480. .size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
  1481. ___
  1482. }
  1483. {
  1484. ######################################################################
  1485. # void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
  1486. # const AES_KEY *key1, const AES_KEY *key2,
  1487. # const unsigned char iv[16]);
  1488. #
  1489. my ($inp,$out,$len,$key,$rounds,$magic,$fp)=(map("r$_",(7..10,1..3)));
  1490. my $const="r6"; # returned by _bsaes_key_convert
  1491. my $twmask=@XMM[5];
  1492. my @T=@XMM[6..7];
  1493. $code.=<<___;
  1494. .globl bsaes_xts_encrypt
  1495. .type bsaes_xts_encrypt,%function
  1496. .align 4
  1497. bsaes_xts_encrypt:
  1498. mov ip, sp
  1499. stmdb sp!, {r4-r10, lr} @ 0x20
  1500. VFP_ABI_PUSH
  1501. mov r6, sp @ future $fp
  1502. mov $inp, r0
  1503. mov $out, r1
  1504. mov $len, r2
  1505. mov $key, r3
  1506. sub r0, sp, #0x10 @ 0x10
  1507. bic r0, #0xf @ align at 16 bytes
  1508. mov sp, r0
  1509. #ifdef XTS_CHAIN_TWEAK
  1510. ldr r0, [ip] @ pointer to input tweak
  1511. #else
  1512. @ generate initial tweak
  1513. ldr r0, [ip, #4] @ iv[]
  1514. mov r1, sp
  1515. ldr r2, [ip, #0] @ key2
  1516. bl AES_encrypt
  1517. mov r0,sp @ pointer to initial tweak
  1518. #endif
  1519. ldr $rounds, [$key, #240] @ get # of rounds
  1520. mov $fp, r6
  1521. #ifndef BSAES_ASM_EXTENDED_KEY
  1522. @ allocate the key schedule on the stack
  1523. sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key
  1524. @ add r12, #`128-32` @ size of bit-sliced key schedule
  1525. sub r12, #`32+16` @ place for tweak[9]
  1526. @ populate the key schedule
  1527. mov r4, $key @ pass key
  1528. mov r5, $rounds @ pass # of rounds
  1529. mov sp, r12
  1530. add r12, #0x90 @ pass key schedule
  1531. bl _bsaes_key_convert
  1532. veor @XMM[7], @XMM[7], @XMM[15] @ fix up last round key
  1533. vstmia r12, {@XMM[7]} @ save last round key
  1534. #else
  1535. ldr r12, [$key, #244]
  1536. eors r12, #1
  1537. beq 0f
  1538. str r12, [$key, #244]
  1539. mov r4, $key @ pass key
  1540. mov r5, $rounds @ pass # of rounds
  1541. add r12, $key, #248 @ pass key schedule
  1542. bl _bsaes_key_convert
  1543. veor @XMM[7], @XMM[7], @XMM[15] @ fix up last round key
  1544. vstmia r12, {@XMM[7]}
  1545. .align 2
  1546. 0: sub sp, #0x90 @ place for tweak[9]
  1547. #endif
  1548. vld1.8 {@XMM[8]}, [r0] @ initial tweak
  1549. adr $magic, .Lxts_magic
  1550. subs $len, #0x80
  1551. blo .Lxts_enc_short
  1552. b .Lxts_enc_loop
  1553. .align 4
  1554. .Lxts_enc_loop:
  1555. vldmia $magic, {$twmask} @ load XTS magic
  1556. vshr.s64 @T[0], @XMM[8], #63
  1557. mov r0, sp
  1558. vand @T[0], @T[0], $twmask
  1559. ___
  1560. for($i=9;$i<16;$i++) {
  1561. $code.=<<___;
  1562. vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1]
  1563. vst1.64 {@XMM[$i-1]}, [r0,:128]!
  1564. vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
  1565. vshr.s64 @T[1], @XMM[$i], #63
  1566. veor @XMM[$i], @XMM[$i], @T[0]
  1567. vand @T[1], @T[1], $twmask
  1568. ___
  1569. @T=reverse(@T);
  1570. $code.=<<___ if ($i>=10);
  1571. vld1.8 {@XMM[$i-10]}, [$inp]!
  1572. ___
  1573. $code.=<<___ if ($i>=11);
  1574. veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
  1575. ___
  1576. }
  1577. $code.=<<___;
  1578. vadd.u64 @XMM[8], @XMM[15], @XMM[15]
  1579. vst1.64 {@XMM[15]}, [r0,:128]!
  1580. vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
  1581. veor @XMM[8], @XMM[8], @T[0]
  1582. vst1.64 {@XMM[8]}, [r0,:128] @ next round tweak
  1583. vld1.8 {@XMM[6]-@XMM[7]}, [$inp]!
  1584. veor @XMM[5], @XMM[5], @XMM[13]
  1585. #ifndef BSAES_ASM_EXTENDED_KEY
  1586. add r4, sp, #0x90 @ pass key schedule
  1587. #else
  1588. add r4, $key, #248 @ pass key schedule
  1589. #endif
  1590. veor @XMM[6], @XMM[6], @XMM[14]
  1591. mov r5, $rounds @ pass rounds
  1592. veor @XMM[7], @XMM[7], @XMM[15]
  1593. mov r0, sp
  1594. bl _bsaes_encrypt8
  1595. vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
  1596. vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
  1597. veor @XMM[0], @XMM[0], @XMM[ 8]
  1598. vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
  1599. veor @XMM[1], @XMM[1], @XMM[ 9]
  1600. veor @XMM[8], @XMM[4], @XMM[10]
  1601. vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
  1602. veor @XMM[9], @XMM[6], @XMM[11]
  1603. vld1.64 {@XMM[14]-@XMM[15]}, [r0,:128]!
  1604. veor @XMM[10], @XMM[3], @XMM[12]
  1605. vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
  1606. veor @XMM[11], @XMM[7], @XMM[13]
  1607. veor @XMM[12], @XMM[2], @XMM[14]
  1608. vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
  1609. veor @XMM[13], @XMM[5], @XMM[15]
  1610. vst1.8 {@XMM[12]-@XMM[13]}, [$out]!
  1611. vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
  1612. subs $len, #0x80
  1613. bpl .Lxts_enc_loop
  1614. .Lxts_enc_short:
  1615. adds $len, #0x70
  1616. bmi .Lxts_enc_done
  1617. vldmia $magic, {$twmask} @ load XTS magic
  1618. vshr.s64 @T[0], @XMM[8], #63
  1619. mov r0, sp
  1620. vand @T[0], @T[0], $twmask
  1621. ___
  1622. for($i=9;$i<16;$i++) {
  1623. $code.=<<___;
  1624. vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1]
  1625. vst1.64 {@XMM[$i-1]}, [r0,:128]!
  1626. vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
  1627. vshr.s64 @T[1], @XMM[$i], #63
  1628. veor @XMM[$i], @XMM[$i], @T[0]
  1629. vand @T[1], @T[1], $twmask
  1630. ___
  1631. @T=reverse(@T);
  1632. $code.=<<___ if ($i>=10);
  1633. vld1.8 {@XMM[$i-10]}, [$inp]!
  1634. subs $len, #0x10
  1635. bmi .Lxts_enc_`$i-9`
  1636. ___
  1637. $code.=<<___ if ($i>=11);
  1638. veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
  1639. ___
  1640. }
  1641. $code.=<<___;
  1642. sub $len, #0x10
  1643. vst1.64 {@XMM[15]}, [r0,:128] @ next round tweak
  1644. vld1.8 {@XMM[6]}, [$inp]!
  1645. veor @XMM[5], @XMM[5], @XMM[13]
  1646. #ifndef BSAES_ASM_EXTENDED_KEY
  1647. add r4, sp, #0x90 @ pass key schedule
  1648. #else
  1649. add r4, $key, #248 @ pass key schedule
  1650. #endif
  1651. veor @XMM[6], @XMM[6], @XMM[14]
  1652. mov r5, $rounds @ pass rounds
  1653. mov r0, sp
  1654. bl _bsaes_encrypt8
  1655. vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
  1656. vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
  1657. veor @XMM[0], @XMM[0], @XMM[ 8]
  1658. vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
  1659. veor @XMM[1], @XMM[1], @XMM[ 9]
  1660. veor @XMM[8], @XMM[4], @XMM[10]
  1661. vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
  1662. veor @XMM[9], @XMM[6], @XMM[11]
  1663. vld1.64 {@XMM[14]}, [r0,:128]!
  1664. veor @XMM[10], @XMM[3], @XMM[12]
  1665. vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
  1666. veor @XMM[11], @XMM[7], @XMM[13]
  1667. veor @XMM[12], @XMM[2], @XMM[14]
  1668. vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
  1669. vst1.8 {@XMM[12]}, [$out]!
  1670. vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
  1671. b .Lxts_enc_done
  1672. .align 4
  1673. .Lxts_enc_6:
  1674. veor @XMM[4], @XMM[4], @XMM[12]
  1675. #ifndef BSAES_ASM_EXTENDED_KEY
  1676. add r4, sp, #0x90 @ pass key schedule
  1677. #else
  1678. add r4, $key, #248 @ pass key schedule
  1679. #endif
  1680. veor @XMM[5], @XMM[5], @XMM[13]
  1681. mov r5, $rounds @ pass rounds
  1682. mov r0, sp
  1683. bl _bsaes_encrypt8
  1684. vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
  1685. vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
  1686. veor @XMM[0], @XMM[0], @XMM[ 8]
  1687. vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
  1688. veor @XMM[1], @XMM[1], @XMM[ 9]
  1689. veor @XMM[8], @XMM[4], @XMM[10]
  1690. vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
  1691. veor @XMM[9], @XMM[6], @XMM[11]
  1692. veor @XMM[10], @XMM[3], @XMM[12]
  1693. vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
  1694. veor @XMM[11], @XMM[7], @XMM[13]
  1695. vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
  1696. vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
  1697. b .Lxts_enc_done
  1698. @ put this in range for both ARM and Thumb mode adr instructions
  1699. .align 5
  1700. .Lxts_magic:
  1701. .quad 1, 0x87
  1702. .align 5
  1703. .Lxts_enc_5:
  1704. veor @XMM[3], @XMM[3], @XMM[11]
  1705. #ifndef BSAES_ASM_EXTENDED_KEY
  1706. add r4, sp, #0x90 @ pass key schedule
  1707. #else
  1708. add r4, $key, #248 @ pass key schedule
  1709. #endif
  1710. veor @XMM[4], @XMM[4], @XMM[12]
  1711. mov r5, $rounds @ pass rounds
  1712. mov r0, sp
  1713. bl _bsaes_encrypt8
  1714. vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
  1715. vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
  1716. veor @XMM[0], @XMM[0], @XMM[ 8]
  1717. vld1.64 {@XMM[12]}, [r0,:128]!
  1718. veor @XMM[1], @XMM[1], @XMM[ 9]
  1719. veor @XMM[8], @XMM[4], @XMM[10]
  1720. vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
  1721. veor @XMM[9], @XMM[6], @XMM[11]
  1722. veor @XMM[10], @XMM[3], @XMM[12]
  1723. vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
  1724. vst1.8 {@XMM[10]}, [$out]!
  1725. vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
  1726. b .Lxts_enc_done
  1727. .align 4
  1728. .Lxts_enc_4:
  1729. veor @XMM[2], @XMM[2], @XMM[10]
  1730. #ifndef BSAES_ASM_EXTENDED_KEY
  1731. add r4, sp, #0x90 @ pass key schedule
  1732. #else
  1733. add r4, $key, #248 @ pass key schedule
  1734. #endif
  1735. veor @XMM[3], @XMM[3], @XMM[11]
  1736. mov r5, $rounds @ pass rounds
  1737. mov r0, sp
  1738. bl _bsaes_encrypt8
  1739. vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
  1740. vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
  1741. veor @XMM[0], @XMM[0], @XMM[ 8]
  1742. veor @XMM[1], @XMM[1], @XMM[ 9]
  1743. veor @XMM[8], @XMM[4], @XMM[10]
  1744. vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
  1745. veor @XMM[9], @XMM[6], @XMM[11]
  1746. vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
  1747. vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
  1748. b .Lxts_enc_done
  1749. .align 4
  1750. .Lxts_enc_3:
  1751. veor @XMM[1], @XMM[1], @XMM[9]
  1752. #ifndef BSAES_ASM_EXTENDED_KEY
  1753. add r4, sp, #0x90 @ pass key schedule
  1754. #else
  1755. add r4, $key, #248 @ pass key schedule
  1756. #endif
  1757. veor @XMM[2], @XMM[2], @XMM[10]
  1758. mov r5, $rounds @ pass rounds
  1759. mov r0, sp
  1760. bl _bsaes_encrypt8
  1761. vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]!
  1762. vld1.64 {@XMM[10]}, [r0,:128]!
  1763. veor @XMM[0], @XMM[0], @XMM[ 8]
  1764. veor @XMM[1], @XMM[1], @XMM[ 9]
  1765. veor @XMM[8], @XMM[4], @XMM[10]
  1766. vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
  1767. vst1.8 {@XMM[8]}, [$out]!
  1768. vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
  1769. b .Lxts_enc_done
  1770. .align 4
  1771. .Lxts_enc_2:
  1772. veor @XMM[0], @XMM[0], @XMM[8]
  1773. #ifndef BSAES_ASM_EXTENDED_KEY
  1774. add r4, sp, #0x90 @ pass key schedule
  1775. #else
  1776. add r4, $key, #248 @ pass key schedule
  1777. #endif
  1778. veor @XMM[1], @XMM[1], @XMM[9]
  1779. mov r5, $rounds @ pass rounds
  1780. mov r0, sp
  1781. bl _bsaes_encrypt8
  1782. vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]!
  1783. veor @XMM[0], @XMM[0], @XMM[ 8]
  1784. veor @XMM[1], @XMM[1], @XMM[ 9]
  1785. vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
  1786. vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
  1787. b .Lxts_enc_done
  1788. .align 4
  1789. .Lxts_enc_1:
  1790. mov r0, sp
  1791. veor @XMM[0], @XMM[0], @XMM[8]
  1792. mov r1, sp
  1793. vst1.8 {@XMM[0]}, [sp,:128]
  1794. mov r2, $key
  1795. mov r4, $fp @ preserve fp
  1796. bl AES_encrypt
  1797. vld1.8 {@XMM[0]}, [sp,:128]
  1798. veor @XMM[0], @XMM[0], @XMM[8]
  1799. vst1.8 {@XMM[0]}, [$out]!
  1800. mov $fp, r4
  1801. vmov @XMM[8], @XMM[9] @ next round tweak
  1802. .Lxts_enc_done:
  1803. #ifndef XTS_CHAIN_TWEAK
  1804. adds $len, #0x10
  1805. beq .Lxts_enc_ret
  1806. sub r6, $out, #0x10
  1807. .Lxts_enc_steal:
  1808. ldrb r0, [$inp], #1
  1809. ldrb r1, [$out, #-0x10]
  1810. strb r0, [$out, #-0x10]
  1811. strb r1, [$out], #1
  1812. subs $len, #1
  1813. bhi .Lxts_enc_steal
  1814. vld1.8 {@XMM[0]}, [r6]
  1815. mov r0, sp
  1816. veor @XMM[0], @XMM[0], @XMM[8]
  1817. mov r1, sp
  1818. vst1.8 {@XMM[0]}, [sp,:128]
  1819. mov r2, $key
  1820. mov r4, $fp @ preserve fp
  1821. bl AES_encrypt
  1822. vld1.8 {@XMM[0]}, [sp,:128]
  1823. veor @XMM[0], @XMM[0], @XMM[8]
  1824. vst1.8 {@XMM[0]}, [r6]
  1825. mov $fp, r4
  1826. #endif
  1827. .Lxts_enc_ret:
  1828. bic r0, $fp, #0xf
  1829. vmov.i32 q0, #0
  1830. vmov.i32 q1, #0
  1831. #ifdef XTS_CHAIN_TWEAK
  1832. ldr r1, [$fp, #0x20+VFP_ABI_FRAME] @ chain tweak
  1833. #endif
  1834. .Lxts_enc_bzero: @ wipe key schedule [if any]
  1835. vstmia sp!, {q0-q1}
  1836. cmp sp, r0
  1837. bne .Lxts_enc_bzero
  1838. mov sp, $fp
  1839. #ifdef XTS_CHAIN_TWEAK
  1840. vst1.8 {@XMM[8]}, [r1]
  1841. #endif
  1842. VFP_ABI_POP
  1843. ldmia sp!, {r4-r10, pc} @ return
  1844. .size bsaes_xts_encrypt,.-bsaes_xts_encrypt
  1845. .globl bsaes_xts_decrypt
  1846. .type bsaes_xts_decrypt,%function
  1847. .align 4
  1848. bsaes_xts_decrypt:
  1849. mov ip, sp
  1850. stmdb sp!, {r4-r10, lr} @ 0x20
  1851. VFP_ABI_PUSH
  1852. mov r6, sp @ future $fp
  1853. mov $inp, r0
  1854. mov $out, r1
  1855. mov $len, r2
  1856. mov $key, r3
  1857. sub r0, sp, #0x10 @ 0x10
  1858. bic r0, #0xf @ align at 16 bytes
  1859. mov sp, r0
  1860. #ifdef XTS_CHAIN_TWEAK
  1861. ldr r0, [ip] @ pointer to input tweak
  1862. #else
  1863. @ generate initial tweak
  1864. ldr r0, [ip, #4] @ iv[]
  1865. mov r1, sp
  1866. ldr r2, [ip, #0] @ key2
  1867. bl AES_encrypt
  1868. mov r0, sp @ pointer to initial tweak
  1869. #endif
  1870. ldr $rounds, [$key, #240] @ get # of rounds
  1871. mov $fp, r6
  1872. #ifndef BSAES_ASM_EXTENDED_KEY
  1873. @ allocate the key schedule on the stack
  1874. sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key
  1875. @ add r12, #`128-32` @ size of bit-sliced key schedule
  1876. sub r12, #`32+16` @ place for tweak[9]
  1877. @ populate the key schedule
  1878. mov r4, $key @ pass key
  1879. mov r5, $rounds @ pass # of rounds
  1880. mov sp, r12
  1881. add r12, #0x90 @ pass key schedule
  1882. bl _bsaes_key_convert
  1883. add r4, sp, #0x90
  1884. vldmia r4, {@XMM[6]}
  1885. vstmia r12, {@XMM[15]} @ save last round key
  1886. veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
  1887. vstmia r4, {@XMM[7]}
  1888. #else
  1889. ldr r12, [$key, #244]
  1890. eors r12, #1
  1891. beq 0f
  1892. str r12, [$key, #244]
  1893. mov r4, $key @ pass key
  1894. mov r5, $rounds @ pass # of rounds
  1895. add r12, $key, #248 @ pass key schedule
  1896. bl _bsaes_key_convert
  1897. add r4, $key, #248
  1898. vldmia r4, {@XMM[6]}
  1899. vstmia r12, {@XMM[15]} @ save last round key
  1900. veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
  1901. vstmia r4, {@XMM[7]}
  1902. .align 2
  1903. 0: sub sp, #0x90 @ place for tweak[9]
  1904. #endif
  1905. vld1.8 {@XMM[8]}, [r0] @ initial tweak
  1906. adr $magic, .Lxts_magic
  1907. #ifndef XTS_CHAIN_TWEAK
  1908. tst $len, #0xf @ if not multiple of 16
  1909. it ne @ Thumb2 thing, sanity check in ARM
  1910. subne $len, #0x10 @ subtract another 16 bytes
  1911. #endif
  1912. subs $len, #0x80
  1913. blo .Lxts_dec_short
  1914. b .Lxts_dec_loop
  1915. .align 4
  1916. .Lxts_dec_loop:
  1917. vldmia $magic, {$twmask} @ load XTS magic
  1918. vshr.s64 @T[0], @XMM[8], #63
  1919. mov r0, sp
  1920. vand @T[0], @T[0], $twmask
  1921. ___
  1922. for($i=9;$i<16;$i++) {
  1923. $code.=<<___;
  1924. vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1]
  1925. vst1.64 {@XMM[$i-1]}, [r0,:128]!
  1926. vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
  1927. vshr.s64 @T[1], @XMM[$i], #63
  1928. veor @XMM[$i], @XMM[$i], @T[0]
  1929. vand @T[1], @T[1], $twmask
  1930. ___
  1931. @T=reverse(@T);
  1932. $code.=<<___ if ($i>=10);
  1933. vld1.8 {@XMM[$i-10]}, [$inp]!
  1934. ___
  1935. $code.=<<___ if ($i>=11);
  1936. veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
  1937. ___
  1938. }
  1939. $code.=<<___;
  1940. vadd.u64 @XMM[8], @XMM[15], @XMM[15]
  1941. vst1.64 {@XMM[15]}, [r0,:128]!
  1942. vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
  1943. veor @XMM[8], @XMM[8], @T[0]
  1944. vst1.64 {@XMM[8]}, [r0,:128] @ next round tweak
  1945. vld1.8 {@XMM[6]-@XMM[7]}, [$inp]!
  1946. veor @XMM[5], @XMM[5], @XMM[13]
  1947. #ifndef BSAES_ASM_EXTENDED_KEY
  1948. add r4, sp, #0x90 @ pass key schedule
  1949. #else
  1950. add r4, $key, #248 @ pass key schedule
  1951. #endif
  1952. veor @XMM[6], @XMM[6], @XMM[14]
  1953. mov r5, $rounds @ pass rounds
  1954. veor @XMM[7], @XMM[7], @XMM[15]
  1955. mov r0, sp
  1956. bl _bsaes_decrypt8
  1957. vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
  1958. vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
  1959. veor @XMM[0], @XMM[0], @XMM[ 8]
  1960. vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
  1961. veor @XMM[1], @XMM[1], @XMM[ 9]
  1962. veor @XMM[8], @XMM[6], @XMM[10]
  1963. vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
  1964. veor @XMM[9], @XMM[4], @XMM[11]
  1965. vld1.64 {@XMM[14]-@XMM[15]}, [r0,:128]!
  1966. veor @XMM[10], @XMM[2], @XMM[12]
  1967. vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
  1968. veor @XMM[11], @XMM[7], @XMM[13]
  1969. veor @XMM[12], @XMM[3], @XMM[14]
  1970. vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
  1971. veor @XMM[13], @XMM[5], @XMM[15]
  1972. vst1.8 {@XMM[12]-@XMM[13]}, [$out]!
  1973. vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
  1974. subs $len, #0x80
  1975. bpl .Lxts_dec_loop
  1976. .Lxts_dec_short:
  1977. adds $len, #0x70
  1978. bmi .Lxts_dec_done
  1979. vldmia $magic, {$twmask} @ load XTS magic
  1980. vshr.s64 @T[0], @XMM[8], #63
  1981. mov r0, sp
  1982. vand @T[0], @T[0], $twmask
  1983. ___
  1984. for($i=9;$i<16;$i++) {
  1985. $code.=<<___;
  1986. vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1]
  1987. vst1.64 {@XMM[$i-1]}, [r0,:128]!
  1988. vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
  1989. vshr.s64 @T[1], @XMM[$i], #63
  1990. veor @XMM[$i], @XMM[$i], @T[0]
  1991. vand @T[1], @T[1], $twmask
  1992. ___
  1993. @T=reverse(@T);
  1994. $code.=<<___ if ($i>=10);
  1995. vld1.8 {@XMM[$i-10]}, [$inp]!
  1996. subs $len, #0x10
  1997. bmi .Lxts_dec_`$i-9`
  1998. ___
  1999. $code.=<<___ if ($i>=11);
  2000. veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
  2001. ___
  2002. }
  2003. $code.=<<___;
  2004. sub $len, #0x10
  2005. vst1.64 {@XMM[15]}, [r0,:128] @ next round tweak
  2006. vld1.8 {@XMM[6]}, [$inp]!
  2007. veor @XMM[5], @XMM[5], @XMM[13]
  2008. #ifndef BSAES_ASM_EXTENDED_KEY
  2009. add r4, sp, #0x90 @ pass key schedule
  2010. #else
  2011. add r4, $key, #248 @ pass key schedule
  2012. #endif
  2013. veor @XMM[6], @XMM[6], @XMM[14]
  2014. mov r5, $rounds @ pass rounds
  2015. mov r0, sp
  2016. bl _bsaes_decrypt8
  2017. vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
  2018. vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
  2019. veor @XMM[0], @XMM[0], @XMM[ 8]
  2020. vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
  2021. veor @XMM[1], @XMM[1], @XMM[ 9]
  2022. veor @XMM[8], @XMM[6], @XMM[10]
  2023. vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
  2024. veor @XMM[9], @XMM[4], @XMM[11]
  2025. vld1.64 {@XMM[14]}, [r0,:128]!
  2026. veor @XMM[10], @XMM[2], @XMM[12]
  2027. vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
  2028. veor @XMM[11], @XMM[7], @XMM[13]
  2029. veor @XMM[12], @XMM[3], @XMM[14]
  2030. vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
  2031. vst1.8 {@XMM[12]}, [$out]!
  2032. vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
  2033. b .Lxts_dec_done
  2034. .align 4
  2035. .Lxts_dec_6:
  2036. vst1.64 {@XMM[14]}, [r0,:128] @ next round tweak
  2037. veor @XMM[4], @XMM[4], @XMM[12]
  2038. #ifndef BSAES_ASM_EXTENDED_KEY
  2039. add r4, sp, #0x90 @ pass key schedule
  2040. #else
  2041. add r4, $key, #248 @ pass key schedule
  2042. #endif
  2043. veor @XMM[5], @XMM[5], @XMM[13]
  2044. mov r5, $rounds @ pass rounds
  2045. mov r0, sp
  2046. bl _bsaes_decrypt8
  2047. vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
  2048. vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
  2049. veor @XMM[0], @XMM[0], @XMM[ 8]
  2050. vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
  2051. veor @XMM[1], @XMM[1], @XMM[ 9]
  2052. veor @XMM[8], @XMM[6], @XMM[10]
  2053. vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
  2054. veor @XMM[9], @XMM[4], @XMM[11]
  2055. veor @XMM[10], @XMM[2], @XMM[12]
  2056. vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
  2057. veor @XMM[11], @XMM[7], @XMM[13]
  2058. vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
  2059. vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
  2060. b .Lxts_dec_done
  2061. .align 4
  2062. .Lxts_dec_5:
  2063. veor @XMM[3], @XMM[3], @XMM[11]
  2064. #ifndef BSAES_ASM_EXTENDED_KEY
  2065. add r4, sp, #0x90 @ pass key schedule
  2066. #else
  2067. add r4, $key, #248 @ pass key schedule
  2068. #endif
  2069. veor @XMM[4], @XMM[4], @XMM[12]
  2070. mov r5, $rounds @ pass rounds
  2071. mov r0, sp
  2072. bl _bsaes_decrypt8
  2073. vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
  2074. vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
  2075. veor @XMM[0], @XMM[0], @XMM[ 8]
  2076. vld1.64 {@XMM[12]}, [r0,:128]!
  2077. veor @XMM[1], @XMM[1], @XMM[ 9]
  2078. veor @XMM[8], @XMM[6], @XMM[10]
  2079. vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
  2080. veor @XMM[9], @XMM[4], @XMM[11]
  2081. veor @XMM[10], @XMM[2], @XMM[12]
  2082. vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
  2083. vst1.8 {@XMM[10]}, [$out]!
  2084. vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
  2085. b .Lxts_dec_done
  2086. .align 4
  2087. .Lxts_dec_4:
  2088. veor @XMM[2], @XMM[2], @XMM[10]
  2089. #ifndef BSAES_ASM_EXTENDED_KEY
  2090. add r4, sp, #0x90 @ pass key schedule
  2091. #else
  2092. add r4, $key, #248 @ pass key schedule
  2093. #endif
  2094. veor @XMM[3], @XMM[3], @XMM[11]
  2095. mov r5, $rounds @ pass rounds
  2096. mov r0, sp
  2097. bl _bsaes_decrypt8
  2098. vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
  2099. vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
  2100. veor @XMM[0], @XMM[0], @XMM[ 8]
  2101. veor @XMM[1], @XMM[1], @XMM[ 9]
  2102. veor @XMM[8], @XMM[6], @XMM[10]
  2103. vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
  2104. veor @XMM[9], @XMM[4], @XMM[11]
  2105. vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
  2106. vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
  2107. b .Lxts_dec_done
  2108. .align 4
  2109. .Lxts_dec_3:
  2110. veor @XMM[1], @XMM[1], @XMM[9]
  2111. #ifndef BSAES_ASM_EXTENDED_KEY
  2112. add r4, sp, #0x90 @ pass key schedule
  2113. #else
  2114. add r4, $key, #248 @ pass key schedule
  2115. #endif
  2116. veor @XMM[2], @XMM[2], @XMM[10]
  2117. mov r5, $rounds @ pass rounds
  2118. mov r0, sp
  2119. bl _bsaes_decrypt8
  2120. vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]!
  2121. vld1.64 {@XMM[10]}, [r0,:128]!
  2122. veor @XMM[0], @XMM[0], @XMM[ 8]
  2123. veor @XMM[1], @XMM[1], @XMM[ 9]
  2124. veor @XMM[8], @XMM[6], @XMM[10]
  2125. vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
  2126. vst1.8 {@XMM[8]}, [$out]!
  2127. vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
  2128. b .Lxts_dec_done
  2129. .align 4
  2130. .Lxts_dec_2:
  2131. veor @XMM[0], @XMM[0], @XMM[8]
  2132. #ifndef BSAES_ASM_EXTENDED_KEY
  2133. add r4, sp, #0x90 @ pass key schedule
  2134. #else
  2135. add r4, $key, #248 @ pass key schedule
  2136. #endif
  2137. veor @XMM[1], @XMM[1], @XMM[9]
  2138. mov r5, $rounds @ pass rounds
  2139. mov r0, sp
  2140. bl _bsaes_decrypt8
  2141. vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]!
  2142. veor @XMM[0], @XMM[0], @XMM[ 8]
  2143. veor @XMM[1], @XMM[1], @XMM[ 9]
  2144. vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
  2145. vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
  2146. b .Lxts_dec_done
  2147. .align 4
  2148. .Lxts_dec_1:
  2149. mov r0, sp
  2150. veor @XMM[0], @XMM[0], @XMM[8]
  2151. mov r1, sp
  2152. vst1.8 {@XMM[0]}, [sp,:128]
  2153. mov r5, $magic @ preserve magic
  2154. mov r2, $key
  2155. mov r4, $fp @ preserve fp
  2156. bl AES_decrypt
  2157. vld1.8 {@XMM[0]}, [sp,:128]
  2158. veor @XMM[0], @XMM[0], @XMM[8]
  2159. vst1.8 {@XMM[0]}, [$out]!
  2160. mov $fp, r4
  2161. mov $magic, r5
  2162. vmov @XMM[8], @XMM[9] @ next round tweak
  2163. .Lxts_dec_done:
  2164. #ifndef XTS_CHAIN_TWEAK
  2165. adds $len, #0x10
  2166. beq .Lxts_dec_ret
  2167. @ calculate one round of extra tweak for the stolen ciphertext
  2168. vldmia $magic, {$twmask}
  2169. vshr.s64 @XMM[6], @XMM[8], #63
  2170. vand @XMM[6], @XMM[6], $twmask
  2171. vadd.u64 @XMM[9], @XMM[8], @XMM[8]
  2172. vswp `&Dhi("@XMM[6]")`,`&Dlo("@XMM[6]")`
  2173. veor @XMM[9], @XMM[9], @XMM[6]
  2174. @ perform the final decryption with the last tweak value
  2175. vld1.8 {@XMM[0]}, [$inp]!
  2176. mov r0, sp
  2177. veor @XMM[0], @XMM[0], @XMM[9]
  2178. mov r1, sp
  2179. vst1.8 {@XMM[0]}, [sp,:128]
  2180. mov r2, $key
  2181. mov r4, $fp @ preserve fp
  2182. bl AES_decrypt
  2183. vld1.8 {@XMM[0]}, [sp,:128]
  2184. veor @XMM[0], @XMM[0], @XMM[9]
  2185. vst1.8 {@XMM[0]}, [$out]
  2186. mov r6, $out
  2187. .Lxts_dec_steal:
  2188. ldrb r1, [$out]
  2189. ldrb r0, [$inp], #1
  2190. strb r1, [$out, #0x10]
  2191. strb r0, [$out], #1
  2192. subs $len, #1
  2193. bhi .Lxts_dec_steal
  2194. vld1.8 {@XMM[0]}, [r6]
  2195. mov r0, sp
  2196. veor @XMM[0], @XMM[8]
  2197. mov r1, sp
  2198. vst1.8 {@XMM[0]}, [sp,:128]
  2199. mov r2, $key
  2200. bl AES_decrypt
  2201. vld1.8 {@XMM[0]}, [sp,:128]
  2202. veor @XMM[0], @XMM[0], @XMM[8]
  2203. vst1.8 {@XMM[0]}, [r6]
  2204. mov $fp, r4
  2205. #endif
  2206. .Lxts_dec_ret:
  2207. bic r0, $fp, #0xf
  2208. vmov.i32 q0, #0
  2209. vmov.i32 q1, #0
  2210. #ifdef XTS_CHAIN_TWEAK
  2211. ldr r1, [$fp, #0x20+VFP_ABI_FRAME] @ chain tweak
  2212. #endif
  2213. .Lxts_dec_bzero: @ wipe key schedule [if any]
  2214. vstmia sp!, {q0-q1}
  2215. cmp sp, r0
  2216. bne .Lxts_dec_bzero
  2217. mov sp, $fp
  2218. #ifdef XTS_CHAIN_TWEAK
  2219. vst1.8 {@XMM[8]}, [r1]
  2220. #endif
  2221. VFP_ABI_POP
  2222. ldmia sp!, {r4-r10, pc} @ return
  2223. .size bsaes_xts_decrypt,.-bsaes_xts_decrypt
  2224. ___
  2225. }
  2226. $code.=<<___;
  2227. #endif
  2228. ___
  2229. $code =~ s/\`([^\`]*)\`/eval($1)/gem;
  2230. open SELF,$0;
  2231. while(<SELF>) {
  2232. next if (/^#!/);
  2233. last if (!s/^#/@/ and !/^$/);
  2234. print;
  2235. }
  2236. close SELF;
  2237. print $code;
  2238. close STDOUT;