bsaes-armv7.pl 63 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494
  1. #! /usr/bin/env perl
  2. # Copyright 2012-2020 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. # ====================================================================
  9. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  10. # project. The module is, however, dual licensed under OpenSSL and
  11. # CRYPTOGAMS licenses depending on where you obtain it. For further
  12. # details see http://www.openssl.org/~appro/cryptogams/.
  13. #
  14. # Specific modes and adaptation for Linux kernel by Ard Biesheuvel
  15. # of Linaro. Permission to use under GPL terms is granted.
  16. # ====================================================================
  17. # Bit-sliced AES for ARM NEON
  18. #
  19. # February 2012.
  20. #
  21. # This implementation is direct adaptation of bsaes-x86_64 module for
  22. # ARM NEON. Except that this module is endian-neutral [in sense that
  23. # it can be compiled for either endianness] by courtesy of vld1.8's
  24. # neutrality. Initial version doesn't implement interface to OpenSSL,
  25. # only low-level primitives and unsupported entry points, just enough
  26. # to collect performance results, which for Cortex-A8 core are:
  27. #
  28. # encrypt 19.5 cycles per byte processed with 128-bit key
  29. # decrypt 22.1 cycles per byte processed with 128-bit key
  30. # key conv. 440 cycles per 128-bit key/0.18 of 8x block
  31. #
  32. # Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 19.7,
  33. # which is [much] worse than anticipated (for further details see
  34. # http://www.openssl.org/~appro/Snapdragon-S4.html).
  35. #
  36. # Cortex-A15 manages in 14.2/16.1 cycles [when integer-only code
  37. # manages in 20.0 cycles].
  38. #
  39. # When comparing to x86_64 results keep in mind that NEON unit is
  40. # [mostly] single-issue and thus can't [fully] benefit from
  41. # instruction-level parallelism. And when comparing to aes-armv4
  42. # results keep in mind key schedule conversion overhead (see
  43. # bsaes-x86_64.pl for further details)...
  44. #
  45. # <appro@openssl.org>
  46. # April-August 2013
  47. # Add CBC, CTR and XTS subroutines and adapt for kernel use; courtesy of Ard.
  48. # $output is the last argument if it looks like a file (it has an extension)
  49. # $flavour is the first argument if it doesn't look like a file
  50. $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  51. $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  52. if ($flavour && $flavour ne "void") {
  53. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  54. ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
  55. ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
  56. die "can't locate arm-xlate.pl";
  57. open STDOUT,"| \"$^X\" $xlate $flavour \"$output\""
  58. or die "can't call $xlate: $!";
  59. } else {
  60. $output and open STDOUT,">$output";
  61. }
  62. my ($inp,$out,$len,$key)=("r0","r1","r2","r3");
  63. my @XMM=map("q$_",(0..15));
  64. {
  65. my ($key,$rounds,$const)=("r4","r5","r6");
  66. sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
  67. sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
  68. sub Sbox {
  69. # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
  70. # output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
  71. my @b=@_[0..7];
  72. my @t=@_[8..11];
  73. my @s=@_[12..15];
  74. &InBasisChange (@b);
  75. &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s);
  76. &OutBasisChange (@b[7,1,4,2,6,5,0,3]);
  77. }
  78. sub InBasisChange {
  79. # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
  80. # output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
  81. my @b=@_[0..7];
  82. $code.=<<___;
  83. veor @b[2], @b[2], @b[1]
  84. veor @b[5], @b[5], @b[6]
  85. veor @b[3], @b[3], @b[0]
  86. veor @b[6], @b[6], @b[2]
  87. veor @b[5], @b[5], @b[0]
  88. veor @b[6], @b[6], @b[3]
  89. veor @b[3], @b[3], @b[7]
  90. veor @b[7], @b[7], @b[5]
  91. veor @b[3], @b[3], @b[4]
  92. veor @b[4], @b[4], @b[5]
  93. veor @b[2], @b[2], @b[7]
  94. veor @b[3], @b[3], @b[1]
  95. veor @b[1], @b[1], @b[5]
  96. ___
  97. }
  98. sub OutBasisChange {
  99. # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
  100. # output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
  101. my @b=@_[0..7];
  102. $code.=<<___;
  103. veor @b[0], @b[0], @b[6]
  104. veor @b[1], @b[1], @b[4]
  105. veor @b[4], @b[4], @b[6]
  106. veor @b[2], @b[2], @b[0]
  107. veor @b[6], @b[6], @b[1]
  108. veor @b[1], @b[1], @b[5]
  109. veor @b[5], @b[5], @b[3]
  110. veor @b[3], @b[3], @b[7]
  111. veor @b[7], @b[7], @b[5]
  112. veor @b[2], @b[2], @b[5]
  113. veor @b[4], @b[4], @b[7]
  114. ___
  115. }
  116. sub InvSbox {
  117. # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
  118. # output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb
  119. my @b=@_[0..7];
  120. my @t=@_[8..11];
  121. my @s=@_[12..15];
  122. &InvInBasisChange (@b);
  123. &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s);
  124. &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]);
  125. }
  126. sub InvInBasisChange { # OutBasisChange in reverse (with twist)
  127. my @b=@_[5,1,2,6,3,7,0,4];
  128. $code.=<<___
  129. veor @b[1], @b[1], @b[7]
  130. veor @b[4], @b[4], @b[7]
  131. veor @b[7], @b[7], @b[5]
  132. veor @b[1], @b[1], @b[3]
  133. veor @b[2], @b[2], @b[5]
  134. veor @b[3], @b[3], @b[7]
  135. veor @b[6], @b[6], @b[1]
  136. veor @b[2], @b[2], @b[0]
  137. veor @b[5], @b[5], @b[3]
  138. veor @b[4], @b[4], @b[6]
  139. veor @b[0], @b[0], @b[6]
  140. veor @b[1], @b[1], @b[4]
  141. ___
  142. }
  143. sub InvOutBasisChange { # InBasisChange in reverse
  144. my @b=@_[2,5,7,3,6,1,0,4];
  145. $code.=<<___;
  146. veor @b[1], @b[1], @b[5]
  147. veor @b[2], @b[2], @b[7]
  148. veor @b[3], @b[3], @b[1]
  149. veor @b[4], @b[4], @b[5]
  150. veor @b[7], @b[7], @b[5]
  151. veor @b[3], @b[3], @b[4]
  152. veor @b[5], @b[5], @b[0]
  153. veor @b[3], @b[3], @b[7]
  154. veor @b[6], @b[6], @b[2]
  155. veor @b[2], @b[2], @b[1]
  156. veor @b[6], @b[6], @b[3]
  157. veor @b[3], @b[3], @b[0]
  158. veor @b[5], @b[5], @b[6]
  159. ___
  160. }
  161. sub Mul_GF4 {
  162. #;*************************************************************
  163. #;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
  164. #;*************************************************************
  165. my ($x0,$x1,$y0,$y1,$t0,$t1)=@_;
  166. $code.=<<___;
  167. veor $t0, $y0, $y1
  168. vand $t0, $t0, $x0
  169. veor $x0, $x0, $x1
  170. vand $t1, $x1, $y0
  171. vand $x0, $x0, $y1
  172. veor $x1, $t1, $t0
  173. veor $x0, $x0, $t1
  174. ___
  175. }
  176. sub Mul_GF4_N { # not used, see next subroutine
  177. # multiply and scale by N
  178. my ($x0,$x1,$y0,$y1,$t0)=@_;
  179. $code.=<<___;
  180. veor $t0, $y0, $y1
  181. vand $t0, $t0, $x0
  182. veor $x0, $x0, $x1
  183. vand $x1, $x1, $y0
  184. vand $x0, $x0, $y1
  185. veor $x1, $x1, $x0
  186. veor $x0, $x0, $t0
  187. ___
  188. }
  189. sub Mul_GF4_N_GF4 {
  190. # interleaved Mul_GF4_N and Mul_GF4
  191. my ($x0,$x1,$y0,$y1,$t0,
  192. $x2,$x3,$y2,$y3,$t1)=@_;
  193. $code.=<<___;
  194. veor $t0, $y0, $y1
  195. veor $t1, $y2, $y3
  196. vand $t0, $t0, $x0
  197. vand $t1, $t1, $x2
  198. veor $x0, $x0, $x1
  199. veor $x2, $x2, $x3
  200. vand $x1, $x1, $y0
  201. vand $x3, $x3, $y2
  202. vand $x0, $x0, $y1
  203. vand $x2, $x2, $y3
  204. veor $x1, $x1, $x0
  205. veor $x2, $x2, $x3
  206. veor $x0, $x0, $t0
  207. veor $x3, $x3, $t1
  208. ___
  209. }
  210. sub Mul_GF16_2 {
  211. my @x=@_[0..7];
  212. my @y=@_[8..11];
  213. my @t=@_[12..15];
  214. $code.=<<___;
  215. veor @t[0], @x[0], @x[2]
  216. veor @t[1], @x[1], @x[3]
  217. ___
  218. &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2..3]);
  219. $code.=<<___;
  220. veor @y[0], @y[0], @y[2]
  221. veor @y[1], @y[1], @y[3]
  222. ___
  223. Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
  224. @x[2], @x[3], @y[2], @y[3], @t[2]);
  225. $code.=<<___;
  226. veor @x[0], @x[0], @t[0]
  227. veor @x[2], @x[2], @t[0]
  228. veor @x[1], @x[1], @t[1]
  229. veor @x[3], @x[3], @t[1]
  230. veor @t[0], @x[4], @x[6]
  231. veor @t[1], @x[5], @x[7]
  232. ___
  233. &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
  234. @x[6], @x[7], @y[2], @y[3], @t[2]);
  235. $code.=<<___;
  236. veor @y[0], @y[0], @y[2]
  237. veor @y[1], @y[1], @y[3]
  238. ___
  239. &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[2..3]);
  240. $code.=<<___;
  241. veor @x[4], @x[4], @t[0]
  242. veor @x[6], @x[6], @t[0]
  243. veor @x[5], @x[5], @t[1]
  244. veor @x[7], @x[7], @t[1]
  245. ___
  246. }
  247. sub Inv_GF256 {
  248. #;********************************************************************
  249. #;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) *
  250. #;********************************************************************
  251. my @x=@_[0..7];
  252. my @t=@_[8..11];
  253. my @s=@_[12..15];
  254. # direct optimizations from hardware
  255. $code.=<<___;
  256. veor @t[3], @x[4], @x[6]
  257. veor @t[2], @x[5], @x[7]
  258. veor @t[1], @x[1], @x[3]
  259. veor @s[1], @x[7], @x[6]
  260. vmov @t[0], @t[2]
  261. veor @s[0], @x[0], @x[2]
  262. vorr @t[2], @t[2], @t[1]
  263. veor @s[3], @t[3], @t[0]
  264. vand @s[2], @t[3], @s[0]
  265. vorr @t[3], @t[3], @s[0]
  266. veor @s[0], @s[0], @t[1]
  267. vand @t[0], @t[0], @t[1]
  268. veor @t[1], @x[3], @x[2]
  269. vand @s[3], @s[3], @s[0]
  270. vand @s[1], @s[1], @t[1]
  271. veor @t[1], @x[4], @x[5]
  272. veor @s[0], @x[1], @x[0]
  273. veor @t[3], @t[3], @s[1]
  274. veor @t[2], @t[2], @s[1]
  275. vand @s[1], @t[1], @s[0]
  276. vorr @t[1], @t[1], @s[0]
  277. veor @t[3], @t[3], @s[3]
  278. veor @t[0], @t[0], @s[1]
  279. veor @t[2], @t[2], @s[2]
  280. veor @t[1], @t[1], @s[3]
  281. veor @t[0], @t[0], @s[2]
  282. vand @s[0], @x[7], @x[3]
  283. veor @t[1], @t[1], @s[2]
  284. vand @s[1], @x[6], @x[2]
  285. vand @s[2], @x[5], @x[1]
  286. vorr @s[3], @x[4], @x[0]
  287. veor @t[3], @t[3], @s[0]
  288. veor @t[1], @t[1], @s[2]
  289. veor @t[0], @t[0], @s[3]
  290. veor @t[2], @t[2], @s[1]
  291. @ Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
  292. @ new smaller inversion
  293. vand @s[2], @t[3], @t[1]
  294. vmov @s[0], @t[0]
  295. veor @s[1], @t[2], @s[2]
  296. veor @s[3], @t[0], @s[2]
  297. veor @s[2], @t[0], @s[2] @ @s[2]=@s[3]
  298. vbsl @s[1], @t[1], @t[0]
  299. vbsl @s[3], @t[3], @t[2]
  300. veor @t[3], @t[3], @t[2]
  301. vbsl @s[0], @s[1], @s[2]
  302. vbsl @t[0], @s[2], @s[1]
  303. vand @s[2], @s[0], @s[3]
  304. veor @t[1], @t[1], @t[0]
  305. veor @s[2], @s[2], @t[3]
  306. ___
  307. # output in s3, s2, s1, t1
  308. # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
  309. # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
  310. &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
  311. ### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
  312. }
  313. # AES linear components
  314. sub ShiftRows {
  315. my @x=@_[0..7];
  316. my @t=@_[8..11];
  317. my $mask=pop;
  318. $code.=<<___;
  319. vldmia $key!, {@t[0]-@t[3]}
  320. veor @t[0], @t[0], @x[0]
  321. veor @t[1], @t[1], @x[1]
  322. vtbl.8 `&Dlo(@x[0])`, {@t[0]}, `&Dlo($mask)`
  323. vtbl.8 `&Dhi(@x[0])`, {@t[0]}, `&Dhi($mask)`
  324. vldmia $key!, {@t[0]}
  325. veor @t[2], @t[2], @x[2]
  326. vtbl.8 `&Dlo(@x[1])`, {@t[1]}, `&Dlo($mask)`
  327. vtbl.8 `&Dhi(@x[1])`, {@t[1]}, `&Dhi($mask)`
  328. vldmia $key!, {@t[1]}
  329. veor @t[3], @t[3], @x[3]
  330. vtbl.8 `&Dlo(@x[2])`, {@t[2]}, `&Dlo($mask)`
  331. vtbl.8 `&Dhi(@x[2])`, {@t[2]}, `&Dhi($mask)`
  332. vldmia $key!, {@t[2]}
  333. vtbl.8 `&Dlo(@x[3])`, {@t[3]}, `&Dlo($mask)`
  334. vtbl.8 `&Dhi(@x[3])`, {@t[3]}, `&Dhi($mask)`
  335. vldmia $key!, {@t[3]}
  336. veor @t[0], @t[0], @x[4]
  337. veor @t[1], @t[1], @x[5]
  338. vtbl.8 `&Dlo(@x[4])`, {@t[0]}, `&Dlo($mask)`
  339. vtbl.8 `&Dhi(@x[4])`, {@t[0]}, `&Dhi($mask)`
  340. veor @t[2], @t[2], @x[6]
  341. vtbl.8 `&Dlo(@x[5])`, {@t[1]}, `&Dlo($mask)`
  342. vtbl.8 `&Dhi(@x[5])`, {@t[1]}, `&Dhi($mask)`
  343. veor @t[3], @t[3], @x[7]
  344. vtbl.8 `&Dlo(@x[6])`, {@t[2]}, `&Dlo($mask)`
  345. vtbl.8 `&Dhi(@x[6])`, {@t[2]}, `&Dhi($mask)`
  346. vtbl.8 `&Dlo(@x[7])`, {@t[3]}, `&Dlo($mask)`
  347. vtbl.8 `&Dhi(@x[7])`, {@t[3]}, `&Dhi($mask)`
  348. ___
  349. }
  350. sub MixColumns {
  351. # modified to emit output in order suitable for feeding back to aesenc[last]
  352. my @x=@_[0..7];
  353. my @t=@_[8..15];
  354. my $inv=@_[16]; # optional
  355. $code.=<<___;
  356. vext.8 @t[0], @x[0], @x[0], #12 @ x0 <<< 32
  357. vext.8 @t[1], @x[1], @x[1], #12
  358. veor @x[0], @x[0], @t[0] @ x0 ^ (x0 <<< 32)
  359. vext.8 @t[2], @x[2], @x[2], #12
  360. veor @x[1], @x[1], @t[1]
  361. vext.8 @t[3], @x[3], @x[3], #12
  362. veor @x[2], @x[2], @t[2]
  363. vext.8 @t[4], @x[4], @x[4], #12
  364. veor @x[3], @x[3], @t[3]
  365. vext.8 @t[5], @x[5], @x[5], #12
  366. veor @x[4], @x[4], @t[4]
  367. vext.8 @t[6], @x[6], @x[6], #12
  368. veor @x[5], @x[5], @t[5]
  369. vext.8 @t[7], @x[7], @x[7], #12
  370. veor @x[6], @x[6], @t[6]
  371. veor @t[1], @t[1], @x[0]
  372. veor @x[7], @x[7], @t[7]
  373. vext.8 @x[0], @x[0], @x[0], #8 @ (x0 ^ (x0 <<< 32)) <<< 64)
  374. veor @t[2], @t[2], @x[1]
  375. veor @t[0], @t[0], @x[7]
  376. veor @t[1], @t[1], @x[7]
  377. vext.8 @x[1], @x[1], @x[1], #8
  378. veor @t[5], @t[5], @x[4]
  379. veor @x[0], @x[0], @t[0]
  380. veor @t[6], @t[6], @x[5]
  381. veor @x[1], @x[1], @t[1]
  382. vext.8 @t[0], @x[4], @x[4], #8
  383. veor @t[4], @t[4], @x[3]
  384. vext.8 @t[1], @x[5], @x[5], #8
  385. veor @t[7], @t[7], @x[6]
  386. vext.8 @x[4], @x[3], @x[3], #8
  387. veor @t[3], @t[3], @x[2]
  388. vext.8 @x[5], @x[7], @x[7], #8
  389. veor @t[4], @t[4], @x[7]
  390. vext.8 @x[3], @x[6], @x[6], #8
  391. veor @t[3], @t[3], @x[7]
  392. vext.8 @x[6], @x[2], @x[2], #8
  393. veor @x[7], @t[1], @t[5]
  394. ___
  395. $code.=<<___ if (!$inv);
  396. veor @x[2], @t[0], @t[4]
  397. veor @x[4], @x[4], @t[3]
  398. veor @x[5], @x[5], @t[7]
  399. veor @x[3], @x[3], @t[6]
  400. @ vmov @x[2], @t[0]
  401. veor @x[6], @x[6], @t[2]
  402. @ vmov @x[7], @t[1]
  403. ___
  404. $code.=<<___ if ($inv);
  405. veor @t[3], @t[3], @x[4]
  406. veor @x[5], @x[5], @t[7]
  407. veor @x[2], @x[3], @t[6]
  408. veor @x[3], @t[0], @t[4]
  409. veor @x[4], @x[6], @t[2]
  410. vmov @x[6], @t[3]
  411. @ vmov @x[7], @t[1]
  412. ___
  413. }
  414. sub InvMixColumns_orig {
  415. my @x=@_[0..7];
  416. my @t=@_[8..15];
  417. $code.=<<___;
  418. @ multiplication by 0x0e
  419. vext.8 @t[7], @x[7], @x[7], #12
  420. vmov @t[2], @x[2]
  421. veor @x[2], @x[2], @x[5] @ 2 5
  422. veor @x[7], @x[7], @x[5] @ 7 5
  423. vext.8 @t[0], @x[0], @x[0], #12
  424. vmov @t[5], @x[5]
  425. veor @x[5], @x[5], @x[0] @ 5 0 [1]
  426. veor @x[0], @x[0], @x[1] @ 0 1
  427. vext.8 @t[1], @x[1], @x[1], #12
  428. veor @x[1], @x[1], @x[2] @ 1 25
  429. veor @x[0], @x[0], @x[6] @ 01 6 [2]
  430. vext.8 @t[3], @x[3], @x[3], #12
  431. veor @x[1], @x[1], @x[3] @ 125 3 [4]
  432. veor @x[2], @x[2], @x[0] @ 25 016 [3]
  433. veor @x[3], @x[3], @x[7] @ 3 75
  434. veor @x[7], @x[7], @x[6] @ 75 6 [0]
  435. vext.8 @t[6], @x[6], @x[6], #12
  436. vmov @t[4], @x[4]
  437. veor @x[6], @x[6], @x[4] @ 6 4
  438. veor @x[4], @x[4], @x[3] @ 4 375 [6]
  439. veor @x[3], @x[3], @x[7] @ 375 756=36
  440. veor @x[6], @x[6], @t[5] @ 64 5 [7]
  441. veor @x[3], @x[3], @t[2] @ 36 2
  442. vext.8 @t[5], @t[5], @t[5], #12
  443. veor @x[3], @x[3], @t[4] @ 362 4 [5]
  444. ___
  445. my @y = @x[7,5,0,2,1,3,4,6];
  446. $code.=<<___;
  447. @ multiplication by 0x0b
  448. veor @y[1], @y[1], @y[0]
  449. veor @y[0], @y[0], @t[0]
  450. vext.8 @t[2], @t[2], @t[2], #12
  451. veor @y[1], @y[1], @t[1]
  452. veor @y[0], @y[0], @t[5]
  453. vext.8 @t[4], @t[4], @t[4], #12
  454. veor @y[1], @y[1], @t[6]
  455. veor @y[0], @y[0], @t[7]
  456. veor @t[7], @t[7], @t[6] @ clobber t[7]
  457. veor @y[3], @y[3], @t[0]
  458. veor @y[1], @y[1], @y[0]
  459. vext.8 @t[0], @t[0], @t[0], #12
  460. veor @y[2], @y[2], @t[1]
  461. veor @y[4], @y[4], @t[1]
  462. vext.8 @t[1], @t[1], @t[1], #12
  463. veor @y[2], @y[2], @t[2]
  464. veor @y[3], @y[3], @t[2]
  465. veor @y[5], @y[5], @t[2]
  466. veor @y[2], @y[2], @t[7]
  467. vext.8 @t[2], @t[2], @t[2], #12
  468. veor @y[3], @y[3], @t[3]
  469. veor @y[6], @y[6], @t[3]
  470. veor @y[4], @y[4], @t[3]
  471. veor @y[7], @y[7], @t[4]
  472. vext.8 @t[3], @t[3], @t[3], #12
  473. veor @y[5], @y[5], @t[4]
  474. veor @y[7], @y[7], @t[7]
  475. veor @t[7], @t[7], @t[5] @ clobber t[7] even more
  476. veor @y[3], @y[3], @t[5]
  477. veor @y[4], @y[4], @t[4]
  478. veor @y[5], @y[5], @t[7]
  479. vext.8 @t[4], @t[4], @t[4], #12
  480. veor @y[6], @y[6], @t[7]
  481. veor @y[4], @y[4], @t[7]
  482. veor @t[7], @t[7], @t[5]
  483. vext.8 @t[5], @t[5], @t[5], #12
  484. @ multiplication by 0x0d
  485. veor @y[4], @y[4], @y[7]
  486. veor @t[7], @t[7], @t[6] @ restore t[7]
  487. veor @y[7], @y[7], @t[4]
  488. vext.8 @t[6], @t[6], @t[6], #12
  489. veor @y[2], @y[2], @t[0]
  490. veor @y[7], @y[7], @t[5]
  491. vext.8 @t[7], @t[7], @t[7], #12
  492. veor @y[2], @y[2], @t[2]
  493. veor @y[3], @y[3], @y[1]
  494. veor @y[1], @y[1], @t[1]
  495. veor @y[0], @y[0], @t[0]
  496. veor @y[3], @y[3], @t[0]
  497. veor @y[1], @y[1], @t[5]
  498. veor @y[0], @y[0], @t[5]
  499. vext.8 @t[0], @t[0], @t[0], #12
  500. veor @y[1], @y[1], @t[7]
  501. veor @y[0], @y[0], @t[6]
  502. veor @y[3], @y[3], @y[1]
  503. veor @y[4], @y[4], @t[1]
  504. vext.8 @t[1], @t[1], @t[1], #12
  505. veor @y[7], @y[7], @t[7]
  506. veor @y[4], @y[4], @t[2]
  507. veor @y[5], @y[5], @t[2]
  508. veor @y[2], @y[2], @t[6]
  509. veor @t[6], @t[6], @t[3] @ clobber t[6]
  510. vext.8 @t[2], @t[2], @t[2], #12
  511. veor @y[4], @y[4], @y[7]
  512. veor @y[3], @y[3], @t[6]
  513. veor @y[6], @y[6], @t[6]
  514. veor @y[5], @y[5], @t[5]
  515. vext.8 @t[5], @t[5], @t[5], #12
  516. veor @y[6], @y[6], @t[4]
  517. vext.8 @t[4], @t[4], @t[4], #12
  518. veor @y[5], @y[5], @t[6]
  519. veor @y[6], @y[6], @t[7]
  520. vext.8 @t[7], @t[7], @t[7], #12
  521. veor @t[6], @t[6], @t[3] @ restore t[6]
  522. vext.8 @t[3], @t[3], @t[3], #12
  523. @ multiplication by 0x09
  524. veor @y[4], @y[4], @y[1]
  525. veor @t[1], @t[1], @y[1] @ t[1]=y[1]
  526. veor @t[0], @t[0], @t[5] @ clobber t[0]
  527. vext.8 @t[6], @t[6], @t[6], #12
  528. veor @t[1], @t[1], @t[5]
  529. veor @y[3], @y[3], @t[0]
  530. veor @t[0], @t[0], @y[0] @ t[0]=y[0]
  531. veor @t[1], @t[1], @t[6]
  532. veor @t[6], @t[6], @t[7] @ clobber t[6]
  533. veor @y[4], @y[4], @t[1]
  534. veor @y[7], @y[7], @t[4]
  535. veor @y[6], @y[6], @t[3]
  536. veor @y[5], @y[5], @t[2]
  537. veor @t[4], @t[4], @y[4] @ t[4]=y[4]
  538. veor @t[3], @t[3], @y[3] @ t[3]=y[3]
  539. veor @t[5], @t[5], @y[5] @ t[5]=y[5]
  540. veor @t[2], @t[2], @y[2] @ t[2]=y[2]
  541. veor @t[3], @t[3], @t[7]
  542. veor @XMM[5], @t[5], @t[6]
  543. veor @XMM[6], @t[6], @y[6] @ t[6]=y[6]
  544. veor @XMM[2], @t[2], @t[6]
  545. veor @XMM[7], @t[7], @y[7] @ t[7]=y[7]
  546. vmov @XMM[0], @t[0]
  547. vmov @XMM[1], @t[1]
  548. @ vmov @XMM[2], @t[2]
  549. vmov @XMM[3], @t[3]
  550. vmov @XMM[4], @t[4]
  551. @ vmov @XMM[5], @t[5]
  552. @ vmov @XMM[6], @t[6]
  553. @ vmov @XMM[7], @t[7]
  554. ___
  555. }
  556. sub InvMixColumns {
  557. my @x=@_[0..7];
  558. my @t=@_[8..15];
  559. # Thanks to Jussi Kivilinna for providing pointer to
  560. #
  561. # | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 |
  562. # | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
  563. # | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 |
  564. # | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 |
  565. $code.=<<___;
  566. @ multiplication by 0x05-0x00-0x04-0x00
  567. vext.8 @t[0], @x[0], @x[0], #8
  568. vext.8 @t[6], @x[6], @x[6], #8
  569. vext.8 @t[7], @x[7], @x[7], #8
  570. veor @t[0], @t[0], @x[0]
  571. vext.8 @t[1], @x[1], @x[1], #8
  572. veor @t[6], @t[6], @x[6]
  573. vext.8 @t[2], @x[2], @x[2], #8
  574. veor @t[7], @t[7], @x[7]
  575. vext.8 @t[3], @x[3], @x[3], #8
  576. veor @t[1], @t[1], @x[1]
  577. vext.8 @t[4], @x[4], @x[4], #8
  578. veor @t[2], @t[2], @x[2]
  579. vext.8 @t[5], @x[5], @x[5], #8
  580. veor @t[3], @t[3], @x[3]
  581. veor @t[4], @t[4], @x[4]
  582. veor @t[5], @t[5], @x[5]
  583. veor @x[0], @x[0], @t[6]
  584. veor @x[1], @x[1], @t[6]
  585. veor @x[2], @x[2], @t[0]
  586. veor @x[4], @x[4], @t[2]
  587. veor @x[3], @x[3], @t[1]
  588. veor @x[1], @x[1], @t[7]
  589. veor @x[2], @x[2], @t[7]
  590. veor @x[4], @x[4], @t[6]
  591. veor @x[5], @x[5], @t[3]
  592. veor @x[3], @x[3], @t[6]
  593. veor @x[6], @x[6], @t[4]
  594. veor @x[4], @x[4], @t[7]
  595. veor @x[5], @x[5], @t[7]
  596. veor @x[7], @x[7], @t[5]
  597. ___
  598. &MixColumns (@x,@t,1); # flipped 2<->3 and 4<->6
  599. }
  600. sub swapmove {
  601. my ($a,$b,$n,$mask,$t)=@_;
  602. $code.=<<___;
  603. vshr.u64 $t, $b, #$n
  604. veor $t, $t, $a
  605. vand $t, $t, $mask
  606. veor $a, $a, $t
  607. vshl.u64 $t, $t, #$n
  608. veor $b, $b, $t
  609. ___
  610. }
  611. sub swapmove2x {
  612. my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
  613. $code.=<<___;
  614. vshr.u64 $t0, $b0, #$n
  615. vshr.u64 $t1, $b1, #$n
  616. veor $t0, $t0, $a0
  617. veor $t1, $t1, $a1
  618. vand $t0, $t0, $mask
  619. vand $t1, $t1, $mask
  620. veor $a0, $a0, $t0
  621. vshl.u64 $t0, $t0, #$n
  622. veor $a1, $a1, $t1
  623. vshl.u64 $t1, $t1, #$n
  624. veor $b0, $b0, $t0
  625. veor $b1, $b1, $t1
  626. ___
  627. }
  628. sub bitslice {
  629. my @x=reverse(@_[0..7]);
  630. my ($t0,$t1,$t2,$t3)=@_[8..11];
  631. $code.=<<___;
  632. vmov.i8 $t0,#0x55 @ compose .LBS0
  633. vmov.i8 $t1,#0x33 @ compose .LBS1
  634. ___
  635. &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
  636. &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
  637. $code.=<<___;
  638. vmov.i8 $t0,#0x0f @ compose .LBS2
  639. ___
  640. &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
  641. &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
  642. &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
  643. &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
  644. }
  645. $code.=<<___;
  646. #ifndef __KERNEL__
  647. # include "arm_arch.h"
  648. # define VFP_ABI_PUSH vstmdb sp!,{d8-d15}
  649. # define VFP_ABI_POP vldmia sp!,{d8-d15}
  650. # define VFP_ABI_FRAME 0x40
  651. #else
  652. # define VFP_ABI_PUSH
  653. # define VFP_ABI_POP
  654. # define VFP_ABI_FRAME 0
  655. # define BSAES_ASM_EXTENDED_KEY
  656. # define XTS_CHAIN_TWEAK
  657. # define __ARM_ARCH__ __LINUX_ARM_ARCH__
  658. # define __ARM_MAX_ARCH__ 7
  659. #endif
  660. #ifdef __thumb__
  661. # define adrl adr
  662. #endif
  663. #if __ARM_MAX_ARCH__>=7
  664. .arch armv7-a
  665. .fpu neon
  666. .syntax unified @ ARMv7-capable assembler is expected to handle this
  667. #if defined(__thumb2__) && !defined(__APPLE__)
  668. .thumb
  669. #else
  670. .code 32
  671. # undef __thumb2__
  672. #endif
  673. .text
  674. .type _bsaes_decrypt8,%function
  675. .align 4
  676. _bsaes_decrypt8:
  677. adr $const,.
  678. vldmia $key!, {@XMM[9]} @ round 0 key
  679. #if defined(__thumb2__) || defined(__APPLE__)
  680. adr $const,.LM0ISR
  681. #else
  682. add $const,$const,#.LM0ISR-_bsaes_decrypt8
  683. #endif
  684. vldmia $const!, {@XMM[8]} @ .LM0ISR
  685. veor @XMM[10], @XMM[0], @XMM[9] @ xor with round0 key
  686. veor @XMM[11], @XMM[1], @XMM[9]
  687. vtbl.8 `&Dlo(@XMM[0])`, {@XMM[10]}, `&Dlo(@XMM[8])`
  688. vtbl.8 `&Dhi(@XMM[0])`, {@XMM[10]}, `&Dhi(@XMM[8])`
  689. veor @XMM[12], @XMM[2], @XMM[9]
  690. vtbl.8 `&Dlo(@XMM[1])`, {@XMM[11]}, `&Dlo(@XMM[8])`
  691. vtbl.8 `&Dhi(@XMM[1])`, {@XMM[11]}, `&Dhi(@XMM[8])`
  692. veor @XMM[13], @XMM[3], @XMM[9]
  693. vtbl.8 `&Dlo(@XMM[2])`, {@XMM[12]}, `&Dlo(@XMM[8])`
  694. vtbl.8 `&Dhi(@XMM[2])`, {@XMM[12]}, `&Dhi(@XMM[8])`
  695. veor @XMM[14], @XMM[4], @XMM[9]
  696. vtbl.8 `&Dlo(@XMM[3])`, {@XMM[13]}, `&Dlo(@XMM[8])`
  697. vtbl.8 `&Dhi(@XMM[3])`, {@XMM[13]}, `&Dhi(@XMM[8])`
  698. veor @XMM[15], @XMM[5], @XMM[9]
  699. vtbl.8 `&Dlo(@XMM[4])`, {@XMM[14]}, `&Dlo(@XMM[8])`
  700. vtbl.8 `&Dhi(@XMM[4])`, {@XMM[14]}, `&Dhi(@XMM[8])`
  701. veor @XMM[10], @XMM[6], @XMM[9]
  702. vtbl.8 `&Dlo(@XMM[5])`, {@XMM[15]}, `&Dlo(@XMM[8])`
  703. vtbl.8 `&Dhi(@XMM[5])`, {@XMM[15]}, `&Dhi(@XMM[8])`
  704. veor @XMM[11], @XMM[7], @XMM[9]
  705. vtbl.8 `&Dlo(@XMM[6])`, {@XMM[10]}, `&Dlo(@XMM[8])`
  706. vtbl.8 `&Dhi(@XMM[6])`, {@XMM[10]}, `&Dhi(@XMM[8])`
  707. vtbl.8 `&Dlo(@XMM[7])`, {@XMM[11]}, `&Dlo(@XMM[8])`
  708. vtbl.8 `&Dhi(@XMM[7])`, {@XMM[11]}, `&Dhi(@XMM[8])`
  709. ___
  710. &bitslice (@XMM[0..7, 8..11]);
  711. $code.=<<___;
  712. sub $rounds,$rounds,#1
  713. b .Ldec_sbox
  714. .align 4
  715. .Ldec_loop:
  716. ___
  717. &ShiftRows (@XMM[0..7, 8..12]);
  718. $code.=".Ldec_sbox:\n";
  719. &InvSbox (@XMM[0..7, 8..15]);
  720. $code.=<<___;
  721. subs $rounds,$rounds,#1
  722. bcc .Ldec_done
  723. ___
  724. &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]);
  725. $code.=<<___;
  726. vldmia $const, {@XMM[12]} @ .LISR
  727. ite eq @ Thumb2 thing, sanity check in ARM
  728. addeq $const,$const,#0x10
  729. bne .Ldec_loop
  730. vldmia $const, {@XMM[12]} @ .LISRM0
  731. b .Ldec_loop
  732. .align 4
  733. .Ldec_done:
  734. ___
  735. &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]);
  736. $code.=<<___;
  737. vldmia $key, {@XMM[8]} @ last round key
  738. veor @XMM[6], @XMM[6], @XMM[8]
  739. veor @XMM[4], @XMM[4], @XMM[8]
  740. veor @XMM[2], @XMM[2], @XMM[8]
  741. veor @XMM[7], @XMM[7], @XMM[8]
  742. veor @XMM[3], @XMM[3], @XMM[8]
  743. veor @XMM[5], @XMM[5], @XMM[8]
  744. veor @XMM[0], @XMM[0], @XMM[8]
  745. veor @XMM[1], @XMM[1], @XMM[8]
  746. bx lr
  747. .size _bsaes_decrypt8,.-_bsaes_decrypt8
  748. .type _bsaes_const,%object
  749. .align 6
  750. _bsaes_const:
  751. .LM0ISR: @ InvShiftRows constants
  752. .quad 0x0a0e0206070b0f03, 0x0004080c0d010509
  753. .LISR:
  754. .quad 0x0504070602010003, 0x0f0e0d0c080b0a09
  755. .LISRM0:
  756. .quad 0x01040b0e0205080f, 0x0306090c00070a0d
  757. .LM0SR: @ ShiftRows constants
  758. .quad 0x0a0e02060f03070b, 0x0004080c05090d01
  759. .LSR:
  760. .quad 0x0504070600030201, 0x0f0e0d0c0a09080b
  761. .LSRM0:
  762. .quad 0x0304090e00050a0f, 0x01060b0c0207080d
  763. .LM0:
  764. .quad 0x02060a0e03070b0f, 0x0004080c0105090d
  765. .LREVM0SR:
  766. .quad 0x090d01050c000408, 0x03070b0f060a0e02
  767. .asciz "Bit-sliced AES for NEON, CRYPTOGAMS by <appro\@openssl.org>"
  768. .align 6
  769. .size _bsaes_const,.-_bsaes_const
  770. .type _bsaes_encrypt8,%function
  771. .align 4
  772. _bsaes_encrypt8:
  773. adr $const,.
  774. vldmia $key!, {@XMM[9]} @ round 0 key
  775. #if defined(__thumb2__) || defined(__APPLE__)
  776. adr $const,.LM0SR
  777. #else
  778. sub $const,$const,#_bsaes_encrypt8-.LM0SR
  779. #endif
  780. vldmia $const!, {@XMM[8]} @ .LM0SR
  781. _bsaes_encrypt8_alt:
  782. veor @XMM[10], @XMM[0], @XMM[9] @ xor with round0 key
  783. veor @XMM[11], @XMM[1], @XMM[9]
  784. vtbl.8 `&Dlo(@XMM[0])`, {@XMM[10]}, `&Dlo(@XMM[8])`
  785. vtbl.8 `&Dhi(@XMM[0])`, {@XMM[10]}, `&Dhi(@XMM[8])`
  786. veor @XMM[12], @XMM[2], @XMM[9]
  787. vtbl.8 `&Dlo(@XMM[1])`, {@XMM[11]}, `&Dlo(@XMM[8])`
  788. vtbl.8 `&Dhi(@XMM[1])`, {@XMM[11]}, `&Dhi(@XMM[8])`
  789. veor @XMM[13], @XMM[3], @XMM[9]
  790. vtbl.8 `&Dlo(@XMM[2])`, {@XMM[12]}, `&Dlo(@XMM[8])`
  791. vtbl.8 `&Dhi(@XMM[2])`, {@XMM[12]}, `&Dhi(@XMM[8])`
  792. veor @XMM[14], @XMM[4], @XMM[9]
  793. vtbl.8 `&Dlo(@XMM[3])`, {@XMM[13]}, `&Dlo(@XMM[8])`
  794. vtbl.8 `&Dhi(@XMM[3])`, {@XMM[13]}, `&Dhi(@XMM[8])`
  795. veor @XMM[15], @XMM[5], @XMM[9]
  796. vtbl.8 `&Dlo(@XMM[4])`, {@XMM[14]}, `&Dlo(@XMM[8])`
  797. vtbl.8 `&Dhi(@XMM[4])`, {@XMM[14]}, `&Dhi(@XMM[8])`
  798. veor @XMM[10], @XMM[6], @XMM[9]
  799. vtbl.8 `&Dlo(@XMM[5])`, {@XMM[15]}, `&Dlo(@XMM[8])`
  800. vtbl.8 `&Dhi(@XMM[5])`, {@XMM[15]}, `&Dhi(@XMM[8])`
  801. veor @XMM[11], @XMM[7], @XMM[9]
  802. vtbl.8 `&Dlo(@XMM[6])`, {@XMM[10]}, `&Dlo(@XMM[8])`
  803. vtbl.8 `&Dhi(@XMM[6])`, {@XMM[10]}, `&Dhi(@XMM[8])`
  804. vtbl.8 `&Dlo(@XMM[7])`, {@XMM[11]}, `&Dlo(@XMM[8])`
  805. vtbl.8 `&Dhi(@XMM[7])`, {@XMM[11]}, `&Dhi(@XMM[8])`
  806. _bsaes_encrypt8_bitslice:
  807. ___
  808. &bitslice (@XMM[0..7, 8..11]);
  809. $code.=<<___;
  810. sub $rounds,$rounds,#1
  811. b .Lenc_sbox
  812. .align 4
  813. .Lenc_loop:
  814. ___
  815. &ShiftRows (@XMM[0..7, 8..12]);
  816. $code.=".Lenc_sbox:\n";
  817. &Sbox (@XMM[0..7, 8..15]);
  818. $code.=<<___;
  819. subs $rounds,$rounds,#1
  820. bcc .Lenc_done
  821. ___
  822. &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]);
  823. $code.=<<___;
  824. vldmia $const, {@XMM[12]} @ .LSR
  825. ite eq @ Thumb2 thing, samity check in ARM
  826. addeq $const,$const,#0x10
  827. bne .Lenc_loop
  828. vldmia $const, {@XMM[12]} @ .LSRM0
  829. b .Lenc_loop
  830. .align 4
  831. .Lenc_done:
  832. ___
  833. # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
  834. &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]);
  835. $code.=<<___;
  836. vldmia $key, {@XMM[8]} @ last round key
  837. veor @XMM[4], @XMM[4], @XMM[8]
  838. veor @XMM[6], @XMM[6], @XMM[8]
  839. veor @XMM[3], @XMM[3], @XMM[8]
  840. veor @XMM[7], @XMM[7], @XMM[8]
  841. veor @XMM[2], @XMM[2], @XMM[8]
  842. veor @XMM[5], @XMM[5], @XMM[8]
  843. veor @XMM[0], @XMM[0], @XMM[8]
  844. veor @XMM[1], @XMM[1], @XMM[8]
  845. bx lr
  846. .size _bsaes_encrypt8,.-_bsaes_encrypt8
  847. ___
  848. }
  849. {
  850. my ($out,$inp,$rounds,$const)=("r12","r4","r5","r6");
  851. sub bitslice_key {
  852. my @x=reverse(@_[0..7]);
  853. my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
  854. &swapmove (@x[0,1],1,$bs0,$t2,$t3);
  855. $code.=<<___;
  856. @ &swapmove(@x[2,3],1,$t0,$t2,$t3);
  857. vmov @x[2], @x[0]
  858. vmov @x[3], @x[1]
  859. ___
  860. #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
  861. &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3);
  862. $code.=<<___;
  863. @ &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
  864. vmov @x[4], @x[0]
  865. vmov @x[6], @x[2]
  866. vmov @x[5], @x[1]
  867. vmov @x[7], @x[3]
  868. ___
  869. &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3);
  870. &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3);
  871. }
  872. $code.=<<___;
  873. .type _bsaes_key_convert,%function
  874. .align 4
  875. _bsaes_key_convert:
  876. adr $const,.
  877. vld1.8 {@XMM[7]}, [$inp]! @ load round 0 key
  878. #if defined(__thumb2__) || defined(__APPLE__)
  879. adr $const,.LM0
  880. #else
  881. sub $const,$const,#_bsaes_key_convert-.LM0
  882. #endif
  883. vld1.8 {@XMM[15]}, [$inp]! @ load round 1 key
  884. vmov.i8 @XMM[8], #0x01 @ bit masks
  885. vmov.i8 @XMM[9], #0x02
  886. vmov.i8 @XMM[10], #0x04
  887. vmov.i8 @XMM[11], #0x08
  888. vmov.i8 @XMM[12], #0x10
  889. vmov.i8 @XMM[13], #0x20
  890. vldmia $const, {@XMM[14]} @ .LM0
  891. #ifdef __ARMEL__
  892. vrev32.8 @XMM[7], @XMM[7]
  893. vrev32.8 @XMM[15], @XMM[15]
  894. #endif
  895. sub $rounds,$rounds,#1
  896. vstmia $out!, {@XMM[7]} @ save round 0 key
  897. b .Lkey_loop
  898. .align 4
  899. .Lkey_loop:
  900. vtbl.8 `&Dlo(@XMM[7])`,{@XMM[15]},`&Dlo(@XMM[14])`
  901. vtbl.8 `&Dhi(@XMM[7])`,{@XMM[15]},`&Dhi(@XMM[14])`
  902. vmov.i8 @XMM[6], #0x40
  903. vmov.i8 @XMM[15], #0x80
  904. vtst.8 @XMM[0], @XMM[7], @XMM[8]
  905. vtst.8 @XMM[1], @XMM[7], @XMM[9]
  906. vtst.8 @XMM[2], @XMM[7], @XMM[10]
  907. vtst.8 @XMM[3], @XMM[7], @XMM[11]
  908. vtst.8 @XMM[4], @XMM[7], @XMM[12]
  909. vtst.8 @XMM[5], @XMM[7], @XMM[13]
  910. vtst.8 @XMM[6], @XMM[7], @XMM[6]
  911. vtst.8 @XMM[7], @XMM[7], @XMM[15]
  912. vld1.8 {@XMM[15]}, [$inp]! @ load next round key
  913. vmvn @XMM[0], @XMM[0] @ "pnot"
  914. vmvn @XMM[1], @XMM[1]
  915. vmvn @XMM[5], @XMM[5]
  916. vmvn @XMM[6], @XMM[6]
  917. #ifdef __ARMEL__
  918. vrev32.8 @XMM[15], @XMM[15]
  919. #endif
  920. subs $rounds,$rounds,#1
  921. vstmia $out!,{@XMM[0]-@XMM[7]} @ write bit-sliced round key
  922. bne .Lkey_loop
  923. vmov.i8 @XMM[7],#0x63 @ compose .L63
  924. @ don't save last round key
  925. bx lr
  926. .size _bsaes_key_convert,.-_bsaes_key_convert
  927. ___
  928. }
  929. if (0) { # following four functions are unsupported interface
  930. # used for benchmarking...
  931. $code.=<<___;
  932. .globl bsaes_enc_key_convert
  933. .type bsaes_enc_key_convert,%function
  934. .align 4
  935. bsaes_enc_key_convert:
  936. stmdb sp!,{r4-r6,lr}
  937. vstmdb sp!,{d8-d15} @ ABI specification says so
  938. ldr r5,[$inp,#240] @ pass rounds
  939. mov r4,$inp @ pass key
  940. mov r12,$out @ pass key schedule
  941. bl _bsaes_key_convert
  942. veor @XMM[7],@XMM[7],@XMM[15] @ fix up last round key
  943. vstmia r12, {@XMM[7]} @ save last round key
  944. vldmia sp!,{d8-d15}
  945. ldmia sp!,{r4-r6,pc}
  946. .size bsaes_enc_key_convert,.-bsaes_enc_key_convert
  947. .globl bsaes_encrypt_128
  948. .type bsaes_encrypt_128,%function
  949. .align 4
  950. bsaes_encrypt_128:
  951. stmdb sp!,{r4-r6,lr}
  952. vstmdb sp!,{d8-d15} @ ABI specification says so
  953. .Lenc128_loop:
  954. vld1.8 {@XMM[0]-@XMM[1]}, [$inp]! @ load input
  955. vld1.8 {@XMM[2]-@XMM[3]}, [$inp]!
  956. mov r4,$key @ pass the key
  957. vld1.8 {@XMM[4]-@XMM[5]}, [$inp]!
  958. mov r5,#10 @ pass rounds
  959. vld1.8 {@XMM[6]-@XMM[7]}, [$inp]!
  960. bl _bsaes_encrypt8
  961. vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
  962. vst1.8 {@XMM[4]}, [$out]!
  963. vst1.8 {@XMM[6]}, [$out]!
  964. vst1.8 {@XMM[3]}, [$out]!
  965. vst1.8 {@XMM[7]}, [$out]!
  966. vst1.8 {@XMM[2]}, [$out]!
  967. subs $len,$len,#0x80
  968. vst1.8 {@XMM[5]}, [$out]!
  969. bhi .Lenc128_loop
  970. vldmia sp!,{d8-d15}
  971. ldmia sp!,{r4-r6,pc}
  972. .size bsaes_encrypt_128,.-bsaes_encrypt_128
  973. .globl bsaes_dec_key_convert
  974. .type bsaes_dec_key_convert,%function
  975. .align 4
  976. bsaes_dec_key_convert:
  977. stmdb sp!,{r4-r6,lr}
  978. vstmdb sp!,{d8-d15} @ ABI specification says so
  979. ldr r5,[$inp,#240] @ pass rounds
  980. mov r4,$inp @ pass key
  981. mov r12,$out @ pass key schedule
  982. bl _bsaes_key_convert
  983. vldmia $out, {@XMM[6]}
  984. vstmia r12, {@XMM[15]} @ save last round key
  985. veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
  986. vstmia $out, {@XMM[7]}
  987. vldmia sp!,{d8-d15}
  988. ldmia sp!,{r4-r6,pc}
  989. .size bsaes_dec_key_convert,.-bsaes_dec_key_convert
  990. .globl bsaes_decrypt_128
  991. .type bsaes_decrypt_128,%function
  992. .align 4
  993. bsaes_decrypt_128:
  994. stmdb sp!,{r4-r6,lr}
  995. vstmdb sp!,{d8-d15} @ ABI specification says so
  996. .Ldec128_loop:
  997. vld1.8 {@XMM[0]-@XMM[1]}, [$inp]! @ load input
  998. vld1.8 {@XMM[2]-@XMM[3]}, [$inp]!
  999. mov r4,$key @ pass the key
  1000. vld1.8 {@XMM[4]-@XMM[5]}, [$inp]!
  1001. mov r5,#10 @ pass rounds
  1002. vld1.8 {@XMM[6]-@XMM[7]}, [$inp]!
  1003. bl _bsaes_decrypt8
  1004. vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
  1005. vst1.8 {@XMM[6]}, [$out]!
  1006. vst1.8 {@XMM[4]}, [$out]!
  1007. vst1.8 {@XMM[2]}, [$out]!
  1008. vst1.8 {@XMM[7]}, [$out]!
  1009. vst1.8 {@XMM[3]}, [$out]!
  1010. subs $len,$len,#0x80
  1011. vst1.8 {@XMM[5]}, [$out]!
  1012. bhi .Ldec128_loop
  1013. vldmia sp!,{d8-d15}
  1014. ldmia sp!,{r4-r6,pc}
  1015. .size bsaes_decrypt_128,.-bsaes_decrypt_128
  1016. ___
  1017. }
  1018. {
  1019. my ($inp,$out,$len,$key, $ivp,$fp,$rounds)=map("r$_",(0..3,8..10));
  1020. my ($keysched)=("sp");
  1021. $code.=<<___;
  1022. .extern AES_cbc_encrypt
  1023. .extern AES_decrypt
  1024. .global bsaes_cbc_encrypt
  1025. .type bsaes_cbc_encrypt,%function
  1026. .align 5
  1027. bsaes_cbc_encrypt:
  1028. #ifndef __KERNEL__
  1029. cmp $len, #128
  1030. #ifndef __thumb__
  1031. blo AES_cbc_encrypt
  1032. #else
  1033. bhs .Lcbc_do_bsaes
  1034. b AES_cbc_encrypt
  1035. .Lcbc_do_bsaes:
  1036. #endif
  1037. #endif
  1038. @ it is up to the caller to make sure we are called with enc == 0
  1039. mov ip, sp
  1040. stmdb sp!, {r4-r10, lr}
  1041. VFP_ABI_PUSH
  1042. ldr $ivp, [ip] @ IV is 1st arg on the stack
  1043. mov $len, $len, lsr#4 @ len in 16 byte blocks
  1044. sub sp, #0x10 @ scratch space to carry over the IV
  1045. mov $fp, sp @ save sp
  1046. ldr $rounds, [$key, #240] @ get # of rounds
  1047. #ifndef BSAES_ASM_EXTENDED_KEY
  1048. @ allocate the key schedule on the stack
  1049. sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key
  1050. add r12, #`128-32` @ sifze of bit-slices key schedule
  1051. @ populate the key schedule
  1052. mov r4, $key @ pass key
  1053. mov r5, $rounds @ pass # of rounds
  1054. mov sp, r12 @ sp is $keysched
  1055. bl _bsaes_key_convert
  1056. vldmia $keysched, {@XMM[6]}
  1057. vstmia r12, {@XMM[15]} @ save last round key
  1058. veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
  1059. vstmia $keysched, {@XMM[7]}
  1060. #else
  1061. ldr r12, [$key, #244]
  1062. eors r12, #1
  1063. beq 0f
  1064. @ populate the key schedule
  1065. str r12, [$key, #244]
  1066. mov r4, $key @ pass key
  1067. mov r5, $rounds @ pass # of rounds
  1068. add r12, $key, #248 @ pass key schedule
  1069. bl _bsaes_key_convert
  1070. add r4, $key, #248
  1071. vldmia r4, {@XMM[6]}
  1072. vstmia r12, {@XMM[15]} @ save last round key
  1073. veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
  1074. vstmia r4, {@XMM[7]}
  1075. .align 2
  1076. 0:
  1077. #endif
  1078. vld1.8 {@XMM[15]}, [$ivp] @ load IV
  1079. b .Lcbc_dec_loop
  1080. .align 4
  1081. .Lcbc_dec_loop:
  1082. subs $len, $len, #0x8
  1083. bmi .Lcbc_dec_loop_finish
  1084. vld1.8 {@XMM[0]-@XMM[1]}, [$inp]! @ load input
  1085. vld1.8 {@XMM[2]-@XMM[3]}, [$inp]!
  1086. #ifndef BSAES_ASM_EXTENDED_KEY
  1087. mov r4, $keysched @ pass the key
  1088. #else
  1089. add r4, $key, #248
  1090. #endif
  1091. vld1.8 {@XMM[4]-@XMM[5]}, [$inp]!
  1092. mov r5, $rounds
  1093. vld1.8 {@XMM[6]-@XMM[7]}, [$inp]
  1094. sub $inp, $inp, #0x60
  1095. vstmia $fp, {@XMM[15]} @ put aside IV
  1096. bl _bsaes_decrypt8
  1097. vldmia $fp, {@XMM[14]} @ reload IV
  1098. vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
  1099. veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
  1100. vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
  1101. veor @XMM[1], @XMM[1], @XMM[8]
  1102. veor @XMM[6], @XMM[6], @XMM[9]
  1103. vld1.8 {@XMM[12]-@XMM[13]}, [$inp]!
  1104. veor @XMM[4], @XMM[4], @XMM[10]
  1105. veor @XMM[2], @XMM[2], @XMM[11]
  1106. vld1.8 {@XMM[14]-@XMM[15]}, [$inp]!
  1107. veor @XMM[7], @XMM[7], @XMM[12]
  1108. vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
  1109. veor @XMM[3], @XMM[3], @XMM[13]
  1110. vst1.8 {@XMM[6]}, [$out]!
  1111. veor @XMM[5], @XMM[5], @XMM[14]
  1112. vst1.8 {@XMM[4]}, [$out]!
  1113. vst1.8 {@XMM[2]}, [$out]!
  1114. vst1.8 {@XMM[7]}, [$out]!
  1115. vst1.8 {@XMM[3]}, [$out]!
  1116. vst1.8 {@XMM[5]}, [$out]!
  1117. b .Lcbc_dec_loop
  1118. .Lcbc_dec_loop_finish:
  1119. adds $len, $len, #8
  1120. beq .Lcbc_dec_done
  1121. vld1.8 {@XMM[0]}, [$inp]! @ load input
  1122. cmp $len, #2
  1123. blo .Lcbc_dec_one
  1124. vld1.8 {@XMM[1]}, [$inp]!
  1125. #ifndef BSAES_ASM_EXTENDED_KEY
  1126. mov r4, $keysched @ pass the key
  1127. #else
  1128. add r4, $key, #248
  1129. #endif
  1130. mov r5, $rounds
  1131. vstmia $fp, {@XMM[15]} @ put aside IV
  1132. beq .Lcbc_dec_two
  1133. vld1.8 {@XMM[2]}, [$inp]!
  1134. cmp $len, #4
  1135. blo .Lcbc_dec_three
  1136. vld1.8 {@XMM[3]}, [$inp]!
  1137. beq .Lcbc_dec_four
  1138. vld1.8 {@XMM[4]}, [$inp]!
  1139. cmp $len, #6
  1140. blo .Lcbc_dec_five
  1141. vld1.8 {@XMM[5]}, [$inp]!
  1142. beq .Lcbc_dec_six
  1143. vld1.8 {@XMM[6]}, [$inp]!
  1144. sub $inp, $inp, #0x70
  1145. bl _bsaes_decrypt8
  1146. vldmia $fp, {@XMM[14]} @ reload IV
  1147. vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
  1148. veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
  1149. vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
  1150. veor @XMM[1], @XMM[1], @XMM[8]
  1151. veor @XMM[6], @XMM[6], @XMM[9]
  1152. vld1.8 {@XMM[12]-@XMM[13]}, [$inp]!
  1153. veor @XMM[4], @XMM[4], @XMM[10]
  1154. veor @XMM[2], @XMM[2], @XMM[11]
  1155. vld1.8 {@XMM[15]}, [$inp]!
  1156. veor @XMM[7], @XMM[7], @XMM[12]
  1157. vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
  1158. veor @XMM[3], @XMM[3], @XMM[13]
  1159. vst1.8 {@XMM[6]}, [$out]!
  1160. vst1.8 {@XMM[4]}, [$out]!
  1161. vst1.8 {@XMM[2]}, [$out]!
  1162. vst1.8 {@XMM[7]}, [$out]!
  1163. vst1.8 {@XMM[3]}, [$out]!
  1164. b .Lcbc_dec_done
  1165. .align 4
  1166. .Lcbc_dec_six:
  1167. sub $inp, $inp, #0x60
  1168. bl _bsaes_decrypt8
  1169. vldmia $fp,{@XMM[14]} @ reload IV
  1170. vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
  1171. veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
  1172. vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
  1173. veor @XMM[1], @XMM[1], @XMM[8]
  1174. veor @XMM[6], @XMM[6], @XMM[9]
  1175. vld1.8 {@XMM[12]}, [$inp]!
  1176. veor @XMM[4], @XMM[4], @XMM[10]
  1177. veor @XMM[2], @XMM[2], @XMM[11]
  1178. vld1.8 {@XMM[15]}, [$inp]!
  1179. veor @XMM[7], @XMM[7], @XMM[12]
  1180. vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
  1181. vst1.8 {@XMM[6]}, [$out]!
  1182. vst1.8 {@XMM[4]}, [$out]!
  1183. vst1.8 {@XMM[2]}, [$out]!
  1184. vst1.8 {@XMM[7]}, [$out]!
  1185. b .Lcbc_dec_done
  1186. .align 4
  1187. .Lcbc_dec_five:
  1188. sub $inp, $inp, #0x50
  1189. bl _bsaes_decrypt8
  1190. vldmia $fp, {@XMM[14]} @ reload IV
  1191. vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
  1192. veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
  1193. vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
  1194. veor @XMM[1], @XMM[1], @XMM[8]
  1195. veor @XMM[6], @XMM[6], @XMM[9]
  1196. vld1.8 {@XMM[15]}, [$inp]!
  1197. veor @XMM[4], @XMM[4], @XMM[10]
  1198. vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
  1199. veor @XMM[2], @XMM[2], @XMM[11]
  1200. vst1.8 {@XMM[6]}, [$out]!
  1201. vst1.8 {@XMM[4]}, [$out]!
  1202. vst1.8 {@XMM[2]}, [$out]!
  1203. b .Lcbc_dec_done
  1204. .align 4
  1205. .Lcbc_dec_four:
  1206. sub $inp, $inp, #0x40
  1207. bl _bsaes_decrypt8
  1208. vldmia $fp, {@XMM[14]} @ reload IV
  1209. vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
  1210. veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
  1211. vld1.8 {@XMM[10]}, [$inp]!
  1212. veor @XMM[1], @XMM[1], @XMM[8]
  1213. veor @XMM[6], @XMM[6], @XMM[9]
  1214. vld1.8 {@XMM[15]}, [$inp]!
  1215. veor @XMM[4], @XMM[4], @XMM[10]
  1216. vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
  1217. vst1.8 {@XMM[6]}, [$out]!
  1218. vst1.8 {@XMM[4]}, [$out]!
  1219. b .Lcbc_dec_done
  1220. .align 4
  1221. .Lcbc_dec_three:
  1222. sub $inp, $inp, #0x30
  1223. bl _bsaes_decrypt8
  1224. vldmia $fp, {@XMM[14]} @ reload IV
  1225. vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
  1226. veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
  1227. vld1.8 {@XMM[15]}, [$inp]!
  1228. veor @XMM[1], @XMM[1], @XMM[8]
  1229. veor @XMM[6], @XMM[6], @XMM[9]
  1230. vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
  1231. vst1.8 {@XMM[6]}, [$out]!
  1232. b .Lcbc_dec_done
  1233. .align 4
  1234. .Lcbc_dec_two:
  1235. sub $inp, $inp, #0x20
  1236. bl _bsaes_decrypt8
  1237. vldmia $fp, {@XMM[14]} @ reload IV
  1238. vld1.8 {@XMM[8]}, [$inp]! @ reload input
  1239. veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
  1240. vld1.8 {@XMM[15]}, [$inp]! @ reload input
  1241. veor @XMM[1], @XMM[1], @XMM[8]
  1242. vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
  1243. b .Lcbc_dec_done
  1244. .align 4
  1245. .Lcbc_dec_one:
  1246. sub $inp, $inp, #0x10
  1247. mov $rounds, $out @ save original out pointer
  1248. mov $out, $fp @ use the iv scratch space as out buffer
  1249. mov r2, $key
  1250. vmov @XMM[4],@XMM[15] @ just in case ensure that IV
  1251. vmov @XMM[5],@XMM[0] @ and input are preserved
  1252. bl AES_decrypt
  1253. vld1.8 {@XMM[0]}, [$fp] @ load result
  1254. veor @XMM[0], @XMM[0], @XMM[4] @ ^= IV
  1255. vmov @XMM[15], @XMM[5] @ @XMM[5] holds input
  1256. vst1.8 {@XMM[0]}, [$rounds] @ write output
  1257. .Lcbc_dec_done:
  1258. #ifndef BSAES_ASM_EXTENDED_KEY
  1259. vmov.i32 q0, #0
  1260. vmov.i32 q1, #0
  1261. .Lcbc_dec_bzero: @ wipe key schedule [if any]
  1262. vstmia $keysched!, {q0-q1}
  1263. cmp $keysched, $fp
  1264. bne .Lcbc_dec_bzero
  1265. #endif
  1266. mov sp, $fp
  1267. add sp, #0x10 @ add sp,$fp,#0x10 is no good for thumb
  1268. vst1.8 {@XMM[15]}, [$ivp] @ return IV
  1269. VFP_ABI_POP
  1270. ldmia sp!, {r4-r10, pc}
  1271. .size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
  1272. ___
  1273. }
  1274. {
  1275. my ($inp,$out,$len,$key, $ctr,$fp,$rounds)=(map("r$_",(0..3,8..10)));
  1276. my $const = "r6"; # shared with _bsaes_encrypt8_alt
  1277. my $keysched = "sp";
  1278. $code.=<<___;
  1279. .extern AES_encrypt
  1280. .global bsaes_ctr32_encrypt_blocks
  1281. .type bsaes_ctr32_encrypt_blocks,%function
  1282. .align 5
  1283. bsaes_ctr32_encrypt_blocks:
  1284. cmp $len, #8 @ use plain AES for
  1285. blo .Lctr_enc_short @ small sizes
  1286. mov ip, sp
  1287. stmdb sp!, {r4-r10, lr}
  1288. VFP_ABI_PUSH
  1289. ldr $ctr, [ip] @ ctr is 1st arg on the stack
  1290. sub sp, sp, #0x10 @ scratch space to carry over the ctr
  1291. mov $fp, sp @ save sp
  1292. ldr $rounds, [$key, #240] @ get # of rounds
  1293. #ifndef BSAES_ASM_EXTENDED_KEY
  1294. @ allocate the key schedule on the stack
  1295. sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key
  1296. add r12, #`128-32` @ size of bit-sliced key schedule
  1297. @ populate the key schedule
  1298. mov r4, $key @ pass key
  1299. mov r5, $rounds @ pass # of rounds
  1300. mov sp, r12 @ sp is $keysched
  1301. bl _bsaes_key_convert
  1302. veor @XMM[7],@XMM[7],@XMM[15] @ fix up last round key
  1303. vstmia r12, {@XMM[7]} @ save last round key
  1304. vld1.8 {@XMM[0]}, [$ctr] @ load counter
  1305. #ifdef __APPLE__
  1306. mov $ctr, #:lower16:(.LREVM0SR-.LM0)
  1307. add $ctr, $const, $ctr
  1308. #else
  1309. add $ctr, $const, #.LREVM0SR-.LM0 @ borrow $ctr
  1310. #endif
  1311. vldmia $keysched, {@XMM[4]} @ load round0 key
  1312. #else
  1313. ldr r12, [$key, #244]
  1314. eors r12, #1
  1315. beq 0f
  1316. @ populate the key schedule
  1317. str r12, [$key, #244]
  1318. mov r4, $key @ pass key
  1319. mov r5, $rounds @ pass # of rounds
  1320. add r12, $key, #248 @ pass key schedule
  1321. bl _bsaes_key_convert
  1322. veor @XMM[7],@XMM[7],@XMM[15] @ fix up last round key
  1323. vstmia r12, {@XMM[7]} @ save last round key
  1324. .align 2
  1325. 0: add r12, $key, #248
  1326. vld1.8 {@XMM[0]}, [$ctr] @ load counter
  1327. adrl $ctr, .LREVM0SR @ borrow $ctr
  1328. vldmia r12, {@XMM[4]} @ load round0 key
  1329. sub sp, #0x10 @ place for adjusted round0 key
  1330. #endif
  1331. vmov.i32 @XMM[8],#1 @ compose 1<<96
  1332. veor @XMM[9],@XMM[9],@XMM[9]
  1333. vrev32.8 @XMM[0],@XMM[0]
  1334. vext.8 @XMM[8],@XMM[9],@XMM[8],#4
  1335. vrev32.8 @XMM[4],@XMM[4]
  1336. vadd.u32 @XMM[9],@XMM[8],@XMM[8] @ compose 2<<96
  1337. vstmia $keysched, {@XMM[4]} @ save adjusted round0 key
  1338. b .Lctr_enc_loop
  1339. .align 4
  1340. .Lctr_enc_loop:
  1341. vadd.u32 @XMM[10], @XMM[8], @XMM[9] @ compose 3<<96
  1342. vadd.u32 @XMM[1], @XMM[0], @XMM[8] @ +1
  1343. vadd.u32 @XMM[2], @XMM[0], @XMM[9] @ +2
  1344. vadd.u32 @XMM[3], @XMM[0], @XMM[10] @ +3
  1345. vadd.u32 @XMM[4], @XMM[1], @XMM[10]
  1346. vadd.u32 @XMM[5], @XMM[2], @XMM[10]
  1347. vadd.u32 @XMM[6], @XMM[3], @XMM[10]
  1348. vadd.u32 @XMM[7], @XMM[4], @XMM[10]
  1349. vadd.u32 @XMM[10], @XMM[5], @XMM[10] @ next counter
  1350. @ Borrow prologue from _bsaes_encrypt8 to use the opportunity
  1351. @ to flip byte order in 32-bit counter
  1352. vldmia $keysched, {@XMM[9]} @ load round0 key
  1353. #ifndef BSAES_ASM_EXTENDED_KEY
  1354. add r4, $keysched, #0x10 @ pass next round key
  1355. #else
  1356. add r4, $key, #`248+16`
  1357. #endif
  1358. vldmia $ctr, {@XMM[8]} @ .LREVM0SR
  1359. mov r5, $rounds @ pass rounds
  1360. vstmia $fp, {@XMM[10]} @ save next counter
  1361. #ifdef __APPLE__
  1362. mov $const, #:lower16:(.LREVM0SR-.LSR)
  1363. sub $const, $ctr, $const
  1364. #else
  1365. sub $const, $ctr, #.LREVM0SR-.LSR @ pass constants
  1366. #endif
  1367. bl _bsaes_encrypt8_alt
  1368. subs $len, $len, #8
  1369. blo .Lctr_enc_loop_done
  1370. vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ load input
  1371. vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
  1372. veor @XMM[0], @XMM[8]
  1373. veor @XMM[1], @XMM[9]
  1374. vld1.8 {@XMM[12]-@XMM[13]}, [$inp]!
  1375. veor @XMM[4], @XMM[10]
  1376. veor @XMM[6], @XMM[11]
  1377. vld1.8 {@XMM[14]-@XMM[15]}, [$inp]!
  1378. veor @XMM[3], @XMM[12]
  1379. vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
  1380. veor @XMM[7], @XMM[13]
  1381. veor @XMM[2], @XMM[14]
  1382. vst1.8 {@XMM[4]}, [$out]!
  1383. veor @XMM[5], @XMM[15]
  1384. vst1.8 {@XMM[6]}, [$out]!
  1385. vmov.i32 @XMM[8], #1 @ compose 1<<96
  1386. vst1.8 {@XMM[3]}, [$out]!
  1387. veor @XMM[9], @XMM[9], @XMM[9]
  1388. vst1.8 {@XMM[7]}, [$out]!
  1389. vext.8 @XMM[8], @XMM[9], @XMM[8], #4
  1390. vst1.8 {@XMM[2]}, [$out]!
  1391. vadd.u32 @XMM[9],@XMM[8],@XMM[8] @ compose 2<<96
  1392. vst1.8 {@XMM[5]}, [$out]!
  1393. vldmia $fp, {@XMM[0]} @ load counter
  1394. bne .Lctr_enc_loop
  1395. b .Lctr_enc_done
  1396. .align 4
  1397. .Lctr_enc_loop_done:
  1398. add $len, $len, #8
  1399. vld1.8 {@XMM[8]}, [$inp]! @ load input
  1400. veor @XMM[0], @XMM[8]
  1401. vst1.8 {@XMM[0]}, [$out]! @ write output
  1402. cmp $len, #2
  1403. blo .Lctr_enc_done
  1404. vld1.8 {@XMM[9]}, [$inp]!
  1405. veor @XMM[1], @XMM[9]
  1406. vst1.8 {@XMM[1]}, [$out]!
  1407. beq .Lctr_enc_done
  1408. vld1.8 {@XMM[10]}, [$inp]!
  1409. veor @XMM[4], @XMM[10]
  1410. vst1.8 {@XMM[4]}, [$out]!
  1411. cmp $len, #4
  1412. blo .Lctr_enc_done
  1413. vld1.8 {@XMM[11]}, [$inp]!
  1414. veor @XMM[6], @XMM[11]
  1415. vst1.8 {@XMM[6]}, [$out]!
  1416. beq .Lctr_enc_done
  1417. vld1.8 {@XMM[12]}, [$inp]!
  1418. veor @XMM[3], @XMM[12]
  1419. vst1.8 {@XMM[3]}, [$out]!
  1420. cmp $len, #6
  1421. blo .Lctr_enc_done
  1422. vld1.8 {@XMM[13]}, [$inp]!
  1423. veor @XMM[7], @XMM[13]
  1424. vst1.8 {@XMM[7]}, [$out]!
  1425. beq .Lctr_enc_done
  1426. vld1.8 {@XMM[14]}, [$inp]
  1427. veor @XMM[2], @XMM[14]
  1428. vst1.8 {@XMM[2]}, [$out]!
  1429. .Lctr_enc_done:
  1430. vmov.i32 q0, #0
  1431. vmov.i32 q1, #0
  1432. #ifndef BSAES_ASM_EXTENDED_KEY
  1433. .Lctr_enc_bzero: @ wipe key schedule [if any]
  1434. vstmia $keysched!, {q0-q1}
  1435. cmp $keysched, $fp
  1436. bne .Lctr_enc_bzero
  1437. #else
  1438. vstmia $keysched, {q0-q1}
  1439. #endif
  1440. mov sp, $fp
  1441. add sp, #0x10 @ add sp,$fp,#0x10 is no good for thumb
  1442. VFP_ABI_POP
  1443. ldmia sp!, {r4-r10, pc} @ return
  1444. .align 4
  1445. .Lctr_enc_short:
  1446. ldr ip, [sp] @ ctr pointer is passed on stack
  1447. stmdb sp!, {r4-r8, lr}
  1448. mov r4, $inp @ copy arguments
  1449. mov r5, $out
  1450. mov r6, $len
  1451. mov r7, $key
  1452. ldr r8, [ip, #12] @ load counter LSW
  1453. vld1.8 {@XMM[1]}, [ip] @ load whole counter value
  1454. #ifdef __ARMEL__
  1455. rev r8, r8
  1456. #endif
  1457. sub sp, sp, #0x10
  1458. vst1.8 {@XMM[1]}, [sp] @ copy counter value
  1459. sub sp, sp, #0x10
  1460. .Lctr_enc_short_loop:
  1461. add r0, sp, #0x10 @ input counter value
  1462. mov r1, sp @ output on the stack
  1463. mov r2, r7 @ key
  1464. bl AES_encrypt
  1465. vld1.8 {@XMM[0]}, [r4]! @ load input
  1466. vld1.8 {@XMM[1]}, [sp] @ load encrypted counter
  1467. add r8, r8, #1
  1468. #ifdef __ARMEL__
  1469. rev r0, r8
  1470. str r0, [sp, #0x1c] @ next counter value
  1471. #else
  1472. str r8, [sp, #0x1c] @ next counter value
  1473. #endif
  1474. veor @XMM[0],@XMM[0],@XMM[1]
  1475. vst1.8 {@XMM[0]}, [r5]! @ store output
  1476. subs r6, r6, #1
  1477. bne .Lctr_enc_short_loop
  1478. vmov.i32 q0, #0
  1479. vmov.i32 q1, #0
  1480. vstmia sp!, {q0-q1}
  1481. ldmia sp!, {r4-r8, pc}
  1482. .size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
  1483. ___
  1484. }
  1485. {
  1486. ######################################################################
  1487. # void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
  1488. # const AES_KEY *key1, const AES_KEY *key2,
  1489. # const unsigned char iv[16]);
  1490. #
  1491. my ($inp,$out,$len,$key,$rounds,$magic,$fp)=(map("r$_",(7..10,1..3)));
  1492. my $const="r6"; # returned by _bsaes_key_convert
  1493. my $twmask=@XMM[5];
  1494. my @T=@XMM[6..7];
  1495. $code.=<<___;
  1496. .globl bsaes_xts_encrypt
  1497. .type bsaes_xts_encrypt,%function
  1498. .align 4
  1499. bsaes_xts_encrypt:
  1500. mov ip, sp
  1501. stmdb sp!, {r4-r10, lr} @ 0x20
  1502. VFP_ABI_PUSH
  1503. mov r6, sp @ future $fp
  1504. mov $inp, r0
  1505. mov $out, r1
  1506. mov $len, r2
  1507. mov $key, r3
  1508. sub r0, sp, #0x10 @ 0x10
  1509. bic r0, #0xf @ align at 16 bytes
  1510. mov sp, r0
  1511. #ifdef XTS_CHAIN_TWEAK
  1512. ldr r0, [ip] @ pointer to input tweak
  1513. #else
  1514. @ generate initial tweak
  1515. ldr r0, [ip, #4] @ iv[]
  1516. mov r1, sp
  1517. ldr r2, [ip, #0] @ key2
  1518. bl AES_encrypt
  1519. mov r0,sp @ pointer to initial tweak
  1520. #endif
  1521. ldr $rounds, [$key, #240] @ get # of rounds
  1522. mov $fp, r6
  1523. #ifndef BSAES_ASM_EXTENDED_KEY
  1524. @ allocate the key schedule on the stack
  1525. sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key
  1526. @ add r12, #`128-32` @ size of bit-sliced key schedule
  1527. sub r12, #`32+16` @ place for tweak[9]
  1528. @ populate the key schedule
  1529. mov r4, $key @ pass key
  1530. mov r5, $rounds @ pass # of rounds
  1531. mov sp, r12
  1532. add r12, #0x90 @ pass key schedule
  1533. bl _bsaes_key_convert
  1534. veor @XMM[7], @XMM[7], @XMM[15] @ fix up last round key
  1535. vstmia r12, {@XMM[7]} @ save last round key
  1536. #else
  1537. ldr r12, [$key, #244]
  1538. eors r12, #1
  1539. beq 0f
  1540. str r12, [$key, #244]
  1541. mov r4, $key @ pass key
  1542. mov r5, $rounds @ pass # of rounds
  1543. add r12, $key, #248 @ pass key schedule
  1544. bl _bsaes_key_convert
  1545. veor @XMM[7], @XMM[7], @XMM[15] @ fix up last round key
  1546. vstmia r12, {@XMM[7]}
  1547. .align 2
  1548. 0: sub sp, #0x90 @ place for tweak[9]
  1549. #endif
  1550. vld1.8 {@XMM[8]}, [r0] @ initial tweak
  1551. adr $magic, .Lxts_magic
  1552. subs $len, #0x80
  1553. blo .Lxts_enc_short
  1554. b .Lxts_enc_loop
  1555. .align 4
  1556. .Lxts_enc_loop:
  1557. vldmia $magic, {$twmask} @ load XTS magic
  1558. vshr.s64 @T[0], @XMM[8], #63
  1559. mov r0, sp
  1560. vand @T[0], @T[0], $twmask
  1561. ___
  1562. for($i=9;$i<16;$i++) {
  1563. $code.=<<___;
  1564. vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1]
  1565. vst1.64 {@XMM[$i-1]}, [r0,:128]!
  1566. vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
  1567. vshr.s64 @T[1], @XMM[$i], #63
  1568. veor @XMM[$i], @XMM[$i], @T[0]
  1569. vand @T[1], @T[1], $twmask
  1570. ___
  1571. @T=reverse(@T);
  1572. $code.=<<___ if ($i>=10);
  1573. vld1.8 {@XMM[$i-10]}, [$inp]!
  1574. ___
  1575. $code.=<<___ if ($i>=11);
  1576. veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
  1577. ___
  1578. }
  1579. $code.=<<___;
  1580. vadd.u64 @XMM[8], @XMM[15], @XMM[15]
  1581. vst1.64 {@XMM[15]}, [r0,:128]!
  1582. vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
  1583. veor @XMM[8], @XMM[8], @T[0]
  1584. vst1.64 {@XMM[8]}, [r0,:128] @ next round tweak
  1585. vld1.8 {@XMM[6]-@XMM[7]}, [$inp]!
  1586. veor @XMM[5], @XMM[5], @XMM[13]
  1587. #ifndef BSAES_ASM_EXTENDED_KEY
  1588. add r4, sp, #0x90 @ pass key schedule
  1589. #else
  1590. add r4, $key, #248 @ pass key schedule
  1591. #endif
  1592. veor @XMM[6], @XMM[6], @XMM[14]
  1593. mov r5, $rounds @ pass rounds
  1594. veor @XMM[7], @XMM[7], @XMM[15]
  1595. mov r0, sp
  1596. bl _bsaes_encrypt8
  1597. vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
  1598. vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
  1599. veor @XMM[0], @XMM[0], @XMM[ 8]
  1600. vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
  1601. veor @XMM[1], @XMM[1], @XMM[ 9]
  1602. veor @XMM[8], @XMM[4], @XMM[10]
  1603. vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
  1604. veor @XMM[9], @XMM[6], @XMM[11]
  1605. vld1.64 {@XMM[14]-@XMM[15]}, [r0,:128]!
  1606. veor @XMM[10], @XMM[3], @XMM[12]
  1607. vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
  1608. veor @XMM[11], @XMM[7], @XMM[13]
  1609. veor @XMM[12], @XMM[2], @XMM[14]
  1610. vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
  1611. veor @XMM[13], @XMM[5], @XMM[15]
  1612. vst1.8 {@XMM[12]-@XMM[13]}, [$out]!
  1613. vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
  1614. subs $len, #0x80
  1615. bpl .Lxts_enc_loop
  1616. .Lxts_enc_short:
  1617. adds $len, #0x70
  1618. bmi .Lxts_enc_done
  1619. vldmia $magic, {$twmask} @ load XTS magic
  1620. vshr.s64 @T[0], @XMM[8], #63
  1621. mov r0, sp
  1622. vand @T[0], @T[0], $twmask
  1623. ___
  1624. for($i=9;$i<16;$i++) {
  1625. $code.=<<___;
  1626. vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1]
  1627. vst1.64 {@XMM[$i-1]}, [r0,:128]!
  1628. vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
  1629. vshr.s64 @T[1], @XMM[$i], #63
  1630. veor @XMM[$i], @XMM[$i], @T[0]
  1631. vand @T[1], @T[1], $twmask
  1632. ___
  1633. @T=reverse(@T);
  1634. $code.=<<___ if ($i>=10);
  1635. vld1.8 {@XMM[$i-10]}, [$inp]!
  1636. subs $len, #0x10
  1637. bmi .Lxts_enc_`$i-9`
  1638. ___
  1639. $code.=<<___ if ($i>=11);
  1640. veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
  1641. ___
  1642. }
  1643. $code.=<<___;
  1644. sub $len, #0x10
  1645. vst1.64 {@XMM[15]}, [r0,:128] @ next round tweak
  1646. vld1.8 {@XMM[6]}, [$inp]!
  1647. veor @XMM[5], @XMM[5], @XMM[13]
  1648. #ifndef BSAES_ASM_EXTENDED_KEY
  1649. add r4, sp, #0x90 @ pass key schedule
  1650. #else
  1651. add r4, $key, #248 @ pass key schedule
  1652. #endif
  1653. veor @XMM[6], @XMM[6], @XMM[14]
  1654. mov r5, $rounds @ pass rounds
  1655. mov r0, sp
  1656. bl _bsaes_encrypt8
  1657. vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
  1658. vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
  1659. veor @XMM[0], @XMM[0], @XMM[ 8]
  1660. vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
  1661. veor @XMM[1], @XMM[1], @XMM[ 9]
  1662. veor @XMM[8], @XMM[4], @XMM[10]
  1663. vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
  1664. veor @XMM[9], @XMM[6], @XMM[11]
  1665. vld1.64 {@XMM[14]}, [r0,:128]!
  1666. veor @XMM[10], @XMM[3], @XMM[12]
  1667. vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
  1668. veor @XMM[11], @XMM[7], @XMM[13]
  1669. veor @XMM[12], @XMM[2], @XMM[14]
  1670. vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
  1671. vst1.8 {@XMM[12]}, [$out]!
  1672. vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
  1673. b .Lxts_enc_done
  1674. .align 4
  1675. .Lxts_enc_6:
  1676. veor @XMM[4], @XMM[4], @XMM[12]
  1677. #ifndef BSAES_ASM_EXTENDED_KEY
  1678. add r4, sp, #0x90 @ pass key schedule
  1679. #else
  1680. add r4, $key, #248 @ pass key schedule
  1681. #endif
  1682. veor @XMM[5], @XMM[5], @XMM[13]
  1683. mov r5, $rounds @ pass rounds
  1684. mov r0, sp
  1685. bl _bsaes_encrypt8
  1686. vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
  1687. vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
  1688. veor @XMM[0], @XMM[0], @XMM[ 8]
  1689. vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
  1690. veor @XMM[1], @XMM[1], @XMM[ 9]
  1691. veor @XMM[8], @XMM[4], @XMM[10]
  1692. vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
  1693. veor @XMM[9], @XMM[6], @XMM[11]
  1694. veor @XMM[10], @XMM[3], @XMM[12]
  1695. vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
  1696. veor @XMM[11], @XMM[7], @XMM[13]
  1697. vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
  1698. vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
  1699. b .Lxts_enc_done
  1700. @ put this in range for both ARM and Thumb mode adr instructions
  1701. .align 5
  1702. .Lxts_magic:
  1703. .quad 1, 0x87
  1704. .align 5
  1705. .Lxts_enc_5:
  1706. veor @XMM[3], @XMM[3], @XMM[11]
  1707. #ifndef BSAES_ASM_EXTENDED_KEY
  1708. add r4, sp, #0x90 @ pass key schedule
  1709. #else
  1710. add r4, $key, #248 @ pass key schedule
  1711. #endif
  1712. veor @XMM[4], @XMM[4], @XMM[12]
  1713. mov r5, $rounds @ pass rounds
  1714. mov r0, sp
  1715. bl _bsaes_encrypt8
  1716. vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
  1717. vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
  1718. veor @XMM[0], @XMM[0], @XMM[ 8]
  1719. vld1.64 {@XMM[12]}, [r0,:128]!
  1720. veor @XMM[1], @XMM[1], @XMM[ 9]
  1721. veor @XMM[8], @XMM[4], @XMM[10]
  1722. vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
  1723. veor @XMM[9], @XMM[6], @XMM[11]
  1724. veor @XMM[10], @XMM[3], @XMM[12]
  1725. vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
  1726. vst1.8 {@XMM[10]}, [$out]!
  1727. vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
  1728. b .Lxts_enc_done
  1729. .align 4
  1730. .Lxts_enc_4:
  1731. veor @XMM[2], @XMM[2], @XMM[10]
  1732. #ifndef BSAES_ASM_EXTENDED_KEY
  1733. add r4, sp, #0x90 @ pass key schedule
  1734. #else
  1735. add r4, $key, #248 @ pass key schedule
  1736. #endif
  1737. veor @XMM[3], @XMM[3], @XMM[11]
  1738. mov r5, $rounds @ pass rounds
  1739. mov r0, sp
  1740. bl _bsaes_encrypt8
  1741. vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
  1742. vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
  1743. veor @XMM[0], @XMM[0], @XMM[ 8]
  1744. veor @XMM[1], @XMM[1], @XMM[ 9]
  1745. veor @XMM[8], @XMM[4], @XMM[10]
  1746. vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
  1747. veor @XMM[9], @XMM[6], @XMM[11]
  1748. vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
  1749. vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
  1750. b .Lxts_enc_done
  1751. .align 4
  1752. .Lxts_enc_3:
  1753. veor @XMM[1], @XMM[1], @XMM[9]
  1754. #ifndef BSAES_ASM_EXTENDED_KEY
  1755. add r4, sp, #0x90 @ pass key schedule
  1756. #else
  1757. add r4, $key, #248 @ pass key schedule
  1758. #endif
  1759. veor @XMM[2], @XMM[2], @XMM[10]
  1760. mov r5, $rounds @ pass rounds
  1761. mov r0, sp
  1762. bl _bsaes_encrypt8
  1763. vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]!
  1764. vld1.64 {@XMM[10]}, [r0,:128]!
  1765. veor @XMM[0], @XMM[0], @XMM[ 8]
  1766. veor @XMM[1], @XMM[1], @XMM[ 9]
  1767. veor @XMM[8], @XMM[4], @XMM[10]
  1768. vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
  1769. vst1.8 {@XMM[8]}, [$out]!
  1770. vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
  1771. b .Lxts_enc_done
  1772. .align 4
  1773. .Lxts_enc_2:
  1774. veor @XMM[0], @XMM[0], @XMM[8]
  1775. #ifndef BSAES_ASM_EXTENDED_KEY
  1776. add r4, sp, #0x90 @ pass key schedule
  1777. #else
  1778. add r4, $key, #248 @ pass key schedule
  1779. #endif
  1780. veor @XMM[1], @XMM[1], @XMM[9]
  1781. mov r5, $rounds @ pass rounds
  1782. mov r0, sp
  1783. bl _bsaes_encrypt8
  1784. vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]!
  1785. veor @XMM[0], @XMM[0], @XMM[ 8]
  1786. veor @XMM[1], @XMM[1], @XMM[ 9]
  1787. vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
  1788. vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
  1789. b .Lxts_enc_done
  1790. .align 4
  1791. .Lxts_enc_1:
  1792. mov r0, sp
  1793. veor @XMM[0], @XMM[0], @XMM[8]
  1794. mov r1, sp
  1795. vst1.8 {@XMM[0]}, [sp,:128]
  1796. mov r2, $key
  1797. mov r4, $fp @ preserve fp
  1798. bl AES_encrypt
  1799. vld1.8 {@XMM[0]}, [sp,:128]
  1800. veor @XMM[0], @XMM[0], @XMM[8]
  1801. vst1.8 {@XMM[0]}, [$out]!
  1802. mov $fp, r4
  1803. vmov @XMM[8], @XMM[9] @ next round tweak
  1804. .Lxts_enc_done:
  1805. #ifndef XTS_CHAIN_TWEAK
  1806. adds $len, #0x10
  1807. beq .Lxts_enc_ret
  1808. sub r6, $out, #0x10
  1809. .Lxts_enc_steal:
  1810. ldrb r0, [$inp], #1
  1811. ldrb r1, [$out, #-0x10]
  1812. strb r0, [$out, #-0x10]
  1813. strb r1, [$out], #1
  1814. subs $len, #1
  1815. bhi .Lxts_enc_steal
  1816. vld1.8 {@XMM[0]}, [r6]
  1817. mov r0, sp
  1818. veor @XMM[0], @XMM[0], @XMM[8]
  1819. mov r1, sp
  1820. vst1.8 {@XMM[0]}, [sp,:128]
  1821. mov r2, $key
  1822. mov r4, $fp @ preserve fp
  1823. bl AES_encrypt
  1824. vld1.8 {@XMM[0]}, [sp,:128]
  1825. veor @XMM[0], @XMM[0], @XMM[8]
  1826. vst1.8 {@XMM[0]}, [r6]
  1827. mov $fp, r4
  1828. #endif
  1829. .Lxts_enc_ret:
  1830. bic r0, $fp, #0xf
  1831. vmov.i32 q0, #0
  1832. vmov.i32 q1, #0
  1833. #ifdef XTS_CHAIN_TWEAK
  1834. ldr r1, [$fp, #0x20+VFP_ABI_FRAME] @ chain tweak
  1835. #endif
  1836. .Lxts_enc_bzero: @ wipe key schedule [if any]
  1837. vstmia sp!, {q0-q1}
  1838. cmp sp, r0
  1839. bne .Lxts_enc_bzero
  1840. mov sp, $fp
  1841. #ifdef XTS_CHAIN_TWEAK
  1842. vst1.8 {@XMM[8]}, [r1]
  1843. #endif
  1844. VFP_ABI_POP
  1845. ldmia sp!, {r4-r10, pc} @ return
  1846. .size bsaes_xts_encrypt,.-bsaes_xts_encrypt
  1847. .globl bsaes_xts_decrypt
  1848. .type bsaes_xts_decrypt,%function
  1849. .align 4
  1850. bsaes_xts_decrypt:
  1851. mov ip, sp
  1852. stmdb sp!, {r4-r10, lr} @ 0x20
  1853. VFP_ABI_PUSH
  1854. mov r6, sp @ future $fp
  1855. mov $inp, r0
  1856. mov $out, r1
  1857. mov $len, r2
  1858. mov $key, r3
  1859. sub r0, sp, #0x10 @ 0x10
  1860. bic r0, #0xf @ align at 16 bytes
  1861. mov sp, r0
  1862. #ifdef XTS_CHAIN_TWEAK
  1863. ldr r0, [ip] @ pointer to input tweak
  1864. #else
  1865. @ generate initial tweak
  1866. ldr r0, [ip, #4] @ iv[]
  1867. mov r1, sp
  1868. ldr r2, [ip, #0] @ key2
  1869. bl AES_encrypt
  1870. mov r0, sp @ pointer to initial tweak
  1871. #endif
  1872. ldr $rounds, [$key, #240] @ get # of rounds
  1873. mov $fp, r6
  1874. #ifndef BSAES_ASM_EXTENDED_KEY
  1875. @ allocate the key schedule on the stack
  1876. sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key
  1877. @ add r12, #`128-32` @ size of bit-sliced key schedule
  1878. sub r12, #`32+16` @ place for tweak[9]
  1879. @ populate the key schedule
  1880. mov r4, $key @ pass key
  1881. mov r5, $rounds @ pass # of rounds
  1882. mov sp, r12
  1883. add r12, #0x90 @ pass key schedule
  1884. bl _bsaes_key_convert
  1885. add r4, sp, #0x90
  1886. vldmia r4, {@XMM[6]}
  1887. vstmia r12, {@XMM[15]} @ save last round key
  1888. veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
  1889. vstmia r4, {@XMM[7]}
  1890. #else
  1891. ldr r12, [$key, #244]
  1892. eors r12, #1
  1893. beq 0f
  1894. str r12, [$key, #244]
  1895. mov r4, $key @ pass key
  1896. mov r5, $rounds @ pass # of rounds
  1897. add r12, $key, #248 @ pass key schedule
  1898. bl _bsaes_key_convert
  1899. add r4, $key, #248
  1900. vldmia r4, {@XMM[6]}
  1901. vstmia r12, {@XMM[15]} @ save last round key
  1902. veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
  1903. vstmia r4, {@XMM[7]}
  1904. .align 2
  1905. 0: sub sp, #0x90 @ place for tweak[9]
  1906. #endif
  1907. vld1.8 {@XMM[8]}, [r0] @ initial tweak
  1908. adr $magic, .Lxts_magic
  1909. #ifndef XTS_CHAIN_TWEAK
  1910. tst $len, #0xf @ if not multiple of 16
  1911. it ne @ Thumb2 thing, sanity check in ARM
  1912. subne $len, #0x10 @ subtract another 16 bytes
  1913. #endif
  1914. subs $len, #0x80
  1915. blo .Lxts_dec_short
  1916. b .Lxts_dec_loop
  1917. .align 4
  1918. .Lxts_dec_loop:
  1919. vldmia $magic, {$twmask} @ load XTS magic
  1920. vshr.s64 @T[0], @XMM[8], #63
  1921. mov r0, sp
  1922. vand @T[0], @T[0], $twmask
  1923. ___
  1924. for($i=9;$i<16;$i++) {
  1925. $code.=<<___;
  1926. vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1]
  1927. vst1.64 {@XMM[$i-1]}, [r0,:128]!
  1928. vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
  1929. vshr.s64 @T[1], @XMM[$i], #63
  1930. veor @XMM[$i], @XMM[$i], @T[0]
  1931. vand @T[1], @T[1], $twmask
  1932. ___
  1933. @T=reverse(@T);
  1934. $code.=<<___ if ($i>=10);
  1935. vld1.8 {@XMM[$i-10]}, [$inp]!
  1936. ___
  1937. $code.=<<___ if ($i>=11);
  1938. veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
  1939. ___
  1940. }
  1941. $code.=<<___;
  1942. vadd.u64 @XMM[8], @XMM[15], @XMM[15]
  1943. vst1.64 {@XMM[15]}, [r0,:128]!
  1944. vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
  1945. veor @XMM[8], @XMM[8], @T[0]
  1946. vst1.64 {@XMM[8]}, [r0,:128] @ next round tweak
  1947. vld1.8 {@XMM[6]-@XMM[7]}, [$inp]!
  1948. veor @XMM[5], @XMM[5], @XMM[13]
  1949. #ifndef BSAES_ASM_EXTENDED_KEY
  1950. add r4, sp, #0x90 @ pass key schedule
  1951. #else
  1952. add r4, $key, #248 @ pass key schedule
  1953. #endif
  1954. veor @XMM[6], @XMM[6], @XMM[14]
  1955. mov r5, $rounds @ pass rounds
  1956. veor @XMM[7], @XMM[7], @XMM[15]
  1957. mov r0, sp
  1958. bl _bsaes_decrypt8
  1959. vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
  1960. vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
  1961. veor @XMM[0], @XMM[0], @XMM[ 8]
  1962. vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
  1963. veor @XMM[1], @XMM[1], @XMM[ 9]
  1964. veor @XMM[8], @XMM[6], @XMM[10]
  1965. vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
  1966. veor @XMM[9], @XMM[4], @XMM[11]
  1967. vld1.64 {@XMM[14]-@XMM[15]}, [r0,:128]!
  1968. veor @XMM[10], @XMM[2], @XMM[12]
  1969. vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
  1970. veor @XMM[11], @XMM[7], @XMM[13]
  1971. veor @XMM[12], @XMM[3], @XMM[14]
  1972. vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
  1973. veor @XMM[13], @XMM[5], @XMM[15]
  1974. vst1.8 {@XMM[12]-@XMM[13]}, [$out]!
  1975. vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
  1976. subs $len, #0x80
  1977. bpl .Lxts_dec_loop
  1978. .Lxts_dec_short:
  1979. adds $len, #0x70
  1980. bmi .Lxts_dec_done
  1981. vldmia $magic, {$twmask} @ load XTS magic
  1982. vshr.s64 @T[0], @XMM[8], #63
  1983. mov r0, sp
  1984. vand @T[0], @T[0], $twmask
  1985. ___
  1986. for($i=9;$i<16;$i++) {
  1987. $code.=<<___;
  1988. vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1]
  1989. vst1.64 {@XMM[$i-1]}, [r0,:128]!
  1990. vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
  1991. vshr.s64 @T[1], @XMM[$i], #63
  1992. veor @XMM[$i], @XMM[$i], @T[0]
  1993. vand @T[1], @T[1], $twmask
  1994. ___
  1995. @T=reverse(@T);
  1996. $code.=<<___ if ($i>=10);
  1997. vld1.8 {@XMM[$i-10]}, [$inp]!
  1998. subs $len, #0x10
  1999. bmi .Lxts_dec_`$i-9`
  2000. ___
  2001. $code.=<<___ if ($i>=11);
  2002. veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
  2003. ___
  2004. }
  2005. $code.=<<___;
  2006. sub $len, #0x10
  2007. vst1.64 {@XMM[15]}, [r0,:128] @ next round tweak
  2008. vld1.8 {@XMM[6]}, [$inp]!
  2009. veor @XMM[5], @XMM[5], @XMM[13]
  2010. #ifndef BSAES_ASM_EXTENDED_KEY
  2011. add r4, sp, #0x90 @ pass key schedule
  2012. #else
  2013. add r4, $key, #248 @ pass key schedule
  2014. #endif
  2015. veor @XMM[6], @XMM[6], @XMM[14]
  2016. mov r5, $rounds @ pass rounds
  2017. mov r0, sp
  2018. bl _bsaes_decrypt8
  2019. vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
  2020. vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
  2021. veor @XMM[0], @XMM[0], @XMM[ 8]
  2022. vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
  2023. veor @XMM[1], @XMM[1], @XMM[ 9]
  2024. veor @XMM[8], @XMM[6], @XMM[10]
  2025. vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
  2026. veor @XMM[9], @XMM[4], @XMM[11]
  2027. vld1.64 {@XMM[14]}, [r0,:128]!
  2028. veor @XMM[10], @XMM[2], @XMM[12]
  2029. vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
  2030. veor @XMM[11], @XMM[7], @XMM[13]
  2031. veor @XMM[12], @XMM[3], @XMM[14]
  2032. vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
  2033. vst1.8 {@XMM[12]}, [$out]!
  2034. vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
  2035. b .Lxts_dec_done
  2036. .align 4
  2037. .Lxts_dec_6:
  2038. vst1.64 {@XMM[14]}, [r0,:128] @ next round tweak
  2039. veor @XMM[4], @XMM[4], @XMM[12]
  2040. #ifndef BSAES_ASM_EXTENDED_KEY
  2041. add r4, sp, #0x90 @ pass key schedule
  2042. #else
  2043. add r4, $key, #248 @ pass key schedule
  2044. #endif
  2045. veor @XMM[5], @XMM[5], @XMM[13]
  2046. mov r5, $rounds @ pass rounds
  2047. mov r0, sp
  2048. bl _bsaes_decrypt8
  2049. vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
  2050. vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
  2051. veor @XMM[0], @XMM[0], @XMM[ 8]
  2052. vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
  2053. veor @XMM[1], @XMM[1], @XMM[ 9]
  2054. veor @XMM[8], @XMM[6], @XMM[10]
  2055. vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
  2056. veor @XMM[9], @XMM[4], @XMM[11]
  2057. veor @XMM[10], @XMM[2], @XMM[12]
  2058. vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
  2059. veor @XMM[11], @XMM[7], @XMM[13]
  2060. vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
  2061. vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
  2062. b .Lxts_dec_done
  2063. .align 4
  2064. .Lxts_dec_5:
  2065. veor @XMM[3], @XMM[3], @XMM[11]
  2066. #ifndef BSAES_ASM_EXTENDED_KEY
  2067. add r4, sp, #0x90 @ pass key schedule
  2068. #else
  2069. add r4, $key, #248 @ pass key schedule
  2070. #endif
  2071. veor @XMM[4], @XMM[4], @XMM[12]
  2072. mov r5, $rounds @ pass rounds
  2073. mov r0, sp
  2074. bl _bsaes_decrypt8
  2075. vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
  2076. vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
  2077. veor @XMM[0], @XMM[0], @XMM[ 8]
  2078. vld1.64 {@XMM[12]}, [r0,:128]!
  2079. veor @XMM[1], @XMM[1], @XMM[ 9]
  2080. veor @XMM[8], @XMM[6], @XMM[10]
  2081. vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
  2082. veor @XMM[9], @XMM[4], @XMM[11]
  2083. veor @XMM[10], @XMM[2], @XMM[12]
  2084. vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
  2085. vst1.8 {@XMM[10]}, [$out]!
  2086. vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
  2087. b .Lxts_dec_done
  2088. .align 4
  2089. .Lxts_dec_4:
  2090. veor @XMM[2], @XMM[2], @XMM[10]
  2091. #ifndef BSAES_ASM_EXTENDED_KEY
  2092. add r4, sp, #0x90 @ pass key schedule
  2093. #else
  2094. add r4, $key, #248 @ pass key schedule
  2095. #endif
  2096. veor @XMM[3], @XMM[3], @XMM[11]
  2097. mov r5, $rounds @ pass rounds
  2098. mov r0, sp
  2099. bl _bsaes_decrypt8
  2100. vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
  2101. vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
  2102. veor @XMM[0], @XMM[0], @XMM[ 8]
  2103. veor @XMM[1], @XMM[1], @XMM[ 9]
  2104. veor @XMM[8], @XMM[6], @XMM[10]
  2105. vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
  2106. veor @XMM[9], @XMM[4], @XMM[11]
  2107. vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
  2108. vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
  2109. b .Lxts_dec_done
  2110. .align 4
  2111. .Lxts_dec_3:
  2112. veor @XMM[1], @XMM[1], @XMM[9]
  2113. #ifndef BSAES_ASM_EXTENDED_KEY
  2114. add r4, sp, #0x90 @ pass key schedule
  2115. #else
  2116. add r4, $key, #248 @ pass key schedule
  2117. #endif
  2118. veor @XMM[2], @XMM[2], @XMM[10]
  2119. mov r5, $rounds @ pass rounds
  2120. mov r0, sp
  2121. bl _bsaes_decrypt8
  2122. vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]!
  2123. vld1.64 {@XMM[10]}, [r0,:128]!
  2124. veor @XMM[0], @XMM[0], @XMM[ 8]
  2125. veor @XMM[1], @XMM[1], @XMM[ 9]
  2126. veor @XMM[8], @XMM[6], @XMM[10]
  2127. vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
  2128. vst1.8 {@XMM[8]}, [$out]!
  2129. vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
  2130. b .Lxts_dec_done
  2131. .align 4
  2132. .Lxts_dec_2:
  2133. veor @XMM[0], @XMM[0], @XMM[8]
  2134. #ifndef BSAES_ASM_EXTENDED_KEY
  2135. add r4, sp, #0x90 @ pass key schedule
  2136. #else
  2137. add r4, $key, #248 @ pass key schedule
  2138. #endif
  2139. veor @XMM[1], @XMM[1], @XMM[9]
  2140. mov r5, $rounds @ pass rounds
  2141. mov r0, sp
  2142. bl _bsaes_decrypt8
  2143. vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]!
  2144. veor @XMM[0], @XMM[0], @XMM[ 8]
  2145. veor @XMM[1], @XMM[1], @XMM[ 9]
  2146. vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
  2147. vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
  2148. b .Lxts_dec_done
  2149. .align 4
  2150. .Lxts_dec_1:
  2151. mov r0, sp
  2152. veor @XMM[0], @XMM[0], @XMM[8]
  2153. mov r1, sp
  2154. vst1.8 {@XMM[0]}, [sp,:128]
  2155. mov r5, $magic @ preserve magic
  2156. mov r2, $key
  2157. mov r4, $fp @ preserve fp
  2158. bl AES_decrypt
  2159. vld1.8 {@XMM[0]}, [sp,:128]
  2160. veor @XMM[0], @XMM[0], @XMM[8]
  2161. vst1.8 {@XMM[0]}, [$out]!
  2162. mov $fp, r4
  2163. mov $magic, r5
  2164. vmov @XMM[8], @XMM[9] @ next round tweak
  2165. .Lxts_dec_done:
  2166. #ifndef XTS_CHAIN_TWEAK
  2167. adds $len, #0x10
  2168. beq .Lxts_dec_ret
  2169. @ calculate one round of extra tweak for the stolen ciphertext
  2170. vldmia $magic, {$twmask}
  2171. vshr.s64 @XMM[6], @XMM[8], #63
  2172. vand @XMM[6], @XMM[6], $twmask
  2173. vadd.u64 @XMM[9], @XMM[8], @XMM[8]
  2174. vswp `&Dhi("@XMM[6]")`,`&Dlo("@XMM[6]")`
  2175. veor @XMM[9], @XMM[9], @XMM[6]
  2176. @ perform the final decryption with the last tweak value
  2177. vld1.8 {@XMM[0]}, [$inp]!
  2178. mov r0, sp
  2179. veor @XMM[0], @XMM[0], @XMM[9]
  2180. mov r1, sp
  2181. vst1.8 {@XMM[0]}, [sp,:128]
  2182. mov r2, $key
  2183. mov r4, $fp @ preserve fp
  2184. bl AES_decrypt
  2185. vld1.8 {@XMM[0]}, [sp,:128]
  2186. veor @XMM[0], @XMM[0], @XMM[9]
  2187. vst1.8 {@XMM[0]}, [$out]
  2188. mov r6, $out
  2189. .Lxts_dec_steal:
  2190. ldrb r1, [$out]
  2191. ldrb r0, [$inp], #1
  2192. strb r1, [$out, #0x10]
  2193. strb r0, [$out], #1
  2194. subs $len, #1
  2195. bhi .Lxts_dec_steal
  2196. vld1.8 {@XMM[0]}, [r6]
  2197. mov r0, sp
  2198. veor @XMM[0], @XMM[8]
  2199. mov r1, sp
  2200. vst1.8 {@XMM[0]}, [sp,:128]
  2201. mov r2, $key
  2202. bl AES_decrypt
  2203. vld1.8 {@XMM[0]}, [sp,:128]
  2204. veor @XMM[0], @XMM[0], @XMM[8]
  2205. vst1.8 {@XMM[0]}, [r6]
  2206. mov $fp, r4
  2207. #endif
  2208. .Lxts_dec_ret:
  2209. bic r0, $fp, #0xf
  2210. vmov.i32 q0, #0
  2211. vmov.i32 q1, #0
  2212. #ifdef XTS_CHAIN_TWEAK
  2213. ldr r1, [$fp, #0x20+VFP_ABI_FRAME] @ chain tweak
  2214. #endif
  2215. .Lxts_dec_bzero: @ wipe key schedule [if any]
  2216. vstmia sp!, {q0-q1}
  2217. cmp sp, r0
  2218. bne .Lxts_dec_bzero
  2219. mov sp, $fp
  2220. #ifdef XTS_CHAIN_TWEAK
  2221. vst1.8 {@XMM[8]}, [r1]
  2222. #endif
  2223. VFP_ABI_POP
  2224. ldmia sp!, {r4-r10, pc} @ return
  2225. .size bsaes_xts_decrypt,.-bsaes_xts_decrypt
  2226. ___
  2227. }
  2228. $code.=<<___;
  2229. #endif
  2230. ___
  2231. $code =~ s/\`([^\`]*)\`/eval($1)/gem;
  2232. open SELF,$0;
  2233. while(<SELF>) {
  2234. next if (/^#!/);
  2235. last if (!s/^#/@/ and !/^$/);
  2236. print;
  2237. }
  2238. close SELF;
  2239. print $code;
  2240. close STDOUT or die "error closing STDOUT: $!";