2
0

vpsm4_ex-armv8.pl 38 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553
  1. #! /usr/bin/env perl
  2. # Copyright 2022-2023 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. # This module implements SM4 with ASIMD and AESE on AARCH64
  10. #
  11. # Dec 2022
  12. #
  13. # $output is the last argument if it looks like a file (it has an extension)
  14. # $flavour is the first argument if it doesn't look like a file
  15. $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  16. $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  17. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  18. ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
  19. ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
  20. die "can't locate arm-xlate.pl";
  21. open OUT,"| \"$^X\" $xlate $flavour \"$output\""
  22. or die "can't call $xlate: $!";
  23. *STDOUT=*OUT;
  24. $prefix="vpsm4_ex";
  25. my @vtmp=map("v$_",(0..3));
  26. my @qtmp=map("q$_",(0..3));
  27. my @data=map("v$_",(4..7));
  28. my @datax=map("v$_",(8..11));
  29. my ($rk0,$rk1)=("v12","v13");
  30. my ($rka,$rkb)=("v14","v15");
  31. my @vtmpx=map("v$_",(12..15));
  32. my ($vtmp4,$vtmp5)=("v24","v25");
  33. my ($MaskV,$TAHMatV,$TALMatV,$ATAHMatV,$ATALMatV,$ANDMaskV)=("v26","v27","v28","v29","v30","v31");
  34. my ($MaskQ,$TAHMatQ,$TALMatQ,$ATAHMatQ,$ATALMatQ,$ANDMaskQ)=("q26","q27","q28","q29","q30","q31");
  35. my ($inp,$outp,$blocks,$rks)=("x0","x1","w2","x3");
  36. my ($tmpw,$tmp,$wtmp0,$wtmp1,$wtmp2)=("w6","x6","w7","w8","w9");
  37. my ($xtmp1,$xtmp2)=("x8","x9");
  38. my ($ptr,$counter)=("x10","w11");
  39. my ($word0,$word1,$word2,$word3)=("w12","w13","w14","w15");
  40. sub rev32() {
  41. my $dst = shift;
  42. my $src = shift;
  43. if ($src and ("$src" ne "$dst")) {
  44. $code.=<<___;
  45. #ifndef __AARCH64EB__
  46. rev32 $dst.16b,$src.16b
  47. #else
  48. mov $dst.16b,$src.16b
  49. #endif
  50. ___
  51. } else {
  52. $code.=<<___;
  53. #ifndef __AARCH64EB__
  54. rev32 $dst.16b,$dst.16b
  55. #endif
  56. ___
  57. }
  58. }
  59. sub rev32_armeb() {
  60. my $dst = shift;
  61. my $src = shift;
  62. if ($src and ("$src" ne "$dst")) {
  63. $code.=<<___;
  64. #ifdef __AARCH64EB__
  65. rev32 $dst.16b,$src.16b
  66. #else
  67. mov $dst.16b,$src.16b
  68. #endif
  69. ___
  70. } else {
  71. $code.=<<___;
  72. #ifdef __AARCH64EB__
  73. rev32 $dst.16b,$dst.16b
  74. #endif
  75. ___
  76. }
  77. }
  78. sub rbit() {
  79. my $dst = shift;
  80. my $src = shift;
  81. my $std = shift;
  82. if ($src and ("$src" ne "$dst")) {
  83. if ($std eq "_gb") {
  84. $code.=<<___;
  85. rbit $dst.16b,$src.16b
  86. ___
  87. } else {
  88. $code.=<<___;
  89. mov $dst.16b,$src.16b
  90. ___
  91. }
  92. } else {
  93. if ($std eq "_gb") {
  94. $code.=<<___;
  95. rbit $dst.16b,$src.16b
  96. ___
  97. }
  98. }
  99. }
  100. sub transpose() {
  101. my ($dat0,$dat1,$dat2,$dat3,$vt0,$vt1,$vt2,$vt3) = @_;
  102. $code.=<<___;
  103. zip1 $vt0.4s,$dat0.4s,$dat1.4s
  104. zip2 $vt1.4s,$dat0.4s,$dat1.4s
  105. zip1 $vt2.4s,$dat2.4s,$dat3.4s
  106. zip2 $vt3.4s,$dat2.4s,$dat3.4s
  107. zip1 $dat0.2d,$vt0.2d,$vt2.2d
  108. zip2 $dat1.2d,$vt0.2d,$vt2.2d
  109. zip1 $dat2.2d,$vt1.2d,$vt3.2d
  110. zip2 $dat3.2d,$vt1.2d,$vt3.2d
  111. ___
  112. }
  113. # matrix multiplication Mat*x = (lowerMat*x) ^ (higherMat*x)
  114. sub mul_matrix() {
  115. my $x = shift;
  116. my $higherMat = shift;
  117. my $lowerMat = shift;
  118. my $tmp = shift;
  119. $code.=<<___;
  120. ushr $tmp.16b, $x.16b, 4
  121. and $x.16b, $x.16b, $ANDMaskV.16b
  122. tbl $x.16b, {$lowerMat.16b}, $x.16b
  123. tbl $tmp.16b, {$higherMat.16b}, $tmp.16b
  124. eor $x.16b, $x.16b, $tmp.16b
  125. ___
  126. }
  127. # sbox operations for 4-lane of words
  128. # sbox operation for 4-lane of words
  129. sub sbox() {
  130. my $dat = shift;
  131. $code.=<<___;
  132. // optimize sbox using AESE instruction
  133. tbl @vtmp[0].16b, {$dat.16b}, $MaskV.16b
  134. ___
  135. &mul_matrix(@vtmp[0], $TAHMatV, $TALMatV, $vtmp4);
  136. $code.=<<___;
  137. eor @vtmp[1].16b, @vtmp[1].16b, @vtmp[1].16b
  138. aese @vtmp[0].16b,@vtmp[1].16b
  139. ___
  140. &mul_matrix(@vtmp[0], $ATAHMatV, $ATALMatV, $vtmp4);
  141. $code.=<<___;
  142. mov $dat.16b,@vtmp[0].16b
  143. // linear transformation
  144. ushr @vtmp[0].4s,$dat.4s,32-2
  145. ushr @vtmp[1].4s,$dat.4s,32-10
  146. ushr @vtmp[2].4s,$dat.4s,32-18
  147. ushr @vtmp[3].4s,$dat.4s,32-24
  148. sli @vtmp[0].4s,$dat.4s,2
  149. sli @vtmp[1].4s,$dat.4s,10
  150. sli @vtmp[2].4s,$dat.4s,18
  151. sli @vtmp[3].4s,$dat.4s,24
  152. eor $vtmp4.16b,@vtmp[0].16b,$dat.16b
  153. eor $vtmp4.16b,$vtmp4.16b,$vtmp[1].16b
  154. eor $dat.16b,@vtmp[2].16b,@vtmp[3].16b
  155. eor $dat.16b,$dat.16b,$vtmp4.16b
  156. ___
  157. }
  158. # sbox operation for 8-lane of words
  159. sub sbox_double() {
  160. my $dat = shift;
  161. my $datx = shift;
  162. $code.=<<___;
  163. // optimize sbox using AESE instruction
  164. tbl @vtmp[0].16b, {$dat.16b}, $MaskV.16b
  165. tbl @vtmp[1].16b, {$datx.16b}, $MaskV.16b
  166. ___
  167. &mul_matrix(@vtmp[0], $TAHMatV, $TALMatV, $vtmp4);
  168. &mul_matrix(@vtmp[1], $TAHMatV, $TALMatV, $vtmp4);
  169. $code.=<<___;
  170. eor $vtmp5.16b, $vtmp5.16b, $vtmp5.16b
  171. aese @vtmp[0].16b,$vtmp5.16b
  172. aese @vtmp[1].16b,$vtmp5.16b
  173. ___
  174. &mul_matrix(@vtmp[0], $ATAHMatV, $ATALMatV,$vtmp4);
  175. &mul_matrix(@vtmp[1], $ATAHMatV, $ATALMatV,$vtmp4);
  176. $code.=<<___;
  177. mov $dat.16b,@vtmp[0].16b
  178. mov $datx.16b,@vtmp[1].16b
  179. // linear transformation
  180. ushr @vtmp[0].4s,$dat.4s,32-2
  181. ushr $vtmp5.4s,$datx.4s,32-2
  182. ushr @vtmp[1].4s,$dat.4s,32-10
  183. ushr @vtmp[2].4s,$dat.4s,32-18
  184. ushr @vtmp[3].4s,$dat.4s,32-24
  185. sli @vtmp[0].4s,$dat.4s,2
  186. sli $vtmp5.4s,$datx.4s,2
  187. sli @vtmp[1].4s,$dat.4s,10
  188. sli @vtmp[2].4s,$dat.4s,18
  189. sli @vtmp[3].4s,$dat.4s,24
  190. eor $vtmp4.16b,@vtmp[0].16b,$dat.16b
  191. eor $vtmp4.16b,$vtmp4.16b,@vtmp[1].16b
  192. eor $dat.16b,@vtmp[2].16b,@vtmp[3].16b
  193. eor $dat.16b,$dat.16b,$vtmp4.16b
  194. ushr @vtmp[1].4s,$datx.4s,32-10
  195. ushr @vtmp[2].4s,$datx.4s,32-18
  196. ushr @vtmp[3].4s,$datx.4s,32-24
  197. sli @vtmp[1].4s,$datx.4s,10
  198. sli @vtmp[2].4s,$datx.4s,18
  199. sli @vtmp[3].4s,$datx.4s,24
  200. eor $vtmp4.16b,$vtmp5.16b,$datx.16b
  201. eor $vtmp4.16b,$vtmp4.16b,@vtmp[1].16b
  202. eor $datx.16b,@vtmp[2].16b,@vtmp[3].16b
  203. eor $datx.16b,$datx.16b,$vtmp4.16b
  204. ___
  205. }
  206. # sbox operation for one single word
  207. sub sbox_1word () {
  208. my $word = shift;
  209. $code.=<<___;
  210. mov @vtmp[3].s[0],$word
  211. // optimize sbox using AESE instruction
  212. tbl @vtmp[0].16b, {@vtmp[3].16b}, $MaskV.16b
  213. ___
  214. &mul_matrix(@vtmp[0], $TAHMatV, $TALMatV, @vtmp[2]);
  215. $code.=<<___;
  216. eor @vtmp[1].16b, @vtmp[1].16b, @vtmp[1].16b
  217. aese @vtmp[0].16b,@vtmp[1].16b
  218. ___
  219. &mul_matrix(@vtmp[0], $ATAHMatV, $ATALMatV, @vtmp[2]);
  220. $code.=<<___;
  221. mov $wtmp0,@vtmp[0].s[0]
  222. eor $word,$wtmp0,$wtmp0,ror #32-2
  223. eor $word,$word,$wtmp0,ror #32-10
  224. eor $word,$word,$wtmp0,ror #32-18
  225. eor $word,$word,$wtmp0,ror #32-24
  226. ___
  227. }
  228. # sm4 for one block of data, in scalar registers word0/word1/word2/word3
  229. sub sm4_1blk () {
  230. my $kptr = shift;
  231. $code.=<<___;
  232. ldp $wtmp0,$wtmp1,[$kptr],8
  233. // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
  234. eor $tmpw,$word2,$word3
  235. eor $wtmp2,$wtmp0,$word1
  236. eor $tmpw,$tmpw,$wtmp2
  237. ___
  238. &sbox_1word($tmpw);
  239. $code.=<<___;
  240. eor $word0,$word0,$tmpw
  241. // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
  242. eor $tmpw,$word2,$word3
  243. eor $wtmp2,$word0,$wtmp1
  244. eor $tmpw,$tmpw,$wtmp2
  245. ___
  246. &sbox_1word($tmpw);
  247. $code.=<<___;
  248. ldp $wtmp0,$wtmp1,[$kptr],8
  249. eor $word1,$word1,$tmpw
  250. // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
  251. eor $tmpw,$word0,$word1
  252. eor $wtmp2,$wtmp0,$word3
  253. eor $tmpw,$tmpw,$wtmp2
  254. ___
  255. &sbox_1word($tmpw);
  256. $code.=<<___;
  257. eor $word2,$word2,$tmpw
  258. // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
  259. eor $tmpw,$word0,$word1
  260. eor $wtmp2,$word2,$wtmp1
  261. eor $tmpw,$tmpw,$wtmp2
  262. ___
  263. &sbox_1word($tmpw);
  264. $code.=<<___;
  265. eor $word3,$word3,$tmpw
  266. ___
  267. }
  268. # sm4 for 4-lanes of data, in neon registers data0/data1/data2/data3
  269. sub sm4_4blks () {
  270. my $kptr = shift;
  271. $code.=<<___;
  272. ldp $wtmp0,$wtmp1,[$kptr],8
  273. dup $rk0.4s,$wtmp0
  274. dup $rk1.4s,$wtmp1
  275. // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
  276. eor $rka.16b,@data[2].16b,@data[3].16b
  277. eor $rk0.16b,@data[1].16b,$rk0.16b
  278. eor $rk0.16b,$rka.16b,$rk0.16b
  279. ___
  280. &sbox($rk0);
  281. $code.=<<___;
  282. eor @data[0].16b,@data[0].16b,$rk0.16b
  283. // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
  284. eor $rka.16b,$rka.16b,@data[0].16b
  285. eor $rk1.16b,$rka.16b,$rk1.16b
  286. ___
  287. &sbox($rk1);
  288. $code.=<<___;
  289. ldp $wtmp0,$wtmp1,[$kptr],8
  290. eor @data[1].16b,@data[1].16b,$rk1.16b
  291. dup $rk0.4s,$wtmp0
  292. dup $rk1.4s,$wtmp1
  293. // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
  294. eor $rka.16b,@data[0].16b,@data[1].16b
  295. eor $rk0.16b,@data[3].16b,$rk0.16b
  296. eor $rk0.16b,$rka.16b,$rk0.16b
  297. ___
  298. &sbox($rk0);
  299. $code.=<<___;
  300. eor @data[2].16b,@data[2].16b,$rk0.16b
  301. // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
  302. eor $rka.16b,$rka.16b,@data[2].16b
  303. eor $rk1.16b,$rka.16b,$rk1.16b
  304. ___
  305. &sbox($rk1);
  306. $code.=<<___;
  307. eor @data[3].16b,@data[3].16b,$rk1.16b
  308. ___
  309. }
  310. # sm4 for 8 lanes of data, in neon registers
  311. # data0/data1/data2/data3 datax0/datax1/datax2/datax3
  312. sub sm4_8blks () {
  313. my $kptr = shift;
  314. $code.=<<___;
  315. ldp $wtmp0,$wtmp1,[$kptr],8
  316. // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
  317. dup $rk0.4s,$wtmp0
  318. eor $rka.16b,@data[2].16b,@data[3].16b
  319. eor $rkb.16b,@datax[2].16b,@datax[3].16b
  320. eor @vtmp[0].16b,@data[1].16b,$rk0.16b
  321. eor @vtmp[1].16b,@datax[1].16b,$rk0.16b
  322. eor $rk0.16b,$rka.16b,@vtmp[0].16b
  323. eor $rk1.16b,$rkb.16b,@vtmp[1].16b
  324. ___
  325. &sbox_double($rk0,$rk1);
  326. $code.=<<___;
  327. eor @data[0].16b,@data[0].16b,$rk0.16b
  328. eor @datax[0].16b,@datax[0].16b,$rk1.16b
  329. // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
  330. dup $rk1.4s,$wtmp1
  331. eor $rka.16b,$rka.16b,@data[0].16b
  332. eor $rkb.16b,$rkb.16b,@datax[0].16b
  333. eor $rk0.16b,$rka.16b,$rk1.16b
  334. eor $rk1.16b,$rkb.16b,$rk1.16b
  335. ___
  336. &sbox_double($rk0,$rk1);
  337. $code.=<<___;
  338. ldp $wtmp0,$wtmp1,[$kptr],8
  339. eor @data[1].16b,@data[1].16b,$rk0.16b
  340. eor @datax[1].16b,@datax[1].16b,$rk1.16b
  341. // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
  342. dup $rk0.4s,$wtmp0
  343. eor $rka.16b,@data[0].16b,@data[1].16b
  344. eor $rkb.16b,@datax[0].16b,@datax[1].16b
  345. eor @vtmp[0].16b,@data[3].16b,$rk0.16b
  346. eor @vtmp[1].16b,@datax[3].16b,$rk0.16b
  347. eor $rk0.16b,$rka.16b,@vtmp[0].16b
  348. eor $rk1.16b,$rkb.16b,@vtmp[1].16b
  349. ___
  350. &sbox_double($rk0,$rk1);
  351. $code.=<<___;
  352. eor @data[2].16b,@data[2].16b,$rk0.16b
  353. eor @datax[2].16b,@datax[2].16b,$rk1.16b
  354. // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
  355. dup $rk1.4s,$wtmp1
  356. eor $rka.16b,$rka.16b,@data[2].16b
  357. eor $rkb.16b,$rkb.16b,@datax[2].16b
  358. eor $rk0.16b,$rka.16b,$rk1.16b
  359. eor $rk1.16b,$rkb.16b,$rk1.16b
  360. ___
  361. &sbox_double($rk0,$rk1);
  362. $code.=<<___;
  363. eor @data[3].16b,@data[3].16b,$rk0.16b
  364. eor @datax[3].16b,@datax[3].16b,$rk1.16b
  365. ___
  366. }
  367. sub encrypt_1blk_norev() {
  368. my $dat = shift;
  369. $code.=<<___;
  370. mov $ptr,$rks
  371. mov $counter,#8
  372. mov $word0,$dat.s[0]
  373. mov $word1,$dat.s[1]
  374. mov $word2,$dat.s[2]
  375. mov $word3,$dat.s[3]
  376. 10:
  377. ___
  378. &sm4_1blk($ptr);
  379. $code.=<<___;
  380. subs $counter,$counter,#1
  381. b.ne 10b
  382. mov $dat.s[0],$word3
  383. mov $dat.s[1],$word2
  384. mov $dat.s[2],$word1
  385. mov $dat.s[3],$word0
  386. ___
  387. }
  388. sub encrypt_1blk() {
  389. my $dat = shift;
  390. &encrypt_1blk_norev($dat);
  391. &rev32($dat,$dat);
  392. }
  393. sub encrypt_4blks() {
  394. $code.=<<___;
  395. mov $ptr,$rks
  396. mov $counter,#8
  397. 10:
  398. ___
  399. &sm4_4blks($ptr);
  400. $code.=<<___;
  401. subs $counter,$counter,#1
  402. b.ne 10b
  403. ___
  404. &rev32(@vtmp[3],@data[0]);
  405. &rev32(@vtmp[2],@data[1]);
  406. &rev32(@vtmp[1],@data[2]);
  407. &rev32(@vtmp[0],@data[3]);
  408. }
  409. sub encrypt_8blks() {
  410. $code.=<<___;
  411. mov $ptr,$rks
  412. mov $counter,#8
  413. 10:
  414. ___
  415. &sm4_8blks($ptr);
  416. $code.=<<___;
  417. subs $counter,$counter,#1
  418. b.ne 10b
  419. ___
  420. &rev32(@vtmp[3],@data[0]);
  421. &rev32(@vtmp[2],@data[1]);
  422. &rev32(@vtmp[1],@data[2]);
  423. &rev32(@vtmp[0],@data[3]);
  424. &rev32(@data[3],@datax[0]);
  425. &rev32(@data[2],@datax[1]);
  426. &rev32(@data[1],@datax[2]);
  427. &rev32(@data[0],@datax[3]);
  428. }
  429. sub load_sbox () {
  430. my $data = shift;
  431. $code.=<<___;
  432. ldr $MaskQ, .Lsbox_magic
  433. ldr $TAHMatQ, .Lsbox_magic+16
  434. ldr $TALMatQ, .Lsbox_magic+32
  435. ldr $ATAHMatQ, .Lsbox_magic+48
  436. ldr $ATALMatQ, .Lsbox_magic+64
  437. ldr $ANDMaskQ, .Lsbox_magic+80
  438. ___
  439. }
  440. sub mov_reg_to_vec() {
  441. my $src0 = shift;
  442. my $src1 = shift;
  443. my $desv = shift;
  444. $code.=<<___;
  445. mov $desv.d[0],$src0
  446. mov $desv.d[1],$src1
  447. ___
  448. &rev32_armeb($desv,$desv);
  449. }
  450. sub mov_vec_to_reg() {
  451. my $srcv = shift;
  452. my $des0 = shift;
  453. my $des1 = shift;
  454. $code.=<<___;
  455. mov $des0,$srcv.d[0]
  456. mov $des1,$srcv.d[1]
  457. ___
  458. }
  459. sub compute_tweak() {
  460. my $src0 = shift;
  461. my $src1 = shift;
  462. my $des0 = shift;
  463. my $des1 = shift;
  464. $code.=<<___;
  465. mov $wtmp0,0x87
  466. extr $xtmp2,$src1,$src1,#32
  467. extr $des1,$src1,$src0,#63
  468. and $wtmp1,$wtmp0,$wtmp2,asr#31
  469. eor $des0,$xtmp1,$src0,lsl#1
  470. ___
  471. }
  472. sub compute_tweak_vec() {
  473. my $src = shift;
  474. my $des = shift;
  475. my $std = shift;
  476. &rbit(@vtmp[2],$src,$std);
  477. $code.=<<___;
  478. ldr @qtmp[0], .Lxts_magic
  479. shl $des.16b, @vtmp[2].16b, #1
  480. ext @vtmp[1].16b, @vtmp[2].16b, @vtmp[2].16b,#15
  481. ushr @vtmp[1].16b, @vtmp[1].16b, #7
  482. mul @vtmp[1].16b, @vtmp[1].16b, @vtmp[0].16b
  483. eor $des.16b, $des.16b, @vtmp[1].16b
  484. ___
  485. &rbit($des,$des,$std);
  486. }
  487. $code=<<___;
  488. #include "arm_arch.h"
  489. .arch armv8-a+crypto
  490. .text
  491. .type _${prefix}_consts,%object
  492. .align 7
  493. _${prefix}_consts:
  494. .Lck:
  495. .long 0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269
  496. .long 0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9
  497. .long 0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249
  498. .long 0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9
  499. .long 0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229
  500. .long 0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299
  501. .long 0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209
  502. .long 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279
  503. .Lfk:
  504. .quad 0x56aa3350a3b1bac6,0xb27022dc677d9197
  505. .Lshuffles:
  506. .quad 0x0B0A090807060504,0x030201000F0E0D0C
  507. .Lxts_magic:
  508. .quad 0x0101010101010187,0x0101010101010101
  509. .Lsbox_magic:
  510. .quad 0x0b0e0104070a0d00,0x0306090c0f020508
  511. .quad 0x62185a2042387a00,0x22581a6002783a40
  512. .quad 0x15df62a89e54e923,0xc10bb67c4a803df7
  513. .quad 0xb9aa6b78c1d21300,0x1407c6d56c7fbead
  514. .quad 0x6404462679195b3b,0xe383c1a1fe9edcbc
  515. .quad 0x0f0f0f0f0f0f0f0f,0x0f0f0f0f0f0f0f0f
  516. .size _${prefix}_consts,.-_${prefix}_consts
  517. ___
  518. {{{
  519. my ($key,$keys,$enc)=("x0","x1","w2");
  520. my ($pointer,$schedules,$wtmp,$roundkey)=("x5","x6","w7","w8");
  521. my ($vkey,$vfk,$vmap)=("v5","v6","v7");
  522. $code.=<<___;
  523. .type _${prefix}_set_key,%function
  524. .align 4
  525. _${prefix}_set_key:
  526. AARCH64_VALID_CALL_TARGET
  527. ld1 {$vkey.4s},[$key]
  528. ___
  529. &load_sbox();
  530. &rev32($vkey,$vkey);
  531. $code.=<<___;
  532. adr $pointer,.Lshuffles
  533. ld1 {$vmap.2d},[$pointer]
  534. adr $pointer,.Lfk
  535. ld1 {$vfk.2d},[$pointer]
  536. eor $vkey.16b,$vkey.16b,$vfk.16b
  537. mov $schedules,#32
  538. adr $pointer,.Lck
  539. movi @vtmp[0].16b,#64
  540. cbnz $enc,1f
  541. add $keys,$keys,124
  542. 1:
  543. mov $wtmp,$vkey.s[1]
  544. ldr $roundkey,[$pointer],#4
  545. eor $roundkey,$roundkey,$wtmp
  546. mov $wtmp,$vkey.s[2]
  547. eor $roundkey,$roundkey,$wtmp
  548. mov $wtmp,$vkey.s[3]
  549. eor $roundkey,$roundkey,$wtmp
  550. // optimize sbox using AESE instruction
  551. mov @data[0].s[0],$roundkey
  552. tbl @vtmp[0].16b, {@data[0].16b}, $MaskV.16b
  553. ___
  554. &mul_matrix(@vtmp[0], $TAHMatV, $TALMatV, @vtmp[2]);
  555. $code.=<<___;
  556. eor @vtmp[1].16b, @vtmp[1].16b, @vtmp[1].16b
  557. aese @vtmp[0].16b,@vtmp[1].16b
  558. ___
  559. &mul_matrix(@vtmp[0], $ATAHMatV, $ATALMatV, @vtmp[2]);
  560. $code.=<<___;
  561. mov $wtmp,@vtmp[0].s[0]
  562. eor $roundkey,$wtmp,$wtmp,ror #19
  563. eor $roundkey,$roundkey,$wtmp,ror #9
  564. mov $wtmp,$vkey.s[0]
  565. eor $roundkey,$roundkey,$wtmp
  566. mov $vkey.s[0],$roundkey
  567. cbz $enc,2f
  568. str $roundkey,[$keys],#4
  569. b 3f
  570. 2:
  571. str $roundkey,[$keys],#-4
  572. 3:
  573. tbl $vkey.16b,{$vkey.16b},$vmap.16b
  574. subs $schedules,$schedules,#1
  575. b.ne 1b
  576. ret
  577. .size _${prefix}_set_key,.-_${prefix}_set_key
  578. ___
  579. }}}
  580. {{{
  581. $code.=<<___;
  582. .type _${prefix}_enc_4blks,%function
  583. .align 4
  584. _${prefix}_enc_4blks:
  585. AARCH64_VALID_CALL_TARGET
  586. ___
  587. &encrypt_4blks();
  588. $code.=<<___;
  589. ret
  590. .size _${prefix}_enc_4blks,.-_${prefix}_enc_4blks
  591. ___
  592. }}}
  593. {{{
  594. $code.=<<___;
  595. .type _${prefix}_enc_8blks,%function
  596. .align 4
  597. _${prefix}_enc_8blks:
  598. AARCH64_VALID_CALL_TARGET
  599. ___
  600. &encrypt_8blks();
  601. $code.=<<___;
  602. ret
  603. .size _${prefix}_enc_8blks,.-_${prefix}_enc_8blks
  604. ___
  605. }}}
  606. {{{
  607. my ($key,$keys)=("x0","x1");
  608. $code.=<<___;
  609. .globl ${prefix}_set_encrypt_key
  610. .type ${prefix}_set_encrypt_key,%function
  611. .align 5
  612. ${prefix}_set_encrypt_key:
  613. AARCH64_SIGN_LINK_REGISTER
  614. stp x29,x30,[sp,#-16]!
  615. mov w2,1
  616. bl _${prefix}_set_key
  617. ldp x29,x30,[sp],#16
  618. AARCH64_VALIDATE_LINK_REGISTER
  619. ret
  620. .size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
  621. ___
  622. }}}
  623. {{{
  624. my ($key,$keys)=("x0","x1");
  625. $code.=<<___;
  626. .globl ${prefix}_set_decrypt_key
  627. .type ${prefix}_set_decrypt_key,%function
  628. .align 5
  629. ${prefix}_set_decrypt_key:
  630. AARCH64_SIGN_LINK_REGISTER
  631. stp x29,x30,[sp,#-16]!
  632. mov w2,0
  633. bl _${prefix}_set_key
  634. ldp x29,x30,[sp],#16
  635. AARCH64_VALIDATE_LINK_REGISTER
  636. ret
  637. .size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
  638. ___
  639. }}}
  640. {{{
  641. sub gen_block () {
  642. my $dir = shift;
  643. my ($inp,$outp,$rk)=map("x$_",(0..2));
  644. $code.=<<___;
  645. .globl ${prefix}_${dir}crypt
  646. .type ${prefix}_${dir}crypt,%function
  647. .align 5
  648. ${prefix}_${dir}crypt:
  649. AARCH64_VALID_CALL_TARGET
  650. ld1 {@data[0].4s},[$inp]
  651. ___
  652. &load_sbox();
  653. &rev32(@data[0],@data[0]);
  654. $code.=<<___;
  655. mov $rks,$rk
  656. ___
  657. &encrypt_1blk(@data[0]);
  658. $code.=<<___;
  659. st1 {@data[0].4s},[$outp]
  660. ret
  661. .size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
  662. ___
  663. }
  664. &gen_block("en");
  665. &gen_block("de");
  666. }}}
  667. {{{
  668. $code.=<<___;
  669. .globl ${prefix}_ecb_encrypt
  670. .type ${prefix}_ecb_encrypt,%function
  671. .align 5
  672. ${prefix}_ecb_encrypt:
  673. AARCH64_SIGN_LINK_REGISTER
  674. // convert length into blocks
  675. lsr x2,x2,4
  676. stp d8,d9,[sp,#-80]!
  677. stp d10,d11,[sp,#16]
  678. stp d12,d13,[sp,#32]
  679. stp d14,d15,[sp,#48]
  680. stp x29,x30,[sp,#64]
  681. ___
  682. &load_sbox();
  683. $code.=<<___;
  684. .Lecb_8_blocks_process:
  685. cmp $blocks,#8
  686. b.lt .Lecb_4_blocks_process
  687. ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
  688. ld4 {@datax[0].4s,$datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
  689. ___
  690. &rev32(@data[0],@data[0]);
  691. &rev32(@data[1],@data[1]);
  692. &rev32(@data[2],@data[2]);
  693. &rev32(@data[3],@data[3]);
  694. &rev32(@datax[0],@datax[0]);
  695. &rev32(@datax[1],@datax[1]);
  696. &rev32(@datax[2],@datax[2]);
  697. &rev32(@datax[3],@datax[3]);
  698. $code.=<<___;
  699. bl _${prefix}_enc_8blks
  700. st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
  701. st4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
  702. subs $blocks,$blocks,#8
  703. b.gt .Lecb_8_blocks_process
  704. b 100f
  705. .Lecb_4_blocks_process:
  706. cmp $blocks,#4
  707. b.lt 1f
  708. ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
  709. ___
  710. &rev32(@data[0],@data[0]);
  711. &rev32(@data[1],@data[1]);
  712. &rev32(@data[2],@data[2]);
  713. &rev32(@data[3],@data[3]);
  714. $code.=<<___;
  715. bl _${prefix}_enc_4blks
  716. st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
  717. sub $blocks,$blocks,#4
  718. 1:
  719. // process last block
  720. cmp $blocks,#1
  721. b.lt 100f
  722. b.gt 1f
  723. ld1 {@data[0].4s},[$inp]
  724. ___
  725. &rev32(@data[0],@data[0]);
  726. &encrypt_1blk(@data[0]);
  727. $code.=<<___;
  728. st1 {@data[0].4s},[$outp]
  729. b 100f
  730. 1: // process last 2 blocks
  731. ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[0],[$inp],#16
  732. ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[1],[$inp],#16
  733. cmp $blocks,#2
  734. b.gt 1f
  735. ___
  736. &rev32(@data[0],@data[0]);
  737. &rev32(@data[1],@data[1]);
  738. &rev32(@data[2],@data[2]);
  739. &rev32(@data[3],@data[3]);
  740. $code.=<<___;
  741. bl _${prefix}_enc_4blks
  742. st4 {@vtmp[0].s-@vtmp[3].s}[0],[$outp],#16
  743. st4 {@vtmp[0].s-@vtmp[3].s}[1],[$outp]
  744. b 100f
  745. 1: // process last 3 blocks
  746. ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[2],[$inp],#16
  747. ___
  748. &rev32(@data[0],@data[0]);
  749. &rev32(@data[1],@data[1]);
  750. &rev32(@data[2],@data[2]);
  751. &rev32(@data[3],@data[3]);
  752. $code.=<<___;
  753. bl _${prefix}_enc_4blks
  754. st4 {@vtmp[0].s-@vtmp[3].s}[0],[$outp],#16
  755. st4 {@vtmp[0].s-@vtmp[3].s}[1],[$outp],#16
  756. st4 {@vtmp[0].s-@vtmp[3].s}[2],[$outp]
  757. 100:
  758. ldp d10,d11,[sp,#16]
  759. ldp d12,d13,[sp,#32]
  760. ldp d14,d15,[sp,#48]
  761. ldp x29,x30,[sp,#64]
  762. ldp d8,d9,[sp],#80
  763. AARCH64_VALIDATE_LINK_REGISTER
  764. ret
  765. .size ${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt
  766. ___
  767. }}}
  768. {{{
  769. my ($len,$ivp,$enc)=("x2","x4","w5");
  770. my $ivec0=("v3");
  771. my $ivec1=("v15");
  772. $code.=<<___;
  773. .globl ${prefix}_cbc_encrypt
  774. .type ${prefix}_cbc_encrypt,%function
  775. .align 5
  776. ${prefix}_cbc_encrypt:
  777. AARCH64_VALID_CALL_TARGET
  778. lsr $len,$len,4
  779. ___
  780. &load_sbox();
  781. $code.=<<___;
  782. cbz $enc,.Ldec
  783. ld1 {$ivec0.4s},[$ivp]
  784. .Lcbc_4_blocks_enc:
  785. cmp $blocks,#4
  786. b.lt 1f
  787. ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
  788. eor @data[0].16b,@data[0].16b,$ivec0.16b
  789. ___
  790. &rev32(@data[1],@data[1]);
  791. &rev32(@data[0],@data[0]);
  792. &rev32(@data[2],@data[2]);
  793. &rev32(@data[3],@data[3]);
  794. &encrypt_1blk_norev(@data[0]);
  795. $code.=<<___;
  796. eor @data[1].16b,@data[1].16b,@data[0].16b
  797. ___
  798. &encrypt_1blk_norev(@data[1]);
  799. &rev32(@data[0],@data[0]);
  800. $code.=<<___;
  801. eor @data[2].16b,@data[2].16b,@data[1].16b
  802. ___
  803. &encrypt_1blk_norev(@data[2]);
  804. &rev32(@data[1],@data[1]);
  805. $code.=<<___;
  806. eor @data[3].16b,@data[3].16b,@data[2].16b
  807. ___
  808. &encrypt_1blk_norev(@data[3]);
  809. &rev32(@data[2],@data[2]);
  810. &rev32(@data[3],@data[3]);
  811. $code.=<<___;
  812. orr $ivec0.16b,@data[3].16b,@data[3].16b
  813. st1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
  814. subs $blocks,$blocks,#4
  815. b.ne .Lcbc_4_blocks_enc
  816. b 2f
  817. 1:
  818. subs $blocks,$blocks,#1
  819. b.lt 2f
  820. ld1 {@data[0].4s},[$inp],#16
  821. eor $ivec0.16b,$ivec0.16b,@data[0].16b
  822. ___
  823. &rev32($ivec0,$ivec0);
  824. &encrypt_1blk($ivec0);
  825. $code.=<<___;
  826. st1 {$ivec0.4s},[$outp],#16
  827. b 1b
  828. 2:
  829. // save back IV
  830. st1 {$ivec0.4s},[$ivp]
  831. ret
  832. .Ldec:
  833. // decryption mode starts
  834. AARCH64_SIGN_LINK_REGISTER
  835. stp d8,d9,[sp,#-80]!
  836. stp d10,d11,[sp,#16]
  837. stp d12,d13,[sp,#32]
  838. stp d14,d15,[sp,#48]
  839. stp x29,x30,[sp,#64]
  840. .Lcbc_8_blocks_dec:
  841. cmp $blocks,#8
  842. b.lt 1f
  843. ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp]
  844. add $ptr,$inp,#64
  845. ld4 {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$ptr]
  846. ___
  847. &rev32(@data[0],@data[0]);
  848. &rev32(@data[1],@data[1]);
  849. &rev32(@data[2],@data[2]);
  850. &rev32(@data[3],$data[3]);
  851. &rev32(@datax[0],@datax[0]);
  852. &rev32(@datax[1],@datax[1]);
  853. &rev32(@datax[2],@datax[2]);
  854. &rev32(@datax[3],$datax[3]);
  855. $code.=<<___;
  856. bl _${prefix}_enc_8blks
  857. ___
  858. &transpose(@vtmp,@datax);
  859. &transpose(@data,@datax);
  860. $code.=<<___;
  861. ld1 {$ivec1.4s},[$ivp]
  862. ld1 {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
  863. // note ivec1 and vtmpx[3] are resuing the same register
  864. // care needs to be taken to avoid conflict
  865. eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
  866. ld1 {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64
  867. eor @vtmp[1].16b,@vtmp[1].16b,@datax[0].16b
  868. eor @vtmp[2].16b,@vtmp[2].16b,@datax[1].16b
  869. eor @vtmp[3].16b,$vtmp[3].16b,@datax[2].16b
  870. // save back IV
  871. st1 {$vtmpx[3].4s}, [$ivp]
  872. eor @data[0].16b,@data[0].16b,$datax[3].16b
  873. eor @data[1].16b,@data[1].16b,@vtmpx[0].16b
  874. eor @data[2].16b,@data[2].16b,@vtmpx[1].16b
  875. eor @data[3].16b,$data[3].16b,@vtmpx[2].16b
  876. st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
  877. st1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
  878. subs $blocks,$blocks,#8
  879. b.gt .Lcbc_8_blocks_dec
  880. b.eq 100f
  881. 1:
  882. ld1 {$ivec1.4s},[$ivp]
  883. .Lcbc_4_blocks_dec:
  884. cmp $blocks,#4
  885. b.lt 1f
  886. ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp]
  887. ___
  888. &rev32(@data[0],@data[0]);
  889. &rev32(@data[1],@data[1]);
  890. &rev32(@data[2],@data[2]);
  891. &rev32(@data[3],$data[3]);
  892. $code.=<<___;
  893. bl _${prefix}_enc_4blks
  894. ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
  895. ___
  896. &transpose(@vtmp,@datax);
  897. $code.=<<___;
  898. eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
  899. eor @vtmp[1].16b,@vtmp[1].16b,@data[0].16b
  900. orr $ivec1.16b,@data[3].16b,@data[3].16b
  901. eor @vtmp[2].16b,@vtmp[2].16b,@data[1].16b
  902. eor @vtmp[3].16b,$vtmp[3].16b,@data[2].16b
  903. st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
  904. subs $blocks,$blocks,#4
  905. b.gt .Lcbc_4_blocks_dec
  906. // save back IV
  907. st1 {@data[3].4s}, [$ivp]
  908. b 100f
  909. 1: // last block
  910. subs $blocks,$blocks,#1
  911. b.lt 100f
  912. b.gt 1f
  913. ld1 {@data[0].4s},[$inp],#16
  914. // save back IV
  915. st1 {$data[0].4s}, [$ivp]
  916. ___
  917. &rev32(@datax[0],@data[0]);
  918. &encrypt_1blk(@datax[0]);
  919. $code.=<<___;
  920. eor @datax[0].16b,@datax[0].16b,$ivec1.16b
  921. st1 {@datax[0].4s},[$outp],#16
  922. b 100f
  923. 1: // last two blocks
  924. ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[0],[$inp]
  925. add $ptr,$inp,#16
  926. ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[1],[$ptr],#16
  927. subs $blocks,$blocks,1
  928. b.gt 1f
  929. ___
  930. &rev32(@data[0],@data[0]);
  931. &rev32(@data[1],@data[1]);
  932. &rev32(@data[2],@data[2]);
  933. &rev32(@data[3],@data[3]);
  934. $code.=<<___;
  935. bl _${prefix}_enc_4blks
  936. ld1 {@data[0].4s,@data[1].4s},[$inp],#32
  937. ___
  938. &transpose(@vtmp,@datax);
  939. $code.=<<___;
  940. eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
  941. eor @vtmp[1].16b,@vtmp[1].16b,@data[0].16b
  942. st1 {@vtmp[0].4s,@vtmp[1].4s},[$outp],#32
  943. // save back IV
  944. st1 {@data[1].4s}, [$ivp]
  945. b 100f
  946. 1: // last 3 blocks
  947. ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[2],[$ptr]
  948. ___
  949. &rev32(@data[0],@data[0]);
  950. &rev32(@data[1],@data[1]);
  951. &rev32(@data[2],@data[2]);
  952. &rev32(@data[3],@data[3]);
  953. $code.=<<___;
  954. bl _${prefix}_enc_4blks
  955. ld1 {@data[0].4s,@data[1].4s,@data[2].4s},[$inp],#48
  956. ___
  957. &transpose(@vtmp,@datax);
  958. $code.=<<___;
  959. eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
  960. eor @vtmp[1].16b,@vtmp[1].16b,@data[0].16b
  961. eor @vtmp[2].16b,@vtmp[2].16b,@data[1].16b
  962. st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s},[$outp],#48
  963. // save back IV
  964. st1 {@data[2].4s}, [$ivp]
  965. 100:
  966. ldp d10,d11,[sp,#16]
  967. ldp d12,d13,[sp,#32]
  968. ldp d14,d15,[sp,#48]
  969. ldp x29,x30,[sp,#64]
  970. ldp d8,d9,[sp],#80
  971. AARCH64_VALIDATE_LINK_REGISTER
  972. ret
  973. .size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
  974. ___
  975. }}}
  976. {{{
  977. my ($ivp)=("x4");
  978. my ($ctr)=("w5");
  979. my $ivec=("v3");
  980. $code.=<<___;
  981. .globl ${prefix}_ctr32_encrypt_blocks
  982. .type ${prefix}_ctr32_encrypt_blocks,%function
  983. .align 5
  984. ${prefix}_ctr32_encrypt_blocks:
  985. AARCH64_VALID_CALL_TARGET
  986. ld1 {$ivec.4s},[$ivp]
  987. ___
  988. &rev32($ivec,$ivec);
  989. &load_sbox();
  990. $code.=<<___;
  991. cmp $blocks,#1
  992. b.ne 1f
  993. // fast processing for one single block without
  994. // context saving overhead
  995. ___
  996. &encrypt_1blk($ivec);
  997. $code.=<<___;
  998. ld1 {@data[0].4s},[$inp]
  999. eor @data[0].16b,@data[0].16b,$ivec.16b
  1000. st1 {@data[0].4s},[$outp]
  1001. ret
  1002. 1:
  1003. AARCH64_SIGN_LINK_REGISTER
  1004. stp d8,d9,[sp,#-80]!
  1005. stp d10,d11,[sp,#16]
  1006. stp d12,d13,[sp,#32]
  1007. stp d14,d15,[sp,#48]
  1008. stp x29,x30,[sp,#64]
  1009. mov $word0,$ivec.s[0]
  1010. mov $word1,$ivec.s[1]
  1011. mov $word2,$ivec.s[2]
  1012. mov $ctr,$ivec.s[3]
  1013. .Lctr32_4_blocks_process:
  1014. cmp $blocks,#4
  1015. b.lt 1f
  1016. dup @data[0].4s,$word0
  1017. dup @data[1].4s,$word1
  1018. dup @data[2].4s,$word2
  1019. mov @data[3].s[0],$ctr
  1020. add $ctr,$ctr,#1
  1021. mov $data[3].s[1],$ctr
  1022. add $ctr,$ctr,#1
  1023. mov @data[3].s[2],$ctr
  1024. add $ctr,$ctr,#1
  1025. mov @data[3].s[3],$ctr
  1026. add $ctr,$ctr,#1
  1027. cmp $blocks,#8
  1028. b.ge .Lctr32_8_blocks_process
  1029. bl _${prefix}_enc_4blks
  1030. ld4 {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64
  1031. eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
  1032. eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
  1033. eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
  1034. eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
  1035. st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
  1036. subs $blocks,$blocks,#4
  1037. b.ne .Lctr32_4_blocks_process
  1038. b 100f
  1039. .Lctr32_8_blocks_process:
  1040. dup @datax[0].4s,$word0
  1041. dup @datax[1].4s,$word1
  1042. dup @datax[2].4s,$word2
  1043. mov @datax[3].s[0],$ctr
  1044. add $ctr,$ctr,#1
  1045. mov $datax[3].s[1],$ctr
  1046. add $ctr,$ctr,#1
  1047. mov @datax[3].s[2],$ctr
  1048. add $ctr,$ctr,#1
  1049. mov @datax[3].s[3],$ctr
  1050. add $ctr,$ctr,#1
  1051. bl _${prefix}_enc_8blks
  1052. ld4 {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64
  1053. ld4 {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
  1054. eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
  1055. eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
  1056. eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
  1057. eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
  1058. eor @data[0].16b,@data[0].16b,@datax[0].16b
  1059. eor @data[1].16b,@data[1].16b,@datax[1].16b
  1060. eor @data[2].16b,@data[2].16b,@datax[2].16b
  1061. eor @data[3].16b,@data[3].16b,@datax[3].16b
  1062. st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
  1063. st4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
  1064. subs $blocks,$blocks,#8
  1065. b.ne .Lctr32_4_blocks_process
  1066. b 100f
  1067. 1: // last block processing
  1068. subs $blocks,$blocks,#1
  1069. b.lt 100f
  1070. b.gt 1f
  1071. mov $ivec.s[0],$word0
  1072. mov $ivec.s[1],$word1
  1073. mov $ivec.s[2],$word2
  1074. mov $ivec.s[3],$ctr
  1075. ___
  1076. &encrypt_1blk($ivec);
  1077. $code.=<<___;
  1078. ld1 {@data[0].4s},[$inp]
  1079. eor @data[0].16b,@data[0].16b,$ivec.16b
  1080. st1 {@data[0].4s},[$outp]
  1081. b 100f
  1082. 1: // last 2 blocks processing
  1083. dup @data[0].4s,$word0
  1084. dup @data[1].4s,$word1
  1085. dup @data[2].4s,$word2
  1086. mov @data[3].s[0],$ctr
  1087. add $ctr,$ctr,#1
  1088. mov @data[3].s[1],$ctr
  1089. subs $blocks,$blocks,#1
  1090. b.ne 1f
  1091. bl _${prefix}_enc_4blks
  1092. ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[0],[$inp],#16
  1093. ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[1],[$inp],#16
  1094. eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
  1095. eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
  1096. eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
  1097. eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
  1098. st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[0],[$outp],#16
  1099. st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[1],[$outp],#16
  1100. b 100f
  1101. 1: // last 3 blocks processing
  1102. add $ctr,$ctr,#1
  1103. mov @data[3].s[2],$ctr
  1104. bl _${prefix}_enc_4blks
  1105. ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[0],[$inp],#16
  1106. ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[1],[$inp],#16
  1107. ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[2],[$inp],#16
  1108. eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
  1109. eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
  1110. eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
  1111. eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
  1112. st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[0],[$outp],#16
  1113. st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[1],[$outp],#16
  1114. st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[2],[$outp],#16
  1115. 100:
  1116. ldp d10,d11,[sp,#16]
  1117. ldp d12,d13,[sp,#32]
  1118. ldp d14,d15,[sp,#48]
  1119. ldp x29,x30,[sp,#64]
  1120. ldp d8,d9,[sp],#80
  1121. AARCH64_VALIDATE_LINK_REGISTER
  1122. ret
  1123. .size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
  1124. ___
  1125. }}}
  1126. {{{
  1127. my ($blocks,$len)=("x2","x2");
  1128. my $ivp=("x5");
  1129. my @twx=map("x$_",(12..27));
  1130. my ($rks1,$rks2)=("x26","x27");
  1131. my $lastBlk=("x26");
  1132. my $enc=("w28");
  1133. my $remain=("x29");
  1134. my @tweak=map("v$_",(16..23));
  1135. my $lastTweak=("v25");
  1136. sub gen_xts_cipher() {
  1137. my $std = shift;
  1138. $code.=<<___;
  1139. .globl ${prefix}_xts_encrypt${std}
  1140. .type ${prefix}_xts_encrypt${std},%function
  1141. .align 5
  1142. ${prefix}_xts_encrypt${std}:
  1143. AARCH64_SIGN_LINK_REGISTER
  1144. stp x15, x16, [sp, #-0x10]!
  1145. stp x17, x18, [sp, #-0x10]!
  1146. stp x19, x20, [sp, #-0x10]!
  1147. stp x21, x22, [sp, #-0x10]!
  1148. stp x23, x24, [sp, #-0x10]!
  1149. stp x25, x26, [sp, #-0x10]!
  1150. stp x27, x28, [sp, #-0x10]!
  1151. stp x29, x30, [sp, #-0x10]!
  1152. stp d8, d9, [sp, #-0x10]!
  1153. stp d10, d11, [sp, #-0x10]!
  1154. stp d12, d13, [sp, #-0x10]!
  1155. stp d14, d15, [sp, #-0x10]!
  1156. mov $rks1,x3
  1157. mov $rks2,x4
  1158. mov $enc,w6
  1159. ld1 {@tweak[0].4s}, [$ivp]
  1160. mov $rks,$rks2
  1161. ___
  1162. &load_sbox();
  1163. &rev32(@tweak[0],@tweak[0]);
  1164. &encrypt_1blk(@tweak[0]);
  1165. $code.=<<___;
  1166. mov $rks,$rks1
  1167. and $remain,$len,#0x0F
  1168. // convert length into blocks
  1169. lsr $blocks,$len,4
  1170. cmp $blocks,#1
  1171. b.lt .return${std}
  1172. cmp $remain,0
  1173. // If the encryption/decryption Length is N times of 16,
  1174. // the all blocks are encrypted/decrypted in .xts_encrypt_blocks${std}
  1175. b.eq .xts_encrypt_blocks${std}
  1176. // If the encryption/decryption length is not N times of 16,
  1177. // the last two blocks are encrypted/decrypted in .last_2blks_tweak${std} or .only_2blks_tweak${std}
  1178. // the other blocks are encrypted/decrypted in .xts_encrypt_blocks${std}
  1179. subs $blocks,$blocks,#1
  1180. b.eq .only_2blks_tweak${std}
  1181. .xts_encrypt_blocks${std}:
  1182. ___
  1183. &rbit(@tweak[0],@tweak[0],$std);
  1184. &rev32_armeb(@tweak[0],@tweak[0]);
  1185. &mov_vec_to_reg(@tweak[0],@twx[0],@twx[1]);
  1186. &compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3]);
  1187. &compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5]);
  1188. &compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7]);
  1189. &compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9]);
  1190. &compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11]);
  1191. &compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13]);
  1192. &compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15]);
  1193. $code.=<<___;
  1194. .Lxts_8_blocks_process${std}:
  1195. cmp $blocks,#8
  1196. ___
  1197. &mov_reg_to_vec(@twx[0],@twx[1],@tweak[0]);
  1198. &compute_tweak(@twx[14],@twx[15],@twx[0],@twx[1]);
  1199. &mov_reg_to_vec(@twx[2],@twx[3],@tweak[1]);
  1200. &compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3]);
  1201. &mov_reg_to_vec(@twx[4],@twx[5],@tweak[2]);
  1202. &compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5]);
  1203. &mov_reg_to_vec(@twx[6],@twx[7],@tweak[3]);
  1204. &compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7]);
  1205. &mov_reg_to_vec(@twx[8],@twx[9],@tweak[4]);
  1206. &compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9]);
  1207. &mov_reg_to_vec(@twx[10],@twx[11],@tweak[5]);
  1208. &compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11]);
  1209. &mov_reg_to_vec(@twx[12],@twx[13],@tweak[6]);
  1210. &compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13]);
  1211. &mov_reg_to_vec(@twx[14],@twx[15],@tweak[7]);
  1212. &compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15]);
  1213. $code.=<<___;
  1214. b.lt .Lxts_4_blocks_process${std}
  1215. ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
  1216. ___
  1217. &rbit(@tweak[0],@tweak[0],$std);
  1218. &rbit(@tweak[1],@tweak[1],$std);
  1219. &rbit(@tweak[2],@tweak[2],$std);
  1220. &rbit(@tweak[3],@tweak[3],$std);
  1221. $code.=<<___;
  1222. eor @data[0].16b, @data[0].16b, @tweak[0].16b
  1223. eor @data[1].16b, @data[1].16b, @tweak[1].16b
  1224. eor @data[2].16b, @data[2].16b, @tweak[2].16b
  1225. eor @data[3].16b, @data[3].16b, @tweak[3].16b
  1226. ld1 {@datax[0].4s,$datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
  1227. ___
  1228. &rbit(@tweak[4],@tweak[4],$std);
  1229. &rbit(@tweak[5],@tweak[5],$std);
  1230. &rbit(@tweak[6],@tweak[6],$std);
  1231. &rbit(@tweak[7],@tweak[7],$std);
  1232. $code.=<<___;
  1233. eor @datax[0].16b, @datax[0].16b, @tweak[4].16b
  1234. eor @datax[1].16b, @datax[1].16b, @tweak[5].16b
  1235. eor @datax[2].16b, @datax[2].16b, @tweak[6].16b
  1236. eor @datax[3].16b, @datax[3].16b, @tweak[7].16b
  1237. ___
  1238. &rev32(@data[0],@data[0]);
  1239. &rev32(@data[1],@data[1]);
  1240. &rev32(@data[2],@data[2]);
  1241. &rev32(@data[3],@data[3]);
  1242. &rev32(@datax[0],@datax[0]);
  1243. &rev32(@datax[1],@datax[1]);
  1244. &rev32(@datax[2],@datax[2]);
  1245. &rev32(@datax[3],@datax[3]);
  1246. &transpose(@data,@vtmp);
  1247. &transpose(@datax,@vtmp);
  1248. $code.=<<___;
  1249. bl _${prefix}_enc_8blks
  1250. ___
  1251. &transpose(@vtmp,@datax);
  1252. &transpose(@data,@datax);
  1253. $code.=<<___;
  1254. eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b
  1255. eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b
  1256. eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b
  1257. eor @vtmp[3].16b, @vtmp[3].16b, @tweak[3].16b
  1258. eor @data[0].16b, @data[0].16b, @tweak[4].16b
  1259. eor @data[1].16b, @data[1].16b, @tweak[5].16b
  1260. eor @data[2].16b, @data[2].16b, @tweak[6].16b
  1261. eor @data[3].16b, @data[3].16b, @tweak[7].16b
  1262. // save the last tweak
  1263. mov $lastTweak.16b,@tweak[7].16b
  1264. st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
  1265. st1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
  1266. subs $blocks,$blocks,#8
  1267. b.gt .Lxts_8_blocks_process${std}
  1268. b 100f
  1269. .Lxts_4_blocks_process${std}:
  1270. cmp $blocks,#4
  1271. b.lt 1f
  1272. ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
  1273. ___
  1274. &rbit(@tweak[0],@tweak[0],$std);
  1275. &rbit(@tweak[1],@tweak[1],$std);
  1276. &rbit(@tweak[2],@tweak[2],$std);
  1277. &rbit(@tweak[3],@tweak[3],$std);
  1278. $code.=<<___;
  1279. eor @data[0].16b, @data[0].16b, @tweak[0].16b
  1280. eor @data[1].16b, @data[1].16b, @tweak[1].16b
  1281. eor @data[2].16b, @data[2].16b, @tweak[2].16b
  1282. eor @data[3].16b, @data[3].16b, @tweak[3].16b
  1283. ___
  1284. &rev32(@data[0],@data[0]);
  1285. &rev32(@data[1],@data[1]);
  1286. &rev32(@data[2],@data[2]);
  1287. &rev32(@data[3],@data[3]);
  1288. &transpose(@data,@vtmp);
  1289. $code.=<<___;
  1290. bl _${prefix}_enc_4blks
  1291. ___
  1292. &transpose(@vtmp,@data);
  1293. $code.=<<___;
  1294. eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b
  1295. eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b
  1296. eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b
  1297. eor @vtmp[3].16b, @vtmp[3].16b, @tweak[3].16b
  1298. st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
  1299. sub $blocks,$blocks,#4
  1300. mov @tweak[0].16b,@tweak[4].16b
  1301. mov @tweak[1].16b,@tweak[5].16b
  1302. mov @tweak[2].16b,@tweak[6].16b
  1303. // save the last tweak
  1304. mov $lastTweak.16b,@tweak[3].16b
  1305. 1:
  1306. // process last block
  1307. cmp $blocks,#1
  1308. b.lt 100f
  1309. b.gt 1f
  1310. ld1 {@data[0].4s},[$inp],#16
  1311. ___
  1312. &rbit(@tweak[0],@tweak[0],$std);
  1313. $code.=<<___;
  1314. eor @data[0].16b, @data[0].16b, @tweak[0].16b
  1315. ___
  1316. &rev32(@data[0],@data[0]);
  1317. &encrypt_1blk(@data[0]);
  1318. $code.=<<___;
  1319. eor @data[0].16b, @data[0].16b, @tweak[0].16b
  1320. st1 {@data[0].4s},[$outp],#16
  1321. // save the last tweak
  1322. mov $lastTweak.16b,@tweak[0].16b
  1323. b 100f
  1324. 1: // process last 2 blocks
  1325. cmp $blocks,#2
  1326. b.gt 1f
  1327. ld1 {@data[0].4s,@data[1].4s},[$inp],#32
  1328. ___
  1329. &rbit(@tweak[0],@tweak[0],$std);
  1330. &rbit(@tweak[1],@tweak[1],$std);
  1331. $code.=<<___;
  1332. eor @data[0].16b, @data[0].16b, @tweak[0].16b
  1333. eor @data[1].16b, @data[1].16b, @tweak[1].16b
  1334. ___
  1335. &rev32(@data[0],@data[0]);
  1336. &rev32(@data[1],@data[1]);
  1337. &transpose(@data,@vtmp);
  1338. $code.=<<___;
  1339. bl _${prefix}_enc_4blks
  1340. ___
  1341. &transpose(@vtmp,@data);
  1342. $code.=<<___;
  1343. eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b
  1344. eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b
  1345. st1 {@vtmp[0].4s,@vtmp[1].4s},[$outp],#32
  1346. // save the last tweak
  1347. mov $lastTweak.16b,@tweak[1].16b
  1348. b 100f
  1349. 1: // process last 3 blocks
  1350. ld1 {@data[0].4s,@data[1].4s,@data[2].4s},[$inp],#48
  1351. ___
  1352. &rbit(@tweak[0],@tweak[0],$std);
  1353. &rbit(@tweak[1],@tweak[1],$std);
  1354. &rbit(@tweak[2],@tweak[2],$std);
  1355. $code.=<<___;
  1356. eor @data[0].16b, @data[0].16b, @tweak[0].16b
  1357. eor @data[1].16b, @data[1].16b, @tweak[1].16b
  1358. eor @data[2].16b, @data[2].16b, @tweak[2].16b
  1359. ___
  1360. &rev32(@data[0],@data[0]);
  1361. &rev32(@data[1],@data[1]);
  1362. &rev32(@data[2],@data[2]);
  1363. &transpose(@data,@vtmp);
  1364. $code.=<<___;
  1365. bl _${prefix}_enc_4blks
  1366. ___
  1367. &transpose(@vtmp,@data);
  1368. $code.=<<___;
  1369. eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b
  1370. eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b
  1371. eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b
  1372. st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s},[$outp],#48
  1373. // save the last tweak
  1374. mov $lastTweak.16b,@tweak[2].16b
  1375. 100:
  1376. cmp $remain,0
  1377. b.eq .return${std}
  1378. // This branch calculates the last two tweaks,
  1379. // while the encryption/decryption length is larger than 32
  1380. .last_2blks_tweak${std}:
  1381. ___
  1382. &rev32_armeb($lastTweak,$lastTweak);
  1383. &compute_tweak_vec($lastTweak,@tweak[1],$std);
  1384. &compute_tweak_vec(@tweak[1],@tweak[2],$std);
  1385. $code.=<<___;
  1386. b .check_dec${std}
  1387. // This branch calculates the last two tweaks,
  1388. // while the encryption/decryption length is equal to 32, who only need two tweaks
  1389. .only_2blks_tweak${std}:
  1390. mov @tweak[1].16b,@tweak[0].16b
  1391. ___
  1392. &rev32_armeb(@tweak[1],@tweak[1]);
  1393. &compute_tweak_vec(@tweak[1],@tweak[2]);
  1394. $code.=<<___;
  1395. b .check_dec${std}
  1396. // Determine whether encryption or decryption is required.
  1397. // The last two tweaks need to be swapped for decryption.
  1398. .check_dec${std}:
  1399. // encryption:1 decryption:0
  1400. cmp $enc,1
  1401. b.eq .prcess_last_2blks${std}
  1402. mov @vtmp[0].16B,@tweak[1].16b
  1403. mov @tweak[1].16B,@tweak[2].16b
  1404. mov @tweak[2].16B,@vtmp[0].16b
  1405. .prcess_last_2blks${std}:
  1406. ___
  1407. &rev32_armeb(@tweak[1],@tweak[1]);
  1408. &rev32_armeb(@tweak[2],@tweak[2]);
  1409. $code.=<<___;
  1410. ld1 {@data[0].4s},[$inp],#16
  1411. eor @data[0].16b, @data[0].16b, @tweak[1].16b
  1412. ___
  1413. &rev32(@data[0],@data[0]);
  1414. &encrypt_1blk(@data[0]);
  1415. $code.=<<___;
  1416. eor @data[0].16b, @data[0].16b, @tweak[1].16b
  1417. st1 {@data[0].4s},[$outp],#16
  1418. sub $lastBlk,$outp,16
  1419. .loop${std}:
  1420. subs $remain,$remain,1
  1421. ldrb $wtmp0,[$lastBlk,$remain]
  1422. ldrb $wtmp1,[$inp,$remain]
  1423. strb $wtmp1,[$lastBlk,$remain]
  1424. strb $wtmp0,[$outp,$remain]
  1425. b.gt .loop${std}
  1426. ld1 {@data[0].4s}, [$lastBlk]
  1427. eor @data[0].16b, @data[0].16b, @tweak[2].16b
  1428. ___
  1429. &rev32(@data[0],@data[0]);
  1430. &encrypt_1blk(@data[0]);
  1431. $code.=<<___;
  1432. eor @data[0].16b, @data[0].16b, @tweak[2].16b
  1433. st1 {@data[0].4s}, [$lastBlk]
  1434. .return${std}:
  1435. ldp d14, d15, [sp], #0x10
  1436. ldp d12, d13, [sp], #0x10
  1437. ldp d10, d11, [sp], #0x10
  1438. ldp d8, d9, [sp], #0x10
  1439. ldp x29, x30, [sp], #0x10
  1440. ldp x27, x28, [sp], #0x10
  1441. ldp x25, x26, [sp], #0x10
  1442. ldp x23, x24, [sp], #0x10
  1443. ldp x21, x22, [sp], #0x10
  1444. ldp x19, x20, [sp], #0x10
  1445. ldp x17, x18, [sp], #0x10
  1446. ldp x15, x16, [sp], #0x10
  1447. AARCH64_VALIDATE_LINK_REGISTER
  1448. ret
  1449. .size ${prefix}_xts_encrypt${std},.-${prefix}_xts_encrypt${std}
  1450. ___
  1451. } # end of gen_xts_cipher
  1452. &gen_xts_cipher("_gb");
  1453. &gen_xts_cipher("");
  1454. }}}
  1455. ########################################
  1456. open SELF,$0;
  1457. while(<SELF>) {
  1458. next if (/^#!/);
  1459. last if (!s/^#/\/\// and !/^$/);
  1460. print;
  1461. }
  1462. close SELF;
  1463. foreach(split("\n",$code)) {
  1464. s/\`([^\`]*)\`/eval($1)/ge;
  1465. print $_,"\n";
  1466. }
  1467. close STDOUT or die "error closing STDOUT: $!";