vpsm4-armv8.pl 41 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578
  1. #! /usr/bin/env perl
  2. # Copyright 2020-2024 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. # This module implements SM4 with ASIMD on aarch64
  10. #
  11. # Feb 2022
  12. #
  13. # $output is the last argument if it looks like a file (it has an extension)
  14. # $flavour is the first argument if it doesn't look like a file
  15. $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  16. $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  17. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  18. ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
  19. ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
  20. die "can't locate arm-xlate.pl";
  21. open OUT,"| \"$^X\" $xlate $flavour \"$output\""
  22. or die "can't call $xlate: $!";
  23. *STDOUT=*OUT;
  24. $prefix="vpsm4";
  25. my @vtmp=map("v$_",(0..3));
  26. my @qtmp=map("q$_",(0..3));
  27. my @data=map("v$_",(4..7));
  28. my @datax=map("v$_",(8..11));
  29. my ($rk0,$rk1)=("v12","v13");
  30. my ($rka,$rkb)=("v14","v15");
  31. my @vtmpx=map("v$_",(12..15));
  32. my @sbox=map("v$_",(16..31));
  33. my ($inp,$outp,$blocks,$rks)=("x0","x1","w2","x3");
  34. my ($tmpw,$tmp,$wtmp0,$wtmp1,$wtmp2)=("w6","x6","w7","w8","w9");
  35. my ($xtmp1,$xtmp2)=("x8","x9");
  36. my ($ptr,$counter)=("x10","w11");
  37. my ($word0,$word1,$word2,$word3)=("w12","w13","w14","w15");
  38. sub rev32() {
  39. my $dst = shift;
  40. my $src = shift;
  41. if ($src and ("$src" ne "$dst")) {
  42. $code.=<<___;
  43. #ifndef __AARCH64EB__
  44. rev32 $dst.16b,$src.16b
  45. #else
  46. mov $dst.16b,$src.16b
  47. #endif
  48. ___
  49. } else {
  50. $code.=<<___;
  51. #ifndef __AARCH64EB__
  52. rev32 $dst.16b,$dst.16b
  53. #endif
  54. ___
  55. }
  56. }
  57. sub rev32_armeb() {
  58. my $dst = shift;
  59. my $src = shift;
  60. if ($src and ("$src" ne "$dst")) {
  61. $code.=<<___;
  62. #ifdef __AARCH64EB__
  63. rev32 $dst.16b,$src.16b
  64. #else
  65. mov $dst.16b,$src.16b
  66. #endif
  67. ___
  68. } else {
  69. $code.=<<___;
  70. #ifdef __AARCH64EB__
  71. rev32 $dst.16b,$dst.16b
  72. #endif
  73. ___
  74. }
  75. }
  76. sub rbit() {
  77. my $dst = shift;
  78. my $src = shift;
  79. my $std = shift;
  80. if ($src and ("$src" ne "$dst")) {
  81. if ($std eq "_gb") {
  82. $code.=<<___;
  83. rbit $dst.16b,$src.16b
  84. ___
  85. } else {
  86. $code.=<<___;
  87. mov $dst.16b,$src.16b
  88. ___
  89. }
  90. } else {
  91. if ($std eq "_gb") {
  92. $code.=<<___;
  93. rbit $dst.16b,$src.16b
  94. ___
  95. }
  96. }
  97. }
  98. sub transpose() {
  99. my ($dat0,$dat1,$dat2,$dat3,$vt0,$vt1,$vt2,$vt3) = @_;
  100. $code.=<<___;
  101. zip1 $vt0.4s,$dat0.4s,$dat1.4s
  102. zip2 $vt1.4s,$dat0.4s,$dat1.4s
  103. zip1 $vt2.4s,$dat2.4s,$dat3.4s
  104. zip2 $vt3.4s,$dat2.4s,$dat3.4s
  105. zip1 $dat0.2d,$vt0.2d,$vt2.2d
  106. zip2 $dat1.2d,$vt0.2d,$vt2.2d
  107. zip1 $dat2.2d,$vt1.2d,$vt3.2d
  108. zip2 $dat3.2d,$vt1.2d,$vt3.2d
  109. ___
  110. }
  111. # sbox operations for 4-lane of words
  112. sub sbox() {
  113. my $dat = shift;
  114. $code.=<<___;
  115. movi @vtmp[0].16b,#64
  116. movi @vtmp[1].16b,#128
  117. movi @vtmp[2].16b,#192
  118. sub @vtmp[0].16b,$dat.16b,@vtmp[0].16b
  119. sub @vtmp[1].16b,$dat.16b,@vtmp[1].16b
  120. sub @vtmp[2].16b,$dat.16b,@vtmp[2].16b
  121. tbl $dat.16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},$dat.16b
  122. tbl @vtmp[0].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@vtmp[0].16b
  123. tbl @vtmp[1].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@vtmp[1].16b
  124. tbl @vtmp[2].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@vtmp[2].16b
  125. add @vtmp[0].2d,@vtmp[0].2d,@vtmp[1].2d
  126. add @vtmp[2].2d,@vtmp[2].2d,$dat.2d
  127. add $dat.2d,@vtmp[0].2d,@vtmp[2].2d
  128. ushr @vtmp[0].4s,$dat.4s,32-2
  129. sli @vtmp[0].4s,$dat.4s,2
  130. ushr @vtmp[2].4s,$dat.4s,32-10
  131. eor @vtmp[1].16b,@vtmp[0].16b,$dat.16b
  132. sli @vtmp[2].4s,$dat.4s,10
  133. eor @vtmp[1].16b,@vtmp[2].16b,$vtmp[1].16b
  134. ushr @vtmp[0].4s,$dat.4s,32-18
  135. sli @vtmp[0].4s,$dat.4s,18
  136. ushr @vtmp[2].4s,$dat.4s,32-24
  137. eor @vtmp[1].16b,@vtmp[0].16b,$vtmp[1].16b
  138. sli @vtmp[2].4s,$dat.4s,24
  139. eor $dat.16b,@vtmp[2].16b,@vtmp[1].16b
  140. ___
  141. }
  142. # sbox operation for 8-lane of words
  143. sub sbox_double() {
  144. my $dat = shift;
  145. my $datx = shift;
  146. $code.=<<___;
  147. movi @vtmp[3].16b,#64
  148. sub @vtmp[0].16b,$dat.16b,@vtmp[3].16b
  149. sub @vtmp[1].16b,@vtmp[0].16b,@vtmp[3].16b
  150. sub @vtmp[2].16b,@vtmp[1].16b,@vtmp[3].16b
  151. tbl $dat.16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},$dat.16b
  152. tbl @vtmp[0].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@vtmp[0].16b
  153. tbl @vtmp[1].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@vtmp[1].16b
  154. tbl @vtmp[2].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@vtmp[2].16b
  155. add @vtmp[1].2d,@vtmp[0].2d,@vtmp[1].2d
  156. add $dat.2d,@vtmp[2].2d,$dat.2d
  157. add $dat.2d,@vtmp[1].2d,$dat.2d
  158. sub @vtmp[0].16b,$datx.16b,@vtmp[3].16b
  159. sub @vtmp[1].16b,@vtmp[0].16b,@vtmp[3].16b
  160. sub @vtmp[2].16b,@vtmp[1].16b,@vtmp[3].16b
  161. tbl $datx.16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},$datx.16b
  162. tbl @vtmp[0].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@vtmp[0].16b
  163. tbl @vtmp[1].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@vtmp[1].16b
  164. tbl @vtmp[2].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@vtmp[2].16b
  165. add @vtmp[1].2d,@vtmp[0].2d,@vtmp[1].2d
  166. add $datx.2d,@vtmp[2].2d,$datx.2d
  167. add $datx.2d,@vtmp[1].2d,$datx.2d
  168. ushr @vtmp[0].4s,$dat.4s,32-2
  169. sli @vtmp[0].4s,$dat.4s,2
  170. ushr @vtmp[2].4s,$datx.4s,32-2
  171. eor @vtmp[1].16b,@vtmp[0].16b,$dat.16b
  172. sli @vtmp[2].4s,$datx.4s,2
  173. ushr @vtmp[0].4s,$dat.4s,32-10
  174. eor @vtmp[3].16b,@vtmp[2].16b,$datx.16b
  175. sli @vtmp[0].4s,$dat.4s,10
  176. ushr @vtmp[2].4s,$datx.4s,32-10
  177. eor @vtmp[1].16b,@vtmp[0].16b,$vtmp[1].16b
  178. sli @vtmp[2].4s,$datx.4s,10
  179. ushr @vtmp[0].4s,$dat.4s,32-18
  180. eor @vtmp[3].16b,@vtmp[2].16b,$vtmp[3].16b
  181. sli @vtmp[0].4s,$dat.4s,18
  182. ushr @vtmp[2].4s,$datx.4s,32-18
  183. eor @vtmp[1].16b,@vtmp[0].16b,$vtmp[1].16b
  184. sli @vtmp[2].4s,$datx.4s,18
  185. ushr @vtmp[0].4s,$dat.4s,32-24
  186. eor @vtmp[3].16b,@vtmp[2].16b,$vtmp[3].16b
  187. sli @vtmp[0].4s,$dat.4s,24
  188. ushr @vtmp[2].4s,$datx.4s,32-24
  189. eor $dat.16b,@vtmp[0].16b,@vtmp[1].16b
  190. sli @vtmp[2].4s,$datx.4s,24
  191. eor $datx.16b,@vtmp[2].16b,@vtmp[3].16b
  192. ___
  193. }
  194. # sbox operation for one single word
  195. sub sbox_1word () {
  196. my $word = shift;
  197. $code.=<<___;
  198. movi @vtmp[1].16b,#64
  199. movi @vtmp[2].16b,#128
  200. movi @vtmp[3].16b,#192
  201. mov @vtmp[0].s[0],$word
  202. sub @vtmp[1].16b,@vtmp[0].16b,@vtmp[1].16b
  203. sub @vtmp[2].16b,@vtmp[0].16b,@vtmp[2].16b
  204. sub @vtmp[3].16b,@vtmp[0].16b,@vtmp[3].16b
  205. tbl @vtmp[0].16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},@vtmp[0].16b
  206. tbl @vtmp[1].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@vtmp[1].16b
  207. tbl @vtmp[2].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@vtmp[2].16b
  208. tbl @vtmp[3].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@vtmp[3].16b
  209. mov $word,@vtmp[0].s[0]
  210. mov $wtmp0,@vtmp[1].s[0]
  211. mov $wtmp2,@vtmp[2].s[0]
  212. add $wtmp0,$word,$wtmp0
  213. mov $word,@vtmp[3].s[0]
  214. add $wtmp0,$wtmp0,$wtmp2
  215. add $wtmp0,$wtmp0,$word
  216. eor $word,$wtmp0,$wtmp0,ror #32-2
  217. eor $word,$word,$wtmp0,ror #32-10
  218. eor $word,$word,$wtmp0,ror #32-18
  219. eor $word,$word,$wtmp0,ror #32-24
  220. ___
  221. }
  222. # sm4 for one block of data, in scalar registers word0/word1/word2/word3
  223. sub sm4_1blk () {
  224. my $kptr = shift;
  225. $code.=<<___;
  226. ldp $wtmp0,$wtmp1,[$kptr],8
  227. // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
  228. eor $tmpw,$word2,$word3
  229. eor $wtmp2,$wtmp0,$word1
  230. eor $tmpw,$tmpw,$wtmp2
  231. ___
  232. &sbox_1word($tmpw);
  233. $code.=<<___;
  234. eor $word0,$word0,$tmpw
  235. // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
  236. eor $tmpw,$word2,$word3
  237. eor $wtmp2,$word0,$wtmp1
  238. eor $tmpw,$tmpw,$wtmp2
  239. ___
  240. &sbox_1word($tmpw);
  241. $code.=<<___;
  242. ldp $wtmp0,$wtmp1,[$kptr],8
  243. eor $word1,$word1,$tmpw
  244. // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
  245. eor $tmpw,$word0,$word1
  246. eor $wtmp2,$wtmp0,$word3
  247. eor $tmpw,$tmpw,$wtmp2
  248. ___
  249. &sbox_1word($tmpw);
  250. $code.=<<___;
  251. eor $word2,$word2,$tmpw
  252. // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
  253. eor $tmpw,$word0,$word1
  254. eor $wtmp2,$word2,$wtmp1
  255. eor $tmpw,$tmpw,$wtmp2
  256. ___
  257. &sbox_1word($tmpw);
  258. $code.=<<___;
  259. eor $word3,$word3,$tmpw
  260. ___
  261. }
  262. # sm4 for 4-lanes of data, in neon registers data0/data1/data2/data3
  263. sub sm4_4blks () {
  264. my $kptr = shift;
  265. $code.=<<___;
  266. ldp $wtmp0,$wtmp1,[$kptr],8
  267. dup $rk0.4s,$wtmp0
  268. dup $rk1.4s,$wtmp1
  269. // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
  270. eor $rka.16b,@data[2].16b,@data[3].16b
  271. eor $rk0.16b,@data[1].16b,$rk0.16b
  272. eor $rk0.16b,$rka.16b,$rk0.16b
  273. ___
  274. &sbox($rk0);
  275. $code.=<<___;
  276. eor @data[0].16b,@data[0].16b,$rk0.16b
  277. // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
  278. eor $rka.16b,$rka.16b,@data[0].16b
  279. eor $rk1.16b,$rka.16b,$rk1.16b
  280. ___
  281. &sbox($rk1);
  282. $code.=<<___;
  283. ldp $wtmp0,$wtmp1,[$kptr],8
  284. eor @data[1].16b,@data[1].16b,$rk1.16b
  285. dup $rk0.4s,$wtmp0
  286. dup $rk1.4s,$wtmp1
  287. // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
  288. eor $rka.16b,@data[0].16b,@data[1].16b
  289. eor $rk0.16b,@data[3].16b,$rk0.16b
  290. eor $rk0.16b,$rka.16b,$rk0.16b
  291. ___
  292. &sbox($rk0);
  293. $code.=<<___;
  294. eor @data[2].16b,@data[2].16b,$rk0.16b
  295. // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
  296. eor $rka.16b,$rka.16b,@data[2].16b
  297. eor $rk1.16b,$rka.16b,$rk1.16b
  298. ___
  299. &sbox($rk1);
  300. $code.=<<___;
  301. eor @data[3].16b,@data[3].16b,$rk1.16b
  302. ___
  303. }
  304. # sm4 for 8 lanes of data, in neon registers
  305. # data0/data1/data2/data3 datax0/datax1/datax2/datax3
  306. sub sm4_8blks () {
  307. my $kptr = shift;
  308. $code.=<<___;
  309. ldp $wtmp0,$wtmp1,[$kptr],8
  310. // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
  311. dup $rk0.4s,$wtmp0
  312. eor $rka.16b,@data[2].16b,@data[3].16b
  313. eor $rkb.16b,@datax[2].16b,@datax[3].16b
  314. eor @vtmp[0].16b,@data[1].16b,$rk0.16b
  315. eor @vtmp[1].16b,@datax[1].16b,$rk0.16b
  316. eor $rk0.16b,$rka.16b,@vtmp[0].16b
  317. eor $rk1.16b,$rkb.16b,@vtmp[1].16b
  318. ___
  319. &sbox_double($rk0,$rk1);
  320. $code.=<<___;
  321. eor @data[0].16b,@data[0].16b,$rk0.16b
  322. eor @datax[0].16b,@datax[0].16b,$rk1.16b
  323. // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
  324. dup $rk1.4s,$wtmp1
  325. eor $rka.16b,$rka.16b,@data[0].16b
  326. eor $rkb.16b,$rkb.16b,@datax[0].16b
  327. eor $rk0.16b,$rka.16b,$rk1.16b
  328. eor $rk1.16b,$rkb.16b,$rk1.16b
  329. ___
  330. &sbox_double($rk0,$rk1);
  331. $code.=<<___;
  332. ldp $wtmp0,$wtmp1,[$kptr],8
  333. eor @data[1].16b,@data[1].16b,$rk0.16b
  334. eor @datax[1].16b,@datax[1].16b,$rk1.16b
  335. // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
  336. dup $rk0.4s,$wtmp0
  337. eor $rka.16b,@data[0].16b,@data[1].16b
  338. eor $rkb.16b,@datax[0].16b,@datax[1].16b
  339. eor @vtmp[0].16b,@data[3].16b,$rk0.16b
  340. eor @vtmp[1].16b,@datax[3].16b,$rk0.16b
  341. eor $rk0.16b,$rka.16b,@vtmp[0].16b
  342. eor $rk1.16b,$rkb.16b,@vtmp[1].16b
  343. ___
  344. &sbox_double($rk0,$rk1);
  345. $code.=<<___;
  346. eor @data[2].16b,@data[2].16b,$rk0.16b
  347. eor @datax[2].16b,@datax[2].16b,$rk1.16b
  348. // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
  349. dup $rk1.4s,$wtmp1
  350. eor $rka.16b,$rka.16b,@data[2].16b
  351. eor $rkb.16b,$rkb.16b,@datax[2].16b
  352. eor $rk0.16b,$rka.16b,$rk1.16b
  353. eor $rk1.16b,$rkb.16b,$rk1.16b
  354. ___
  355. &sbox_double($rk0,$rk1);
  356. $code.=<<___;
  357. eor @data[3].16b,@data[3].16b,$rk0.16b
  358. eor @datax[3].16b,@datax[3].16b,$rk1.16b
  359. ___
  360. }
  361. sub encrypt_1blk_norev() {
  362. my $dat = shift;
  363. $code.=<<___;
  364. mov $ptr,$rks
  365. mov $counter,#8
  366. mov $word0,$dat.s[0]
  367. mov $word1,$dat.s[1]
  368. mov $word2,$dat.s[2]
  369. mov $word3,$dat.s[3]
  370. 10:
  371. ___
  372. &sm4_1blk($ptr);
  373. $code.=<<___;
  374. subs $counter,$counter,#1
  375. b.ne 10b
  376. mov $dat.s[0],$word3
  377. mov $dat.s[1],$word2
  378. mov $dat.s[2],$word1
  379. mov $dat.s[3],$word0
  380. ___
  381. }
  382. sub encrypt_1blk() {
  383. my $dat = shift;
  384. &encrypt_1blk_norev($dat);
  385. &rev32($dat,$dat);
  386. }
  387. sub encrypt_4blks() {
  388. $code.=<<___;
  389. mov $ptr,$rks
  390. mov $counter,#8
  391. 10:
  392. ___
  393. &sm4_4blks($ptr);
  394. $code.=<<___;
  395. subs $counter,$counter,#1
  396. b.ne 10b
  397. ___
  398. &rev32(@vtmp[3],@data[0]);
  399. &rev32(@vtmp[2],@data[1]);
  400. &rev32(@vtmp[1],@data[2]);
  401. &rev32(@vtmp[0],@data[3]);
  402. }
  403. sub encrypt_8blks() {
  404. $code.=<<___;
  405. mov $ptr,$rks
  406. mov $counter,#8
  407. 10:
  408. ___
  409. &sm4_8blks($ptr);
  410. $code.=<<___;
  411. subs $counter,$counter,#1
  412. b.ne 10b
  413. ___
  414. &rev32(@vtmp[3],@data[0]);
  415. &rev32(@vtmp[2],@data[1]);
  416. &rev32(@vtmp[1],@data[2]);
  417. &rev32(@vtmp[0],@data[3]);
  418. &rev32(@data[3],@datax[0]);
  419. &rev32(@data[2],@datax[1]);
  420. &rev32(@data[1],@datax[2]);
  421. &rev32(@data[0],@datax[3]);
  422. }
  423. sub load_sbox () {
  424. my $data = shift;
  425. $code.=<<___;
  426. adr $ptr,.Lsbox
  427. ld1 {@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},[$ptr],#64
  428. ld1 {@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},[$ptr],#64
  429. ld1 {@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},[$ptr],#64
  430. ld1 {@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},[$ptr]
  431. ___
  432. }
  433. sub mov_reg_to_vec() {
  434. my $src0 = shift;
  435. my $src1 = shift;
  436. my $desv = shift;
  437. $code.=<<___;
  438. mov $desv.d[0],$src0
  439. mov $desv.d[1],$src1
  440. ___
  441. &rev32_armeb($desv,$desv);
  442. }
  443. sub mov_vec_to_reg() {
  444. my $srcv = shift;
  445. my $des0 = shift;
  446. my $des1 = shift;
  447. $code.=<<___;
  448. mov $des0,$srcv.d[0]
  449. mov $des1,$srcv.d[1]
  450. ___
  451. }
  452. sub compute_tweak() {
  453. my $src0 = shift;
  454. my $src1 = shift;
  455. my $des0 = shift;
  456. my $des1 = shift;
  457. $code.=<<___;
  458. mov $wtmp0,0x87
  459. extr $xtmp2,$src1,$src1,#32
  460. extr $des1,$src1,$src0,#63
  461. and $wtmp1,$wtmp0,$wtmp2,asr#31
  462. eor $des0,$xtmp1,$src0,lsl#1
  463. ___
  464. }
  465. sub compute_tweak_vec() {
  466. my $src = shift;
  467. my $des = shift;
  468. my $std = shift;
  469. &rbit(@vtmp[2],$src,$std);
  470. $code.=<<___;
  471. ldr @qtmp[0], .Lxts_magic
  472. shl $des.16b, @vtmp[2].16b, #1
  473. ext @vtmp[1].16b, @vtmp[2].16b, @vtmp[2].16b,#15
  474. ushr @vtmp[1].16b, @vtmp[1].16b, #7
  475. mul @vtmp[1].16b, @vtmp[1].16b, @vtmp[0].16b
  476. eor $des.16b, $des.16b, @vtmp[1].16b
  477. ___
  478. &rbit($des,$des,$std);
  479. }
  480. $code=<<___;
  481. #include "arm_arch.h"
  482. .arch armv8-a
  483. .text
  484. .type _vpsm4_consts,%object
  485. .align 7
  486. _vpsm4_consts:
  487. .Lsbox:
  488. .byte 0xD6,0x90,0xE9,0xFE,0xCC,0xE1,0x3D,0xB7,0x16,0xB6,0x14,0xC2,0x28,0xFB,0x2C,0x05
  489. .byte 0x2B,0x67,0x9A,0x76,0x2A,0xBE,0x04,0xC3,0xAA,0x44,0x13,0x26,0x49,0x86,0x06,0x99
  490. .byte 0x9C,0x42,0x50,0xF4,0x91,0xEF,0x98,0x7A,0x33,0x54,0x0B,0x43,0xED,0xCF,0xAC,0x62
  491. .byte 0xE4,0xB3,0x1C,0xA9,0xC9,0x08,0xE8,0x95,0x80,0xDF,0x94,0xFA,0x75,0x8F,0x3F,0xA6
  492. .byte 0x47,0x07,0xA7,0xFC,0xF3,0x73,0x17,0xBA,0x83,0x59,0x3C,0x19,0xE6,0x85,0x4F,0xA8
  493. .byte 0x68,0x6B,0x81,0xB2,0x71,0x64,0xDA,0x8B,0xF8,0xEB,0x0F,0x4B,0x70,0x56,0x9D,0x35
  494. .byte 0x1E,0x24,0x0E,0x5E,0x63,0x58,0xD1,0xA2,0x25,0x22,0x7C,0x3B,0x01,0x21,0x78,0x87
  495. .byte 0xD4,0x00,0x46,0x57,0x9F,0xD3,0x27,0x52,0x4C,0x36,0x02,0xE7,0xA0,0xC4,0xC8,0x9E
  496. .byte 0xEA,0xBF,0x8A,0xD2,0x40,0xC7,0x38,0xB5,0xA3,0xF7,0xF2,0xCE,0xF9,0x61,0x15,0xA1
  497. .byte 0xE0,0xAE,0x5D,0xA4,0x9B,0x34,0x1A,0x55,0xAD,0x93,0x32,0x30,0xF5,0x8C,0xB1,0xE3
  498. .byte 0x1D,0xF6,0xE2,0x2E,0x82,0x66,0xCA,0x60,0xC0,0x29,0x23,0xAB,0x0D,0x53,0x4E,0x6F
  499. .byte 0xD5,0xDB,0x37,0x45,0xDE,0xFD,0x8E,0x2F,0x03,0xFF,0x6A,0x72,0x6D,0x6C,0x5B,0x51
  500. .byte 0x8D,0x1B,0xAF,0x92,0xBB,0xDD,0xBC,0x7F,0x11,0xD9,0x5C,0x41,0x1F,0x10,0x5A,0xD8
  501. .byte 0x0A,0xC1,0x31,0x88,0xA5,0xCD,0x7B,0xBD,0x2D,0x74,0xD0,0x12,0xB8,0xE5,0xB4,0xB0
  502. .byte 0x89,0x69,0x97,0x4A,0x0C,0x96,0x77,0x7E,0x65,0xB9,0xF1,0x09,0xC5,0x6E,0xC6,0x84
  503. .byte 0x18,0xF0,0x7D,0xEC,0x3A,0xDC,0x4D,0x20,0x79,0xEE,0x5F,0x3E,0xD7,0xCB,0x39,0x48
  504. .Lck:
  505. .long 0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269
  506. .long 0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9
  507. .long 0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249
  508. .long 0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9
  509. .long 0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229
  510. .long 0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299
  511. .long 0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209
  512. .long 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279
  513. .Lfk:
  514. .quad 0x56aa3350a3b1bac6,0xb27022dc677d9197
  515. .Lshuffles:
  516. .quad 0x0B0A090807060504,0x030201000F0E0D0C
  517. .Lxts_magic:
  518. .quad 0x0101010101010187,0x0101010101010101
  519. .size _vpsm4_consts,.-_vpsm4_consts
  520. ___
  521. {{{
  522. my ($key,$keys,$enc)=("x0","x1","w2");
  523. my ($pointer,$schedules,$wtmp,$roundkey)=("x5","x6","w7","w8");
  524. my ($vkey,$vfk,$vmap)=("v5","v6","v7");
  525. $code.=<<___;
  526. .type _vpsm4_set_key,%function
  527. .align 4
  528. _vpsm4_set_key:
  529. AARCH64_VALID_CALL_TARGET
  530. ld1 {$vkey.4s},[$key]
  531. ___
  532. &load_sbox();
  533. &rev32($vkey,$vkey);
  534. $code.=<<___;
  535. adr $pointer,.Lshuffles
  536. ld1 {$vmap.2d},[$pointer]
  537. adr $pointer,.Lfk
  538. ld1 {$vfk.2d},[$pointer]
  539. eor $vkey.16b,$vkey.16b,$vfk.16b
  540. mov $schedules,#32
  541. adr $pointer,.Lck
  542. movi @vtmp[0].16b,#64
  543. cbnz $enc,1f
  544. add $keys,$keys,124
  545. 1:
  546. mov $wtmp,$vkey.s[1]
  547. ldr $roundkey,[$pointer],#4
  548. eor $roundkey,$roundkey,$wtmp
  549. mov $wtmp,$vkey.s[2]
  550. eor $roundkey,$roundkey,$wtmp
  551. mov $wtmp,$vkey.s[3]
  552. eor $roundkey,$roundkey,$wtmp
  553. // sbox lookup
  554. mov @data[0].s[0],$roundkey
  555. tbl @vtmp[1].16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},@data[0].16b
  556. sub @data[0].16b,@data[0].16b,@vtmp[0].16b
  557. tbx @vtmp[1].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@data[0].16b
  558. sub @data[0].16b,@data[0].16b,@vtmp[0].16b
  559. tbx @vtmp[1].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@data[0].16b
  560. sub @data[0].16b,@data[0].16b,@vtmp[0].16b
  561. tbx @vtmp[1].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@data[0].16b
  562. mov $wtmp,@vtmp[1].s[0]
  563. eor $roundkey,$wtmp,$wtmp,ror #19
  564. eor $roundkey,$roundkey,$wtmp,ror #9
  565. mov $wtmp,$vkey.s[0]
  566. eor $roundkey,$roundkey,$wtmp
  567. mov $vkey.s[0],$roundkey
  568. cbz $enc,2f
  569. str $roundkey,[$keys],#4
  570. b 3f
  571. 2:
  572. str $roundkey,[$keys],#-4
  573. 3:
  574. tbl $vkey.16b,{$vkey.16b},$vmap.16b
  575. subs $schedules,$schedules,#1
  576. b.ne 1b
  577. ret
  578. .size _vpsm4_set_key,.-_vpsm4_set_key
  579. ___
  580. }}}
  581. {{{
  582. $code.=<<___;
  583. .type _vpsm4_enc_4blks,%function
  584. .align 4
  585. _vpsm4_enc_4blks:
  586. AARCH64_VALID_CALL_TARGET
  587. ___
  588. &encrypt_4blks();
  589. $code.=<<___;
  590. ret
  591. .size _vpsm4_enc_4blks,.-_vpsm4_enc_4blks
  592. ___
  593. }}}
  594. {{{
  595. $code.=<<___;
  596. .type _vpsm4_enc_8blks,%function
  597. .align 4
  598. _vpsm4_enc_8blks:
  599. AARCH64_VALID_CALL_TARGET
  600. ___
  601. &encrypt_8blks();
  602. $code.=<<___;
  603. ret
  604. .size _vpsm4_enc_8blks,.-_vpsm4_enc_8blks
  605. ___
  606. }}}
  607. {{{
  608. my ($key,$keys)=("x0","x1");
  609. $code.=<<___;
  610. .globl ${prefix}_set_encrypt_key
  611. .type ${prefix}_set_encrypt_key,%function
  612. .align 5
  613. ${prefix}_set_encrypt_key:
  614. AARCH64_SIGN_LINK_REGISTER
  615. stp x29,x30,[sp,#-16]!
  616. mov w2,1
  617. bl _vpsm4_set_key
  618. ldp x29,x30,[sp],#16
  619. AARCH64_VALIDATE_LINK_REGISTER
  620. ret
  621. .size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
  622. ___
  623. }}}
  624. {{{
  625. my ($key,$keys)=("x0","x1");
  626. $code.=<<___;
  627. .globl ${prefix}_set_decrypt_key
  628. .type ${prefix}_set_decrypt_key,%function
  629. .align 5
  630. ${prefix}_set_decrypt_key:
  631. AARCH64_SIGN_LINK_REGISTER
  632. stp x29,x30,[sp,#-16]!
  633. mov w2,0
  634. bl _vpsm4_set_key
  635. ldp x29,x30,[sp],#16
  636. AARCH64_VALIDATE_LINK_REGISTER
  637. ret
  638. .size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
  639. ___
  640. }}}
  641. {{{
  642. sub gen_block () {
  643. my $dir = shift;
  644. my ($inp,$outp,$rk)=map("x$_",(0..2));
  645. $code.=<<___;
  646. .globl ${prefix}_${dir}crypt
  647. .type ${prefix}_${dir}crypt,%function
  648. .align 5
  649. ${prefix}_${dir}crypt:
  650. AARCH64_VALID_CALL_TARGET
  651. ld1 {@data[0].4s},[$inp]
  652. ___
  653. &load_sbox();
  654. &rev32(@data[0],@data[0]);
  655. $code.=<<___;
  656. mov $rks,x2
  657. ___
  658. &encrypt_1blk(@data[0]);
  659. $code.=<<___;
  660. st1 {@data[0].4s},[$outp]
  661. ret
  662. .size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
  663. ___
  664. }
  665. &gen_block("en");
  666. &gen_block("de");
  667. }}}
  668. {{{
  669. my ($enc) = ("w4");
  670. my @dat=map("v$_",(16..23));
  671. $code.=<<___;
  672. .globl ${prefix}_ecb_encrypt
  673. .type ${prefix}_ecb_encrypt,%function
  674. .align 5
  675. ${prefix}_ecb_encrypt:
  676. AARCH64_SIGN_LINK_REGISTER
  677. // convert length into blocks
  678. lsr x2,x2,4
  679. stp d8,d9,[sp,#-80]!
  680. stp d10,d11,[sp,#16]
  681. stp d12,d13,[sp,#32]
  682. stp d14,d15,[sp,#48]
  683. stp x29,x30,[sp,#64]
  684. ___
  685. &load_sbox();
  686. $code.=<<___;
  687. .Lecb_8_blocks_process:
  688. cmp $blocks,#8
  689. b.lt .Lecb_4_blocks_process
  690. ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
  691. ld4 {@datax[0].4s,$datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
  692. ___
  693. &rev32(@data[0],@data[0]);
  694. &rev32(@data[1],@data[1]);
  695. &rev32(@data[2],@data[2]);
  696. &rev32(@data[3],@data[3]);
  697. &rev32(@datax[0],@datax[0]);
  698. &rev32(@datax[1],@datax[1]);
  699. &rev32(@datax[2],@datax[2]);
  700. &rev32(@datax[3],@datax[3]);
  701. $code.=<<___;
  702. bl _vpsm4_enc_8blks
  703. st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
  704. st4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
  705. subs $blocks,$blocks,#8
  706. b.gt .Lecb_8_blocks_process
  707. b 100f
  708. .Lecb_4_blocks_process:
  709. cmp $blocks,#4
  710. b.lt 1f
  711. ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
  712. ___
  713. &rev32(@data[0],@data[0]);
  714. &rev32(@data[1],@data[1]);
  715. &rev32(@data[2],@data[2]);
  716. &rev32(@data[3],@data[3]);
  717. $code.=<<___;
  718. bl _vpsm4_enc_4blks
  719. st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
  720. sub $blocks,$blocks,#4
  721. 1:
  722. // process last block
  723. cmp $blocks,#1
  724. b.lt 100f
  725. b.gt 1f
  726. ld1 {@data[0].4s},[$inp]
  727. ___
  728. &rev32(@data[0],@data[0]);
  729. &encrypt_1blk(@data[0]);
  730. $code.=<<___;
  731. st1 {@data[0].4s},[$outp]
  732. b 100f
  733. 1: // process last 2 blocks
  734. ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[0],[$inp],#16
  735. ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[1],[$inp],#16
  736. cmp $blocks,#2
  737. b.gt 1f
  738. ___
  739. &rev32(@data[0],@data[0]);
  740. &rev32(@data[1],@data[1]);
  741. &rev32(@data[2],@data[2]);
  742. &rev32(@data[3],@data[3]);
  743. $code.=<<___;
  744. bl _vpsm4_enc_4blks
  745. st4 {@vtmp[0].s-@vtmp[3].s}[0],[$outp],#16
  746. st4 {@vtmp[0].s-@vtmp[3].s}[1],[$outp]
  747. b 100f
  748. 1: // process last 3 blocks
  749. ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[2],[$inp],#16
  750. ___
  751. &rev32(@data[0],@data[0]);
  752. &rev32(@data[1],@data[1]);
  753. &rev32(@data[2],@data[2]);
  754. &rev32(@data[3],@data[3]);
  755. $code.=<<___;
  756. bl _vpsm4_enc_4blks
  757. st4 {@vtmp[0].s-@vtmp[3].s}[0],[$outp],#16
  758. st4 {@vtmp[0].s-@vtmp[3].s}[1],[$outp],#16
  759. st4 {@vtmp[0].s-@vtmp[3].s}[2],[$outp]
  760. 100:
  761. ldp d10,d11,[sp,#16]
  762. ldp d12,d13,[sp,#32]
  763. ldp d14,d15,[sp,#48]
  764. ldp x29,x30,[sp,#64]
  765. ldp d8,d9,[sp],#80
  766. AARCH64_VALIDATE_LINK_REGISTER
  767. ret
  768. .size ${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt
  769. ___
  770. }}}
  771. {{{
  772. my ($len,$ivp,$enc)=("x2","x4","w5");
  773. my $ivec0=("v3");
  774. my $ivec1=("v15");
  775. $code.=<<___;
  776. .globl ${prefix}_cbc_encrypt
  777. .type ${prefix}_cbc_encrypt,%function
  778. .align 5
  779. ${prefix}_cbc_encrypt:
  780. AARCH64_VALID_CALL_TARGET
  781. lsr $len,$len,4
  782. ___
  783. &load_sbox();
  784. $code.=<<___;
  785. cbz $enc,.Ldec
  786. ld1 {$ivec0.4s},[$ivp]
  787. .Lcbc_4_blocks_enc:
  788. cmp $blocks,#4
  789. b.lt 1f
  790. ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
  791. eor @data[0].16b,@data[0].16b,$ivec0.16b
  792. ___
  793. &rev32(@data[1],@data[1]);
  794. &rev32(@data[0],@data[0]);
  795. &rev32(@data[2],@data[2]);
  796. &rev32(@data[3],@data[3]);
  797. &encrypt_1blk_norev(@data[0]);
  798. $code.=<<___;
  799. eor @data[1].16b,@data[1].16b,@data[0].16b
  800. ___
  801. &encrypt_1blk_norev(@data[1]);
  802. &rev32(@data[0],@data[0]);
  803. $code.=<<___;
  804. eor @data[2].16b,@data[2].16b,@data[1].16b
  805. ___
  806. &encrypt_1blk_norev(@data[2]);
  807. &rev32(@data[1],@data[1]);
  808. $code.=<<___;
  809. eor @data[3].16b,@data[3].16b,@data[2].16b
  810. ___
  811. &encrypt_1blk_norev(@data[3]);
  812. &rev32(@data[2],@data[2]);
  813. &rev32(@data[3],@data[3]);
  814. $code.=<<___;
  815. orr $ivec0.16b,@data[3].16b,@data[3].16b
  816. st1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
  817. subs $blocks,$blocks,#4
  818. b.ne .Lcbc_4_blocks_enc
  819. b 2f
  820. 1:
  821. subs $blocks,$blocks,#1
  822. b.lt 2f
  823. ld1 {@data[0].4s},[$inp],#16
  824. eor $ivec0.16b,$ivec0.16b,@data[0].16b
  825. ___
  826. &rev32($ivec0,$ivec0);
  827. &encrypt_1blk($ivec0);
  828. $code.=<<___;
  829. st1 {$ivec0.4s},[$outp],#16
  830. b 1b
  831. 2:
  832. // save back IV
  833. st1 {$ivec0.4s},[$ivp]
  834. ret
  835. .Ldec:
  836. // decryption mode starts
  837. AARCH64_SIGN_LINK_REGISTER
  838. stp d8,d9,[sp,#-80]!
  839. stp d10,d11,[sp,#16]
  840. stp d12,d13,[sp,#32]
  841. stp d14,d15,[sp,#48]
  842. stp x29,x30,[sp,#64]
  843. .Lcbc_8_blocks_dec:
  844. cmp $blocks,#8
  845. b.lt 1f
  846. ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp]
  847. add $ptr,$inp,#64
  848. ld4 {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$ptr]
  849. ___
  850. &rev32(@data[0],@data[0]);
  851. &rev32(@data[1],@data[1]);
  852. &rev32(@data[2],@data[2]);
  853. &rev32(@data[3],$data[3]);
  854. &rev32(@datax[0],@datax[0]);
  855. &rev32(@datax[1],@datax[1]);
  856. &rev32(@datax[2],@datax[2]);
  857. &rev32(@datax[3],$datax[3]);
  858. $code.=<<___;
  859. bl _vpsm4_enc_8blks
  860. ___
  861. &transpose(@vtmp,@datax);
  862. &transpose(@data,@datax);
  863. $code.=<<___;
  864. ld1 {$ivec1.4s},[$ivp]
  865. ld1 {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
  866. // note ivec1 and vtmpx[3] are reusing the same register
  867. // care needs to be taken to avoid conflict
  868. eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
  869. ld1 {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64
  870. eor @vtmp[1].16b,@vtmp[1].16b,@datax[0].16b
  871. eor @vtmp[2].16b,@vtmp[2].16b,@datax[1].16b
  872. eor @vtmp[3].16b,$vtmp[3].16b,@datax[2].16b
  873. // save back IV
  874. st1 {$vtmpx[3].4s}, [$ivp]
  875. eor @data[0].16b,@data[0].16b,$datax[3].16b
  876. eor @data[1].16b,@data[1].16b,@vtmpx[0].16b
  877. eor @data[2].16b,@data[2].16b,@vtmpx[1].16b
  878. eor @data[3].16b,$data[3].16b,@vtmpx[2].16b
  879. st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
  880. st1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
  881. subs $blocks,$blocks,#8
  882. b.gt .Lcbc_8_blocks_dec
  883. b.eq 100f
  884. 1:
  885. ld1 {$ivec1.4s},[$ivp]
  886. .Lcbc_4_blocks_dec:
  887. cmp $blocks,#4
  888. b.lt 1f
  889. ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp]
  890. ___
  891. &rev32(@data[0],@data[0]);
  892. &rev32(@data[1],@data[1]);
  893. &rev32(@data[2],@data[2]);
  894. &rev32(@data[3],$data[3]);
  895. $code.=<<___;
  896. bl _vpsm4_enc_4blks
  897. ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
  898. ___
  899. &transpose(@vtmp,@datax);
  900. $code.=<<___;
  901. eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
  902. eor @vtmp[1].16b,@vtmp[1].16b,@data[0].16b
  903. orr $ivec1.16b,@data[3].16b,@data[3].16b
  904. eor @vtmp[2].16b,@vtmp[2].16b,@data[1].16b
  905. eor @vtmp[3].16b,$vtmp[3].16b,@data[2].16b
  906. st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
  907. subs $blocks,$blocks,#4
  908. b.gt .Lcbc_4_blocks_dec
  909. // save back IV
  910. st1 {@data[3].4s}, [$ivp]
  911. b 100f
  912. 1: // last block
  913. subs $blocks,$blocks,#1
  914. b.lt 100f
  915. b.gt 1f
  916. ld1 {@data[0].4s},[$inp],#16
  917. // save back IV
  918. st1 {$data[0].4s}, [$ivp]
  919. ___
  920. &rev32(@datax[0],@data[0]);
  921. &encrypt_1blk(@datax[0]);
  922. $code.=<<___;
  923. eor @datax[0].16b,@datax[0].16b,$ivec1.16b
  924. st1 {@datax[0].4s},[$outp],#16
  925. b 100f
  926. 1: // last two blocks
  927. ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[0],[$inp]
  928. add $ptr,$inp,#16
  929. ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[1],[$ptr],#16
  930. subs $blocks,$blocks,1
  931. b.gt 1f
  932. ___
  933. &rev32(@data[0],@data[0]);
  934. &rev32(@data[1],@data[1]);
  935. &rev32(@data[2],@data[2]);
  936. &rev32(@data[3],@data[3]);
  937. $code.=<<___;
  938. bl _vpsm4_enc_4blks
  939. ld1 {@data[0].4s,@data[1].4s},[$inp],#32
  940. ___
  941. &transpose(@vtmp,@datax);
  942. $code.=<<___;
  943. eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
  944. eor @vtmp[1].16b,@vtmp[1].16b,@data[0].16b
  945. st1 {@vtmp[0].4s,@vtmp[1].4s},[$outp],#32
  946. // save back IV
  947. st1 {@data[1].4s}, [$ivp]
  948. b 100f
  949. 1: // last 3 blocks
  950. ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[2],[$ptr]
  951. ___
  952. &rev32(@data[0],@data[0]);
  953. &rev32(@data[1],@data[1]);
  954. &rev32(@data[2],@data[2]);
  955. &rev32(@data[3],@data[3]);
  956. $code.=<<___;
  957. bl _vpsm4_enc_4blks
  958. ld1 {@data[0].4s,@data[1].4s,@data[2].4s},[$inp],#48
  959. ___
  960. &transpose(@vtmp,@datax);
  961. $code.=<<___;
  962. eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
  963. eor @vtmp[1].16b,@vtmp[1].16b,@data[0].16b
  964. eor @vtmp[2].16b,@vtmp[2].16b,@data[1].16b
  965. st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s},[$outp],#48
  966. // save back IV
  967. st1 {@data[2].4s}, [$ivp]
  968. 100:
  969. ldp d10,d11,[sp,#16]
  970. ldp d12,d13,[sp,#32]
  971. ldp d14,d15,[sp,#48]
  972. ldp x29,x30,[sp,#64]
  973. ldp d8,d9,[sp],#80
  974. AARCH64_VALIDATE_LINK_REGISTER
  975. ret
  976. .size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
  977. ___
  978. }}}
  979. {{{
  980. my ($ivp)=("x4");
  981. my ($ctr)=("w5");
  982. my $ivec=("v3");
  983. $code.=<<___;
  984. .globl ${prefix}_ctr32_encrypt_blocks
  985. .type ${prefix}_ctr32_encrypt_blocks,%function
  986. .align 5
  987. ${prefix}_ctr32_encrypt_blocks:
  988. AARCH64_VALID_CALL_TARGET
  989. ld1 {$ivec.4s},[$ivp]
  990. ___
  991. &rev32($ivec,$ivec);
  992. &load_sbox();
  993. $code.=<<___;
  994. cmp $blocks,#1
  995. b.ne 1f
  996. // fast processing for one single block without
  997. // context saving overhead
  998. ___
  999. &encrypt_1blk($ivec);
  1000. $code.=<<___;
  1001. ld1 {@data[0].4s},[$inp]
  1002. eor @data[0].16b,@data[0].16b,$ivec.16b
  1003. st1 {@data[0].4s},[$outp]
  1004. ret
  1005. 1:
  1006. AARCH64_SIGN_LINK_REGISTER
  1007. stp d8,d9,[sp,#-80]!
  1008. stp d10,d11,[sp,#16]
  1009. stp d12,d13,[sp,#32]
  1010. stp d14,d15,[sp,#48]
  1011. stp x29,x30,[sp,#64]
  1012. mov $word0,$ivec.s[0]
  1013. mov $word1,$ivec.s[1]
  1014. mov $word2,$ivec.s[2]
  1015. mov $ctr,$ivec.s[3]
  1016. .Lctr32_4_blocks_process:
  1017. cmp $blocks,#4
  1018. b.lt 1f
  1019. dup @data[0].4s,$word0
  1020. dup @data[1].4s,$word1
  1021. dup @data[2].4s,$word2
  1022. mov @data[3].s[0],$ctr
  1023. add $ctr,$ctr,#1
  1024. mov $data[3].s[1],$ctr
  1025. add $ctr,$ctr,#1
  1026. mov @data[3].s[2],$ctr
  1027. add $ctr,$ctr,#1
  1028. mov @data[3].s[3],$ctr
  1029. add $ctr,$ctr,#1
  1030. cmp $blocks,#8
  1031. b.ge .Lctr32_8_blocks_process
  1032. bl _vpsm4_enc_4blks
  1033. ld4 {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64
  1034. eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
  1035. eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
  1036. eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
  1037. eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
  1038. st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
  1039. subs $blocks,$blocks,#4
  1040. b.ne .Lctr32_4_blocks_process
  1041. b 100f
  1042. .Lctr32_8_blocks_process:
  1043. dup @datax[0].4s,$word0
  1044. dup @datax[1].4s,$word1
  1045. dup @datax[2].4s,$word2
  1046. mov @datax[3].s[0],$ctr
  1047. add $ctr,$ctr,#1
  1048. mov $datax[3].s[1],$ctr
  1049. add $ctr,$ctr,#1
  1050. mov @datax[3].s[2],$ctr
  1051. add $ctr,$ctr,#1
  1052. mov @datax[3].s[3],$ctr
  1053. add $ctr,$ctr,#1
  1054. bl _vpsm4_enc_8blks
  1055. ld4 {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64
  1056. ld4 {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
  1057. eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
  1058. eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
  1059. eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
  1060. eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
  1061. eor @data[0].16b,@data[0].16b,@datax[0].16b
  1062. eor @data[1].16b,@data[1].16b,@datax[1].16b
  1063. eor @data[2].16b,@data[2].16b,@datax[2].16b
  1064. eor @data[3].16b,@data[3].16b,@datax[3].16b
  1065. st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
  1066. st4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
  1067. subs $blocks,$blocks,#8
  1068. b.ne .Lctr32_4_blocks_process
  1069. b 100f
  1070. 1: // last block processing
  1071. subs $blocks,$blocks,#1
  1072. b.lt 100f
  1073. b.gt 1f
  1074. mov $ivec.s[0],$word0
  1075. mov $ivec.s[1],$word1
  1076. mov $ivec.s[2],$word2
  1077. mov $ivec.s[3],$ctr
  1078. ___
  1079. &encrypt_1blk($ivec);
  1080. $code.=<<___;
  1081. ld1 {@data[0].4s},[$inp]
  1082. eor @data[0].16b,@data[0].16b,$ivec.16b
  1083. st1 {@data[0].4s},[$outp]
  1084. b 100f
  1085. 1: // last 2 blocks processing
  1086. dup @data[0].4s,$word0
  1087. dup @data[1].4s,$word1
  1088. dup @data[2].4s,$word2
  1089. mov @data[3].s[0],$ctr
  1090. add $ctr,$ctr,#1
  1091. mov @data[3].s[1],$ctr
  1092. subs $blocks,$blocks,#1
  1093. b.ne 1f
  1094. bl _vpsm4_enc_4blks
  1095. ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[0],[$inp],#16
  1096. ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[1],[$inp],#16
  1097. eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
  1098. eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
  1099. eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
  1100. eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
  1101. st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[0],[$outp],#16
  1102. st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[1],[$outp],#16
  1103. b 100f
  1104. 1: // last 3 blocks processing
  1105. add $ctr,$ctr,#1
  1106. mov @data[3].s[2],$ctr
  1107. bl _vpsm4_enc_4blks
  1108. ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[0],[$inp],#16
  1109. ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[1],[$inp],#16
  1110. ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[2],[$inp],#16
  1111. eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
  1112. eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
  1113. eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
  1114. eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
  1115. st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[0],[$outp],#16
  1116. st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[1],[$outp],#16
  1117. st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[2],[$outp],#16
  1118. 100:
  1119. ldp d10,d11,[sp,#16]
  1120. ldp d12,d13,[sp,#32]
  1121. ldp d14,d15,[sp,#48]
  1122. ldp x29,x30,[sp,#64]
  1123. ldp d8,d9,[sp],#80
  1124. AARCH64_VALIDATE_LINK_REGISTER
  1125. ret
  1126. .size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
  1127. ___
  1128. }}}
  1129. {{{
  1130. my ($blocks,$len)=("x2","x2");
  1131. my $ivp=("x5");
  1132. my @twx=map("x$_",(12..27));
  1133. my ($rks1,$rks2)=("x26","x27");
  1134. my $lastBlk=("x26");
  1135. my $enc=("w28");
  1136. my $remain=("x29");
  1137. my @tweak=@datax;
  1138. sub gen_xts_cipher() {
  1139. my $std = shift;
  1140. $code.=<<___;
  1141. .globl ${prefix}_xts_encrypt${std}
  1142. .type ${prefix}_xts_encrypt${std},%function
  1143. .align 5
  1144. ${prefix}_xts_encrypt${std}:
  1145. AARCH64_SIGN_LINK_REGISTER
  1146. stp x15, x16, [sp, #-0x10]!
  1147. stp x17, x18, [sp, #-0x10]!
  1148. stp x19, x20, [sp, #-0x10]!
  1149. stp x21, x22, [sp, #-0x10]!
  1150. stp x23, x24, [sp, #-0x10]!
  1151. stp x25, x26, [sp, #-0x10]!
  1152. stp x27, x28, [sp, #-0x10]!
  1153. stp x29, x30, [sp, #-0x10]!
  1154. stp d8, d9, [sp, #-0x10]!
  1155. stp d10, d11, [sp, #-0x10]!
  1156. stp d12, d13, [sp, #-0x10]!
  1157. stp d14, d15, [sp, #-0x10]!
  1158. mov $rks1,x3
  1159. mov $rks2,x4
  1160. mov $enc,w6
  1161. ld1 {@tweak[0].4s}, [$ivp]
  1162. mov $rks,$rks2
  1163. ___
  1164. &load_sbox();
  1165. &rev32(@tweak[0],@tweak[0]);
  1166. &encrypt_1blk(@tweak[0]);
  1167. $code.=<<___;
  1168. mov $rks,$rks1
  1169. and $remain,$len,#0x0F
  1170. // convert length into blocks
  1171. lsr $blocks,$len,4
  1172. cmp $blocks,#1
  1173. b.lt .return${std}
  1174. cmp $remain,0
  1175. // If the encryption/decryption Length is N times of 16,
  1176. // the all blocks are encrypted/decrypted in .xts_encrypt_blocks${std}
  1177. b.eq .xts_encrypt_blocks${std}
  1178. // If the encryption/decryption length is not N times of 16,
  1179. // the last two blocks are encrypted/decrypted in .last_2blks_tweak${std} or .only_2blks_tweak${std}
  1180. // the other blocks are encrypted/decrypted in .xts_encrypt_blocks${std}
  1181. subs $blocks,$blocks,#1
  1182. b.eq .only_2blks_tweak${std}
  1183. .xts_encrypt_blocks${std}:
  1184. ___
  1185. &rbit(@tweak[0],@tweak[0],$std);
  1186. &rev32_armeb(@tweak[0],@tweak[0]);
  1187. &mov_vec_to_reg(@tweak[0],@twx[0],@twx[1]);
  1188. &compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3]);
  1189. &compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5]);
  1190. &compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7]);
  1191. &compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9]);
  1192. &compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11]);
  1193. &compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13]);
  1194. &compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15]);
  1195. $code.=<<___;
  1196. .Lxts_8_blocks_process${std}:
  1197. cmp $blocks,#8
  1198. b.lt .Lxts_4_blocks_process${std}
  1199. ___
  1200. &mov_reg_to_vec(@twx[0],@twx[1],@vtmp[0]);
  1201. &mov_reg_to_vec(@twx[2],@twx[3],@vtmp[1]);
  1202. &mov_reg_to_vec(@twx[4],@twx[5],@vtmp[2]);
  1203. &mov_reg_to_vec(@twx[6],@twx[7],@vtmp[3]);
  1204. &mov_reg_to_vec(@twx[8],@twx[9],@vtmpx[0]);
  1205. &mov_reg_to_vec(@twx[10],@twx[11],@vtmpx[1]);
  1206. &mov_reg_to_vec(@twx[12],@twx[13],@vtmpx[2]);
  1207. &mov_reg_to_vec(@twx[14],@twx[15],@vtmpx[3]);
  1208. $code.=<<___;
  1209. ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
  1210. ___
  1211. &rbit(@vtmp[0],@vtmp[0],$std);
  1212. &rbit(@vtmp[1],@vtmp[1],$std);
  1213. &rbit(@vtmp[2],@vtmp[2],$std);
  1214. &rbit(@vtmp[3],@vtmp[3],$std);
  1215. $code.=<<___;
  1216. eor @data[0].16b, @data[0].16b, @vtmp[0].16b
  1217. eor @data[1].16b, @data[1].16b, @vtmp[1].16b
  1218. eor @data[2].16b, @data[2].16b, @vtmp[2].16b
  1219. eor @data[3].16b, @data[3].16b, @vtmp[3].16b
  1220. ld1 {@datax[0].4s,$datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
  1221. ___
  1222. &rbit(@vtmpx[0],@vtmpx[0],$std);
  1223. &rbit(@vtmpx[1],@vtmpx[1],$std);
  1224. &rbit(@vtmpx[2],@vtmpx[2],$std);
  1225. &rbit(@vtmpx[3],@vtmpx[3],$std);
  1226. $code.=<<___;
  1227. eor @datax[0].16b, @datax[0].16b, @vtmpx[0].16b
  1228. eor @datax[1].16b, @datax[1].16b, @vtmpx[1].16b
  1229. eor @datax[2].16b, @datax[2].16b, @vtmpx[2].16b
  1230. eor @datax[3].16b, @datax[3].16b, @vtmpx[3].16b
  1231. ___
  1232. &rev32(@data[0],@data[0]);
  1233. &rev32(@data[1],@data[1]);
  1234. &rev32(@data[2],@data[2]);
  1235. &rev32(@data[3],@data[3]);
  1236. &rev32(@datax[0],@datax[0]);
  1237. &rev32(@datax[1],@datax[1]);
  1238. &rev32(@datax[2],@datax[2]);
  1239. &rev32(@datax[3],@datax[3]);
  1240. &transpose(@data,@vtmp);
  1241. &transpose(@datax,@vtmp);
  1242. $code.=<<___;
  1243. bl _${prefix}_enc_8blks
  1244. ___
  1245. &transpose(@vtmp,@datax);
  1246. &transpose(@data,@datax);
  1247. &mov_reg_to_vec(@twx[0],@twx[1],@vtmpx[0]);
  1248. &compute_tweak(@twx[14],@twx[15],@twx[0],@twx[1]);
  1249. &mov_reg_to_vec(@twx[2],@twx[3],@vtmpx[1]);
  1250. &compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3]);
  1251. &mov_reg_to_vec(@twx[4],@twx[5],@vtmpx[2]);
  1252. &compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5]);
  1253. &mov_reg_to_vec(@twx[6],@twx[7],@vtmpx[3]);
  1254. &compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7]);
  1255. &mov_reg_to_vec(@twx[8],@twx[9],@tweak[0]);
  1256. &compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9]);
  1257. &mov_reg_to_vec(@twx[10],@twx[11],@tweak[1]);
  1258. &compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11]);
  1259. &mov_reg_to_vec(@twx[12],@twx[13],@tweak[2]);
  1260. &compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13]);
  1261. &mov_reg_to_vec(@twx[14],@twx[15],@tweak[3]);
  1262. &compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15]);
  1263. $code.=<<___;
  1264. eor @vtmp[0].16b, @vtmp[0].16b, @vtmpx[0].16b
  1265. eor @vtmp[1].16b, @vtmp[1].16b, @vtmpx[1].16b
  1266. eor @vtmp[2].16b, @vtmp[2].16b, @vtmpx[2].16b
  1267. eor @vtmp[3].16b, @vtmp[3].16b, @vtmpx[3].16b
  1268. eor @data[0].16b, @data[0].16b, @tweak[0].16b
  1269. eor @data[1].16b, @data[1].16b, @tweak[1].16b
  1270. eor @data[2].16b, @data[2].16b, @tweak[2].16b
  1271. eor @data[3].16b, @data[3].16b, @tweak[3].16b
  1272. // save the last tweak
  1273. st1 {@tweak[3].4s},[$ivp]
  1274. st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
  1275. st1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
  1276. subs $blocks,$blocks,#8
  1277. b.gt .Lxts_8_blocks_process${std}
  1278. b 100f
  1279. .Lxts_4_blocks_process${std}:
  1280. ___
  1281. &mov_reg_to_vec(@twx[0],@twx[1],@tweak[0]);
  1282. &mov_reg_to_vec(@twx[2],@twx[3],@tweak[1]);
  1283. &mov_reg_to_vec(@twx[4],@twx[5],@tweak[2]);
  1284. &mov_reg_to_vec(@twx[6],@twx[7],@tweak[3]);
  1285. $code.=<<___;
  1286. cmp $blocks,#4
  1287. b.lt 1f
  1288. ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
  1289. ___
  1290. &rbit(@tweak[0],@tweak[0],$std);
  1291. &rbit(@tweak[1],@tweak[1],$std);
  1292. &rbit(@tweak[2],@tweak[2],$std);
  1293. &rbit(@tweak[3],@tweak[3],$std);
  1294. $code.=<<___;
  1295. eor @data[0].16b, @data[0].16b, @tweak[0].16b
  1296. eor @data[1].16b, @data[1].16b, @tweak[1].16b
  1297. eor @data[2].16b, @data[2].16b, @tweak[2].16b
  1298. eor @data[3].16b, @data[3].16b, @tweak[3].16b
  1299. ___
  1300. &rev32(@data[0],@data[0]);
  1301. &rev32(@data[1],@data[1]);
  1302. &rev32(@data[2],@data[2]);
  1303. &rev32(@data[3],@data[3]);
  1304. &transpose(@data,@vtmp);
  1305. $code.=<<___;
  1306. bl _${prefix}_enc_4blks
  1307. ___
  1308. &transpose(@vtmp,@data);
  1309. $code.=<<___;
  1310. eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b
  1311. eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b
  1312. eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b
  1313. eor @vtmp[3].16b, @vtmp[3].16b, @tweak[3].16b
  1314. st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
  1315. sub $blocks,$blocks,#4
  1316. ___
  1317. &mov_reg_to_vec(@twx[8],@twx[9],@tweak[0]);
  1318. &mov_reg_to_vec(@twx[10],@twx[11],@tweak[1]);
  1319. &mov_reg_to_vec(@twx[12],@twx[13],@tweak[2]);
  1320. $code.=<<___;
  1321. // save the last tweak
  1322. st1 {@tweak[3].4s},[$ivp]
  1323. 1:
  1324. // process last block
  1325. cmp $blocks,#1
  1326. b.lt 100f
  1327. b.gt 1f
  1328. ld1 {@data[0].4s},[$inp],#16
  1329. ___
  1330. &rbit(@tweak[0],@tweak[0],$std);
  1331. $code.=<<___;
  1332. eor @data[0].16b, @data[0].16b, @tweak[0].16b
  1333. ___
  1334. &rev32(@data[0],@data[0]);
  1335. &encrypt_1blk(@data[0]);
  1336. $code.=<<___;
  1337. eor @data[0].16b, @data[0].16b, @tweak[0].16b
  1338. st1 {@data[0].4s},[$outp],#16
  1339. // save the last tweak
  1340. st1 {@tweak[0].4s},[$ivp]
  1341. b 100f
  1342. 1: // process last 2 blocks
  1343. cmp $blocks,#2
  1344. b.gt 1f
  1345. ld1 {@data[0].4s,@data[1].4s},[$inp],#32
  1346. ___
  1347. &rbit(@tweak[0],@tweak[0],$std);
  1348. &rbit(@tweak[1],@tweak[1],$std);
  1349. $code.=<<___;
  1350. eor @data[0].16b, @data[0].16b, @tweak[0].16b
  1351. eor @data[1].16b, @data[1].16b, @tweak[1].16b
  1352. ___
  1353. &rev32(@data[0],@data[0]);
  1354. &rev32(@data[1],@data[1]);
  1355. &transpose(@data,@vtmp);
  1356. $code.=<<___;
  1357. bl _${prefix}_enc_4blks
  1358. ___
  1359. &transpose(@vtmp,@data);
  1360. $code.=<<___;
  1361. eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b
  1362. eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b
  1363. st1 {@vtmp[0].4s,@vtmp[1].4s},[$outp],#32
  1364. // save the last tweak
  1365. st1 {@tweak[1].4s},[$ivp]
  1366. b 100f
  1367. 1: // process last 3 blocks
  1368. ld1 {@data[0].4s,@data[1].4s,@data[2].4s},[$inp],#48
  1369. ___
  1370. &rbit(@tweak[0],@tweak[0],$std);
  1371. &rbit(@tweak[1],@tweak[1],$std);
  1372. &rbit(@tweak[2],@tweak[2],$std);
  1373. $code.=<<___;
  1374. eor @data[0].16b, @data[0].16b, @tweak[0].16b
  1375. eor @data[1].16b, @data[1].16b, @tweak[1].16b
  1376. eor @data[2].16b, @data[2].16b, @tweak[2].16b
  1377. ___
  1378. &rev32(@data[0],@data[0]);
  1379. &rev32(@data[1],@data[1]);
  1380. &rev32(@data[2],@data[2]);
  1381. &transpose(@data,@vtmp);
  1382. $code.=<<___;
  1383. bl _${prefix}_enc_4blks
  1384. ___
  1385. &transpose(@vtmp,@data);
  1386. $code.=<<___;
  1387. eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b
  1388. eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b
  1389. eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b
  1390. st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s},[$outp],#48
  1391. // save the last tweak
  1392. st1 {@tweak[2].4s},[$ivp]
  1393. 100:
  1394. cmp $remain,0
  1395. b.eq .return${std}
  1396. // This branch calculates the last two tweaks,
  1397. // while the encryption/decryption length is larger than 32
  1398. .last_2blks_tweak${std}:
  1399. ld1 {@tweak[0].4s},[$ivp]
  1400. ___
  1401. &rev32_armeb(@tweak[0],@tweak[0]);
  1402. &compute_tweak_vec(@tweak[0],@tweak[1],$std);
  1403. &compute_tweak_vec(@tweak[1],@tweak[2],$std);
  1404. $code.=<<___;
  1405. b .check_dec${std}
  1406. // This branch calculates the last two tweaks,
  1407. // while the encryption/decryption length is equal to 32, who only need two tweaks
  1408. .only_2blks_tweak${std}:
  1409. mov @tweak[1].16b,@tweak[0].16b
  1410. ___
  1411. &rev32_armeb(@tweak[1],@tweak[1]);
  1412. &compute_tweak_vec(@tweak[1],@tweak[2],$std);
  1413. $code.=<<___;
  1414. b .check_dec${std}
  1415. // Determine whether encryption or decryption is required.
  1416. // The last two tweaks need to be swapped for decryption.
  1417. .check_dec${std}:
  1418. // encryption:1 decryption:0
  1419. cmp $enc,1
  1420. b.eq .process_last_2blks${std}
  1421. mov @vtmp[0].16B,@tweak[1].16b
  1422. mov @tweak[1].16B,@tweak[2].16b
  1423. mov @tweak[2].16B,@vtmp[0].16b
  1424. .process_last_2blks${std}:
  1425. ___
  1426. &rev32_armeb(@tweak[1],@tweak[1]);
  1427. &rev32_armeb(@tweak[2],@tweak[2]);
  1428. $code.=<<___;
  1429. ld1 {@data[0].4s},[$inp],#16
  1430. eor @data[0].16b, @data[0].16b, @tweak[1].16b
  1431. ___
  1432. &rev32(@data[0],@data[0]);
  1433. &encrypt_1blk(@data[0]);
  1434. $code.=<<___;
  1435. eor @data[0].16b, @data[0].16b, @tweak[1].16b
  1436. st1 {@data[0].4s},[$outp],#16
  1437. sub $lastBlk,$outp,16
  1438. .loop${std}:
  1439. subs $remain,$remain,1
  1440. ldrb $wtmp0,[$lastBlk,$remain]
  1441. ldrb $wtmp1,[$inp,$remain]
  1442. strb $wtmp1,[$lastBlk,$remain]
  1443. strb $wtmp0,[$outp,$remain]
  1444. b.gt .loop${std}
  1445. ld1 {@data[0].4s}, [$lastBlk]
  1446. eor @data[0].16b, @data[0].16b, @tweak[2].16b
  1447. ___
  1448. &rev32(@data[0],@data[0]);
  1449. &encrypt_1blk(@data[0]);
  1450. $code.=<<___;
  1451. eor @data[0].16b, @data[0].16b, @tweak[2].16b
  1452. st1 {@data[0].4s}, [$lastBlk]
  1453. .return${std}:
  1454. ldp d14, d15, [sp], #0x10
  1455. ldp d12, d13, [sp], #0x10
  1456. ldp d10, d11, [sp], #0x10
  1457. ldp d8, d9, [sp], #0x10
  1458. ldp x29, x30, [sp], #0x10
  1459. ldp x27, x28, [sp], #0x10
  1460. ldp x25, x26, [sp], #0x10
  1461. ldp x23, x24, [sp], #0x10
  1462. ldp x21, x22, [sp], #0x10
  1463. ldp x19, x20, [sp], #0x10
  1464. ldp x17, x18, [sp], #0x10
  1465. ldp x15, x16, [sp], #0x10
  1466. AARCH64_VALIDATE_LINK_REGISTER
  1467. ret
  1468. .size ${prefix}_xts_encrypt${std},.-${prefix}_xts_encrypt${std}
  1469. ___
  1470. } # end of gen_xts_cipher
  1471. &gen_xts_cipher("_gb");
  1472. &gen_xts_cipher("");
  1473. }}}
  1474. ########################################
  1475. open SELF,$0;
  1476. while(<SELF>) {
  1477. next if (/^#!/);
  1478. last if (!s/^#/\/\// and !/^$/);
  1479. print;
  1480. }
  1481. close SELF;
  1482. foreach(split("\n",$code)) {
  1483. s/\`([^\`]*)\`/eval($1)/ge;
  1484. print $_,"\n";
  1485. }
  1486. close STDOUT or die "error closing STDOUT: $!";