aesv8-armx.pl 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997
  1. #!/usr/bin/env perl
  2. #
  3. # ====================================================================
  4. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  5. # project. The module is, however, dual licensed under OpenSSL and
  6. # CRYPTOGAMS licenses depending on where you obtain it. For further
  7. # details see http://www.openssl.org/~appro/cryptogams/.
  8. # ====================================================================
  9. #
  10. # This module implements support for ARMv8 AES instructions. The
  11. # module is endian-agnostic in sense that it supports both big- and
  12. # little-endian cases. As does it support both 32- and 64-bit modes
  13. # of operation. Latter is achieved by limiting amount of utilized
  14. # registers to 16, which implies additional NEON load and integer
  15. # instructions. This has no effect on mighty Apple A7, where results
  16. # are literally equal to the theoretical estimates based on AES
  17. # instruction latencies and issue rates. On Cortex-A53, an in-order
  18. # execution core, this costs up to 10-15%, which is partially
  19. # compensated by implementing dedicated code path for 128-bit
  20. # CBC encrypt case. On Cortex-A57 parallelizable mode performance
  21. # seems to be limited by sheer amount of NEON instructions...
  22. #
  23. # Performance in cycles per byte processed with 128-bit key:
  24. #
  25. # CBC enc CBC dec CTR
  26. # Apple A7 2.39 1.20 1.20
  27. # Cortex-A53 1.32 1.29 1.46
  28. # Cortex-A57(*) 1.95 0.85 0.93
  29. # Denver 1.96 0.86 0.80
  30. #
  31. # (*) original 3.64/1.34/1.32 results were for r0p0 revision
  32. # and are still same even for updated module;
  33. $flavour = shift;
  34. $output = shift;
  35. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  36. ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
  37. ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
  38. die "can't locate arm-xlate.pl";
  39. open OUT,"| \"$^X\" $xlate $flavour $output";
  40. *STDOUT=*OUT;
  41. $prefix="aes_v8";
  42. $code=<<___;
  43. #include "arm_arch.h"
  44. #if __ARM_MAX_ARCH__>=7
  45. .text
  46. ___
  47. $code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/);
  48. $code.=".arch armv7-a\n.fpu neon\n.code 32\n" if ($flavour !~ /64/);
  49. #^^^^^^ this is done to simplify adoption by not depending
  50. # on latest binutils.
  51. # Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
  52. # NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
  53. # maintain both 32- and 64-bit codes within single module and
  54. # transliterate common code to either flavour with regex vodoo.
  55. #
  56. {{{
  57. my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
  58. my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
  59. $flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));
  60. $code.=<<___;
  61. .align 5
  62. .Lrcon:
  63. .long 0x01,0x01,0x01,0x01
  64. .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat
  65. .long 0x1b,0x1b,0x1b,0x1b
  66. .globl ${prefix}_set_encrypt_key
  67. .type ${prefix}_set_encrypt_key,%function
  68. .align 5
  69. ${prefix}_set_encrypt_key:
  70. .Lenc_key:
  71. ___
  72. $code.=<<___ if ($flavour =~ /64/);
  73. stp x29,x30,[sp,#-16]!
  74. add x29,sp,#0
  75. ___
  76. $code.=<<___;
  77. mov $ptr,#-1
  78. cmp $inp,#0
  79. b.eq .Lenc_key_abort
  80. cmp $out,#0
  81. b.eq .Lenc_key_abort
  82. mov $ptr,#-2
  83. cmp $bits,#128
  84. b.lt .Lenc_key_abort
  85. cmp $bits,#256
  86. b.gt .Lenc_key_abort
  87. tst $bits,#0x3f
  88. b.ne .Lenc_key_abort
  89. adr $ptr,.Lrcon
  90. cmp $bits,#192
  91. veor $zero,$zero,$zero
  92. vld1.8 {$in0},[$inp],#16
  93. mov $bits,#8 // reuse $bits
  94. vld1.32 {$rcon,$mask},[$ptr],#32
  95. b.lt .Loop128
  96. b.eq .L192
  97. b .L256
  98. .align 4
  99. .Loop128:
  100. vtbl.8 $key,{$in0},$mask
  101. vext.8 $tmp,$zero,$in0,#12
  102. vst1.32 {$in0},[$out],#16
  103. aese $key,$zero
  104. subs $bits,$bits,#1
  105. veor $in0,$in0,$tmp
  106. vext.8 $tmp,$zero,$tmp,#12
  107. veor $in0,$in0,$tmp
  108. vext.8 $tmp,$zero,$tmp,#12
  109. veor $key,$key,$rcon
  110. veor $in0,$in0,$tmp
  111. vshl.u8 $rcon,$rcon,#1
  112. veor $in0,$in0,$key
  113. b.ne .Loop128
  114. vld1.32 {$rcon},[$ptr]
  115. vtbl.8 $key,{$in0},$mask
  116. vext.8 $tmp,$zero,$in0,#12
  117. vst1.32 {$in0},[$out],#16
  118. aese $key,$zero
  119. veor $in0,$in0,$tmp
  120. vext.8 $tmp,$zero,$tmp,#12
  121. veor $in0,$in0,$tmp
  122. vext.8 $tmp,$zero,$tmp,#12
  123. veor $key,$key,$rcon
  124. veor $in0,$in0,$tmp
  125. vshl.u8 $rcon,$rcon,#1
  126. veor $in0,$in0,$key
  127. vtbl.8 $key,{$in0},$mask
  128. vext.8 $tmp,$zero,$in0,#12
  129. vst1.32 {$in0},[$out],#16
  130. aese $key,$zero
  131. veor $in0,$in0,$tmp
  132. vext.8 $tmp,$zero,$tmp,#12
  133. veor $in0,$in0,$tmp
  134. vext.8 $tmp,$zero,$tmp,#12
  135. veor $key,$key,$rcon
  136. veor $in0,$in0,$tmp
  137. veor $in0,$in0,$key
  138. vst1.32 {$in0},[$out]
  139. add $out,$out,#0x50
  140. mov $rounds,#10
  141. b .Ldone
  142. .align 4
  143. .L192:
  144. vld1.8 {$in1},[$inp],#8
  145. vmov.i8 $key,#8 // borrow $key
  146. vst1.32 {$in0},[$out],#16
  147. vsub.i8 $mask,$mask,$key // adjust the mask
  148. .Loop192:
  149. vtbl.8 $key,{$in1},$mask
  150. vext.8 $tmp,$zero,$in0,#12
  151. vst1.32 {$in1},[$out],#8
  152. aese $key,$zero
  153. subs $bits,$bits,#1
  154. veor $in0,$in0,$tmp
  155. vext.8 $tmp,$zero,$tmp,#12
  156. veor $in0,$in0,$tmp
  157. vext.8 $tmp,$zero,$tmp,#12
  158. veor $in0,$in0,$tmp
  159. vdup.32 $tmp,${in0}[3]
  160. veor $tmp,$tmp,$in1
  161. veor $key,$key,$rcon
  162. vext.8 $in1,$zero,$in1,#12
  163. vshl.u8 $rcon,$rcon,#1
  164. veor $in1,$in1,$tmp
  165. veor $in0,$in0,$key
  166. veor $in1,$in1,$key
  167. vst1.32 {$in0},[$out],#16
  168. b.ne .Loop192
  169. mov $rounds,#12
  170. add $out,$out,#0x20
  171. b .Ldone
  172. .align 4
  173. .L256:
  174. vld1.8 {$in1},[$inp]
  175. mov $bits,#7
  176. mov $rounds,#14
  177. vst1.32 {$in0},[$out],#16
  178. .Loop256:
  179. vtbl.8 $key,{$in1},$mask
  180. vext.8 $tmp,$zero,$in0,#12
  181. vst1.32 {$in1},[$out],#16
  182. aese $key,$zero
  183. subs $bits,$bits,#1
  184. veor $in0,$in0,$tmp
  185. vext.8 $tmp,$zero,$tmp,#12
  186. veor $in0,$in0,$tmp
  187. vext.8 $tmp,$zero,$tmp,#12
  188. veor $key,$key,$rcon
  189. veor $in0,$in0,$tmp
  190. vshl.u8 $rcon,$rcon,#1
  191. veor $in0,$in0,$key
  192. vst1.32 {$in0},[$out],#16
  193. b.eq .Ldone
  194. vdup.32 $key,${in0}[3] // just splat
  195. vext.8 $tmp,$zero,$in1,#12
  196. aese $key,$zero
  197. veor $in1,$in1,$tmp
  198. vext.8 $tmp,$zero,$tmp,#12
  199. veor $in1,$in1,$tmp
  200. vext.8 $tmp,$zero,$tmp,#12
  201. veor $in1,$in1,$tmp
  202. veor $in1,$in1,$key
  203. b .Loop256
  204. .Ldone:
  205. str $rounds,[$out]
  206. mov $ptr,#0
  207. .Lenc_key_abort:
  208. mov x0,$ptr // return value
  209. `"ldr x29,[sp],#16" if ($flavour =~ /64/)`
  210. ret
  211. .size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
  212. .globl ${prefix}_set_decrypt_key
  213. .type ${prefix}_set_decrypt_key,%function
  214. .align 5
  215. ${prefix}_set_decrypt_key:
  216. ___
  217. $code.=<<___ if ($flavour =~ /64/);
  218. stp x29,x30,[sp,#-16]!
  219. add x29,sp,#0
  220. ___
  221. $code.=<<___ if ($flavour !~ /64/);
  222. stmdb sp!,{r4,lr}
  223. ___
  224. $code.=<<___;
  225. bl .Lenc_key
  226. cmp x0,#0
  227. b.ne .Ldec_key_abort
  228. sub $out,$out,#240 // restore original $out
  229. mov x4,#-16
  230. add $inp,$out,x12,lsl#4 // end of key schedule
  231. vld1.32 {v0.16b},[$out]
  232. vld1.32 {v1.16b},[$inp]
  233. vst1.32 {v0.16b},[$inp],x4
  234. vst1.32 {v1.16b},[$out],#16
  235. .Loop_imc:
  236. vld1.32 {v0.16b},[$out]
  237. vld1.32 {v1.16b},[$inp]
  238. aesimc v0.16b,v0.16b
  239. aesimc v1.16b,v1.16b
  240. vst1.32 {v0.16b},[$inp],x4
  241. vst1.32 {v1.16b},[$out],#16
  242. cmp $inp,$out
  243. b.hi .Loop_imc
  244. vld1.32 {v0.16b},[$out]
  245. aesimc v0.16b,v0.16b
  246. vst1.32 {v0.16b},[$inp]
  247. eor x0,x0,x0 // return value
  248. .Ldec_key_abort:
  249. ___
  250. $code.=<<___ if ($flavour !~ /64/);
  251. ldmia sp!,{r4,pc}
  252. ___
  253. $code.=<<___ if ($flavour =~ /64/);
  254. ldp x29,x30,[sp],#16
  255. ret
  256. ___
  257. $code.=<<___;
  258. .size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
  259. ___
  260. }}}
  261. {{{
  262. sub gen_block () {
  263. my $dir = shift;
  264. my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
  265. my ($inp,$out,$key)=map("x$_",(0..2));
  266. my $rounds="w3";
  267. my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));
  268. $code.=<<___;
  269. .globl ${prefix}_${dir}crypt
  270. .type ${prefix}_${dir}crypt,%function
  271. .align 5
  272. ${prefix}_${dir}crypt:
  273. ldr $rounds,[$key,#240]
  274. vld1.32 {$rndkey0},[$key],#16
  275. vld1.8 {$inout},[$inp]
  276. sub $rounds,$rounds,#2
  277. vld1.32 {$rndkey1},[$key],#16
  278. .Loop_${dir}c:
  279. aes$e $inout,$rndkey0
  280. aes$mc $inout,$inout
  281. vld1.32 {$rndkey0},[$key],#16
  282. subs $rounds,$rounds,#2
  283. aes$e $inout,$rndkey1
  284. aes$mc $inout,$inout
  285. vld1.32 {$rndkey1},[$key],#16
  286. b.gt .Loop_${dir}c
  287. aes$e $inout,$rndkey0
  288. aes$mc $inout,$inout
  289. vld1.32 {$rndkey0},[$key]
  290. aes$e $inout,$rndkey1
  291. veor $inout,$inout,$rndkey0
  292. vst1.8 {$inout},[$out]
  293. ret
  294. .size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
  295. ___
  296. }
  297. &gen_block("en");
  298. &gen_block("de");
  299. }}}
  300. {{{
  301. my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
  302. my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
  303. my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
  304. my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
  305. my ($key4,$key5,$key6,$key7)=("x6","x12","x14",$key);
  306. ### q8-q15 preloaded key schedule
  307. $code.=<<___;
  308. .globl ${prefix}_cbc_encrypt
  309. .type ${prefix}_cbc_encrypt,%function
  310. .align 5
  311. ${prefix}_cbc_encrypt:
  312. ___
  313. $code.=<<___ if ($flavour =~ /64/);
  314. stp x29,x30,[sp,#-16]!
  315. add x29,sp,#0
  316. ___
  317. $code.=<<___ if ($flavour !~ /64/);
  318. mov ip,sp
  319. stmdb sp!,{r4-r8,lr}
  320. vstmdb sp!,{d8-d15} @ ABI specification says so
  321. ldmia ip,{r4-r5} @ load remaining args
  322. ___
  323. $code.=<<___;
  324. subs $len,$len,#16
  325. mov $step,#16
  326. b.lo .Lcbc_abort
  327. cclr $step,eq
  328. cmp $enc,#0 // en- or decrypting?
  329. ldr $rounds,[$key,#240]
  330. and $len,$len,#-16
  331. vld1.8 {$ivec},[$ivp]
  332. vld1.8 {$dat},[$inp],$step
  333. vld1.32 {q8-q9},[$key] // load key schedule...
  334. sub $rounds,$rounds,#6
  335. add $key_,$key,x5,lsl#4 // pointer to last 7 round keys
  336. sub $rounds,$rounds,#2
  337. vld1.32 {q10-q11},[$key_],#32
  338. vld1.32 {q12-q13},[$key_],#32
  339. vld1.32 {q14-q15},[$key_],#32
  340. vld1.32 {$rndlast},[$key_]
  341. add $key_,$key,#32
  342. mov $cnt,$rounds
  343. b.eq .Lcbc_dec
  344. cmp $rounds,#2
  345. veor $dat,$dat,$ivec
  346. veor $rndzero_n_last,q8,$rndlast
  347. b.eq .Lcbc_enc128
  348. vld1.32 {$in0-$in1},[$key_]
  349. add $key_,$key,#16
  350. add $key4,$key,#16*4
  351. add $key5,$key,#16*5
  352. aese $dat,q8
  353. aesmc $dat,$dat
  354. add $key6,$key,#16*6
  355. add $key7,$key,#16*7
  356. b .Lenter_cbc_enc
  357. .align 4
  358. .Loop_cbc_enc:
  359. aese $dat,q8
  360. aesmc $dat,$dat
  361. vst1.8 {$ivec},[$out],#16
  362. .Lenter_cbc_enc:
  363. aese $dat,q9
  364. aesmc $dat,$dat
  365. aese $dat,$in0
  366. aesmc $dat,$dat
  367. vld1.32 {q8},[$key4]
  368. cmp $rounds,#4
  369. aese $dat,$in1
  370. aesmc $dat,$dat
  371. vld1.32 {q9},[$key5]
  372. b.eq .Lcbc_enc192
  373. aese $dat,q8
  374. aesmc $dat,$dat
  375. vld1.32 {q8},[$key6]
  376. aese $dat,q9
  377. aesmc $dat,$dat
  378. vld1.32 {q9},[$key7]
  379. nop
  380. .Lcbc_enc192:
  381. aese $dat,q8
  382. aesmc $dat,$dat
  383. subs $len,$len,#16
  384. aese $dat,q9
  385. aesmc $dat,$dat
  386. cclr $step,eq
  387. aese $dat,q10
  388. aesmc $dat,$dat
  389. aese $dat,q11
  390. aesmc $dat,$dat
  391. vld1.8 {q8},[$inp],$step
  392. aese $dat,q12
  393. aesmc $dat,$dat
  394. veor q8,q8,$rndzero_n_last
  395. aese $dat,q13
  396. aesmc $dat,$dat
  397. vld1.32 {q9},[$key_] // re-pre-load rndkey[1]
  398. aese $dat,q14
  399. aesmc $dat,$dat
  400. aese $dat,q15
  401. veor $ivec,$dat,$rndlast
  402. b.hs .Loop_cbc_enc
  403. vst1.8 {$ivec},[$out],#16
  404. b .Lcbc_done
  405. .align 5
  406. .Lcbc_enc128:
  407. vld1.32 {$in0-$in1},[$key_]
  408. aese $dat,q8
  409. aesmc $dat,$dat
  410. b .Lenter_cbc_enc128
  411. .Loop_cbc_enc128:
  412. aese $dat,q8
  413. aesmc $dat,$dat
  414. vst1.8 {$ivec},[$out],#16
  415. .Lenter_cbc_enc128:
  416. aese $dat,q9
  417. aesmc $dat,$dat
  418. subs $len,$len,#16
  419. aese $dat,$in0
  420. aesmc $dat,$dat
  421. cclr $step,eq
  422. aese $dat,$in1
  423. aesmc $dat,$dat
  424. aese $dat,q10
  425. aesmc $dat,$dat
  426. aese $dat,q11
  427. aesmc $dat,$dat
  428. vld1.8 {q8},[$inp],$step
  429. aese $dat,q12
  430. aesmc $dat,$dat
  431. aese $dat,q13
  432. aesmc $dat,$dat
  433. aese $dat,q14
  434. aesmc $dat,$dat
  435. veor q8,q8,$rndzero_n_last
  436. aese $dat,q15
  437. veor $ivec,$dat,$rndlast
  438. b.hs .Loop_cbc_enc128
  439. vst1.8 {$ivec},[$out],#16
  440. b .Lcbc_done
  441. ___
  442. {
  443. my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
  444. $code.=<<___;
  445. .align 5
  446. .Lcbc_dec:
  447. vld1.8 {$dat2},[$inp],#16
  448. subs $len,$len,#32 // bias
  449. add $cnt,$rounds,#2
  450. vorr $in1,$dat,$dat
  451. vorr $dat1,$dat,$dat
  452. vorr $in2,$dat2,$dat2
  453. b.lo .Lcbc_dec_tail
  454. vorr $dat1,$dat2,$dat2
  455. vld1.8 {$dat2},[$inp],#16
  456. vorr $in0,$dat,$dat
  457. vorr $in1,$dat1,$dat1
  458. vorr $in2,$dat2,$dat2
  459. .Loop3x_cbc_dec:
  460. aesd $dat0,q8
  461. aesimc $dat0,$dat0
  462. aesd $dat1,q8
  463. aesimc $dat1,$dat1
  464. aesd $dat2,q8
  465. aesimc $dat2,$dat2
  466. vld1.32 {q8},[$key_],#16
  467. subs $cnt,$cnt,#2
  468. aesd $dat0,q9
  469. aesimc $dat0,$dat0
  470. aesd $dat1,q9
  471. aesimc $dat1,$dat1
  472. aesd $dat2,q9
  473. aesimc $dat2,$dat2
  474. vld1.32 {q9},[$key_],#16
  475. b.gt .Loop3x_cbc_dec
  476. aesd $dat0,q8
  477. aesimc $dat0,$dat0
  478. aesd $dat1,q8
  479. aesimc $dat1,$dat1
  480. aesd $dat2,q8
  481. aesimc $dat2,$dat2
  482. veor $tmp0,$ivec,$rndlast
  483. subs $len,$len,#0x30
  484. veor $tmp1,$in0,$rndlast
  485. mov.lo x6,$len // x6, $cnt, is zero at this point
  486. aesd $dat0,q9
  487. aesimc $dat0,$dat0
  488. aesd $dat1,q9
  489. aesimc $dat1,$dat1
  490. aesd $dat2,q9
  491. aesimc $dat2,$dat2
  492. veor $tmp2,$in1,$rndlast
  493. add $inp,$inp,x6 // $inp is adjusted in such way that
  494. // at exit from the loop $dat1-$dat2
  495. // are loaded with last "words"
  496. vorr $ivec,$in2,$in2
  497. mov $key_,$key
  498. aesd $dat0,q12
  499. aesimc $dat0,$dat0
  500. aesd $dat1,q12
  501. aesimc $dat1,$dat1
  502. aesd $dat2,q12
  503. aesimc $dat2,$dat2
  504. vld1.8 {$in0},[$inp],#16
  505. aesd $dat0,q13
  506. aesimc $dat0,$dat0
  507. aesd $dat1,q13
  508. aesimc $dat1,$dat1
  509. aesd $dat2,q13
  510. aesimc $dat2,$dat2
  511. vld1.8 {$in1},[$inp],#16
  512. aesd $dat0,q14
  513. aesimc $dat0,$dat0
  514. aesd $dat1,q14
  515. aesimc $dat1,$dat1
  516. aesd $dat2,q14
  517. aesimc $dat2,$dat2
  518. vld1.8 {$in2},[$inp],#16
  519. aesd $dat0,q15
  520. aesd $dat1,q15
  521. aesd $dat2,q15
  522. vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
  523. add $cnt,$rounds,#2
  524. veor $tmp0,$tmp0,$dat0
  525. veor $tmp1,$tmp1,$dat1
  526. veor $dat2,$dat2,$tmp2
  527. vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
  528. vst1.8 {$tmp0},[$out],#16
  529. vorr $dat0,$in0,$in0
  530. vst1.8 {$tmp1},[$out],#16
  531. vorr $dat1,$in1,$in1
  532. vst1.8 {$dat2},[$out],#16
  533. vorr $dat2,$in2,$in2
  534. b.hs .Loop3x_cbc_dec
  535. cmn $len,#0x30
  536. b.eq .Lcbc_done
  537. nop
  538. .Lcbc_dec_tail:
  539. aesd $dat1,q8
  540. aesimc $dat1,$dat1
  541. aesd $dat2,q8
  542. aesimc $dat2,$dat2
  543. vld1.32 {q8},[$key_],#16
  544. subs $cnt,$cnt,#2
  545. aesd $dat1,q9
  546. aesimc $dat1,$dat1
  547. aesd $dat2,q9
  548. aesimc $dat2,$dat2
  549. vld1.32 {q9},[$key_],#16
  550. b.gt .Lcbc_dec_tail
  551. aesd $dat1,q8
  552. aesimc $dat1,$dat1
  553. aesd $dat2,q8
  554. aesimc $dat2,$dat2
  555. aesd $dat1,q9
  556. aesimc $dat1,$dat1
  557. aesd $dat2,q9
  558. aesimc $dat2,$dat2
  559. aesd $dat1,q12
  560. aesimc $dat1,$dat1
  561. aesd $dat2,q12
  562. aesimc $dat2,$dat2
  563. cmn $len,#0x20
  564. aesd $dat1,q13
  565. aesimc $dat1,$dat1
  566. aesd $dat2,q13
  567. aesimc $dat2,$dat2
  568. veor $tmp1,$ivec,$rndlast
  569. aesd $dat1,q14
  570. aesimc $dat1,$dat1
  571. aesd $dat2,q14
  572. aesimc $dat2,$dat2
  573. veor $tmp2,$in1,$rndlast
  574. aesd $dat1,q15
  575. aesd $dat2,q15
  576. b.eq .Lcbc_dec_one
  577. veor $tmp1,$tmp1,$dat1
  578. veor $tmp2,$tmp2,$dat2
  579. vorr $ivec,$in2,$in2
  580. vst1.8 {$tmp1},[$out],#16
  581. vst1.8 {$tmp2},[$out],#16
  582. b .Lcbc_done
  583. .Lcbc_dec_one:
  584. veor $tmp1,$tmp1,$dat2
  585. vorr $ivec,$in2,$in2
  586. vst1.8 {$tmp1},[$out],#16
  587. .Lcbc_done:
  588. vst1.8 {$ivec},[$ivp]
  589. .Lcbc_abort:
  590. ___
  591. }
  592. $code.=<<___ if ($flavour !~ /64/);
  593. vldmia sp!,{d8-d15}
  594. ldmia sp!,{r4-r8,pc}
  595. ___
  596. $code.=<<___ if ($flavour =~ /64/);
  597. ldr x29,[sp],#16
  598. ret
  599. ___
  600. $code.=<<___;
  601. .size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
  602. ___
  603. }}}
  604. {{{
  605. my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
  606. my ($rounds,$cnt,$key_)=("w5","w6","x7");
  607. my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
  608. my $step="x12"; # aliases with $tctr2
  609. my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
  610. my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
  611. my ($dat,$tmp)=($dat0,$tmp0);
  612. ### q8-q15 preloaded key schedule
  613. $code.=<<___;
  614. .globl ${prefix}_ctr32_encrypt_blocks
  615. .type ${prefix}_ctr32_encrypt_blocks,%function
  616. .align 5
  617. ${prefix}_ctr32_encrypt_blocks:
  618. ___
  619. $code.=<<___ if ($flavour =~ /64/);
  620. stp x29,x30,[sp,#-16]!
  621. add x29,sp,#0
  622. ___
  623. $code.=<<___ if ($flavour !~ /64/);
  624. mov ip,sp
  625. stmdb sp!,{r4-r10,lr}
  626. vstmdb sp!,{d8-d15} @ ABI specification says so
  627. ldr r4, [ip] @ load remaining arg
  628. ___
  629. $code.=<<___;
  630. ldr $rounds,[$key,#240]
  631. ldr $ctr, [$ivp, #12]
  632. vld1.32 {$dat0},[$ivp]
  633. vld1.32 {q8-q9},[$key] // load key schedule...
  634. sub $rounds,$rounds,#4
  635. mov $step,#16
  636. cmp $len,#2
  637. add $key_,$key,x5,lsl#4 // pointer to last 5 round keys
  638. sub $rounds,$rounds,#2
  639. vld1.32 {q12-q13},[$key_],#32
  640. vld1.32 {q14-q15},[$key_],#32
  641. vld1.32 {$rndlast},[$key_]
  642. add $key_,$key,#32
  643. mov $cnt,$rounds
  644. cclr $step,lo
  645. #ifndef __ARMEB__
  646. rev $ctr, $ctr
  647. #endif
  648. vorr $dat1,$dat0,$dat0
  649. add $tctr1, $ctr, #1
  650. vorr $dat2,$dat0,$dat0
  651. add $ctr, $ctr, #2
  652. vorr $ivec,$dat0,$dat0
  653. rev $tctr1, $tctr1
  654. vmov.32 ${dat1}[3],$tctr1
  655. b.ls .Lctr32_tail
  656. rev $tctr2, $ctr
  657. sub $len,$len,#3 // bias
  658. vmov.32 ${dat2}[3],$tctr2
  659. b .Loop3x_ctr32
  660. .align 4
  661. .Loop3x_ctr32:
  662. aese $dat0,q8
  663. aesmc $dat0,$dat0
  664. aese $dat1,q8
  665. aesmc $dat1,$dat1
  666. aese $dat2,q8
  667. aesmc $dat2,$dat2
  668. vld1.32 {q8},[$key_],#16
  669. subs $cnt,$cnt,#2
  670. aese $dat0,q9
  671. aesmc $dat0,$dat0
  672. aese $dat1,q9
  673. aesmc $dat1,$dat1
  674. aese $dat2,q9
  675. aesmc $dat2,$dat2
  676. vld1.32 {q9},[$key_],#16
  677. b.gt .Loop3x_ctr32
  678. aese $dat0,q8
  679. aesmc $tmp0,$dat0
  680. aese $dat1,q8
  681. aesmc $tmp1,$dat1
  682. vld1.8 {$in0},[$inp],#16
  683. vorr $dat0,$ivec,$ivec
  684. aese $dat2,q8
  685. aesmc $dat2,$dat2
  686. vld1.8 {$in1},[$inp],#16
  687. vorr $dat1,$ivec,$ivec
  688. aese $tmp0,q9
  689. aesmc $tmp0,$tmp0
  690. aese $tmp1,q9
  691. aesmc $tmp1,$tmp1
  692. vld1.8 {$in2},[$inp],#16
  693. mov $key_,$key
  694. aese $dat2,q9
  695. aesmc $tmp2,$dat2
  696. vorr $dat2,$ivec,$ivec
  697. add $tctr0,$ctr,#1
  698. aese $tmp0,q12
  699. aesmc $tmp0,$tmp0
  700. aese $tmp1,q12
  701. aesmc $tmp1,$tmp1
  702. veor $in0,$in0,$rndlast
  703. add $tctr1,$ctr,#2
  704. aese $tmp2,q12
  705. aesmc $tmp2,$tmp2
  706. veor $in1,$in1,$rndlast
  707. add $ctr,$ctr,#3
  708. aese $tmp0,q13
  709. aesmc $tmp0,$tmp0
  710. aese $tmp1,q13
  711. aesmc $tmp1,$tmp1
  712. veor $in2,$in2,$rndlast
  713. rev $tctr0,$tctr0
  714. aese $tmp2,q13
  715. aesmc $tmp2,$tmp2
  716. vmov.32 ${dat0}[3], $tctr0
  717. rev $tctr1,$tctr1
  718. aese $tmp0,q14
  719. aesmc $tmp0,$tmp0
  720. aese $tmp1,q14
  721. aesmc $tmp1,$tmp1
  722. vmov.32 ${dat1}[3], $tctr1
  723. rev $tctr2,$ctr
  724. aese $tmp2,q14
  725. aesmc $tmp2,$tmp2
  726. vmov.32 ${dat2}[3], $tctr2
  727. subs $len,$len,#3
  728. aese $tmp0,q15
  729. aese $tmp1,q15
  730. aese $tmp2,q15
  731. veor $in0,$in0,$tmp0
  732. vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
  733. vst1.8 {$in0},[$out],#16
  734. veor $in1,$in1,$tmp1
  735. mov $cnt,$rounds
  736. vst1.8 {$in1},[$out],#16
  737. veor $in2,$in2,$tmp2
  738. vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
  739. vst1.8 {$in2},[$out],#16
  740. b.hs .Loop3x_ctr32
  741. adds $len,$len,#3
  742. b.eq .Lctr32_done
  743. cmp $len,#1
  744. mov $step,#16
  745. cclr $step,eq
  746. .Lctr32_tail:
  747. aese $dat0,q8
  748. aesmc $dat0,$dat0
  749. aese $dat1,q8
  750. aesmc $dat1,$dat1
  751. vld1.32 {q8},[$key_],#16
  752. subs $cnt,$cnt,#2
  753. aese $dat0,q9
  754. aesmc $dat0,$dat0
  755. aese $dat1,q9
  756. aesmc $dat1,$dat1
  757. vld1.32 {q9},[$key_],#16
  758. b.gt .Lctr32_tail
  759. aese $dat0,q8
  760. aesmc $dat0,$dat0
  761. aese $dat1,q8
  762. aesmc $dat1,$dat1
  763. aese $dat0,q9
  764. aesmc $dat0,$dat0
  765. aese $dat1,q9
  766. aesmc $dat1,$dat1
  767. vld1.8 {$in0},[$inp],$step
  768. aese $dat0,q12
  769. aesmc $dat0,$dat0
  770. aese $dat1,q12
  771. aesmc $dat1,$dat1
  772. vld1.8 {$in1},[$inp]
  773. aese $dat0,q13
  774. aesmc $dat0,$dat0
  775. aese $dat1,q13
  776. aesmc $dat1,$dat1
  777. veor $in0,$in0,$rndlast
  778. aese $dat0,q14
  779. aesmc $dat0,$dat0
  780. aese $dat1,q14
  781. aesmc $dat1,$dat1
  782. veor $in1,$in1,$rndlast
  783. aese $dat0,q15
  784. aese $dat1,q15
  785. cmp $len,#1
  786. veor $in0,$in0,$dat0
  787. veor $in1,$in1,$dat1
  788. vst1.8 {$in0},[$out],#16
  789. b.eq .Lctr32_done
  790. vst1.8 {$in1},[$out]
  791. .Lctr32_done:
  792. ___
  793. $code.=<<___ if ($flavour !~ /64/);
  794. vldmia sp!,{d8-d15}
  795. ldmia sp!,{r4-r10,pc}
  796. ___
  797. $code.=<<___ if ($flavour =~ /64/);
  798. ldr x29,[sp],#16
  799. ret
  800. ___
  801. $code.=<<___;
  802. .size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
  803. ___
  804. }}}
  805. $code.=<<___;
  806. #endif
  807. ___
  808. ########################################
  809. if ($flavour =~ /64/) { ######## 64-bit code
  810. my %opcode = (
  811. "aesd" => 0x4e285800, "aese" => 0x4e284800,
  812. "aesimc"=> 0x4e287800, "aesmc" => 0x4e286800 );
  813. local *unaes = sub {
  814. my ($mnemonic,$arg)=@_;
  815. $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o &&
  816. sprintf ".inst\t0x%08x\t//%s %s",
  817. $opcode{$mnemonic}|$1|($2<<5),
  818. $mnemonic,$arg;
  819. };
  820. foreach(split("\n",$code)) {
  821. s/\`([^\`]*)\`/eval($1)/geo;
  822. s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers
  823. s/@\s/\/\//o; # old->new style commentary
  824. #s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
  825. s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or
  826. s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel $2,$3,$2,$1/o or
  827. s/vmov\.i8/movi/o or # fix up legacy mnemonics
  828. s/vext\.8/ext/o or
  829. s/vrev32\.8/rev32/o or
  830. s/vtst\.8/cmtst/o or
  831. s/vshr/ushr/o or
  832. s/^(\s+)v/$1/o or # strip off v prefix
  833. s/\bbx\s+lr\b/ret/o;
  834. # fix up remainig legacy suffixes
  835. s/\.[ui]?8//o;
  836. m/\],#8/o and s/\.16b/\.8b/go;
  837. s/\.[ui]?32//o and s/\.16b/\.4s/go;
  838. s/\.[ui]?64//o and s/\.16b/\.2d/go;
  839. s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
  840. print $_,"\n";
  841. }
  842. } else { ######## 32-bit code
  843. my %opcode = (
  844. "aesd" => 0xf3b00340, "aese" => 0xf3b00300,
  845. "aesimc"=> 0xf3b003c0, "aesmc" => 0xf3b00380 );
  846. local *unaes = sub {
  847. my ($mnemonic,$arg)=@_;
  848. if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) {
  849. my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
  850. |(($2&7)<<1) |(($2&8)<<2);
  851. # since ARMv7 instructions are always encoded little-endian.
  852. # correct solution is to use .inst directive, but older
  853. # assemblers don't implement it:-(
  854. sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
  855. $word&0xff,($word>>8)&0xff,
  856. ($word>>16)&0xff,($word>>24)&0xff,
  857. $mnemonic,$arg;
  858. }
  859. };
  860. sub unvtbl {
  861. my $arg=shift;
  862. $arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
  863. sprintf "vtbl.8 d%d,{q%d},d%d\n\t".
  864. "vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;
  865. }
  866. sub unvdup32 {
  867. my $arg=shift;
  868. $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
  869. sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
  870. }
  871. sub unvmov32 {
  872. my $arg=shift;
  873. $arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
  874. sprintf "vmov.32 d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
  875. }
  876. foreach(split("\n",$code)) {
  877. s/\`([^\`]*)\`/eval($1)/geo;
  878. s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers
  879. s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers
  880. s/\/\/\s?/@ /o; # new->old style commentary
  881. # fix up remainig new-style suffixes
  882. s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo or
  883. s/\],#[0-9]+/]!/o;
  884. s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
  885. s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or
  886. s/vtbl\.8\s+(.*)/unvtbl($1)/geo or
  887. s/vdup\.32\s+(.*)/unvdup32($1)/geo or
  888. s/vmov\.32\s+(.*)/unvmov32($1)/geo or
  889. s/^(\s+)b\./$1b/o or
  890. s/^(\s+)mov\./$1mov/o or
  891. s/^(\s+)ret/$1bx\tlr/o;
  892. print $_,"\n";
  893. }
  894. }
  895. close STDOUT;