2
0

aesv8-armx.pl 81 KB


  1. #! /usr/bin/env perl
  2. # Copyright 2014-2023 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. # ====================================================================
  10. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  11. # project. The module is, however, dual licensed under OpenSSL and
  12. # CRYPTOGAMS licenses depending on where you obtain it. For further
  13. # details see http://www.openssl.org/~appro/cryptogams/.
  14. # ====================================================================
  15. #
  16. # This module implements support for ARMv8 AES instructions. The
  17. # module is endian-agnostic in sense that it supports both big- and
  18. # little-endian cases. As does it support both 32- and 64-bit modes
  19. # of operation. Latter is achieved by limiting amount of utilized
  20. # registers to 16, which implies additional NEON load and integer
  21. # instructions. This has no effect on mighty Apple A7, where results
  22. # are literally equal to the theoretical estimates based on AES
  23. # instruction latencies and issue rates. On Cortex-A53, an in-order
  24. # execution core, this costs up to 10-15%, which is partially
  25. # compensated by implementing dedicated code path for 128-bit
  26. # CBC encrypt case. On Cortex-A57 parallelizable mode performance
  27. # seems to be limited by sheer amount of NEON instructions...
  28. #
  29. # April 2019
  30. #
  31. # Key to performance of parallelize-able modes is round instruction
  32. # interleaving. But which factor to use? There is optimal one for
  33. # each combination of instruction latency and issue rate, beyond
  34. # which increasing interleave factor doesn't pay off. While on cons
  35. # side we have code size increase and resource waste on platforms for
  36. # which interleave factor is too high. In other words you want it to
  37. # be just right. So far interleave factor of 3x was serving well all
  38. # platforms. But for ThunderX2 optimal interleave factor was measured
  39. # to be 5x...
  40. #
  41. # Performance in cycles per byte processed with 128-bit key:
  42. #
  43. # CBC enc CBC dec CTR
  44. # Apple A7 2.39 1.20 1.20
  45. # Cortex-A53 1.32 1.17/1.29(**) 1.36/1.46
  46. # Cortex-A57(*) 1.95 0.82/0.85 0.89/0.93
  47. # Cortex-A72 1.33 0.85/0.88 0.92/0.96
  48. # Denver 1.96 0.65/0.86 0.76/0.80
  49. # Mongoose 1.33 1.23/1.20 1.30/1.20
  50. # Kryo 1.26 0.87/0.94 1.00/1.00
  51. # ThunderX2 5.95 1.25 1.30
  52. #
  53. # (*) original 3.64/1.34/1.32 results were for r0p0 revision
  54. # and are still same even for updated module;
  55. # (**) numbers after slash are for 32-bit code, which is 3x-
  56. # interleaved;
  57. # $output is the last argument if it looks like a file (it has an extension)
  58. # $flavour is the first argument if it doesn't look like a file
  59. $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  60. $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  61. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  62. ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
  63. ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
  64. die "can't locate arm-xlate.pl";
  65. open OUT,"| \"$^X\" $xlate $flavour \"$output\""
  66. or die "can't call $xlate: $!";
  67. *STDOUT=*OUT;
  68. $prefix="aes_v8";
  69. $_byte = ($flavour =~ /win/ ? "DCB" : ".byte");
  70. $code=<<___;
  71. #include "arm_arch.h"
  72. #if __ARM_MAX_ARCH__>=7
  73. ___
  74. $code.=".arch armv8-a+crypto\n.text\n" if ($flavour =~ /64/);
  75. $code.=<<___ if ($flavour !~ /64/);
  76. .arch armv7-a // don't confuse not-so-latest binutils with argv8 :-)
  77. .fpu neon
  78. #ifdef __thumb2__
  79. .syntax unified
  80. .thumb
  81. # define INST(a,b,c,d) $_byte c,d|0xc,a,b
  82. #else
  83. .code 32
  84. # define INST(a,b,c,d) $_byte a,b,c,d
  85. #endif
  86. .text
  87. ___
  88. # Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
  89. # NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
  90. # maintain both 32- and 64-bit codes within single module and
  91. # transliterate common code to either flavour with regex vodoo.
  92. #
  93. {{{
  94. my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
  95. my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
  96. $flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));
  97. $code.=<<___;
  98. .align 5
  99. .Lrcon:
  100. .long 0x01,0x01,0x01,0x01
  101. .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat
  102. .long 0x1b,0x1b,0x1b,0x1b
  103. .globl ${prefix}_set_encrypt_key
  104. .type ${prefix}_set_encrypt_key,%function
  105. .align 5
  106. ${prefix}_set_encrypt_key:
  107. .Lenc_key:
  108. ___
  109. $code.=<<___ if ($flavour =~ /64/);
  110. AARCH64_VALID_CALL_TARGET
  111. // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
  112. stp x29,x30,[sp,#-16]!
  113. add x29,sp,#0
  114. ___
  115. $code.=<<___;
  116. mov $ptr,#-1
  117. cmp $inp,#0
  118. b.eq .Lenc_key_abort
  119. cmp $out,#0
  120. b.eq .Lenc_key_abort
  121. mov $ptr,#-2
  122. cmp $bits,#128
  123. b.lt .Lenc_key_abort
  124. cmp $bits,#256
  125. b.gt .Lenc_key_abort
  126. tst $bits,#0x3f
  127. b.ne .Lenc_key_abort
  128. adr $ptr,.Lrcon
  129. cmp $bits,#192
  130. veor $zero,$zero,$zero
  131. vld1.8 {$in0},[$inp],#16
  132. mov $bits,#8 // reuse $bits
  133. vld1.32 {$rcon,$mask},[$ptr],#32
  134. b.lt .Loop128
  135. b.eq .L192
  136. b .L256
  137. .align 4
  138. .Loop128:
  139. vtbl.8 $key,{$in0},$mask
  140. vext.8 $tmp,$zero,$in0,#12
  141. vst1.32 {$in0},[$out],#16
  142. aese $key,$zero
  143. subs $bits,$bits,#1
  144. veor $in0,$in0,$tmp
  145. vext.8 $tmp,$zero,$tmp,#12
  146. veor $in0,$in0,$tmp
  147. vext.8 $tmp,$zero,$tmp,#12
  148. veor $key,$key,$rcon
  149. veor $in0,$in0,$tmp
  150. vshl.u8 $rcon,$rcon,#1
  151. veor $in0,$in0,$key
  152. b.ne .Loop128
  153. vld1.32 {$rcon},[$ptr]
  154. vtbl.8 $key,{$in0},$mask
  155. vext.8 $tmp,$zero,$in0,#12
  156. vst1.32 {$in0},[$out],#16
  157. aese $key,$zero
  158. veor $in0,$in0,$tmp
  159. vext.8 $tmp,$zero,$tmp,#12
  160. veor $in0,$in0,$tmp
  161. vext.8 $tmp,$zero,$tmp,#12
  162. veor $key,$key,$rcon
  163. veor $in0,$in0,$tmp
  164. vshl.u8 $rcon,$rcon,#1
  165. veor $in0,$in0,$key
  166. vtbl.8 $key,{$in0},$mask
  167. vext.8 $tmp,$zero,$in0,#12
  168. vst1.32 {$in0},[$out],#16
  169. aese $key,$zero
  170. veor $in0,$in0,$tmp
  171. vext.8 $tmp,$zero,$tmp,#12
  172. veor $in0,$in0,$tmp
  173. vext.8 $tmp,$zero,$tmp,#12
  174. veor $key,$key,$rcon
  175. veor $in0,$in0,$tmp
  176. veor $in0,$in0,$key
  177. vst1.32 {$in0},[$out]
  178. add $out,$out,#0x50
  179. mov $rounds,#10
  180. b .Ldone
  181. .align 4
  182. .L192:
  183. vld1.8 {$in1},[$inp],#8
  184. vmov.i8 $key,#8 // borrow $key
  185. vst1.32 {$in0},[$out],#16
  186. vsub.i8 $mask,$mask,$key // adjust the mask
  187. .Loop192:
  188. vtbl.8 $key,{$in1},$mask
  189. vext.8 $tmp,$zero,$in0,#12
  190. #ifdef __ARMEB__
  191. vst1.32 {$in1},[$out],#16
  192. sub $out,$out,#8
  193. #else
  194. vst1.32 {$in1},[$out],#8
  195. #endif
  196. aese $key,$zero
  197. subs $bits,$bits,#1
  198. veor $in0,$in0,$tmp
  199. vext.8 $tmp,$zero,$tmp,#12
  200. veor $in0,$in0,$tmp
  201. vext.8 $tmp,$zero,$tmp,#12
  202. veor $in0,$in0,$tmp
  203. vdup.32 $tmp,${in0}[3]
  204. veor $tmp,$tmp,$in1
  205. veor $key,$key,$rcon
  206. vext.8 $in1,$zero,$in1,#12
  207. vshl.u8 $rcon,$rcon,#1
  208. veor $in1,$in1,$tmp
  209. veor $in0,$in0,$key
  210. veor $in1,$in1,$key
  211. vst1.32 {$in0},[$out],#16
  212. b.ne .Loop192
  213. mov $rounds,#12
  214. add $out,$out,#0x20
  215. b .Ldone
  216. .align 4
  217. .L256:
  218. vld1.8 {$in1},[$inp]
  219. mov $bits,#7
  220. mov $rounds,#14
  221. vst1.32 {$in0},[$out],#16
  222. .Loop256:
  223. vtbl.8 $key,{$in1},$mask
  224. vext.8 $tmp,$zero,$in0,#12
  225. vst1.32 {$in1},[$out],#16
  226. aese $key,$zero
  227. subs $bits,$bits,#1
  228. veor $in0,$in0,$tmp
  229. vext.8 $tmp,$zero,$tmp,#12
  230. veor $in0,$in0,$tmp
  231. vext.8 $tmp,$zero,$tmp,#12
  232. veor $key,$key,$rcon
  233. veor $in0,$in0,$tmp
  234. vshl.u8 $rcon,$rcon,#1
  235. veor $in0,$in0,$key
  236. vst1.32 {$in0},[$out],#16
  237. b.eq .Ldone
  238. vdup.32 $key,${in0}[3] // just splat
  239. vext.8 $tmp,$zero,$in1,#12
  240. aese $key,$zero
  241. veor $in1,$in1,$tmp
  242. vext.8 $tmp,$zero,$tmp,#12
  243. veor $in1,$in1,$tmp
  244. vext.8 $tmp,$zero,$tmp,#12
  245. veor $in1,$in1,$tmp
  246. veor $in1,$in1,$key
  247. b .Loop256
  248. .Ldone:
  249. str $rounds,[$out]
  250. mov $ptr,#0
  251. .Lenc_key_abort:
  252. mov x0,$ptr // return value
  253. `"ldr x29,[sp],#16" if ($flavour =~ /64/)`
  254. ret
  255. .size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
  256. .globl ${prefix}_set_decrypt_key
  257. .type ${prefix}_set_decrypt_key,%function
  258. .align 5
  259. ${prefix}_set_decrypt_key:
  260. ___
  261. $code.=<<___ if ($flavour =~ /64/);
  262. AARCH64_SIGN_LINK_REGISTER
  263. stp x29,x30,[sp,#-16]!
  264. add x29,sp,#0
  265. ___
  266. $code.=<<___ if ($flavour !~ /64/);
  267. stmdb sp!,{r4,lr}
  268. ___
  269. $code.=<<___;
  270. bl .Lenc_key
  271. cmp x0,#0
  272. b.ne .Ldec_key_abort
  273. sub $out,$out,#240 // restore original $out
  274. mov x4,#-16
  275. add $inp,$out,x12,lsl#4 // end of key schedule
  276. vld1.32 {v0.16b},[$out]
  277. vld1.32 {v1.16b},[$inp]
  278. vst1.32 {v0.16b},[$inp],x4
  279. vst1.32 {v1.16b},[$out],#16
  280. .Loop_imc:
  281. vld1.32 {v0.16b},[$out]
  282. vld1.32 {v1.16b},[$inp]
  283. aesimc v0.16b,v0.16b
  284. aesimc v1.16b,v1.16b
  285. vst1.32 {v0.16b},[$inp],x4
  286. vst1.32 {v1.16b},[$out],#16
  287. cmp $inp,$out
  288. b.hi .Loop_imc
  289. vld1.32 {v0.16b},[$out]
  290. aesimc v0.16b,v0.16b
  291. vst1.32 {v0.16b},[$inp]
  292. eor x0,x0,x0 // return value
  293. .Ldec_key_abort:
  294. ___
  295. $code.=<<___ if ($flavour !~ /64/);
  296. ldmia sp!,{r4,pc}
  297. ___
  298. $code.=<<___ if ($flavour =~ /64/);
  299. ldp x29,x30,[sp],#16
  300. AARCH64_VALIDATE_LINK_REGISTER
  301. ret
  302. ___
  303. $code.=<<___;
  304. .size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
  305. ___
  306. }}}
  307. {{{
  308. sub gen_block () {
  309. my $dir = shift;
  310. my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
  311. my ($inp,$out,$key)=map("x$_",(0..2));
  312. my $rounds="w3";
  313. my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));
  314. $code.=<<___;
  315. .globl ${prefix}_${dir}crypt
  316. .type ${prefix}_${dir}crypt,%function
  317. .align 5
  318. ${prefix}_${dir}crypt:
  319. ___
  320. $code.=<<___ if ($flavour =~ /64/);
  321. AARCH64_VALID_CALL_TARGET
  322. ___
  323. $code.=<<___;
  324. ldr $rounds,[$key,#240]
  325. vld1.32 {$rndkey0},[$key],#16
  326. vld1.8 {$inout},[$inp]
  327. sub $rounds,$rounds,#2
  328. vld1.32 {$rndkey1},[$key],#16
  329. .Loop_${dir}c:
  330. aes$e $inout,$rndkey0
  331. aes$mc $inout,$inout
  332. vld1.32 {$rndkey0},[$key],#16
  333. subs $rounds,$rounds,#2
  334. aes$e $inout,$rndkey1
  335. aes$mc $inout,$inout
  336. vld1.32 {$rndkey1},[$key],#16
  337. b.gt .Loop_${dir}c
  338. aes$e $inout,$rndkey0
  339. aes$mc $inout,$inout
  340. vld1.32 {$rndkey0},[$key]
  341. aes$e $inout,$rndkey1
  342. veor $inout,$inout,$rndkey0
  343. vst1.8 {$inout},[$out]
  344. ret
  345. .size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
  346. ___
  347. }
  348. &gen_block("en");
  349. &gen_block("de");
  350. }}}
  351. # Performance in cycles per byte.
  352. # Processed with AES-ECB different key size.
  353. # It shows the value before and after optimization as below:
  354. # (before/after):
  355. #
  356. # AES-128-ECB AES-192-ECB AES-256-ECB
  357. # Cortex-A57 1.85/0.82 2.16/0.96 2.47/1.10
  358. # Cortex-A72 1.64/0.85 1.82/0.99 2.13/1.14
  359. # Optimization is implemented by loop unrolling and interleaving.
  360. # Commonly, we choose the unrolling factor as 5, if the input
  361. # data size smaller than 5 blocks, but not smaller than 3 blocks,
  362. # choose 3 as the unrolling factor.
  363. # If the input data size dsize >= 5*16 bytes, then take 5 blocks
  364. # as one iteration, every loop the left size lsize -= 5*16.
  365. # If 5*16 > lsize >= 3*16 bytes, take 3 blocks as one iteration,
  366. # every loop lsize -=3*16.
  367. # If lsize < 3*16 bytes, treat them as the tail, interleave the
  368. # two blocks AES instructions.
  369. # There is one special case, if the original input data size dsize
  370. # = 16 bytes, we will treat it separately to improve the
  371. # performance: one independent code block without LR, FP load and
  372. # store, just looks like what the original ECB implementation does.
  373. {{{
  374. my ($inp,$out,$len,$key)=map("x$_",(0..3));
  375. my ($enc,$rounds,$cnt,$key_,$step)=("w4","w5","w6","x7","x8");
  376. my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7));
  377. my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
  378. ### q7 last round key
  379. ### q10-q15 q7 Last 7 round keys
  380. ### q8-q9 preloaded round keys except last 7 keys for big size
  381. ### q5, q6, q8-q9 preloaded round keys except last 7 keys for only 16 byte
  382. {
  383. my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
  384. my ($dat3,$in3,$tmp3); # used only in 64-bit mode
  385. my ($dat4,$in4,$tmp4);
  386. if ($flavour =~ /64/) {
  387. ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
  388. }
  389. $code.=<<___;
  390. .globl ${prefix}_ecb_encrypt
  391. .type ${prefix}_ecb_encrypt,%function
  392. .align 5
  393. ${prefix}_ecb_encrypt:
  394. ___
  395. $code.=<<___ if ($flavour =~ /64/);
  396. AARCH64_VALID_CALL_TARGET
  397. subs $len,$len,#16
  398. // Original input data size bigger than 16, jump to big size processing.
  399. b.ne .Lecb_big_size
  400. vld1.8 {$dat0},[$inp]
  401. cmp $enc,#0 // en- or decrypting?
  402. ldr $rounds,[$key,#240]
  403. vld1.32 {q5-q6},[$key],#32 // load key schedule...
  404. b.eq .Lecb_small_dec
  405. aese $dat0,q5
  406. aesmc $dat0,$dat0
  407. vld1.32 {q8-q9},[$key],#32 // load key schedule...
  408. aese $dat0,q6
  409. aesmc $dat0,$dat0
  410. subs $rounds,$rounds,#10 // if rounds==10, jump to aes-128-ecb processing
  411. b.eq .Lecb_128_enc
  412. .Lecb_round_loop:
  413. aese $dat0,q8
  414. aesmc $dat0,$dat0
  415. vld1.32 {q8},[$key],#16 // load key schedule...
  416. aese $dat0,q9
  417. aesmc $dat0,$dat0
  418. vld1.32 {q9},[$key],#16 // load key schedule...
  419. subs $rounds,$rounds,#2 // bias
  420. b.gt .Lecb_round_loop
  421. .Lecb_128_enc:
  422. vld1.32 {q10-q11},[$key],#32 // load key schedule...
  423. aese $dat0,q8
  424. aesmc $dat0,$dat0
  425. aese $dat0,q9
  426. aesmc $dat0,$dat0
  427. vld1.32 {q12-q13},[$key],#32 // load key schedule...
  428. aese $dat0,q10
  429. aesmc $dat0,$dat0
  430. aese $dat0,q11
  431. aesmc $dat0,$dat0
  432. vld1.32 {q14-q15},[$key],#32 // load key schedule...
  433. aese $dat0,q12
  434. aesmc $dat0,$dat0
  435. aese $dat0,q13
  436. aesmc $dat0,$dat0
  437. vld1.32 {$rndlast},[$key]
  438. aese $dat0,q14
  439. aesmc $dat0,$dat0
  440. aese $dat0,q15
  441. veor $dat0,$dat0,$rndlast
  442. vst1.8 {$dat0},[$out]
  443. b .Lecb_Final_abort
  444. .Lecb_small_dec:
  445. aesd $dat0,q5
  446. aesimc $dat0,$dat0
  447. vld1.32 {q8-q9},[$key],#32 // load key schedule...
  448. aesd $dat0,q6
  449. aesimc $dat0,$dat0
  450. subs $rounds,$rounds,#10 // bias
  451. b.eq .Lecb_128_dec
  452. .Lecb_dec_round_loop:
  453. aesd $dat0,q8
  454. aesimc $dat0,$dat0
  455. vld1.32 {q8},[$key],#16 // load key schedule...
  456. aesd $dat0,q9
  457. aesimc $dat0,$dat0
  458. vld1.32 {q9},[$key],#16 // load key schedule...
  459. subs $rounds,$rounds,#2 // bias
  460. b.gt .Lecb_dec_round_loop
  461. .Lecb_128_dec:
  462. vld1.32 {q10-q11},[$key],#32 // load key schedule...
  463. aesd $dat0,q8
  464. aesimc $dat0,$dat0
  465. aesd $dat0,q9
  466. aesimc $dat0,$dat0
  467. vld1.32 {q12-q13},[$key],#32 // load key schedule...
  468. aesd $dat0,q10
  469. aesimc $dat0,$dat0
  470. aesd $dat0,q11
  471. aesimc $dat0,$dat0
  472. vld1.32 {q14-q15},[$key],#32 // load key schedule...
  473. aesd $dat0,q12
  474. aesimc $dat0,$dat0
  475. aesd $dat0,q13
  476. aesimc $dat0,$dat0
  477. vld1.32 {$rndlast},[$key]
  478. aesd $dat0,q14
  479. aesimc $dat0,$dat0
  480. aesd $dat0,q15
  481. veor $dat0,$dat0,$rndlast
  482. vst1.8 {$dat0},[$out]
  483. b .Lecb_Final_abort
  484. .Lecb_big_size:
  485. ___
  486. $code.=<<___ if ($flavour =~ /64/);
  487. stp x29,x30,[sp,#-16]!
  488. add x29,sp,#0
  489. ___
  490. $code.=<<___ if ($flavour !~ /64/);
  491. mov ip,sp
  492. stmdb sp!,{r4-r8,lr}
  493. vstmdb sp!,{d8-d15} @ ABI specification says so
  494. ldmia ip,{r4-r5} @ load remaining args
  495. subs $len,$len,#16
  496. ___
  497. $code.=<<___;
  498. mov $step,#16
  499. b.lo .Lecb_done
  500. cclr $step,eq
  501. cmp $enc,#0 // en- or decrypting?
  502. ldr $rounds,[$key,#240]
  503. and $len,$len,#-16
  504. vld1.8 {$dat},[$inp],$step
  505. vld1.32 {q8-q9},[$key] // load key schedule...
  506. sub $rounds,$rounds,#6
  507. add $key_,$key,x5,lsl#4 // pointer to last 7 round keys
  508. sub $rounds,$rounds,#2
  509. vld1.32 {q10-q11},[$key_],#32
  510. vld1.32 {q12-q13},[$key_],#32
  511. vld1.32 {q14-q15},[$key_],#32
  512. vld1.32 {$rndlast},[$key_]
  513. add $key_,$key,#32
  514. mov $cnt,$rounds
  515. b.eq .Lecb_dec
  516. vld1.8 {$dat1},[$inp],#16
  517. subs $len,$len,#32 // bias
  518. add $cnt,$rounds,#2
  519. vorr $in1,$dat1,$dat1
  520. vorr $dat2,$dat1,$dat1
  521. vorr $dat1,$dat,$dat
  522. b.lo .Lecb_enc_tail
  523. vorr $dat1,$in1,$in1
  524. vld1.8 {$dat2},[$inp],#16
  525. ___
  526. $code.=<<___ if ($flavour =~ /64/);
  527. cmp $len,#32
  528. b.lo .Loop3x_ecb_enc
  529. vld1.8 {$dat3},[$inp],#16
  530. vld1.8 {$dat4},[$inp],#16
  531. sub $len,$len,#32 // bias
  532. mov $cnt,$rounds
  533. .Loop5x_ecb_enc:
  534. aese $dat0,q8
  535. aesmc $dat0,$dat0
  536. aese $dat1,q8
  537. aesmc $dat1,$dat1
  538. aese $dat2,q8
  539. aesmc $dat2,$dat2
  540. aese $dat3,q8
  541. aesmc $dat3,$dat3
  542. aese $dat4,q8
  543. aesmc $dat4,$dat4
  544. vld1.32 {q8},[$key_],#16
  545. subs $cnt,$cnt,#2
  546. aese $dat0,q9
  547. aesmc $dat0,$dat0
  548. aese $dat1,q9
  549. aesmc $dat1,$dat1
  550. aese $dat2,q9
  551. aesmc $dat2,$dat2
  552. aese $dat3,q9
  553. aesmc $dat3,$dat3
  554. aese $dat4,q9
  555. aesmc $dat4,$dat4
  556. vld1.32 {q9},[$key_],#16
  557. b.gt .Loop5x_ecb_enc
  558. aese $dat0,q8
  559. aesmc $dat0,$dat0
  560. aese $dat1,q8
  561. aesmc $dat1,$dat1
  562. aese $dat2,q8
  563. aesmc $dat2,$dat2
  564. aese $dat3,q8
  565. aesmc $dat3,$dat3
  566. aese $dat4,q8
  567. aesmc $dat4,$dat4
  568. cmp $len,#0x40 // because .Lecb_enc_tail4x
  569. sub $len,$len,#0x50
  570. aese $dat0,q9
  571. aesmc $dat0,$dat0
  572. aese $dat1,q9
  573. aesmc $dat1,$dat1
  574. aese $dat2,q9
  575. aesmc $dat2,$dat2
  576. aese $dat3,q9
  577. aesmc $dat3,$dat3
  578. aese $dat4,q9
  579. aesmc $dat4,$dat4
  580. csel x6,xzr,$len,gt // borrow x6, $cnt, "gt" is not typo
  581. mov $key_,$key
  582. aese $dat0,q10
  583. aesmc $dat0,$dat0
  584. aese $dat1,q10
  585. aesmc $dat1,$dat1
  586. aese $dat2,q10
  587. aesmc $dat2,$dat2
  588. aese $dat3,q10
  589. aesmc $dat3,$dat3
  590. aese $dat4,q10
  591. aesmc $dat4,$dat4
  592. add $inp,$inp,x6 // $inp is adjusted in such way that
  593. // at exit from the loop $dat1-$dat4
  594. // are loaded with last "words"
  595. add x6,$len,#0x60 // because .Lecb_enc_tail4x
  596. aese $dat0,q11
  597. aesmc $dat0,$dat0
  598. aese $dat1,q11
  599. aesmc $dat1,$dat1
  600. aese $dat2,q11
  601. aesmc $dat2,$dat2
  602. aese $dat3,q11
  603. aesmc $dat3,$dat3
  604. aese $dat4,q11
  605. aesmc $dat4,$dat4
  606. aese $dat0,q12
  607. aesmc $dat0,$dat0
  608. aese $dat1,q12
  609. aesmc $dat1,$dat1
  610. aese $dat2,q12
  611. aesmc $dat2,$dat2
  612. aese $dat3,q12
  613. aesmc $dat3,$dat3
  614. aese $dat4,q12
  615. aesmc $dat4,$dat4
  616. aese $dat0,q13
  617. aesmc $dat0,$dat0
  618. aese $dat1,q13
  619. aesmc $dat1,$dat1
  620. aese $dat2,q13
  621. aesmc $dat2,$dat2
  622. aese $dat3,q13
  623. aesmc $dat3,$dat3
  624. aese $dat4,q13
  625. aesmc $dat4,$dat4
  626. aese $dat0,q14
  627. aesmc $dat0,$dat0
  628. aese $dat1,q14
  629. aesmc $dat1,$dat1
  630. aese $dat2,q14
  631. aesmc $dat2,$dat2
  632. aese $dat3,q14
  633. aesmc $dat3,$dat3
  634. aese $dat4,q14
  635. aesmc $dat4,$dat4
  636. aese $dat0,q15
  637. vld1.8 {$in0},[$inp],#16
  638. aese $dat1,q15
  639. vld1.8 {$in1},[$inp],#16
  640. aese $dat2,q15
  641. vld1.8 {$in2},[$inp],#16
  642. aese $dat3,q15
  643. vld1.8 {$in3},[$inp],#16
  644. aese $dat4,q15
  645. vld1.8 {$in4},[$inp],#16
  646. cbz x6,.Lecb_enc_tail4x
  647. vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
  648. veor $tmp0,$rndlast,$dat0
  649. vorr $dat0,$in0,$in0
  650. veor $tmp1,$rndlast,$dat1
  651. vorr $dat1,$in1,$in1
  652. veor $tmp2,$rndlast,$dat2
  653. vorr $dat2,$in2,$in2
  654. veor $tmp3,$rndlast,$dat3
  655. vorr $dat3,$in3,$in3
  656. veor $tmp4,$rndlast,$dat4
  657. vst1.8 {$tmp0},[$out],#16
  658. vorr $dat4,$in4,$in4
  659. vst1.8 {$tmp1},[$out],#16
  660. mov $cnt,$rounds
  661. vst1.8 {$tmp2},[$out],#16
  662. vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
  663. vst1.8 {$tmp3},[$out],#16
  664. vst1.8 {$tmp4},[$out],#16
  665. b.hs .Loop5x_ecb_enc
  666. add $len,$len,#0x50
  667. cbz $len,.Lecb_done
  668. add $cnt,$rounds,#2
  669. subs $len,$len,#0x30
  670. vorr $dat0,$in2,$in2
  671. vorr $dat1,$in3,$in3
  672. vorr $dat2,$in4,$in4
  673. b.lo .Lecb_enc_tail
  674. b .Loop3x_ecb_enc
  675. .align 4
  676. .Lecb_enc_tail4x:
  677. veor $tmp1,$rndlast,$dat1
  678. veor $tmp2,$rndlast,$dat2
  679. veor $tmp3,$rndlast,$dat3
  680. veor $tmp4,$rndlast,$dat4
  681. vst1.8 {$tmp1},[$out],#16
  682. vst1.8 {$tmp2},[$out],#16
  683. vst1.8 {$tmp3},[$out],#16
  684. vst1.8 {$tmp4},[$out],#16
  685. b .Lecb_done
  686. .align 4
  687. ___
  688. $code.=<<___;
  689. .Loop3x_ecb_enc:
  690. aese $dat0,q8
  691. aesmc $dat0,$dat0
  692. aese $dat1,q8
  693. aesmc $dat1,$dat1
  694. aese $dat2,q8
  695. aesmc $dat2,$dat2
  696. vld1.32 {q8},[$key_],#16
  697. subs $cnt,$cnt,#2
  698. aese $dat0,q9
  699. aesmc $dat0,$dat0
  700. aese $dat1,q9
  701. aesmc $dat1,$dat1
  702. aese $dat2,q9
  703. aesmc $dat2,$dat2
  704. vld1.32 {q9},[$key_],#16
  705. b.gt .Loop3x_ecb_enc
  706. aese $dat0,q8
  707. aesmc $dat0,$dat0
  708. aese $dat1,q8
  709. aesmc $dat1,$dat1
  710. aese $dat2,q8
  711. aesmc $dat2,$dat2
  712. subs $len,$len,#0x30
  713. mov.lo x6,$len // x6, $cnt, is zero at this point
  714. aese $dat0,q9
  715. aesmc $dat0,$dat0
  716. aese $dat1,q9
  717. aesmc $dat1,$dat1
  718. aese $dat2,q9
  719. aesmc $dat2,$dat2
  720. add $inp,$inp,x6 // $inp is adjusted in such way that
  721. // at exit from the loop $dat1-$dat2
  722. // are loaded with last "words"
  723. mov $key_,$key
  724. aese $dat0,q12
  725. aesmc $dat0,$dat0
  726. aese $dat1,q12
  727. aesmc $dat1,$dat1
  728. aese $dat2,q12
  729. aesmc $dat2,$dat2
  730. vld1.8 {$in0},[$inp],#16
  731. aese $dat0,q13
  732. aesmc $dat0,$dat0
  733. aese $dat1,q13
  734. aesmc $dat1,$dat1
  735. aese $dat2,q13
  736. aesmc $dat2,$dat2
  737. vld1.8 {$in1},[$inp],#16
  738. aese $dat0,q14
  739. aesmc $dat0,$dat0
  740. aese $dat1,q14
  741. aesmc $dat1,$dat1
  742. aese $dat2,q14
  743. aesmc $dat2,$dat2
  744. vld1.8 {$in2},[$inp],#16
  745. aese $dat0,q15
  746. aese $dat1,q15
  747. aese $dat2,q15
  748. vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
  749. add $cnt,$rounds,#2
  750. veor $tmp0,$rndlast,$dat0
  751. veor $tmp1,$rndlast,$dat1
  752. veor $dat2,$dat2,$rndlast
  753. vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
  754. vst1.8 {$tmp0},[$out],#16
  755. vorr $dat0,$in0,$in0
  756. vst1.8 {$tmp1},[$out],#16
  757. vorr $dat1,$in1,$in1
  758. vst1.8 {$dat2},[$out],#16
  759. vorr $dat2,$in2,$in2
  760. b.hs .Loop3x_ecb_enc
  761. cmn $len,#0x30
  762. b.eq .Lecb_done
  763. nop
  764. .Lecb_enc_tail:
  765. aese $dat1,q8
  766. aesmc $dat1,$dat1
  767. aese $dat2,q8
  768. aesmc $dat2,$dat2
  769. vld1.32 {q8},[$key_],#16
  770. subs $cnt,$cnt,#2
  771. aese $dat1,q9
  772. aesmc $dat1,$dat1
  773. aese $dat2,q9
  774. aesmc $dat2,$dat2
  775. vld1.32 {q9},[$key_],#16
  776. b.gt .Lecb_enc_tail
  777. aese $dat1,q8
  778. aesmc $dat1,$dat1
  779. aese $dat2,q8
  780. aesmc $dat2,$dat2
  781. aese $dat1,q9
  782. aesmc $dat1,$dat1
  783. aese $dat2,q9
  784. aesmc $dat2,$dat2
  785. aese $dat1,q12
  786. aesmc $dat1,$dat1
  787. aese $dat2,q12
  788. aesmc $dat2,$dat2
  789. cmn $len,#0x20
  790. aese $dat1,q13
  791. aesmc $dat1,$dat1
  792. aese $dat2,q13
  793. aesmc $dat2,$dat2
  794. aese $dat1,q14
  795. aesmc $dat1,$dat1
  796. aese $dat2,q14
  797. aesmc $dat2,$dat2
  798. aese $dat1,q15
  799. aese $dat2,q15
  800. b.eq .Lecb_enc_one
  801. veor $tmp1,$rndlast,$dat1
  802. veor $tmp2,$rndlast,$dat2
  803. vst1.8 {$tmp1},[$out],#16
  804. vst1.8 {$tmp2},[$out],#16
  805. b .Lecb_done
  806. .Lecb_enc_one:
  807. veor $tmp1,$rndlast,$dat2
  808. vst1.8 {$tmp1},[$out],#16
  809. b .Lecb_done
  810. ___
  811. $code.=<<___;
  812. .align 5
  813. .Lecb_dec:
  814. vld1.8 {$dat1},[$inp],#16
  815. subs $len,$len,#32 // bias
  816. add $cnt,$rounds,#2
  817. vorr $in1,$dat1,$dat1
  818. vorr $dat2,$dat1,$dat1
  819. vorr $dat1,$dat,$dat
  820. b.lo .Lecb_dec_tail
  821. vorr $dat1,$in1,$in1
  822. vld1.8 {$dat2},[$inp],#16
  823. ___
  824. $code.=<<___ if ($flavour =~ /64/);
  825. cmp $len,#32
  826. b.lo .Loop3x_ecb_dec
  827. vld1.8 {$dat3},[$inp],#16
  828. vld1.8 {$dat4},[$inp],#16
  829. sub $len,$len,#32 // bias
  830. mov $cnt,$rounds
  831. .Loop5x_ecb_dec:
  832. aesd $dat0,q8
  833. aesimc $dat0,$dat0
  834. aesd $dat1,q8
  835. aesimc $dat1,$dat1
  836. aesd $dat2,q8
  837. aesimc $dat2,$dat2
  838. aesd $dat3,q8
  839. aesimc $dat3,$dat3
  840. aesd $dat4,q8
  841. aesimc $dat4,$dat4
  842. vld1.32 {q8},[$key_],#16
  843. subs $cnt,$cnt,#2
  844. aesd $dat0,q9
  845. aesimc $dat0,$dat0
  846. aesd $dat1,q9
  847. aesimc $dat1,$dat1
  848. aesd $dat2,q9
  849. aesimc $dat2,$dat2
  850. aesd $dat3,q9
  851. aesimc $dat3,$dat3
  852. aesd $dat4,q9
  853. aesimc $dat4,$dat4
  854. vld1.32 {q9},[$key_],#16
  855. b.gt .Loop5x_ecb_dec
  856. aesd $dat0,q8
  857. aesimc $dat0,$dat0
  858. aesd $dat1,q8
  859. aesimc $dat1,$dat1
  860. aesd $dat2,q8
  861. aesimc $dat2,$dat2
  862. aesd $dat3,q8
  863. aesimc $dat3,$dat3
  864. aesd $dat4,q8
  865. aesimc $dat4,$dat4
  866. cmp $len,#0x40 // because .Lecb_tail4x
  867. sub $len,$len,#0x50
  868. aesd $dat0,q9
  869. aesimc $dat0,$dat0
  870. aesd $dat1,q9
  871. aesimc $dat1,$dat1
  872. aesd $dat2,q9
  873. aesimc $dat2,$dat2
  874. aesd $dat3,q9
  875. aesimc $dat3,$dat3
  876. aesd $dat4,q9
  877. aesimc $dat4,$dat4
  878. csel x6,xzr,$len,gt // borrow x6, $cnt, "gt" is not typo
  879. mov $key_,$key
  880. aesd $dat0,q10
  881. aesimc $dat0,$dat0
  882. aesd $dat1,q10
  883. aesimc $dat1,$dat1
  884. aesd $dat2,q10
  885. aesimc $dat2,$dat2
  886. aesd $dat3,q10
  887. aesimc $dat3,$dat3
  888. aesd $dat4,q10
  889. aesimc $dat4,$dat4
  890. add $inp,$inp,x6 // $inp is adjusted in such way that
  891. // at exit from the loop $dat1-$dat4
  892. // are loaded with last "words"
  893. add x6,$len,#0x60 // because .Lecb_tail4x
  894. aesd $dat0,q11
  895. aesimc $dat0,$dat0
  896. aesd $dat1,q11
  897. aesimc $dat1,$dat1
  898. aesd $dat2,q11
  899. aesimc $dat2,$dat2
  900. aesd $dat3,q11
  901. aesimc $dat3,$dat3
  902. aesd $dat4,q11
  903. aesimc $dat4,$dat4
  904. aesd $dat0,q12
  905. aesimc $dat0,$dat0
  906. aesd $dat1,q12
  907. aesimc $dat1,$dat1
  908. aesd $dat2,q12
  909. aesimc $dat2,$dat2
  910. aesd $dat3,q12
  911. aesimc $dat3,$dat3
  912. aesd $dat4,q12
  913. aesimc $dat4,$dat4
  914. aesd $dat0,q13
  915. aesimc $dat0,$dat0
  916. aesd $dat1,q13
  917. aesimc $dat1,$dat1
  918. aesd $dat2,q13
  919. aesimc $dat2,$dat2
  920. aesd $dat3,q13
  921. aesimc $dat3,$dat3
  922. aesd $dat4,q13
  923. aesimc $dat4,$dat4
  924. aesd $dat0,q14
  925. aesimc $dat0,$dat0
  926. aesd $dat1,q14
  927. aesimc $dat1,$dat1
  928. aesd $dat2,q14
  929. aesimc $dat2,$dat2
  930. aesd $dat3,q14
  931. aesimc $dat3,$dat3
  932. aesd $dat4,q14
  933. aesimc $dat4,$dat4
  934. aesd $dat0,q15
  935. vld1.8 {$in0},[$inp],#16
  936. aesd $dat1,q15
  937. vld1.8 {$in1},[$inp],#16
  938. aesd $dat2,q15
  939. vld1.8 {$in2},[$inp],#16
  940. aesd $dat3,q15
  941. vld1.8 {$in3},[$inp],#16
  942. aesd $dat4,q15
  943. vld1.8 {$in4},[$inp],#16
  944. cbz x6,.Lecb_tail4x
  945. vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
  946. veor $tmp0,$rndlast,$dat0
  947. vorr $dat0,$in0,$in0
  948. veor $tmp1,$rndlast,$dat1
  949. vorr $dat1,$in1,$in1
  950. veor $tmp2,$rndlast,$dat2
  951. vorr $dat2,$in2,$in2
  952. veor $tmp3,$rndlast,$dat3
  953. vorr $dat3,$in3,$in3
  954. veor $tmp4,$rndlast,$dat4
  955. vst1.8 {$tmp0},[$out],#16
  956. vorr $dat4,$in4,$in4
  957. vst1.8 {$tmp1},[$out],#16
  958. mov $cnt,$rounds
  959. vst1.8 {$tmp2},[$out],#16
  960. vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
  961. vst1.8 {$tmp3},[$out],#16
  962. vst1.8 {$tmp4},[$out],#16
  963. b.hs .Loop5x_ecb_dec
  964. add $len,$len,#0x50
  965. cbz $len,.Lecb_done
  966. add $cnt,$rounds,#2
  967. subs $len,$len,#0x30
  968. vorr $dat0,$in2,$in2
  969. vorr $dat1,$in3,$in3
  970. vorr $dat2,$in4,$in4
  971. b.lo .Lecb_dec_tail
  972. b .Loop3x_ecb_dec
  973. .align 4
  974. .Lecb_tail4x:
  975. veor $tmp1,$rndlast,$dat1
  976. veor $tmp2,$rndlast,$dat2
  977. veor $tmp3,$rndlast,$dat3
  978. veor $tmp4,$rndlast,$dat4
  979. vst1.8 {$tmp1},[$out],#16
  980. vst1.8 {$tmp2},[$out],#16
  981. vst1.8 {$tmp3},[$out],#16
  982. vst1.8 {$tmp4},[$out],#16
  983. b .Lecb_done
  984. .align 4
  985. ___
  986. $code.=<<___;
  987. .Loop3x_ecb_dec:
  988. aesd $dat0,q8
  989. aesimc $dat0,$dat0
  990. aesd $dat1,q8
  991. aesimc $dat1,$dat1
  992. aesd $dat2,q8
  993. aesimc $dat2,$dat2
  994. vld1.32 {q8},[$key_],#16
  995. subs $cnt,$cnt,#2
  996. aesd $dat0,q9
  997. aesimc $dat0,$dat0
  998. aesd $dat1,q9
  999. aesimc $dat1,$dat1
  1000. aesd $dat2,q9
  1001. aesimc $dat2,$dat2
  1002. vld1.32 {q9},[$key_],#16
  1003. b.gt .Loop3x_ecb_dec
  1004. aesd $dat0,q8
  1005. aesimc $dat0,$dat0
  1006. aesd $dat1,q8
  1007. aesimc $dat1,$dat1
  1008. aesd $dat2,q8
  1009. aesimc $dat2,$dat2
  1010. subs $len,$len,#0x30
  1011. mov.lo x6,$len // x6, $cnt, is zero at this point
  1012. aesd $dat0,q9
  1013. aesimc $dat0,$dat0
  1014. aesd $dat1,q9
  1015. aesimc $dat1,$dat1
  1016. aesd $dat2,q9
  1017. aesimc $dat2,$dat2
  1018. add $inp,$inp,x6 // $inp is adjusted in such way that
  1019. // at exit from the loop $dat1-$dat2
  1020. // are loaded with last "words"
  1021. mov $key_,$key
  1022. aesd $dat0,q12
  1023. aesimc $dat0,$dat0
  1024. aesd $dat1,q12
  1025. aesimc $dat1,$dat1
  1026. aesd $dat2,q12
  1027. aesimc $dat2,$dat2
  1028. vld1.8 {$in0},[$inp],#16
  1029. aesd $dat0,q13
  1030. aesimc $dat0,$dat0
  1031. aesd $dat1,q13
  1032. aesimc $dat1,$dat1
  1033. aesd $dat2,q13
  1034. aesimc $dat2,$dat2
  1035. vld1.8 {$in1},[$inp],#16
  1036. aesd $dat0,q14
  1037. aesimc $dat0,$dat0
  1038. aesd $dat1,q14
  1039. aesimc $dat1,$dat1
  1040. aesd $dat2,q14
  1041. aesimc $dat2,$dat2
  1042. vld1.8 {$in2},[$inp],#16
  1043. aesd $dat0,q15
  1044. aesd $dat1,q15
  1045. aesd $dat2,q15
  1046. vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
  1047. add $cnt,$rounds,#2
  1048. veor $tmp0,$rndlast,$dat0
  1049. veor $tmp1,$rndlast,$dat1
  1050. veor $dat2,$dat2,$rndlast
  1051. vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
  1052. vst1.8 {$tmp0},[$out],#16
  1053. vorr $dat0,$in0,$in0
  1054. vst1.8 {$tmp1},[$out],#16
  1055. vorr $dat1,$in1,$in1
  1056. vst1.8 {$dat2},[$out],#16
  1057. vorr $dat2,$in2,$in2
  1058. b.hs .Loop3x_ecb_dec
  1059. cmn $len,#0x30
  1060. b.eq .Lecb_done
  1061. nop
  1062. .Lecb_dec_tail:
  1063. aesd $dat1,q8
  1064. aesimc $dat1,$dat1
  1065. aesd $dat2,q8
  1066. aesimc $dat2,$dat2
  1067. vld1.32 {q8},[$key_],#16
  1068. subs $cnt,$cnt,#2
  1069. aesd $dat1,q9
  1070. aesimc $dat1,$dat1
  1071. aesd $dat2,q9
  1072. aesimc $dat2,$dat2
  1073. vld1.32 {q9},[$key_],#16
  1074. b.gt .Lecb_dec_tail
  1075. aesd $dat1,q8
  1076. aesimc $dat1,$dat1
  1077. aesd $dat2,q8
  1078. aesimc $dat2,$dat2
  1079. aesd $dat1,q9
  1080. aesimc $dat1,$dat1
  1081. aesd $dat2,q9
  1082. aesimc $dat2,$dat2
  1083. aesd $dat1,q12
  1084. aesimc $dat1,$dat1
  1085. aesd $dat2,q12
  1086. aesimc $dat2,$dat2
  1087. cmn $len,#0x20
  1088. aesd $dat1,q13
  1089. aesimc $dat1,$dat1
  1090. aesd $dat2,q13
  1091. aesimc $dat2,$dat2
  1092. aesd $dat1,q14
  1093. aesimc $dat1,$dat1
  1094. aesd $dat2,q14
  1095. aesimc $dat2,$dat2
  1096. aesd $dat1,q15
  1097. aesd $dat2,q15
  1098. b.eq .Lecb_dec_one
  1099. veor $tmp1,$rndlast,$dat1
  1100. veor $tmp2,$rndlast,$dat2
  1101. vst1.8 {$tmp1},[$out],#16
  1102. vst1.8 {$tmp2},[$out],#16
  1103. b .Lecb_done
  1104. .Lecb_dec_one:
  1105. veor $tmp1,$rndlast,$dat2
  1106. vst1.8 {$tmp1},[$out],#16
  1107. .Lecb_done:
  1108. ___
  1109. }
  1110. $code.=<<___ if ($flavour !~ /64/);
  1111. vldmia sp!,{d8-d15}
  1112. ldmia sp!,{r4-r8,pc}
  1113. ___
  1114. $code.=<<___ if ($flavour =~ /64/);
  1115. ldr x29,[sp],#16
  1116. ___
  1117. $code.=<<___ if ($flavour =~ /64/);
  1118. .Lecb_Final_abort:
  1119. ret
  1120. ___
  1121. $code.=<<___;
  1122. .size ${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt
  1123. ___
  1124. }}}
  1125. {{{
  1126. my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
  1127. my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
  1128. my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
  1129. my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
  1130. my ($key4,$key5,$key6,$key7)=("x6","x12","x14",$key);
  1131. ### q8-q15 preloaded key schedule
  1132. $code.=<<___;
  1133. .globl ${prefix}_cbc_encrypt
  1134. .type ${prefix}_cbc_encrypt,%function
  1135. .align 5
  1136. ${prefix}_cbc_encrypt:
  1137. ___
  1138. $code.=<<___ if ($flavour =~ /64/);
  1139. AARCH64_VALID_CALL_TARGET
  1140. // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
  1141. stp x29,x30,[sp,#-16]!
  1142. add x29,sp,#0
  1143. ___
  1144. $code.=<<___ if ($flavour !~ /64/);
  1145. mov ip,sp
  1146. stmdb sp!,{r4-r8,lr}
  1147. vstmdb sp!,{d8-d15} @ ABI specification says so
  1148. ldmia ip,{r4-r5} @ load remaining args
  1149. ___
  1150. $code.=<<___;
  1151. subs $len,$len,#16
  1152. mov $step,#16
  1153. b.lo .Lcbc_abort
  1154. cclr $step,eq
  1155. cmp $enc,#0 // en- or decrypting?
  1156. ldr $rounds,[$key,#240]
  1157. and $len,$len,#-16
  1158. vld1.8 {$ivec},[$ivp]
  1159. vld1.8 {$dat},[$inp],$step
  1160. vld1.32 {q8-q9},[$key] // load key schedule...
  1161. sub $rounds,$rounds,#6
  1162. add $key_,$key,x5,lsl#4 // pointer to last 7 round keys
  1163. sub $rounds,$rounds,#2
  1164. vld1.32 {q10-q11},[$key_],#32
  1165. vld1.32 {q12-q13},[$key_],#32
  1166. vld1.32 {q14-q15},[$key_],#32
  1167. vld1.32 {$rndlast},[$key_]
  1168. add $key_,$key,#32
  1169. mov $cnt,$rounds
  1170. b.eq .Lcbc_dec
  1171. cmp $rounds,#2
  1172. veor $dat,$dat,$ivec
  1173. veor $rndzero_n_last,q8,$rndlast
  1174. b.eq .Lcbc_enc128
  1175. vld1.32 {$in0-$in1},[$key_]
  1176. add $key_,$key,#16
  1177. add $key4,$key,#16*4
  1178. add $key5,$key,#16*5
  1179. aese $dat,q8
  1180. aesmc $dat,$dat
  1181. add $key6,$key,#16*6
  1182. add $key7,$key,#16*7
  1183. b .Lenter_cbc_enc
  1184. .align 4
  1185. .Loop_cbc_enc:
  1186. aese $dat,q8
  1187. aesmc $dat,$dat
  1188. vst1.8 {$ivec},[$out],#16
  1189. .Lenter_cbc_enc:
  1190. aese $dat,q9
  1191. aesmc $dat,$dat
  1192. aese $dat,$in0
  1193. aesmc $dat,$dat
  1194. vld1.32 {q8},[$key4]
  1195. cmp $rounds,#4
  1196. aese $dat,$in1
  1197. aesmc $dat,$dat
  1198. vld1.32 {q9},[$key5]
  1199. b.eq .Lcbc_enc192
  1200. aese $dat,q8
  1201. aesmc $dat,$dat
  1202. vld1.32 {q8},[$key6]
  1203. aese $dat,q9
  1204. aesmc $dat,$dat
  1205. vld1.32 {q9},[$key7]
  1206. nop
  1207. .Lcbc_enc192:
  1208. aese $dat,q8
  1209. aesmc $dat,$dat
  1210. subs $len,$len,#16
  1211. aese $dat,q9
  1212. aesmc $dat,$dat
  1213. cclr $step,eq
  1214. aese $dat,q10
  1215. aesmc $dat,$dat
  1216. aese $dat,q11
  1217. aesmc $dat,$dat
  1218. vld1.8 {q8},[$inp],$step
  1219. aese $dat,q12
  1220. aesmc $dat,$dat
  1221. veor q8,q8,$rndzero_n_last
  1222. aese $dat,q13
  1223. aesmc $dat,$dat
  1224. vld1.32 {q9},[$key_] // re-pre-load rndkey[1]
  1225. aese $dat,q14
  1226. aesmc $dat,$dat
  1227. aese $dat,q15
  1228. veor $ivec,$dat,$rndlast
  1229. b.hs .Loop_cbc_enc
  1230. vst1.8 {$ivec},[$out],#16
  1231. b .Lcbc_done
  1232. .align 5
  1233. .Lcbc_enc128:
  1234. vld1.32 {$in0-$in1},[$key_]
  1235. aese $dat,q8
  1236. aesmc $dat,$dat
  1237. b .Lenter_cbc_enc128
  1238. .Loop_cbc_enc128:
  1239. aese $dat,q8
  1240. aesmc $dat,$dat
  1241. vst1.8 {$ivec},[$out],#16
  1242. .Lenter_cbc_enc128:
  1243. aese $dat,q9
  1244. aesmc $dat,$dat
  1245. subs $len,$len,#16
  1246. aese $dat,$in0
  1247. aesmc $dat,$dat
  1248. cclr $step,eq
  1249. aese $dat,$in1
  1250. aesmc $dat,$dat
  1251. aese $dat,q10
  1252. aesmc $dat,$dat
  1253. aese $dat,q11
  1254. aesmc $dat,$dat
  1255. vld1.8 {q8},[$inp],$step
  1256. aese $dat,q12
  1257. aesmc $dat,$dat
  1258. aese $dat,q13
  1259. aesmc $dat,$dat
  1260. aese $dat,q14
  1261. aesmc $dat,$dat
  1262. veor q8,q8,$rndzero_n_last
  1263. aese $dat,q15
  1264. veor $ivec,$dat,$rndlast
  1265. b.hs .Loop_cbc_enc128
  1266. vst1.8 {$ivec},[$out],#16
  1267. b .Lcbc_done
  1268. ___
  1269. {
  1270. my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
  1271. my ($dat3,$in3,$tmp3); # used only in 64-bit mode
  1272. my ($dat4,$in4,$tmp4);
  1273. if ($flavour =~ /64/) {
  1274. ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
  1275. }
  1276. $code.=<<___;
  1277. .align 5
  1278. .Lcbc_dec:
  1279. vld1.8 {$dat2},[$inp],#16
  1280. subs $len,$len,#32 // bias
  1281. add $cnt,$rounds,#2
  1282. vorr $in1,$dat,$dat
  1283. vorr $dat1,$dat,$dat
  1284. vorr $in2,$dat2,$dat2
  1285. b.lo .Lcbc_dec_tail
  1286. vorr $dat1,$dat2,$dat2
  1287. vld1.8 {$dat2},[$inp],#16
  1288. vorr $in0,$dat,$dat
  1289. vorr $in1,$dat1,$dat1
  1290. vorr $in2,$dat2,$dat2
  1291. ___
  1292. $code.=<<___ if ($flavour =~ /64/);
  1293. cmp $len,#32
  1294. b.lo .Loop3x_cbc_dec
  1295. vld1.8 {$dat3},[$inp],#16
  1296. vld1.8 {$dat4},[$inp],#16
  1297. sub $len,$len,#32 // bias
  1298. mov $cnt,$rounds
  1299. vorr $in3,$dat3,$dat3
  1300. vorr $in4,$dat4,$dat4
  1301. .Loop5x_cbc_dec:
  1302. aesd $dat0,q8
  1303. aesimc $dat0,$dat0
  1304. aesd $dat1,q8
  1305. aesimc $dat1,$dat1
  1306. aesd $dat2,q8
  1307. aesimc $dat2,$dat2
  1308. aesd $dat3,q8
  1309. aesimc $dat3,$dat3
  1310. aesd $dat4,q8
  1311. aesimc $dat4,$dat4
  1312. vld1.32 {q8},[$key_],#16
  1313. subs $cnt,$cnt,#2
  1314. aesd $dat0,q9
  1315. aesimc $dat0,$dat0
  1316. aesd $dat1,q9
  1317. aesimc $dat1,$dat1
  1318. aesd $dat2,q9
  1319. aesimc $dat2,$dat2
  1320. aesd $dat3,q9
  1321. aesimc $dat3,$dat3
  1322. aesd $dat4,q9
  1323. aesimc $dat4,$dat4
  1324. vld1.32 {q9},[$key_],#16
  1325. b.gt .Loop5x_cbc_dec
  1326. aesd $dat0,q8
  1327. aesimc $dat0,$dat0
  1328. aesd $dat1,q8
  1329. aesimc $dat1,$dat1
  1330. aesd $dat2,q8
  1331. aesimc $dat2,$dat2
  1332. aesd $dat3,q8
  1333. aesimc $dat3,$dat3
  1334. aesd $dat4,q8
  1335. aesimc $dat4,$dat4
  1336. cmp $len,#0x40 // because .Lcbc_tail4x
  1337. sub $len,$len,#0x50
  1338. aesd $dat0,q9
  1339. aesimc $dat0,$dat0
  1340. aesd $dat1,q9
  1341. aesimc $dat1,$dat1
  1342. aesd $dat2,q9
  1343. aesimc $dat2,$dat2
  1344. aesd $dat3,q9
  1345. aesimc $dat3,$dat3
  1346. aesd $dat4,q9
  1347. aesimc $dat4,$dat4
  1348. csel x6,xzr,$len,gt // borrow x6, $cnt, "gt" is not typo
  1349. mov $key_,$key
  1350. aesd $dat0,q10
  1351. aesimc $dat0,$dat0
  1352. aesd $dat1,q10
  1353. aesimc $dat1,$dat1
  1354. aesd $dat2,q10
  1355. aesimc $dat2,$dat2
  1356. aesd $dat3,q10
  1357. aesimc $dat3,$dat3
  1358. aesd $dat4,q10
  1359. aesimc $dat4,$dat4
  1360. add $inp,$inp,x6 // $inp is adjusted in such way that
  1361. // at exit from the loop $dat1-$dat4
  1362. // are loaded with last "words"
  1363. add x6,$len,#0x60 // because .Lcbc_tail4x
  1364. aesd $dat0,q11
  1365. aesimc $dat0,$dat0
  1366. aesd $dat1,q11
  1367. aesimc $dat1,$dat1
  1368. aesd $dat2,q11
  1369. aesimc $dat2,$dat2
  1370. aesd $dat3,q11
  1371. aesimc $dat3,$dat3
  1372. aesd $dat4,q11
  1373. aesimc $dat4,$dat4
  1374. aesd $dat0,q12
  1375. aesimc $dat0,$dat0
  1376. aesd $dat1,q12
  1377. aesimc $dat1,$dat1
  1378. aesd $dat2,q12
  1379. aesimc $dat2,$dat2
  1380. aesd $dat3,q12
  1381. aesimc $dat3,$dat3
  1382. aesd $dat4,q12
  1383. aesimc $dat4,$dat4
  1384. aesd $dat0,q13
  1385. aesimc $dat0,$dat0
  1386. aesd $dat1,q13
  1387. aesimc $dat1,$dat1
  1388. aesd $dat2,q13
  1389. aesimc $dat2,$dat2
  1390. aesd $dat3,q13
  1391. aesimc $dat3,$dat3
  1392. aesd $dat4,q13
  1393. aesimc $dat4,$dat4
  1394. aesd $dat0,q14
  1395. aesimc $dat0,$dat0
  1396. aesd $dat1,q14
  1397. aesimc $dat1,$dat1
  1398. aesd $dat2,q14
  1399. aesimc $dat2,$dat2
  1400. aesd $dat3,q14
  1401. aesimc $dat3,$dat3
  1402. aesd $dat4,q14
  1403. aesimc $dat4,$dat4
  1404. veor $tmp0,$ivec,$rndlast
  1405. aesd $dat0,q15
  1406. veor $tmp1,$in0,$rndlast
  1407. vld1.8 {$in0},[$inp],#16
  1408. aesd $dat1,q15
  1409. veor $tmp2,$in1,$rndlast
  1410. vld1.8 {$in1},[$inp],#16
  1411. aesd $dat2,q15
  1412. veor $tmp3,$in2,$rndlast
  1413. vld1.8 {$in2},[$inp],#16
  1414. aesd $dat3,q15
  1415. veor $tmp4,$in3,$rndlast
  1416. vld1.8 {$in3},[$inp],#16
  1417. aesd $dat4,q15
  1418. vorr $ivec,$in4,$in4
  1419. vld1.8 {$in4},[$inp],#16
  1420. cbz x6,.Lcbc_tail4x
  1421. vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
  1422. veor $tmp0,$tmp0,$dat0
  1423. vorr $dat0,$in0,$in0
  1424. veor $tmp1,$tmp1,$dat1
  1425. vorr $dat1,$in1,$in1
  1426. veor $tmp2,$tmp2,$dat2
  1427. vorr $dat2,$in2,$in2
  1428. veor $tmp3,$tmp3,$dat3
  1429. vorr $dat3,$in3,$in3
  1430. veor $tmp4,$tmp4,$dat4
  1431. vst1.8 {$tmp0},[$out],#16
  1432. vorr $dat4,$in4,$in4
  1433. vst1.8 {$tmp1},[$out],#16
  1434. mov $cnt,$rounds
  1435. vst1.8 {$tmp2},[$out],#16
  1436. vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
  1437. vst1.8 {$tmp3},[$out],#16
  1438. vst1.8 {$tmp4},[$out],#16
  1439. b.hs .Loop5x_cbc_dec
  1440. add $len,$len,#0x50
  1441. cbz $len,.Lcbc_done
  1442. add $cnt,$rounds,#2
  1443. subs $len,$len,#0x30
  1444. vorr $dat0,$in2,$in2
  1445. vorr $in0,$in2,$in2
  1446. vorr $dat1,$in3,$in3
  1447. vorr $in1,$in3,$in3
  1448. vorr $dat2,$in4,$in4
  1449. vorr $in2,$in4,$in4
  1450. b.lo .Lcbc_dec_tail
  1451. b .Loop3x_cbc_dec
  1452. .align 4
  1453. .Lcbc_tail4x:
  1454. veor $tmp1,$tmp0,$dat1
  1455. veor $tmp2,$tmp2,$dat2
  1456. veor $tmp3,$tmp3,$dat3
  1457. veor $tmp4,$tmp4,$dat4
  1458. vst1.8 {$tmp1},[$out],#16
  1459. vst1.8 {$tmp2},[$out],#16
  1460. vst1.8 {$tmp3},[$out],#16
  1461. vst1.8 {$tmp4},[$out],#16
  1462. b .Lcbc_done
  1463. .align 4
  1464. ___
  1465. $code.=<<___;
  1466. .Loop3x_cbc_dec:
  1467. aesd $dat0,q8
  1468. aesimc $dat0,$dat0
  1469. aesd $dat1,q8
  1470. aesimc $dat1,$dat1
  1471. aesd $dat2,q8
  1472. aesimc $dat2,$dat2
  1473. vld1.32 {q8},[$key_],#16
  1474. subs $cnt,$cnt,#2
  1475. aesd $dat0,q9
  1476. aesimc $dat0,$dat0
  1477. aesd $dat1,q9
  1478. aesimc $dat1,$dat1
  1479. aesd $dat2,q9
  1480. aesimc $dat2,$dat2
  1481. vld1.32 {q9},[$key_],#16
  1482. b.gt .Loop3x_cbc_dec
  1483. aesd $dat0,q8
  1484. aesimc $dat0,$dat0
  1485. aesd $dat1,q8
  1486. aesimc $dat1,$dat1
  1487. aesd $dat2,q8
  1488. aesimc $dat2,$dat2
  1489. veor $tmp0,$ivec,$rndlast
  1490. subs $len,$len,#0x30
  1491. veor $tmp1,$in0,$rndlast
  1492. mov.lo x6,$len // x6, $cnt, is zero at this point
  1493. aesd $dat0,q9
  1494. aesimc $dat0,$dat0
  1495. aesd $dat1,q9
  1496. aesimc $dat1,$dat1
  1497. aesd $dat2,q9
  1498. aesimc $dat2,$dat2
  1499. veor $tmp2,$in1,$rndlast
  1500. add $inp,$inp,x6 // $inp is adjusted in such way that
  1501. // at exit from the loop $dat1-$dat2
  1502. // are loaded with last "words"
  1503. vorr $ivec,$in2,$in2
  1504. mov $key_,$key
  1505. aesd $dat0,q12
  1506. aesimc $dat0,$dat0
  1507. aesd $dat1,q12
  1508. aesimc $dat1,$dat1
  1509. aesd $dat2,q12
  1510. aesimc $dat2,$dat2
  1511. vld1.8 {$in0},[$inp],#16
  1512. aesd $dat0,q13
  1513. aesimc $dat0,$dat0
  1514. aesd $dat1,q13
  1515. aesimc $dat1,$dat1
  1516. aesd $dat2,q13
  1517. aesimc $dat2,$dat2
  1518. vld1.8 {$in1},[$inp],#16
  1519. aesd $dat0,q14
  1520. aesimc $dat0,$dat0
  1521. aesd $dat1,q14
  1522. aesimc $dat1,$dat1
  1523. aesd $dat2,q14
  1524. aesimc $dat2,$dat2
  1525. vld1.8 {$in2},[$inp],#16
  1526. aesd $dat0,q15
  1527. aesd $dat1,q15
  1528. aesd $dat2,q15
  1529. vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
  1530. add $cnt,$rounds,#2
  1531. veor $tmp0,$tmp0,$dat0
  1532. veor $tmp1,$tmp1,$dat1
  1533. veor $dat2,$dat2,$tmp2
  1534. vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
  1535. vst1.8 {$tmp0},[$out],#16
  1536. vorr $dat0,$in0,$in0
  1537. vst1.8 {$tmp1},[$out],#16
  1538. vorr $dat1,$in1,$in1
  1539. vst1.8 {$dat2},[$out],#16
  1540. vorr $dat2,$in2,$in2
  1541. b.hs .Loop3x_cbc_dec
  1542. cmn $len,#0x30
  1543. b.eq .Lcbc_done
  1544. nop
  1545. .Lcbc_dec_tail:
  1546. aesd $dat1,q8
  1547. aesimc $dat1,$dat1
  1548. aesd $dat2,q8
  1549. aesimc $dat2,$dat2
  1550. vld1.32 {q8},[$key_],#16
  1551. subs $cnt,$cnt,#2
  1552. aesd $dat1,q9
  1553. aesimc $dat1,$dat1
  1554. aesd $dat2,q9
  1555. aesimc $dat2,$dat2
  1556. vld1.32 {q9},[$key_],#16
  1557. b.gt .Lcbc_dec_tail
  1558. aesd $dat1,q8
  1559. aesimc $dat1,$dat1
  1560. aesd $dat2,q8
  1561. aesimc $dat2,$dat2
  1562. aesd $dat1,q9
  1563. aesimc $dat1,$dat1
  1564. aesd $dat2,q9
  1565. aesimc $dat2,$dat2
  1566. aesd $dat1,q12
  1567. aesimc $dat1,$dat1
  1568. aesd $dat2,q12
  1569. aesimc $dat2,$dat2
  1570. cmn $len,#0x20
  1571. aesd $dat1,q13
  1572. aesimc $dat1,$dat1
  1573. aesd $dat2,q13
  1574. aesimc $dat2,$dat2
  1575. veor $tmp1,$ivec,$rndlast
  1576. aesd $dat1,q14
  1577. aesimc $dat1,$dat1
  1578. aesd $dat2,q14
  1579. aesimc $dat2,$dat2
  1580. veor $tmp2,$in1,$rndlast
  1581. aesd $dat1,q15
  1582. aesd $dat2,q15
  1583. b.eq .Lcbc_dec_one
  1584. veor $tmp1,$tmp1,$dat1
  1585. veor $tmp2,$tmp2,$dat2
  1586. vorr $ivec,$in2,$in2
  1587. vst1.8 {$tmp1},[$out],#16
  1588. vst1.8 {$tmp2},[$out],#16
  1589. b .Lcbc_done
  1590. .Lcbc_dec_one:
  1591. veor $tmp1,$tmp1,$dat2
  1592. vorr $ivec,$in2,$in2
  1593. vst1.8 {$tmp1},[$out],#16
  1594. .Lcbc_done:
  1595. vst1.8 {$ivec},[$ivp]
  1596. .Lcbc_abort:
  1597. ___
  1598. }
  1599. $code.=<<___ if ($flavour !~ /64/);
  1600. vldmia sp!,{d8-d15}
  1601. ldmia sp!,{r4-r8,pc}
  1602. ___
  1603. $code.=<<___ if ($flavour =~ /64/);
  1604. ldr x29,[sp],#16
  1605. ret
  1606. ___
  1607. $code.=<<___;
  1608. .size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
  1609. ___
  1610. }}}
  1611. {{{
  1612. my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
  1613. my ($rounds,$cnt,$key_)=("w5","w6","x7");
  1614. my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
  1615. my $step="x12"; # aliases with $tctr2
  1616. my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
  1617. my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
  1618. # used only in 64-bit mode...
  1619. my ($dat3,$dat4,$in3,$in4)=map("q$_",(16..23));
  1620. my ($dat,$tmp)=($dat0,$tmp0);
  1621. ### q8-q15 preloaded key schedule
  1622. $code.=<<___;
  1623. .globl ${prefix}_ctr32_encrypt_blocks
  1624. .type ${prefix}_ctr32_encrypt_blocks,%function
  1625. .align 5
  1626. ${prefix}_ctr32_encrypt_blocks:
  1627. ___
  1628. $code.=<<___ if ($flavour =~ /64/);
  1629. AARCH64_VALID_CALL_TARGET
  1630. // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
  1631. stp x29,x30,[sp,#-16]!
  1632. add x29,sp,#0
  1633. ___
  1634. $code.=<<___ if ($flavour !~ /64/);
  1635. mov ip,sp
  1636. stmdb sp!,{r4-r10,lr}
  1637. vstmdb sp!,{d8-d15} @ ABI specification says so
  1638. ldr r4, [ip] @ load remaining arg
  1639. ___
  1640. $code.=<<___;
  1641. ldr $rounds,[$key,#240]
  1642. ldr $ctr, [$ivp, #12]
  1643. #ifdef __ARMEB__
  1644. vld1.8 {$dat0},[$ivp]
  1645. #else
  1646. vld1.32 {$dat0},[$ivp]
  1647. #endif
  1648. vld1.32 {q8-q9},[$key] // load key schedule...
  1649. sub $rounds,$rounds,#4
  1650. mov $step,#16
  1651. cmp $len,#2
  1652. add $key_,$key,x5,lsl#4 // pointer to last 5 round keys
  1653. sub $rounds,$rounds,#2
  1654. vld1.32 {q12-q13},[$key_],#32
  1655. vld1.32 {q14-q15},[$key_],#32
  1656. vld1.32 {$rndlast},[$key_]
  1657. add $key_,$key,#32
  1658. mov $cnt,$rounds
  1659. cclr $step,lo
  1660. #ifndef __ARMEB__
  1661. rev $ctr, $ctr
  1662. #endif
  1663. ___
  1664. $code.=<<___ if ($flavour =~ /64/);
  1665. vorr $dat1,$dat0,$dat0
  1666. add $tctr1, $ctr, #1
  1667. vorr $dat2,$dat0,$dat0
  1668. add $ctr, $ctr, #2
  1669. vorr $ivec,$dat0,$dat0
  1670. rev $tctr1, $tctr1
  1671. vmov.32 ${dat1}[3],$tctr1
  1672. b.ls .Lctr32_tail
  1673. rev $tctr2, $ctr
  1674. sub $len,$len,#3 // bias
  1675. vmov.32 ${dat2}[3],$tctr2
  1676. ___
  1677. $code.=<<___ if ($flavour !~ /64/);
  1678. add $tctr1, $ctr, #1
  1679. vorr $ivec,$dat0,$dat0
  1680. rev $tctr1, $tctr1
  1681. vmov.32 ${ivec}[3],$tctr1
  1682. add $ctr, $ctr, #2
  1683. vorr $dat1,$ivec,$ivec
  1684. b.ls .Lctr32_tail
  1685. rev $tctr2, $ctr
  1686. vmov.32 ${ivec}[3],$tctr2
  1687. sub $len,$len,#3 // bias
  1688. vorr $dat2,$ivec,$ivec
  1689. ___
  1690. $code.=<<___ if ($flavour =~ /64/);
  1691. cmp $len,#32
  1692. b.lo .Loop3x_ctr32
  1693. add w13,$ctr,#1
  1694. add w14,$ctr,#2
  1695. vorr $dat3,$dat0,$dat0
  1696. rev w13,w13
  1697. vorr $dat4,$dat0,$dat0
  1698. rev w14,w14
  1699. vmov.32 ${dat3}[3],w13
  1700. sub $len,$len,#2 // bias
  1701. vmov.32 ${dat4}[3],w14
  1702. add $ctr,$ctr,#2
  1703. b .Loop5x_ctr32
  1704. .align 4
  1705. .Loop5x_ctr32:
  1706. aese $dat0,q8
  1707. aesmc $dat0,$dat0
  1708. aese $dat1,q8
  1709. aesmc $dat1,$dat1
  1710. aese $dat2,q8
  1711. aesmc $dat2,$dat2
  1712. aese $dat3,q8
  1713. aesmc $dat3,$dat3
  1714. aese $dat4,q8
  1715. aesmc $dat4,$dat4
  1716. vld1.32 {q8},[$key_],#16
  1717. subs $cnt,$cnt,#2
  1718. aese $dat0,q9
  1719. aesmc $dat0,$dat0
  1720. aese $dat1,q9
  1721. aesmc $dat1,$dat1
  1722. aese $dat2,q9
  1723. aesmc $dat2,$dat2
  1724. aese $dat3,q9
  1725. aesmc $dat3,$dat3
  1726. aese $dat4,q9
  1727. aesmc $dat4,$dat4
  1728. vld1.32 {q9},[$key_],#16
  1729. b.gt .Loop5x_ctr32
  1730. mov $key_,$key
  1731. aese $dat0,q8
  1732. aesmc $dat0,$dat0
  1733. aese $dat1,q8
  1734. aesmc $dat1,$dat1
  1735. aese $dat2,q8
  1736. aesmc $dat2,$dat2
  1737. aese $dat3,q8
  1738. aesmc $dat3,$dat3
  1739. aese $dat4,q8
  1740. aesmc $dat4,$dat4
  1741. vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
  1742. aese $dat0,q9
  1743. aesmc $dat0,$dat0
  1744. aese $dat1,q9
  1745. aesmc $dat1,$dat1
  1746. aese $dat2,q9
  1747. aesmc $dat2,$dat2
  1748. aese $dat3,q9
  1749. aesmc $dat3,$dat3
  1750. aese $dat4,q9
  1751. aesmc $dat4,$dat4
  1752. vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
  1753. aese $dat0,q12
  1754. aesmc $dat0,$dat0
  1755. add $tctr0,$ctr,#1
  1756. add $tctr1,$ctr,#2
  1757. aese $dat1,q12
  1758. aesmc $dat1,$dat1
  1759. add $tctr2,$ctr,#3
  1760. add w13,$ctr,#4
  1761. aese $dat2,q12
  1762. aesmc $dat2,$dat2
  1763. add w14,$ctr,#5
  1764. rev $tctr0,$tctr0
  1765. aese $dat3,q12
  1766. aesmc $dat3,$dat3
  1767. rev $tctr1,$tctr1
  1768. rev $tctr2,$tctr2
  1769. aese $dat4,q12
  1770. aesmc $dat4,$dat4
  1771. rev w13,w13
  1772. rev w14,w14
  1773. aese $dat0,q13
  1774. aesmc $dat0,$dat0
  1775. aese $dat1,q13
  1776. aesmc $dat1,$dat1
  1777. aese $dat2,q13
  1778. aesmc $dat2,$dat2
  1779. aese $dat3,q13
  1780. aesmc $dat3,$dat3
  1781. aese $dat4,q13
  1782. aesmc $dat4,$dat4
  1783. aese $dat0,q14
  1784. aesmc $dat0,$dat0
  1785. vld1.8 {$in0},[$inp],#16
  1786. aese $dat1,q14
  1787. aesmc $dat1,$dat1
  1788. vld1.8 {$in1},[$inp],#16
  1789. aese $dat2,q14
  1790. aesmc $dat2,$dat2
  1791. vld1.8 {$in2},[$inp],#16
  1792. aese $dat3,q14
  1793. aesmc $dat3,$dat3
  1794. vld1.8 {$in3},[$inp],#16
  1795. aese $dat4,q14
  1796. aesmc $dat4,$dat4
  1797. vld1.8 {$in4},[$inp],#16
  1798. aese $dat0,q15
  1799. veor $in0,$in0,$rndlast
  1800. aese $dat1,q15
  1801. veor $in1,$in1,$rndlast
  1802. aese $dat2,q15
  1803. veor $in2,$in2,$rndlast
  1804. aese $dat3,q15
  1805. veor $in3,$in3,$rndlast
  1806. aese $dat4,q15
  1807. veor $in4,$in4,$rndlast
  1808. veor $in0,$in0,$dat0
  1809. vorr $dat0,$ivec,$ivec
  1810. veor $in1,$in1,$dat1
  1811. vorr $dat1,$ivec,$ivec
  1812. veor $in2,$in2,$dat2
  1813. vorr $dat2,$ivec,$ivec
  1814. veor $in3,$in3,$dat3
  1815. vorr $dat3,$ivec,$ivec
  1816. veor $in4,$in4,$dat4
  1817. vorr $dat4,$ivec,$ivec
  1818. vst1.8 {$in0},[$out],#16
  1819. vmov.32 ${dat0}[3],$tctr0
  1820. vst1.8 {$in1},[$out],#16
  1821. vmov.32 ${dat1}[3],$tctr1
  1822. vst1.8 {$in2},[$out],#16
  1823. vmov.32 ${dat2}[3],$tctr2
  1824. vst1.8 {$in3},[$out],#16
  1825. vmov.32 ${dat3}[3],w13
  1826. vst1.8 {$in4},[$out],#16
  1827. vmov.32 ${dat4}[3],w14
  1828. mov $cnt,$rounds
  1829. cbz $len,.Lctr32_done
  1830. add $ctr,$ctr,#5
  1831. subs $len,$len,#5
  1832. b.hs .Loop5x_ctr32
  1833. add $len,$len,#5
  1834. sub $ctr,$ctr,#5
  1835. cmp $len,#2
  1836. mov $step,#16
  1837. cclr $step,lo
  1838. b.ls .Lctr32_tail
  1839. sub $len,$len,#3 // bias
  1840. add $ctr,$ctr,#3
  1841. ___
  1842. $code.=<<___;
  1843. b .Loop3x_ctr32
  1844. .align 4
  1845. .Loop3x_ctr32:
  1846. aese $dat0,q8
  1847. aesmc $dat0,$dat0
  1848. aese $dat1,q8
  1849. aesmc $dat1,$dat1
  1850. aese $dat2,q8
  1851. aesmc $dat2,$dat2
  1852. vld1.32 {q8},[$key_],#16
  1853. subs $cnt,$cnt,#2
  1854. aese $dat0,q9
  1855. aesmc $dat0,$dat0
  1856. aese $dat1,q9
  1857. aesmc $dat1,$dat1
  1858. aese $dat2,q9
  1859. aesmc $dat2,$dat2
  1860. vld1.32 {q9},[$key_],#16
  1861. b.gt .Loop3x_ctr32
  1862. aese $dat0,q8
  1863. aesmc $tmp0,$dat0
  1864. aese $dat1,q8
  1865. aesmc $tmp1,$dat1
  1866. vld1.8 {$in0},[$inp],#16
  1867. ___
  1868. $code.=<<___ if ($flavour =~ /64/);
  1869. vorr $dat0,$ivec,$ivec
  1870. ___
  1871. $code.=<<___ if ($flavour !~ /64/);
  1872. add $tctr0,$ctr,#1
  1873. ___
  1874. $code.=<<___;
  1875. aese $dat2,q8
  1876. aesmc $dat2,$dat2
  1877. vld1.8 {$in1},[$inp],#16
  1878. ___
  1879. $code.=<<___ if ($flavour =~ /64/);
  1880. vorr $dat1,$ivec,$ivec
  1881. ___
  1882. $code.=<<___ if ($flavour !~ /64/);
  1883. rev $tctr0,$tctr0
  1884. ___
  1885. $code.=<<___;
  1886. aese $tmp0,q9
  1887. aesmc $tmp0,$tmp0
  1888. aese $tmp1,q9
  1889. aesmc $tmp1,$tmp1
  1890. vld1.8 {$in2},[$inp],#16
  1891. mov $key_,$key
  1892. aese $dat2,q9
  1893. aesmc $tmp2,$dat2
  1894. ___
  1895. $code.=<<___ if ($flavour =~ /64/);
  1896. vorr $dat2,$ivec,$ivec
  1897. add $tctr0,$ctr,#1
  1898. ___
  1899. $code.=<<___;
  1900. aese $tmp0,q12
  1901. aesmc $tmp0,$tmp0
  1902. aese $tmp1,q12
  1903. aesmc $tmp1,$tmp1
  1904. veor $in0,$in0,$rndlast
  1905. add $tctr1,$ctr,#2
  1906. aese $tmp2,q12
  1907. aesmc $tmp2,$tmp2
  1908. veor $in1,$in1,$rndlast
  1909. add $ctr,$ctr,#3
  1910. aese $tmp0,q13
  1911. aesmc $tmp0,$tmp0
  1912. aese $tmp1,q13
  1913. aesmc $tmp1,$tmp1
  1914. veor $in2,$in2,$rndlast
  1915. ___
  1916. $code.=<<___ if ($flavour =~ /64/);
  1917. rev $tctr0,$tctr0
  1918. aese $tmp2,q13
  1919. aesmc $tmp2,$tmp2
  1920. vmov.32 ${dat0}[3], $tctr0
  1921. ___
  1922. $code.=<<___ if ($flavour !~ /64/);
  1923. vmov.32 ${ivec}[3], $tctr0
  1924. aese $tmp2,q13
  1925. aesmc $tmp2,$tmp2
  1926. vorr $dat0,$ivec,$ivec
  1927. ___
  1928. $code.=<<___;
  1929. rev $tctr1,$tctr1
  1930. aese $tmp0,q14
  1931. aesmc $tmp0,$tmp0
  1932. ___
  1933. $code.=<<___ if ($flavour !~ /64/);
  1934. vmov.32 ${ivec}[3], $tctr1
  1935. rev $tctr2,$ctr
  1936. ___
  1937. $code.=<<___;
  1938. aese $tmp1,q14
  1939. aesmc $tmp1,$tmp1
  1940. ___
  1941. $code.=<<___ if ($flavour =~ /64/);
  1942. vmov.32 ${dat1}[3], $tctr1
  1943. rev $tctr2,$ctr
  1944. aese $tmp2,q14
  1945. aesmc $tmp2,$tmp2
  1946. vmov.32 ${dat2}[3], $tctr2
  1947. ___
  1948. $code.=<<___ if ($flavour !~ /64/);
  1949. vorr $dat1,$ivec,$ivec
  1950. vmov.32 ${ivec}[3], $tctr2
  1951. aese $tmp2,q14
  1952. aesmc $tmp2,$tmp2
  1953. vorr $dat2,$ivec,$ivec
  1954. ___
  1955. $code.=<<___;
  1956. subs $len,$len,#3
  1957. aese $tmp0,q15
  1958. aese $tmp1,q15
  1959. aese $tmp2,q15
  1960. veor $in0,$in0,$tmp0
  1961. vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
  1962. vst1.8 {$in0},[$out],#16
  1963. veor $in1,$in1,$tmp1
  1964. mov $cnt,$rounds
  1965. vst1.8 {$in1},[$out],#16
  1966. veor $in2,$in2,$tmp2
  1967. vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
  1968. vst1.8 {$in2},[$out],#16
  1969. b.hs .Loop3x_ctr32
  1970. adds $len,$len,#3
  1971. b.eq .Lctr32_done
  1972. cmp $len,#1
  1973. mov $step,#16
  1974. cclr $step,eq
  1975. .Lctr32_tail:
  1976. aese $dat0,q8
  1977. aesmc $dat0,$dat0
  1978. aese $dat1,q8
  1979. aesmc $dat1,$dat1
  1980. vld1.32 {q8},[$key_],#16
  1981. subs $cnt,$cnt,#2
  1982. aese $dat0,q9
  1983. aesmc $dat0,$dat0
  1984. aese $dat1,q9
  1985. aesmc $dat1,$dat1
  1986. vld1.32 {q9},[$key_],#16
  1987. b.gt .Lctr32_tail
  1988. aese $dat0,q8
  1989. aesmc $dat0,$dat0
  1990. aese $dat1,q8
  1991. aesmc $dat1,$dat1
  1992. aese $dat0,q9
  1993. aesmc $dat0,$dat0
  1994. aese $dat1,q9
  1995. aesmc $dat1,$dat1
  1996. vld1.8 {$in0},[$inp],$step
  1997. aese $dat0,q12
  1998. aesmc $dat0,$dat0
  1999. aese $dat1,q12
  2000. aesmc $dat1,$dat1
  2001. vld1.8 {$in1},[$inp]
  2002. aese $dat0,q13
  2003. aesmc $dat0,$dat0
  2004. aese $dat1,q13
  2005. aesmc $dat1,$dat1
  2006. veor $in0,$in0,$rndlast
  2007. aese $dat0,q14
  2008. aesmc $dat0,$dat0
  2009. aese $dat1,q14
  2010. aesmc $dat1,$dat1
  2011. veor $in1,$in1,$rndlast
  2012. aese $dat0,q15
  2013. aese $dat1,q15
  2014. cmp $len,#1
  2015. veor $in0,$in0,$dat0
  2016. veor $in1,$in1,$dat1
  2017. vst1.8 {$in0},[$out],#16
  2018. b.eq .Lctr32_done
  2019. vst1.8 {$in1},[$out]
  2020. .Lctr32_done:
  2021. ___
  2022. $code.=<<___ if ($flavour !~ /64/);
  2023. vldmia sp!,{d8-d15}
  2024. ldmia sp!,{r4-r10,pc}
  2025. ___
  2026. $code.=<<___ if ($flavour =~ /64/);
  2027. ldr x29,[sp],#16
  2028. ret
  2029. ___
  2030. $code.=<<___;
  2031. .size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
  2032. ___
  2033. }}}
  2034. # Performance in cycles per byte.
  2035. # Processed with AES-XTS different key size.
  2036. # It shows the value before and after optimization as below:
  2037. # (before/after):
  2038. #
  2039. # AES-128-XTS AES-256-XTS
  2040. # Cortex-A57 3.36/1.09 4.02/1.37
  2041. # Cortex-A72 3.03/1.02 3.28/1.33
  2042. # Optimization is implemented by loop unrolling and interleaving.
  2043. # Commonly, we choose the unrolling factor as 5, if the input
  2044. # data size smaller than 5 blocks, but not smaller than 3 blocks,
  2045. # choose 3 as the unrolling factor.
  2046. # If the input data size dsize >= 5*16 bytes, then take 5 blocks
  2047. # as one iteration, every loop the left size lsize -= 5*16.
  2048. # If lsize < 5*16 bytes, treat them as the tail. Note: left 4*16 bytes
  2049. # will be processed specially, which be integrated into the 5*16 bytes
  2050. # loop to improve the efficiency.
  2051. # There is one special case, if the original input data size dsize
  2052. # = 16 bytes, we will treat it separately to improve the
  2053. # performance: one independent code block without LR, FP load and
  2054. # store.
  2055. # Encryption will process the (length -tailcnt) bytes as mentioned
  2056. # previously, then encrypt the composite block as last second
  2057. # cipher block.
  2058. # Decryption will process the (length -tailcnt -1) bytes as mentioned
  2059. # previously, then decrypt the last second cipher block to get the
  2060. # last plain block(tail), decrypt the composite block as last second
  2061. # plain text block.
  2062. {{{
  2063. my ($inp,$out,$len,$key1,$key2,$ivp)=map("x$_",(0..5));
  2064. my ($rounds0,$rounds,$key_,$step,$ivl,$ivh)=("w5","w6","x7","x8","x9","x10");
  2065. my ($tmpoutp,$loutp,$l2outp,$tmpinp)=("x13","w14","w15","x20");
  2066. my ($tailcnt,$midnum,$midnumx,$constnum,$constnumx)=("x21","w22","x22","w19","x19");
  2067. my ($xoffset,$tmpmx,$tmpmw)=("x6","x11","w11");
  2068. my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7));
  2069. my ($iv0,$iv1,$iv2,$iv3,$iv4)=("v6.16b","v8.16b","v9.16b","v10.16b","v11.16b");
  2070. my ($ivd00,$ivd01,$ivd20,$ivd21)=("d6","v6.d[1]","d9","v9.d[1]");
  2071. my ($ivd10,$ivd11,$ivd30,$ivd31,$ivd40,$ivd41)=("d8","v8.d[1]","d10","v10.d[1]","d11","v11.d[1]");
  2072. my ($tmpin)=("v26.16b");
  2073. my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
  2074. # q7 last round key
  2075. # q10-q15, q7 Last 7 round keys
  2076. # q8-q9 preloaded round keys except last 7 keys for big size
  2077. # q20, q21, q8-q9 preloaded round keys except last 7 keys for only 16 byte
  2078. my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
  2079. my ($dat3,$in3,$tmp3); # used only in 64-bit mode
  2080. my ($dat4,$in4,$tmp4);
  2081. if ($flavour =~ /64/) {
  2082. ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
  2083. }
  2084. $code.=<<___ if ($flavour =~ /64/);
  2085. .globl ${prefix}_xts_encrypt
  2086. .type ${prefix}_xts_encrypt,%function
  2087. .align 5
  2088. ${prefix}_xts_encrypt:
  2089. ___
  2090. $code.=<<___ if ($flavour =~ /64/);
  2091. AARCH64_VALID_CALL_TARGET
  2092. cmp $len,#16
  2093. // Original input data size bigger than 16, jump to big size processing.
  2094. b.ne .Lxts_enc_big_size
  2095. // Encrypt the iv with key2, as the first XEX iv.
  2096. ldr $rounds,[$key2,#240]
  2097. vld1.32 {$dat},[$key2],#16
  2098. vld1.8 {$iv0},[$ivp]
  2099. sub $rounds,$rounds,#2
  2100. vld1.32 {$dat1},[$key2],#16
  2101. .Loop_enc_iv_enc:
  2102. aese $iv0,$dat
  2103. aesmc $iv0,$iv0
  2104. vld1.32 {$dat},[$key2],#16
  2105. subs $rounds,$rounds,#2
  2106. aese $iv0,$dat1
  2107. aesmc $iv0,$iv0
  2108. vld1.32 {$dat1},[$key2],#16
  2109. b.gt .Loop_enc_iv_enc
  2110. aese $iv0,$dat
  2111. aesmc $iv0,$iv0
  2112. vld1.32 {$dat},[$key2]
  2113. aese $iv0,$dat1
  2114. veor $iv0,$iv0,$dat
  2115. vld1.8 {$dat0},[$inp]
  2116. veor $dat0,$iv0,$dat0
  2117. ldr $rounds,[$key1,#240]
  2118. vld1.32 {q20-q21},[$key1],#32 // load key schedule...
  2119. aese $dat0,q20
  2120. aesmc $dat0,$dat0
  2121. vld1.32 {q8-q9},[$key1],#32 // load key schedule...
  2122. aese $dat0,q21
  2123. aesmc $dat0,$dat0
  2124. subs $rounds,$rounds,#10 // if rounds==10, jump to aes-128-xts processing
  2125. b.eq .Lxts_128_enc
  2126. .Lxts_enc_round_loop:
  2127. aese $dat0,q8
  2128. aesmc $dat0,$dat0
  2129. vld1.32 {q8},[$key1],#16 // load key schedule...
  2130. aese $dat0,q9
  2131. aesmc $dat0,$dat0
  2132. vld1.32 {q9},[$key1],#16 // load key schedule...
  2133. subs $rounds,$rounds,#2 // bias
  2134. b.gt .Lxts_enc_round_loop
  2135. .Lxts_128_enc:
  2136. vld1.32 {q10-q11},[$key1],#32 // load key schedule...
  2137. aese $dat0,q8
  2138. aesmc $dat0,$dat0
  2139. aese $dat0,q9
  2140. aesmc $dat0,$dat0
  2141. vld1.32 {q12-q13},[$key1],#32 // load key schedule...
  2142. aese $dat0,q10
  2143. aesmc $dat0,$dat0
  2144. aese $dat0,q11
  2145. aesmc $dat0,$dat0
  2146. vld1.32 {q14-q15},[$key1],#32 // load key schedule...
  2147. aese $dat0,q12
  2148. aesmc $dat0,$dat0
  2149. aese $dat0,q13
  2150. aesmc $dat0,$dat0
  2151. vld1.32 {$rndlast},[$key1]
  2152. aese $dat0,q14
  2153. aesmc $dat0,$dat0
  2154. aese $dat0,q15
  2155. veor $dat0,$dat0,$rndlast
  2156. veor $dat0,$dat0,$iv0
  2157. vst1.8 {$dat0},[$out]
  2158. b .Lxts_enc_final_abort
  2159. .align 4
  2160. .Lxts_enc_big_size:
  2161. ___
  2162. $code.=<<___ if ($flavour =~ /64/);
  2163. stp $constnumx,$tmpinp,[sp,#-64]!
  2164. stp $tailcnt,$midnumx,[sp,#48]
  2165. stp $ivd10,$ivd20,[sp,#32]
  2166. stp $ivd30,$ivd40,[sp,#16]
  2167. // tailcnt store the tail value of length%16.
  2168. and $tailcnt,$len,#0xf
  2169. and $len,$len,#-16
  2170. subs $len,$len,#16
  2171. mov $step,#16
  2172. b.lo .Lxts_abort
  2173. csel $step,xzr,$step,eq
  2174. // Firstly, encrypt the iv with key2, as the first iv of XEX.
  2175. ldr $rounds,[$key2,#240]
  2176. vld1.32 {$dat},[$key2],#16
  2177. vld1.8 {$iv0},[$ivp]
  2178. sub $rounds,$rounds,#2
  2179. vld1.32 {$dat1},[$key2],#16
  2180. .Loop_iv_enc:
  2181. aese $iv0,$dat
  2182. aesmc $iv0,$iv0
  2183. vld1.32 {$dat},[$key2],#16
  2184. subs $rounds,$rounds,#2
  2185. aese $iv0,$dat1
  2186. aesmc $iv0,$iv0
  2187. vld1.32 {$dat1},[$key2],#16
  2188. b.gt .Loop_iv_enc
  2189. aese $iv0,$dat
  2190. aesmc $iv0,$iv0
  2191. vld1.32 {$dat},[$key2]
  2192. aese $iv0,$dat1
  2193. veor $iv0,$iv0,$dat
  2194. // The iv for second block
  2195. // $ivl- iv(low), $ivh - iv(high)
  2196. // the five ivs stored into, $iv0,$iv1,$iv2,$iv3,$iv4
  2197. fmov $ivl,$ivd00
  2198. fmov $ivh,$ivd01
  2199. mov $constnum,#0x87
  2200. extr $midnumx,$ivh,$ivh,#32
  2201. extr $ivh,$ivh,$ivl,#63
  2202. and $tmpmw,$constnum,$midnum,asr#31
  2203. eor $ivl,$tmpmx,$ivl,lsl#1
  2204. fmov $ivd10,$ivl
  2205. fmov $ivd11,$ivh
  2206. ldr $rounds0,[$key1,#240] // next starting point
  2207. vld1.8 {$dat},[$inp],$step
  2208. vld1.32 {q8-q9},[$key1] // load key schedule...
  2209. sub $rounds0,$rounds0,#6
  2210. add $key_,$key1,$ivp,lsl#4 // pointer to last 7 round keys
  2211. sub $rounds0,$rounds0,#2
  2212. vld1.32 {q10-q11},[$key_],#32
  2213. vld1.32 {q12-q13},[$key_],#32
  2214. vld1.32 {q14-q15},[$key_],#32
  2215. vld1.32 {$rndlast},[$key_]
  2216. add $key_,$key1,#32
  2217. mov $rounds,$rounds0
  2218. // Encryption
  2219. .Lxts_enc:
  2220. vld1.8 {$dat2},[$inp],#16
  2221. subs $len,$len,#32 // bias
  2222. add $rounds,$rounds0,#2
  2223. vorr $in1,$dat,$dat
  2224. vorr $dat1,$dat,$dat
  2225. vorr $in3,$dat,$dat
  2226. vorr $in2,$dat2,$dat2
  2227. vorr $in4,$dat2,$dat2
  2228. b.lo .Lxts_inner_enc_tail
  2229. veor $dat,$dat,$iv0 // before encryption, xor with iv
  2230. veor $dat2,$dat2,$iv1
  2231. // The iv for third block
  2232. extr $midnumx,$ivh,$ivh,#32
  2233. extr $ivh,$ivh,$ivl,#63
  2234. and $tmpmw,$constnum,$midnum,asr#31
  2235. eor $ivl,$tmpmx,$ivl,lsl#1
  2236. fmov $ivd20,$ivl
  2237. fmov $ivd21,$ivh
  2238. vorr $dat1,$dat2,$dat2
  2239. vld1.8 {$dat2},[$inp],#16
  2240. vorr $in0,$dat,$dat
  2241. vorr $in1,$dat1,$dat1
  2242. veor $in2,$dat2,$iv2 // the third block
  2243. veor $dat2,$dat2,$iv2
  2244. cmp $len,#32
  2245. b.lo .Lxts_outer_enc_tail
  2246. // The iv for fourth block
  2247. extr $midnumx,$ivh,$ivh,#32
  2248. extr $ivh,$ivh,$ivl,#63
  2249. and $tmpmw,$constnum,$midnum,asr#31
  2250. eor $ivl,$tmpmx,$ivl,lsl#1
  2251. fmov $ivd30,$ivl
  2252. fmov $ivd31,$ivh
  2253. vld1.8 {$dat3},[$inp],#16
  2254. // The iv for fifth block
  2255. extr $midnumx,$ivh,$ivh,#32
  2256. extr $ivh,$ivh,$ivl,#63
  2257. and $tmpmw,$constnum,$midnum,asr#31
  2258. eor $ivl,$tmpmx,$ivl,lsl#1
  2259. fmov $ivd40,$ivl
  2260. fmov $ivd41,$ivh
  2261. vld1.8 {$dat4},[$inp],#16
  2262. veor $dat3,$dat3,$iv3 // the fourth block
  2263. veor $dat4,$dat4,$iv4
  2264. sub $len,$len,#32 // bias
  2265. mov $rounds,$rounds0
  2266. b .Loop5x_xts_enc
  2267. .align 4
  2268. .Loop5x_xts_enc:
  2269. aese $dat0,q8
  2270. aesmc $dat0,$dat0
  2271. aese $dat1,q8
  2272. aesmc $dat1,$dat1
  2273. aese $dat2,q8
  2274. aesmc $dat2,$dat2
  2275. aese $dat3,q8
  2276. aesmc $dat3,$dat3
  2277. aese $dat4,q8
  2278. aesmc $dat4,$dat4
  2279. vld1.32 {q8},[$key_],#16
  2280. subs $rounds,$rounds,#2
  2281. aese $dat0,q9
  2282. aesmc $dat0,$dat0
  2283. aese $dat1,q9
  2284. aesmc $dat1,$dat1
  2285. aese $dat2,q9
  2286. aesmc $dat2,$dat2
  2287. aese $dat3,q9
  2288. aesmc $dat3,$dat3
  2289. aese $dat4,q9
  2290. aesmc $dat4,$dat4
  2291. vld1.32 {q9},[$key_],#16
  2292. b.gt .Loop5x_xts_enc
  2293. aese $dat0,q8
  2294. aesmc $dat0,$dat0
  2295. aese $dat1,q8
  2296. aesmc $dat1,$dat1
  2297. aese $dat2,q8
  2298. aesmc $dat2,$dat2
  2299. aese $dat3,q8
  2300. aesmc $dat3,$dat3
  2301. aese $dat4,q8
  2302. aesmc $dat4,$dat4
  2303. subs $len,$len,#0x50 // because .Lxts_enc_tail4x
  2304. aese $dat0,q9
  2305. aesmc $dat0,$dat0
  2306. aese $dat1,q9
  2307. aesmc $dat1,$dat1
  2308. aese $dat2,q9
  2309. aesmc $dat2,$dat2
  2310. aese $dat3,q9
  2311. aesmc $dat3,$dat3
  2312. aese $dat4,q9
  2313. aesmc $dat4,$dat4
  2314. csel $xoffset,xzr,$len,gt // borrow x6, w6, "gt" is not typo
  2315. mov $key_,$key1
  2316. aese $dat0,q10
  2317. aesmc $dat0,$dat0
  2318. aese $dat1,q10
  2319. aesmc $dat1,$dat1
  2320. aese $dat2,q10
  2321. aesmc $dat2,$dat2
  2322. aese $dat3,q10
  2323. aesmc $dat3,$dat3
  2324. aese $dat4,q10
  2325. aesmc $dat4,$dat4
  2326. add $inp,$inp,$xoffset // x0 is adjusted in such way that
  2327. // at exit from the loop v1.16b-v26.16b
  2328. // are loaded with last "words"
  2329. add $xoffset,$len,#0x60 // because .Lxts_enc_tail4x
  2330. aese $dat0,q11
  2331. aesmc $dat0,$dat0
  2332. aese $dat1,q11
  2333. aesmc $dat1,$dat1
  2334. aese $dat2,q11
  2335. aesmc $dat2,$dat2
  2336. aese $dat3,q11
  2337. aesmc $dat3,$dat3
  2338. aese $dat4,q11
  2339. aesmc $dat4,$dat4
  2340. aese $dat0,q12
  2341. aesmc $dat0,$dat0
  2342. aese $dat1,q12
  2343. aesmc $dat1,$dat1
  2344. aese $dat2,q12
  2345. aesmc $dat2,$dat2
  2346. aese $dat3,q12
  2347. aesmc $dat3,$dat3
  2348. aese $dat4,q12
  2349. aesmc $dat4,$dat4
  2350. aese $dat0,q13
  2351. aesmc $dat0,$dat0
  2352. aese $dat1,q13
  2353. aesmc $dat1,$dat1
  2354. aese $dat2,q13
  2355. aesmc $dat2,$dat2
  2356. aese $dat3,q13
  2357. aesmc $dat3,$dat3
  2358. aese $dat4,q13
  2359. aesmc $dat4,$dat4
  2360. aese $dat0,q14
  2361. aesmc $dat0,$dat0
  2362. aese $dat1,q14
  2363. aesmc $dat1,$dat1
  2364. aese $dat2,q14
  2365. aesmc $dat2,$dat2
  2366. aese $dat3,q14
  2367. aesmc $dat3,$dat3
  2368. aese $dat4,q14
  2369. aesmc $dat4,$dat4
  2370. veor $tmp0,$rndlast,$iv0
  2371. aese $dat0,q15
  2372. // The iv for first block of one iteration
  2373. extr $midnumx,$ivh,$ivh,#32
  2374. extr $ivh,$ivh,$ivl,#63
  2375. and $tmpmw,$constnum,$midnum,asr#31
  2376. eor $ivl,$tmpmx,$ivl,lsl#1
  2377. fmov $ivd00,$ivl
  2378. fmov $ivd01,$ivh
  2379. veor $tmp1,$rndlast,$iv1
  2380. vld1.8 {$in0},[$inp],#16
  2381. aese $dat1,q15
  2382. // The iv for second block
  2383. extr $midnumx,$ivh,$ivh,#32
  2384. extr $ivh,$ivh,$ivl,#63
  2385. and $tmpmw,$constnum,$midnum,asr#31
  2386. eor $ivl,$tmpmx,$ivl,lsl#1
  2387. fmov $ivd10,$ivl
  2388. fmov $ivd11,$ivh
  2389. veor $tmp2,$rndlast,$iv2
  2390. vld1.8 {$in1},[$inp],#16
  2391. aese $dat2,q15
  2392. // The iv for third block
  2393. extr $midnumx,$ivh,$ivh,#32
  2394. extr $ivh,$ivh,$ivl,#63
  2395. and $tmpmw,$constnum,$midnum,asr#31
  2396. eor $ivl,$tmpmx,$ivl,lsl#1
  2397. fmov $ivd20,$ivl
  2398. fmov $ivd21,$ivh
  2399. veor $tmp3,$rndlast,$iv3
  2400. vld1.8 {$in2},[$inp],#16
  2401. aese $dat3,q15
  2402. // The iv for fourth block
  2403. extr $midnumx,$ivh,$ivh,#32
  2404. extr $ivh,$ivh,$ivl,#63
  2405. and $tmpmw,$constnum,$midnum,asr#31
  2406. eor $ivl,$tmpmx,$ivl,lsl#1
  2407. fmov $ivd30,$ivl
  2408. fmov $ivd31,$ivh
  2409. veor $tmp4,$rndlast,$iv4
  2410. vld1.8 {$in3},[$inp],#16
  2411. aese $dat4,q15
  2412. // The iv for fifth block
  2413. extr $midnumx,$ivh,$ivh,#32
  2414. extr $ivh,$ivh,$ivl,#63
  2415. and $tmpmw,$constnum,$midnum,asr #31
  2416. eor $ivl,$tmpmx,$ivl,lsl #1
  2417. fmov $ivd40,$ivl
  2418. fmov $ivd41,$ivh
  2419. vld1.8 {$in4},[$inp],#16
  2420. cbz $xoffset,.Lxts_enc_tail4x
  2421. vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
  2422. veor $tmp0,$tmp0,$dat0
  2423. veor $dat0,$in0,$iv0
  2424. veor $tmp1,$tmp1,$dat1
  2425. veor $dat1,$in1,$iv1
  2426. veor $tmp2,$tmp2,$dat2
  2427. veor $dat2,$in2,$iv2
  2428. veor $tmp3,$tmp3,$dat3
  2429. veor $dat3,$in3,$iv3
  2430. veor $tmp4,$tmp4,$dat4
  2431. vst1.8 {$tmp0},[$out],#16
  2432. veor $dat4,$in4,$iv4
  2433. vst1.8 {$tmp1},[$out],#16
  2434. mov $rounds,$rounds0
  2435. vst1.8 {$tmp2},[$out],#16
  2436. vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
  2437. vst1.8 {$tmp3},[$out],#16
  2438. vst1.8 {$tmp4},[$out],#16
  2439. b.hs .Loop5x_xts_enc
  2440. // If left 4 blocks, borrow the five block's processing.
  2441. cmn $len,#0x10
  2442. b.ne .Loop5x_enc_after
  2443. vorr $iv4,$iv3,$iv3
  2444. vorr $iv3,$iv2,$iv2
  2445. vorr $iv2,$iv1,$iv1
  2446. vorr $iv1,$iv0,$iv0
  2447. fmov $ivl,$ivd40
  2448. fmov $ivh,$ivd41
  2449. veor $dat0,$iv0,$in0
  2450. veor $dat1,$iv1,$in1
  2451. veor $dat2,$in2,$iv2
  2452. veor $dat3,$in3,$iv3
  2453. veor $dat4,$in4,$iv4
  2454. b.eq .Loop5x_xts_enc
  2455. .Loop5x_enc_after:
  2456. add $len,$len,#0x50
  2457. cbz $len,.Lxts_enc_done
  2458. add $rounds,$rounds0,#2
  2459. subs $len,$len,#0x30
  2460. b.lo .Lxts_inner_enc_tail
  2461. veor $dat0,$iv0,$in2
  2462. veor $dat1,$iv1,$in3
  2463. veor $dat2,$in4,$iv2
  2464. b .Lxts_outer_enc_tail
  2465. .align 4
  2466. .Lxts_enc_tail4x:
  2467. add $inp,$inp,#16
  2468. veor $tmp1,$dat1,$tmp1
  2469. vst1.8 {$tmp1},[$out],#16
  2470. veor $tmp2,$dat2,$tmp2
  2471. vst1.8 {$tmp2},[$out],#16
  2472. veor $tmp3,$dat3,$tmp3
  2473. veor $tmp4,$dat4,$tmp4
  2474. vst1.8 {$tmp3-$tmp4},[$out],#32
  2475. b .Lxts_enc_done
  2476. .align 4
  2477. .Lxts_outer_enc_tail:
  2478. aese $dat0,q8
  2479. aesmc $dat0,$dat0
  2480. aese $dat1,q8
  2481. aesmc $dat1,$dat1
  2482. aese $dat2,q8
  2483. aesmc $dat2,$dat2
  2484. vld1.32 {q8},[$key_],#16
  2485. subs $rounds,$rounds,#2
  2486. aese $dat0,q9
  2487. aesmc $dat0,$dat0
  2488. aese $dat1,q9
  2489. aesmc $dat1,$dat1
  2490. aese $dat2,q9
  2491. aesmc $dat2,$dat2
  2492. vld1.32 {q9},[$key_],#16
  2493. b.gt .Lxts_outer_enc_tail
  2494. aese $dat0,q8
  2495. aesmc $dat0,$dat0
  2496. aese $dat1,q8
  2497. aesmc $dat1,$dat1
  2498. aese $dat2,q8
  2499. aesmc $dat2,$dat2
  2500. veor $tmp0,$iv0,$rndlast
  2501. subs $len,$len,#0x30
  2502. // The iv for first block
  2503. fmov $ivl,$ivd20
  2504. fmov $ivh,$ivd21
  2505. //mov $constnum,#0x87
  2506. extr $midnumx,$ivh,$ivh,#32
  2507. extr $ivh,$ivh,$ivl,#63
  2508. and $tmpmw,$constnum,$midnum,asr#31
  2509. eor $ivl,$tmpmx,$ivl,lsl#1
  2510. fmov $ivd00,$ivl
  2511. fmov $ivd01,$ivh
  2512. veor $tmp1,$iv1,$rndlast
  2513. csel $xoffset,$len,$xoffset,lo // x6, w6, is zero at this point
  2514. aese $dat0,q9
  2515. aesmc $dat0,$dat0
  2516. aese $dat1,q9
  2517. aesmc $dat1,$dat1
  2518. aese $dat2,q9
  2519. aesmc $dat2,$dat2
  2520. veor $tmp2,$iv2,$rndlast
  2521. add $xoffset,$xoffset,#0x20
  2522. add $inp,$inp,$xoffset
  2523. mov $key_,$key1
  2524. aese $dat0,q12
  2525. aesmc $dat0,$dat0
  2526. aese $dat1,q12
  2527. aesmc $dat1,$dat1
  2528. aese $dat2,q12
  2529. aesmc $dat2,$dat2
  2530. aese $dat0,q13
  2531. aesmc $dat0,$dat0
  2532. aese $dat1,q13
  2533. aesmc $dat1,$dat1
  2534. aese $dat2,q13
  2535. aesmc $dat2,$dat2
  2536. aese $dat0,q14
  2537. aesmc $dat0,$dat0
  2538. aese $dat1,q14
  2539. aesmc $dat1,$dat1
  2540. aese $dat2,q14
  2541. aesmc $dat2,$dat2
  2542. aese $dat0,q15
  2543. aese $dat1,q15
  2544. aese $dat2,q15
  2545. vld1.8 {$in2},[$inp],#16
  2546. add $rounds,$rounds0,#2
  2547. vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
  2548. veor $tmp0,$tmp0,$dat0
  2549. veor $tmp1,$tmp1,$dat1
  2550. veor $dat2,$dat2,$tmp2
  2551. vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
  2552. vst1.8 {$tmp0},[$out],#16
  2553. vst1.8 {$tmp1},[$out],#16
  2554. vst1.8 {$dat2},[$out],#16
  2555. cmn $len,#0x30
  2556. b.eq .Lxts_enc_done
  2557. .Lxts_encxor_one:
  2558. vorr $in3,$in1,$in1
  2559. vorr $in4,$in2,$in2
  2560. nop
  2561. .Lxts_inner_enc_tail:
  2562. cmn $len,#0x10
  2563. veor $dat1,$in3,$iv0
  2564. veor $dat2,$in4,$iv1
  2565. b.eq .Lxts_enc_tail_loop
  2566. veor $dat2,$in4,$iv0
  2567. .Lxts_enc_tail_loop:
  2568. aese $dat1,q8
  2569. aesmc $dat1,$dat1
  2570. aese $dat2,q8
  2571. aesmc $dat2,$dat2
  2572. vld1.32 {q8},[$key_],#16
  2573. subs $rounds,$rounds,#2
  2574. aese $dat1,q9
  2575. aesmc $dat1,$dat1
  2576. aese $dat2,q9
  2577. aesmc $dat2,$dat2
  2578. vld1.32 {q9},[$key_],#16
  2579. b.gt .Lxts_enc_tail_loop
  2580. aese $dat1,q8
  2581. aesmc $dat1,$dat1
  2582. aese $dat2,q8
  2583. aesmc $dat2,$dat2
  2584. aese $dat1,q9
  2585. aesmc $dat1,$dat1
  2586. aese $dat2,q9
  2587. aesmc $dat2,$dat2
  2588. aese $dat1,q12
  2589. aesmc $dat1,$dat1
  2590. aese $dat2,q12
  2591. aesmc $dat2,$dat2
  2592. cmn $len,#0x20
  2593. aese $dat1,q13
  2594. aesmc $dat1,$dat1
  2595. aese $dat2,q13
  2596. aesmc $dat2,$dat2
  2597. veor $tmp1,$iv0,$rndlast
  2598. aese $dat1,q14
  2599. aesmc $dat1,$dat1
  2600. aese $dat2,q14
  2601. aesmc $dat2,$dat2
  2602. veor $tmp2,$iv1,$rndlast
  2603. aese $dat1,q15
  2604. aese $dat2,q15
  2605. b.eq .Lxts_enc_one
  2606. veor $tmp1,$tmp1,$dat1
  2607. vst1.8 {$tmp1},[$out],#16
  2608. veor $tmp2,$tmp2,$dat2
  2609. vorr $iv0,$iv1,$iv1
  2610. vst1.8 {$tmp2},[$out],#16
  2611. fmov $ivl,$ivd10
  2612. fmov $ivh,$ivd11
  2613. mov $constnum,#0x87
  2614. extr $midnumx,$ivh,$ivh,#32
  2615. extr $ivh,$ivh,$ivl,#63
  2616. and $tmpmw,$constnum,$midnum,asr #31
  2617. eor $ivl,$tmpmx,$ivl,lsl #1
  2618. fmov $ivd00,$ivl
  2619. fmov $ivd01,$ivh
  2620. b .Lxts_enc_done
  2621. .Lxts_enc_one:
  2622. veor $tmp1,$tmp1,$dat2
  2623. vorr $iv0,$iv0,$iv0
  2624. vst1.8 {$tmp1},[$out],#16
  2625. fmov $ivl,$ivd00
  2626. fmov $ivh,$ivd01
  2627. mov $constnum,#0x87
  2628. extr $midnumx,$ivh,$ivh,#32
  2629. extr $ivh,$ivh,$ivl,#63
  2630. and $tmpmw,$constnum,$midnum,asr #31
  2631. eor $ivl,$tmpmx,$ivl,lsl #1
  2632. fmov $ivd00,$ivl
  2633. fmov $ivd01,$ivh
  2634. b .Lxts_enc_done
  2635. .align 5
  2636. .Lxts_enc_done:
  2637. // Process the tail block with cipher stealing.
  2638. tst $tailcnt,#0xf
  2639. b.eq .Lxts_abort
  2640. mov $tmpinp,$inp
  2641. mov $tmpoutp,$out
  2642. sub $out,$out,#16
  2643. .composite_enc_loop:
  2644. subs $tailcnt,$tailcnt,#1
  2645. ldrb $l2outp,[$out,$tailcnt]
  2646. ldrb $loutp,[$tmpinp,$tailcnt]
  2647. strb $l2outp,[$tmpoutp,$tailcnt]
  2648. strb $loutp,[$out,$tailcnt]
  2649. b.gt .composite_enc_loop
  2650. .Lxts_enc_load_done:
  2651. vld1.8 {$tmpin},[$out]
  2652. veor $tmpin,$tmpin,$iv0
  2653. // Encrypt the composite block to get the last second encrypted text block
  2654. ldr $rounds,[$key1,#240] // load key schedule...
  2655. vld1.32 {$dat},[$key1],#16
  2656. sub $rounds,$rounds,#2
  2657. vld1.32 {$dat1},[$key1],#16 // load key schedule...
  2658. .Loop_final_enc:
  2659. aese $tmpin,$dat0
  2660. aesmc $tmpin,$tmpin
  2661. vld1.32 {$dat0},[$key1],#16
  2662. subs $rounds,$rounds,#2
  2663. aese $tmpin,$dat1
  2664. aesmc $tmpin,$tmpin
  2665. vld1.32 {$dat1},[$key1],#16
  2666. b.gt .Loop_final_enc
  2667. aese $tmpin,$dat0
  2668. aesmc $tmpin,$tmpin
  2669. vld1.32 {$dat0},[$key1]
  2670. aese $tmpin,$dat1
  2671. veor $tmpin,$tmpin,$dat0
  2672. veor $tmpin,$tmpin,$iv0
  2673. vst1.8 {$tmpin},[$out]
  2674. .Lxts_abort:
  2675. ldp $tailcnt,$midnumx,[sp,#48]
  2676. ldp $ivd10,$ivd20,[sp,#32]
  2677. ldp $ivd30,$ivd40,[sp,#16]
  2678. ldp $constnumx,$tmpinp,[sp],#64
  2679. .Lxts_enc_final_abort:
  2680. ret
  2681. .size ${prefix}_xts_encrypt,.-${prefix}_xts_encrypt
  2682. ___
  2683. }}}
  2684. {{{
  2685. my ($inp,$out,$len,$key1,$key2,$ivp)=map("x$_",(0..5));
  2686. my ($rounds0,$rounds,$key_,$step,$ivl,$ivh)=("w5","w6","x7","x8","x9","x10");
  2687. my ($tmpoutp,$loutp,$l2outp,$tmpinp)=("x13","w14","w15","x20");
  2688. my ($tailcnt,$midnum,$midnumx,$constnum,$constnumx)=("x21","w22","x22","w19","x19");
  2689. my ($xoffset,$tmpmx,$tmpmw)=("x6","x11","w11");
  2690. my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7));
  2691. my ($iv0,$iv1,$iv2,$iv3,$iv4,$tmpin)=("v6.16b","v8.16b","v9.16b","v10.16b","v11.16b","v26.16b");
  2692. my ($ivd00,$ivd01,$ivd20,$ivd21)=("d6","v6.d[1]","d9","v9.d[1]");
  2693. my ($ivd10,$ivd11,$ivd30,$ivd31,$ivd40,$ivd41)=("d8","v8.d[1]","d10","v10.d[1]","d11","v11.d[1]");
  2694. my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
  2695. # q7 last round key
  2696. # q10-q15, q7 Last 7 round keys
  2697. # q8-q9 preloaded round keys except last 7 keys for big size
  2698. # q20, q21, q8-q9 preloaded round keys except last 7 keys for only 16 byte
  2699. {
  2700. my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
  2701. my ($dat3,$in3,$tmp3); # used only in 64-bit mode
  2702. my ($dat4,$in4,$tmp4);
  2703. if ($flavour =~ /64/) {
  2704. ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
  2705. }
  2706. $code.=<<___ if ($flavour =~ /64/);
  2707. .globl ${prefix}_xts_decrypt
  2708. .type ${prefix}_xts_decrypt,%function
  2709. .align 5
  2710. ${prefix}_xts_decrypt:
  2711. AARCH64_VALID_CALL_TARGET
  2712. ___
  2713. $code.=<<___ if ($flavour =~ /64/);
  2714. cmp $len,#16
  2715. // Original input data size bigger than 16, jump to big size processing.
  2716. b.ne .Lxts_dec_big_size
  2717. // Encrypt the iv with key2, as the first XEX iv.
  2718. ldr $rounds,[$key2,#240]
  2719. vld1.32 {$dat},[$key2],#16
  2720. vld1.8 {$iv0},[$ivp]
  2721. sub $rounds,$rounds,#2
  2722. vld1.32 {$dat1},[$key2],#16
  2723. .Loop_dec_small_iv_enc:
  2724. aese $iv0,$dat
  2725. aesmc $iv0,$iv0
  2726. vld1.32 {$dat},[$key2],#16
  2727. subs $rounds,$rounds,#2
  2728. aese $iv0,$dat1
  2729. aesmc $iv0,$iv0
  2730. vld1.32 {$dat1},[$key2],#16
  2731. b.gt .Loop_dec_small_iv_enc
  2732. aese $iv0,$dat
  2733. aesmc $iv0,$iv0
  2734. vld1.32 {$dat},[$key2]
  2735. aese $iv0,$dat1
  2736. veor $iv0,$iv0,$dat
  2737. vld1.8 {$dat0},[$inp]
  2738. veor $dat0,$iv0,$dat0
  2739. ldr $rounds,[$key1,#240]
  2740. vld1.32 {q20-q21},[$key1],#32 // load key schedule...
  2741. aesd $dat0,q20
  2742. aesimc $dat0,$dat0
  2743. vld1.32 {q8-q9},[$key1],#32 // load key schedule...
  2744. aesd $dat0,q21
  2745. aesimc $dat0,$dat0
  2746. subs $rounds,$rounds,#10 // bias
  2747. b.eq .Lxts_128_dec
  2748. .Lxts_dec_round_loop:
  2749. aesd $dat0,q8
  2750. aesimc $dat0,$dat0
  2751. vld1.32 {q8},[$key1],#16 // load key schedule...
  2752. aesd $dat0,q9
  2753. aesimc $dat0,$dat0
  2754. vld1.32 {q9},[$key1],#16 // load key schedule...
  2755. subs $rounds,$rounds,#2 // bias
  2756. b.gt .Lxts_dec_round_loop
  2757. .Lxts_128_dec:
  2758. vld1.32 {q10-q11},[$key1],#32 // load key schedule...
  2759. aesd $dat0,q8
  2760. aesimc $dat0,$dat0
  2761. aesd $dat0,q9
  2762. aesimc $dat0,$dat0
  2763. vld1.32 {q12-q13},[$key1],#32 // load key schedule...
  2764. aesd $dat0,q10
  2765. aesimc $dat0,$dat0
  2766. aesd $dat0,q11
  2767. aesimc $dat0,$dat0
  2768. vld1.32 {q14-q15},[$key1],#32 // load key schedule...
  2769. aesd $dat0,q12
  2770. aesimc $dat0,$dat0
  2771. aesd $dat0,q13
  2772. aesimc $dat0,$dat0
  2773. vld1.32 {$rndlast},[$key1]
  2774. aesd $dat0,q14
  2775. aesimc $dat0,$dat0
  2776. aesd $dat0,q15
  2777. veor $dat0,$dat0,$rndlast
  2778. veor $dat0,$iv0,$dat0
  2779. vst1.8 {$dat0},[$out]
  2780. b .Lxts_dec_final_abort
  2781. .Lxts_dec_big_size:
  2782. ___
  2783. $code.=<<___ if ($flavour =~ /64/);
  2784. stp $constnumx,$tmpinp,[sp,#-64]!
  2785. stp $tailcnt,$midnumx,[sp,#48]
  2786. stp $ivd10,$ivd20,[sp,#32]
  2787. stp $ivd30,$ivd40,[sp,#16]
  2788. and $tailcnt,$len,#0xf
  2789. and $len,$len,#-16
  2790. subs $len,$len,#16
  2791. mov $step,#16
  2792. b.lo .Lxts_dec_abort
  2793. // Encrypt the iv with key2, as the first XEX iv
  2794. ldr $rounds,[$key2,#240]
  2795. vld1.32 {$dat},[$key2],#16
  2796. vld1.8 {$iv0},[$ivp]
  2797. sub $rounds,$rounds,#2
  2798. vld1.32 {$dat1},[$key2],#16
  2799. .Loop_dec_iv_enc:
  2800. aese $iv0,$dat
  2801. aesmc $iv0,$iv0
  2802. vld1.32 {$dat},[$key2],#16
  2803. subs $rounds,$rounds,#2
  2804. aese $iv0,$dat1
  2805. aesmc $iv0,$iv0
  2806. vld1.32 {$dat1},[$key2],#16
  2807. b.gt .Loop_dec_iv_enc
  2808. aese $iv0,$dat
  2809. aesmc $iv0,$iv0
  2810. vld1.32 {$dat},[$key2]
  2811. aese $iv0,$dat1
  2812. veor $iv0,$iv0,$dat
  2813. // The iv for second block
  2814. // $ivl- iv(low), $ivh - iv(high)
  2815. // the five ivs stored into, $iv0,$iv1,$iv2,$iv3,$iv4
  2816. fmov $ivl,$ivd00
  2817. fmov $ivh,$ivd01
  2818. mov $constnum,#0x87
  2819. extr $midnumx,$ivh,$ivh,#32
  2820. extr $ivh,$ivh,$ivl,#63
  2821. and $tmpmw,$constnum,$midnum,asr #31
  2822. eor $ivl,$tmpmx,$ivl,lsl #1
  2823. fmov $ivd10,$ivl
  2824. fmov $ivd11,$ivh
  2825. ldr $rounds0,[$key1,#240] // load rounds number
  2826. // The iv for third block
  2827. extr $midnumx,$ivh,$ivh,#32
  2828. extr $ivh,$ivh,$ivl,#63
  2829. and $tmpmw,$constnum,$midnum,asr #31
  2830. eor $ivl,$tmpmx,$ivl,lsl #1
  2831. fmov $ivd20,$ivl
  2832. fmov $ivd21,$ivh
  2833. vld1.32 {q8-q9},[$key1] // load key schedule...
  2834. sub $rounds0,$rounds0,#6
  2835. add $key_,$key1,$ivp,lsl#4 // pointer to last 7 round keys
  2836. sub $rounds0,$rounds0,#2
  2837. vld1.32 {q10-q11},[$key_],#32 // load key schedule...
  2838. vld1.32 {q12-q13},[$key_],#32
  2839. vld1.32 {q14-q15},[$key_],#32
  2840. vld1.32 {$rndlast},[$key_]
  2841. // The iv for fourth block
  2842. extr $midnumx,$ivh,$ivh,#32
  2843. extr $ivh,$ivh,$ivl,#63
  2844. and $tmpmw,$constnum,$midnum,asr #31
  2845. eor $ivl,$tmpmx,$ivl,lsl #1
  2846. fmov $ivd30,$ivl
  2847. fmov $ivd31,$ivh
  2848. add $key_,$key1,#32
  2849. mov $rounds,$rounds0
  2850. b .Lxts_dec
  2851. // Decryption
  2852. .align 5
  2853. .Lxts_dec:
  2854. tst $tailcnt,#0xf
  2855. b.eq .Lxts_dec_begin
  2856. subs $len,$len,#16
  2857. csel $step,xzr,$step,eq
  2858. vld1.8 {$dat},[$inp],#16
  2859. b.lo .Lxts_done
  2860. sub $inp,$inp,#16
  2861. .Lxts_dec_begin:
  2862. vld1.8 {$dat},[$inp],$step
  2863. subs $len,$len,#32 // bias
  2864. add $rounds,$rounds0,#2
  2865. vorr $in1,$dat,$dat
  2866. vorr $dat1,$dat,$dat
  2867. vorr $in3,$dat,$dat
  2868. vld1.8 {$dat2},[$inp],#16
  2869. vorr $in2,$dat2,$dat2
  2870. vorr $in4,$dat2,$dat2
  2871. b.lo .Lxts_inner_dec_tail
  2872. veor $dat,$dat,$iv0 // before decryt, xor with iv
  2873. veor $dat2,$dat2,$iv1
  2874. vorr $dat1,$dat2,$dat2
  2875. vld1.8 {$dat2},[$inp],#16
  2876. vorr $in0,$dat,$dat
  2877. vorr $in1,$dat1,$dat1
  2878. veor $in2,$dat2,$iv2 // third block xox with third iv
  2879. veor $dat2,$dat2,$iv2
  2880. cmp $len,#32
  2881. b.lo .Lxts_outer_dec_tail
  2882. vld1.8 {$dat3},[$inp],#16
  2883. // The iv for fifth block
  2884. extr $midnumx,$ivh,$ivh,#32
  2885. extr $ivh,$ivh,$ivl,#63
  2886. and $tmpmw,$constnum,$midnum,asr #31
  2887. eor $ivl,$tmpmx,$ivl,lsl #1
  2888. fmov $ivd40,$ivl
  2889. fmov $ivd41,$ivh
  2890. vld1.8 {$dat4},[$inp],#16
  2891. veor $dat3,$dat3,$iv3 // the fourth block
  2892. veor $dat4,$dat4,$iv4
  2893. sub $len,$len,#32 // bias
  2894. mov $rounds,$rounds0
  2895. b .Loop5x_xts_dec
  2896. .align 4
  2897. .Loop5x_xts_dec:
  2898. aesd $dat0,q8
  2899. aesimc $dat0,$dat0
  2900. aesd $dat1,q8
  2901. aesimc $dat1,$dat1
  2902. aesd $dat2,q8
  2903. aesimc $dat2,$dat2
  2904. aesd $dat3,q8
  2905. aesimc $dat3,$dat3
  2906. aesd $dat4,q8
  2907. aesimc $dat4,$dat4
  2908. vld1.32 {q8},[$key_],#16 // load key schedule...
  2909. subs $rounds,$rounds,#2
  2910. aesd $dat0,q9
  2911. aesimc $dat0,$dat0
  2912. aesd $dat1,q9
  2913. aesimc $dat1,$dat1
  2914. aesd $dat2,q9
  2915. aesimc $dat2,$dat2
  2916. aesd $dat3,q9
  2917. aesimc $dat3,$dat3
  2918. aesd $dat4,q9
  2919. aesimc $dat4,$dat4
  2920. vld1.32 {q9},[$key_],#16 // load key schedule...
  2921. b.gt .Loop5x_xts_dec
  2922. aesd $dat0,q8
  2923. aesimc $dat0,$dat0
  2924. aesd $dat1,q8
  2925. aesimc $dat1,$dat1
  2926. aesd $dat2,q8
  2927. aesimc $dat2,$dat2
  2928. aesd $dat3,q8
  2929. aesimc $dat3,$dat3
  2930. aesd $dat4,q8
  2931. aesimc $dat4,$dat4
  2932. subs $len,$len,#0x50 // because .Lxts_dec_tail4x
  2933. aesd $dat0,q9
  2934. aesimc $dat0,$dat
  2935. aesd $dat1,q9
  2936. aesimc $dat1,$dat1
  2937. aesd $dat2,q9
  2938. aesimc $dat2,$dat2
  2939. aesd $dat3,q9
  2940. aesimc $dat3,$dat3
  2941. aesd $dat4,q9
  2942. aesimc $dat4,$dat4
  2943. csel $xoffset,xzr,$len,gt // borrow x6, w6, "gt" is not typo
  2944. mov $key_,$key1
  2945. aesd $dat0,q10
  2946. aesimc $dat0,$dat0
  2947. aesd $dat1,q10
  2948. aesimc $dat1,$dat1
  2949. aesd $dat2,q10
  2950. aesimc $dat2,$dat2
  2951. aesd $dat3,q10
  2952. aesimc $dat3,$dat3
  2953. aesd $dat4,q10
  2954. aesimc $dat4,$dat4
  2955. add $inp,$inp,$xoffset // x0 is adjusted in such way that
  2956. // at exit from the loop v1.16b-v26.16b
  2957. // are loaded with last "words"
  2958. add $xoffset,$len,#0x60 // because .Lxts_dec_tail4x
  2959. aesd $dat0,q11
  2960. aesimc $dat0,$dat0
  2961. aesd $dat1,q11
  2962. aesimc $dat1,$dat1
  2963. aesd $dat2,q11
  2964. aesimc $dat2,$dat2
  2965. aesd $dat3,q11
  2966. aesimc $dat3,$dat3
  2967. aesd $dat4,q11
  2968. aesimc $dat4,$dat4
  2969. aesd $dat0,q12
  2970. aesimc $dat0,$dat0
  2971. aesd $dat1,q12
  2972. aesimc $dat1,$dat1
  2973. aesd $dat2,q12
  2974. aesimc $dat2,$dat2
  2975. aesd $dat3,q12
  2976. aesimc $dat3,$dat3
  2977. aesd $dat4,q12
  2978. aesimc $dat4,$dat4
  2979. aesd $dat0,q13
  2980. aesimc $dat0,$dat0
  2981. aesd $dat1,q13
  2982. aesimc $dat1,$dat1
  2983. aesd $dat2,q13
  2984. aesimc $dat2,$dat2
  2985. aesd $dat3,q13
  2986. aesimc $dat3,$dat3
  2987. aesd $dat4,q13
  2988. aesimc $dat4,$dat4
  2989. aesd $dat0,q14
  2990. aesimc $dat0,$dat0
  2991. aesd $dat1,q14
  2992. aesimc $dat1,$dat1
  2993. aesd $dat2,q14
  2994. aesimc $dat2,$dat2
  2995. aesd $dat3,q14
  2996. aesimc $dat3,$dat3
  2997. aesd $dat4,q14
  2998. aesimc $dat4,$dat4
  2999. veor $tmp0,$rndlast,$iv0
  3000. aesd $dat0,q15
  3001. // The iv for first block of next iteration.
  3002. extr $midnumx,$ivh,$ivh,#32
  3003. extr $ivh,$ivh,$ivl,#63
  3004. and $tmpmw,$constnum,$midnum,asr #31
  3005. eor $ivl,$tmpmx,$ivl,lsl #1
  3006. fmov $ivd00,$ivl
  3007. fmov $ivd01,$ivh
  3008. veor $tmp1,$rndlast,$iv1
  3009. vld1.8 {$in0},[$inp],#16
  3010. aesd $dat1,q15
  3011. // The iv for second block
  3012. extr $midnumx,$ivh,$ivh,#32
  3013. extr $ivh,$ivh,$ivl,#63
  3014. and $tmpmw,$constnum,$midnum,asr #31
  3015. eor $ivl,$tmpmx,$ivl,lsl #1
  3016. fmov $ivd10,$ivl
  3017. fmov $ivd11,$ivh
  3018. veor $tmp2,$rndlast,$iv2
  3019. vld1.8 {$in1},[$inp],#16
  3020. aesd $dat2,q15
  3021. // The iv for third block
  3022. extr $midnumx,$ivh,$ivh,#32
  3023. extr $ivh,$ivh,$ivl,#63
  3024. and $tmpmw,$constnum,$midnum,asr #31
  3025. eor $ivl,$tmpmx,$ivl,lsl #1
  3026. fmov $ivd20,$ivl
  3027. fmov $ivd21,$ivh
  3028. veor $tmp3,$rndlast,$iv3
  3029. vld1.8 {$in2},[$inp],#16
  3030. aesd $dat3,q15
  3031. // The iv for fourth block
  3032. extr $midnumx,$ivh,$ivh,#32
  3033. extr $ivh,$ivh,$ivl,#63
  3034. and $tmpmw,$constnum,$midnum,asr #31
  3035. eor $ivl,$tmpmx,$ivl,lsl #1
  3036. fmov $ivd30,$ivl
  3037. fmov $ivd31,$ivh
  3038. veor $tmp4,$rndlast,$iv4
  3039. vld1.8 {$in3},[$inp],#16
  3040. aesd $dat4,q15
  3041. // The iv for fifth block
  3042. extr $midnumx,$ivh,$ivh,#32
  3043. extr $ivh,$ivh,$ivl,#63
  3044. and $tmpmw,$constnum,$midnum,asr #31
  3045. eor $ivl,$tmpmx,$ivl,lsl #1
  3046. fmov $ivd40,$ivl
  3047. fmov $ivd41,$ivh
  3048. vld1.8 {$in4},[$inp],#16
  3049. cbz $xoffset,.Lxts_dec_tail4x
  3050. vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
  3051. veor $tmp0,$tmp0,$dat0
  3052. veor $dat0,$in0,$iv0
  3053. veor $tmp1,$tmp1,$dat1
  3054. veor $dat1,$in1,$iv1
  3055. veor $tmp2,$tmp2,$dat2
  3056. veor $dat2,$in2,$iv2
  3057. veor $tmp3,$tmp3,$dat3
  3058. veor $dat3,$in3,$iv3
  3059. veor $tmp4,$tmp4,$dat4
  3060. vst1.8 {$tmp0},[$out],#16
  3061. veor $dat4,$in4,$iv4
  3062. vst1.8 {$tmp1},[$out],#16
  3063. mov $rounds,$rounds0
  3064. vst1.8 {$tmp2},[$out],#16
  3065. vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
  3066. vst1.8 {$tmp3},[$out],#16
  3067. vst1.8 {$tmp4},[$out],#16
  3068. b.hs .Loop5x_xts_dec
  3069. cmn $len,#0x10
  3070. b.ne .Loop5x_dec_after
  3071. // If x2($len) equal to -0x10, the left blocks is 4.
  3072. // After specially processing, utilize the five blocks processing again.
  3073. // It will use the following IVs: $iv0,$iv0,$iv1,$iv2,$iv3.
  3074. vorr $iv4,$iv3,$iv3
  3075. vorr $iv3,$iv2,$iv2
  3076. vorr $iv2,$iv1,$iv1
  3077. vorr $iv1,$iv0,$iv0
  3078. fmov $ivl,$ivd40
  3079. fmov $ivh,$ivd41
  3080. veor $dat0,$iv0,$in0
  3081. veor $dat1,$iv1,$in1
  3082. veor $dat2,$in2,$iv2
  3083. veor $dat3,$in3,$iv3
  3084. veor $dat4,$in4,$iv4
  3085. b.eq .Loop5x_xts_dec
  3086. .Loop5x_dec_after:
  3087. add $len,$len,#0x50
  3088. cbz $len,.Lxts_done
  3089. add $rounds,$rounds0,#2
  3090. subs $len,$len,#0x30
  3091. b.lo .Lxts_inner_dec_tail
  3092. veor $dat0,$iv0,$in2
  3093. veor $dat1,$iv1,$in3
  3094. veor $dat2,$in4,$iv2
  3095. b .Lxts_outer_dec_tail
  3096. .align 4
  3097. .Lxts_dec_tail4x:
  3098. add $inp,$inp,#16
  3099. tst $tailcnt,#0xf
  3100. veor $tmp1,$dat1,$tmp0
  3101. vst1.8 {$tmp1},[$out],#16
  3102. veor $tmp2,$dat2,$tmp2
  3103. vst1.8 {$tmp2},[$out],#16
  3104. veor $tmp3,$dat3,$tmp3
  3105. veor $tmp4,$dat4,$tmp4
  3106. vst1.8 {$tmp3-$tmp4},[$out],#32
  3107. b.eq .Lxts_dec_abort
  3108. vld1.8 {$dat0},[$inp],#16
  3109. b .Lxts_done
  3110. .align 4
  3111. .Lxts_outer_dec_tail:
  3112. aesd $dat0,q8
  3113. aesimc $dat0,$dat0
  3114. aesd $dat1,q8
  3115. aesimc $dat1,$dat1
  3116. aesd $dat2,q8
  3117. aesimc $dat2,$dat2
  3118. vld1.32 {q8},[$key_],#16
  3119. subs $rounds,$rounds,#2
  3120. aesd $dat0,q9
  3121. aesimc $dat0,$dat0
  3122. aesd $dat1,q9
  3123. aesimc $dat1,$dat1
  3124. aesd $dat2,q9
  3125. aesimc $dat2,$dat2
  3126. vld1.32 {q9},[$key_],#16
  3127. b.gt .Lxts_outer_dec_tail
  3128. aesd $dat0,q8
  3129. aesimc $dat0,$dat0
  3130. aesd $dat1,q8
  3131. aesimc $dat1,$dat1
  3132. aesd $dat2,q8
  3133. aesimc $dat2,$dat2
  3134. veor $tmp0,$iv0,$rndlast
  3135. subs $len,$len,#0x30
  3136. // The iv for first block
  3137. fmov $ivl,$ivd20
  3138. fmov $ivh,$ivd21
  3139. mov $constnum,#0x87
  3140. extr $midnumx,$ivh,$ivh,#32
  3141. extr $ivh,$ivh,$ivl,#63
  3142. and $tmpmw,$constnum,$midnum,asr #31
  3143. eor $ivl,$tmpmx,$ivl,lsl #1
  3144. fmov $ivd00,$ivl
  3145. fmov $ivd01,$ivh
  3146. veor $tmp1,$iv1,$rndlast
  3147. csel $xoffset,$len,$xoffset,lo // x6, w6, is zero at this point
  3148. aesd $dat0,q9
  3149. aesimc $dat0,$dat0
  3150. aesd $dat1,q9
  3151. aesimc $dat1,$dat1
  3152. aesd $dat2,q9
  3153. aesimc $dat2,$dat2
  3154. veor $tmp2,$iv2,$rndlast
  3155. // The iv for second block
  3156. extr $midnumx,$ivh,$ivh,#32
  3157. extr $ivh,$ivh,$ivl,#63
  3158. and $tmpmw,$constnum,$midnum,asr #31
  3159. eor $ivl,$tmpmx,$ivl,lsl #1
  3160. fmov $ivd10,$ivl
  3161. fmov $ivd11,$ivh
  3162. add $xoffset,$xoffset,#0x20
  3163. add $inp,$inp,$xoffset // $inp is adjusted to the last data
  3164. mov $key_,$key1
  3165. // The iv for third block
  3166. extr $midnumx,$ivh,$ivh,#32
  3167. extr $ivh,$ivh,$ivl,#63
  3168. and $tmpmw,$constnum,$midnum,asr #31
  3169. eor $ivl,$tmpmx,$ivl,lsl #1
  3170. fmov $ivd20,$ivl
  3171. fmov $ivd21,$ivh
  3172. aesd $dat0,q12
  3173. aesimc $dat0,$dat0
  3174. aesd $dat1,q12
  3175. aesimc $dat1,$dat1
  3176. aesd $dat2,q12
  3177. aesimc $dat2,$dat2
  3178. aesd $dat0,q13
  3179. aesimc $dat0,$dat0
  3180. aesd $dat1,q13
  3181. aesimc $dat1,$dat1
  3182. aesd $dat2,q13
  3183. aesimc $dat2,$dat2
  3184. aesd $dat0,q14
  3185. aesimc $dat0,$dat0
  3186. aesd $dat1,q14
  3187. aesimc $dat1,$dat1
  3188. aesd $dat2,q14
  3189. aesimc $dat2,$dat2
  3190. vld1.8 {$in2},[$inp],#16
  3191. aesd $dat0,q15
  3192. aesd $dat1,q15
  3193. aesd $dat2,q15
  3194. vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
  3195. add $rounds,$rounds0,#2
  3196. veor $tmp0,$tmp0,$dat0
  3197. veor $tmp1,$tmp1,$dat1
  3198. veor $dat2,$dat2,$tmp2
  3199. vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
  3200. vst1.8 {$tmp0},[$out],#16
  3201. vst1.8 {$tmp1},[$out],#16
  3202. vst1.8 {$dat2},[$out],#16
  3203. cmn $len,#0x30
  3204. add $len,$len,#0x30
  3205. b.eq .Lxts_done
  3206. sub $len,$len,#0x30
  3207. vorr $in3,$in1,$in1
  3208. vorr $in4,$in2,$in2
  3209. nop
  3210. .Lxts_inner_dec_tail:
  3211. // $len == -0x10 means two blocks left.
  3212. cmn $len,#0x10
  3213. veor $dat1,$in3,$iv0
  3214. veor $dat2,$in4,$iv1
  3215. b.eq .Lxts_dec_tail_loop
  3216. veor $dat2,$in4,$iv0
  3217. .Lxts_dec_tail_loop:
  3218. aesd $dat1,q8
  3219. aesimc $dat1,$dat1
  3220. aesd $dat2,q8
  3221. aesimc $dat2,$dat2
  3222. vld1.32 {q8},[$key_],#16
  3223. subs $rounds,$rounds,#2
  3224. aesd $dat1,q9
  3225. aesimc $dat1,$dat1
  3226. aesd $dat2,q9
  3227. aesimc $dat2,$dat2
  3228. vld1.32 {q9},[$key_],#16
  3229. b.gt .Lxts_dec_tail_loop
  3230. aesd $dat1,q8
  3231. aesimc $dat1,$dat1
  3232. aesd $dat2,q8
  3233. aesimc $dat2,$dat2
  3234. aesd $dat1,q9
  3235. aesimc $dat1,$dat1
  3236. aesd $dat2,q9
  3237. aesimc $dat2,$dat2
  3238. aesd $dat1,q12
  3239. aesimc $dat1,$dat1
  3240. aesd $dat2,q12
  3241. aesimc $dat2,$dat2
  3242. cmn $len,#0x20
  3243. aesd $dat1,q13
  3244. aesimc $dat1,$dat1
  3245. aesd $dat2,q13
  3246. aesimc $dat2,$dat2
  3247. veor $tmp1,$iv0,$rndlast
  3248. aesd $dat1,q14
  3249. aesimc $dat1,$dat1
  3250. aesd $dat2,q14
  3251. aesimc $dat2,$dat2
  3252. veor $tmp2,$iv1,$rndlast
  3253. aesd $dat1,q15
  3254. aesd $dat2,q15
  3255. b.eq .Lxts_dec_one
  3256. veor $tmp1,$tmp1,$dat1
  3257. veor $tmp2,$tmp2,$dat2
  3258. vorr $iv0,$iv2,$iv2
  3259. vorr $iv1,$iv3,$iv3
  3260. vst1.8 {$tmp1},[$out],#16
  3261. vst1.8 {$tmp2},[$out],#16
  3262. add $len,$len,#16
  3263. b .Lxts_done
  3264. .Lxts_dec_one:
  3265. veor $tmp1,$tmp1,$dat2
  3266. vorr $iv0,$iv1,$iv1
  3267. vorr $iv1,$iv2,$iv2
  3268. vst1.8 {$tmp1},[$out],#16
  3269. add $len,$len,#32
  3270. .Lxts_done:
  3271. tst $tailcnt,#0xf
  3272. b.eq .Lxts_dec_abort
  3273. // Processing the last two blocks with cipher stealing.
  3274. mov x7,x3
  3275. cbnz x2,.Lxts_dec_1st_done
  3276. vld1.8 {$dat0},[$inp],#16
  3277. // Decrypt the last second block to get the last plain text block
  3278. .Lxts_dec_1st_done:
  3279. eor $tmpin,$dat0,$iv1
  3280. ldr $rounds,[$key1,#240]
  3281. vld1.32 {$dat0},[$key1],#16
  3282. sub $rounds,$rounds,#2
  3283. vld1.32 {$dat1},[$key1],#16
  3284. .Loop_final_2nd_dec:
  3285. aesd $tmpin,$dat0
  3286. aesimc $tmpin,$tmpin
  3287. vld1.32 {$dat0},[$key1],#16 // load key schedule...
  3288. subs $rounds,$rounds,#2
  3289. aesd $tmpin,$dat1
  3290. aesimc $tmpin,$tmpin
  3291. vld1.32 {$dat1},[$key1],#16 // load key schedule...
  3292. b.gt .Loop_final_2nd_dec
  3293. aesd $tmpin,$dat0
  3294. aesimc $tmpin,$tmpin
  3295. vld1.32 {$dat0},[$key1]
  3296. aesd $tmpin,$dat1
  3297. veor $tmpin,$tmpin,$dat0
  3298. veor $tmpin,$tmpin,$iv1
  3299. vst1.8 {$tmpin},[$out]
  3300. mov $tmpinp,$inp
  3301. add $tmpoutp,$out,#16
  3302. // Composite the tailcnt "16 byte not aligned block" into the last second plain blocks
  3303. // to get the last encrypted block.
  3304. .composite_dec_loop:
  3305. subs $tailcnt,$tailcnt,#1
  3306. ldrb $l2outp,[$out,$tailcnt]
  3307. ldrb $loutp,[$tmpinp,$tailcnt]
  3308. strb $l2outp,[$tmpoutp,$tailcnt]
  3309. strb $loutp,[$out,$tailcnt]
  3310. b.gt .composite_dec_loop
  3311. .Lxts_dec_load_done:
  3312. vld1.8 {$tmpin},[$out]
  3313. veor $tmpin,$tmpin,$iv0
  3314. // Decrypt the composite block to get the last second plain text block
  3315. ldr $rounds,[$key_,#240]
  3316. vld1.32 {$dat},[$key_],#16
  3317. sub $rounds,$rounds,#2
  3318. vld1.32 {$dat1},[$key_],#16
  3319. .Loop_final_dec:
  3320. aesd $tmpin,$dat0
  3321. aesimc $tmpin,$tmpin
  3322. vld1.32 {$dat0},[$key_],#16 // load key schedule...
  3323. subs $rounds,$rounds,#2
  3324. aesd $tmpin,$dat1
  3325. aesimc $tmpin,$tmpin
  3326. vld1.32 {$dat1},[$key_],#16 // load key schedule...
  3327. b.gt .Loop_final_dec
  3328. aesd $tmpin,$dat0
  3329. aesimc $tmpin,$tmpin
  3330. vld1.32 {$dat0},[$key_]
  3331. aesd $tmpin,$dat1
  3332. veor $tmpin,$tmpin,$dat0
  3333. veor $tmpin,$tmpin,$iv0
  3334. vst1.8 {$tmpin},[$out]
  3335. .Lxts_dec_abort:
  3336. ldp $tailcnt,$midnumx,[sp,#48]
  3337. ldp $ivd10,$ivd20,[sp,#32]
  3338. ldp $ivd30,$ivd40,[sp,#16]
  3339. ldp $constnumx,$tmpinp,[sp],#64
  3340. .Lxts_dec_final_abort:
  3341. ret
  3342. .size ${prefix}_xts_decrypt,.-${prefix}_xts_decrypt
  3343. ___
  3344. }
  3345. }}}
  3346. $code.=<<___;
  3347. #endif
  3348. ___
  3349. ########################################
  3350. if ($flavour =~ /64/) { ######## 64-bit code
  3351. my %opcode = (
  3352. "aesd" => 0x4e285800, "aese" => 0x4e284800,
  3353. "aesimc"=> 0x4e287800, "aesmc" => 0x4e286800 );
  3354. local *unaes = sub {
  3355. my ($mnemonic,$arg)=@_;
  3356. $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o &&
  3357. sprintf ".inst\t0x%08x\t//%s %s",
  3358. $opcode{$mnemonic}|$1|($2<<5),
  3359. $mnemonic,$arg;
  3360. };
  3361. foreach(split("\n",$code)) {
  3362. s/\`([^\`]*)\`/eval($1)/geo;
  3363. s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers
  3364. s/@\s/\/\//o; # old->new style commentary
  3365. #s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
  3366. s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or
  3367. s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel $2,$3,$2,$1/o or
  3368. s/vmov\.i8/movi/o or # fix up legacy mnemonics
  3369. s/vext\.8/ext/o or
  3370. s/vrev32\.8/rev32/o or
  3371. s/vtst\.8/cmtst/o or
  3372. s/vshr/ushr/o or
  3373. s/^(\s+)v/$1/o or # strip off v prefix
  3374. s/\bbx\s+lr\b/ret/o;
  3375. # fix up remaining legacy suffixes
  3376. s/\.[ui]?8//o;
  3377. m/\],#8/o and s/\.16b/\.8b/go;
  3378. s/\.[ui]?32//o and s/\.16b/\.4s/go;
  3379. s/\.[ui]?64//o and s/\.16b/\.2d/go;
  3380. s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
  3381. # Switch preprocessor checks to aarch64 versions.
  3382. s/__ARME([BL])__/__AARCH64E$1__/go;
  3383. print $_,"\n";
  3384. }
  3385. } else { ######## 32-bit code
  3386. my %opcode = (
  3387. "aesd" => 0xf3b00340, "aese" => 0xf3b00300,
  3388. "aesimc"=> 0xf3b003c0, "aesmc" => 0xf3b00380 );
  3389. local *unaes = sub {
  3390. my ($mnemonic,$arg)=@_;
  3391. if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) {
  3392. my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
  3393. |(($2&7)<<1) |(($2&8)<<2);
  3394. # since ARMv7 instructions are always encoded little-endian.
  3395. # correct solution is to use .inst directive, but older
  3396. # assemblers don't implement it:-(
  3397. sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
  3398. $word&0xff,($word>>8)&0xff,
  3399. ($word>>16)&0xff,($word>>24)&0xff,
  3400. $mnemonic,$arg;
  3401. }
  3402. };
  3403. sub unvtbl {
  3404. my $arg=shift;
  3405. $arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
  3406. sprintf "vtbl.8 d%d,{q%d},d%d\n\t".
  3407. "vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;
  3408. }
  3409. sub unvdup32 {
  3410. my $arg=shift;
  3411. $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
  3412. sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
  3413. }
  3414. sub unvmov32 {
  3415. my $arg=shift;
  3416. $arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
  3417. sprintf "vmov.32 d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
  3418. }
  3419. foreach(split("\n",$code)) {
  3420. s/\`([^\`]*)\`/eval($1)/geo;
  3421. s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers
  3422. s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers
  3423. s/\/\/\s?/@ /o; # new->old style commentary
  3424. # fix up remaining new-style suffixes
  3425. s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo or
  3426. s/\],#[0-9]+/]!/o;
  3427. s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
  3428. s/cclr\s+([^,]+),\s*([a-z]+)/mov.$2 $1,#0/o or
  3429. s/vtbl\.8\s+(.*)/unvtbl($1)/geo or
  3430. s/vdup\.32\s+(.*)/unvdup32($1)/geo or
  3431. s/vmov\.32\s+(.*)/unvmov32($1)/geo or
  3432. s/^(\s+)b\./$1b/o or
  3433. s/^(\s+)ret/$1bx\tlr/o;
  3434. if (s/^(\s+)mov\.([a-z]+)/$1mov$2/) {
  3435. print " it $2\n";
  3436. }
  3437. print $_,"\n";
  3438. }
  3439. }
  3440. close STDOUT or die "error closing STDOUT: $!";