aes-gcm-armv8-unroll8_64.pl 317 KB


  1. #! /usr/bin/env perl
  2. # Copyright 2020-2021 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. #========================================================================
  10. # Written by Xiaokang Qian <xiaokang.qian@arm.com> for the OpenSSL project,
  11. # derived from https://github.com/ARM-software/AArch64cryptolib, original
  12. # author Samuel Lee <Samuel.Lee@arm.com>. The module is, however, dual
  13. # licensed under OpenSSL and SPDX BSD-3-Clause licenses depending on where you
  14. # obtain it.
  15. #========================================================================
  16. #
  17. # Approach - We want to reload constants as we have plenty of spare ASIMD slots around crypto units for loading
  18. # Unroll x8 in main loop, main loop to act on 8 16B blocks per iteration, and then do modulo of the accumulated
  19. # intermediate hashesfrom the 8 blocks.
  20. #
  21. # ____________________________________________________
  22. # | |
  23. # | PRE |
  24. # |____________________________________________________|
  25. # | | | |
  26. # | CTR block 8k+13| AES block 8k+8 | GHASH block 8k+0 |
  27. # |________________|________________|__________________|
  28. # | | | |
  29. # | CTR block 8k+14| AES block 8k+9 | GHASH block 8k+1 |
  30. # |________________|________________|__________________|
  31. # | | | |
  32. # | CTR block 8k+15| AES block 8k+10| GHASH block 8k+2 |
  33. # |________________|________________|__________________|
  34. # | | | |
  35. # | CTR block 8k+16| AES block 8k+11| GHASH block 8k+3 |
  36. # |________________|________________|__________________|
  37. # | | | |
  38. # | CTR block 8k+17| AES block 8k+12| GHASH block 8k+4 |
  39. # |________________|________________|__________________|
  40. # | | | |
  41. # | CTR block 8k+18| AES block 8k+13| GHASH block 8k+5 |
  42. # |________________|________________|__________________|
  43. # | | | |
  44. # | CTR block 8k+19| AES block 8k+14| GHASH block 8k+6 |
  45. # |________________|________________|__________________|
  46. # | | | |
  47. # | CTR block 8k+20| AES block 8k+15| GHASH block 8k+7 |
  48. # |________________|____(mostly)____|__________________|
  49. # | |
  50. # | MODULO |
  51. # |____________________________________________________|
  52. #
  53. # PRE:
  54. # Ensure previous generated intermediate hash is aligned and merged with result for GHASH 4k+0
  55. # EXT low_acc, low_acc, low_acc, #8
  56. # EOR res_curr (8k+0), res_curr (4k+0), low_acc
  57. #
  58. # CTR block:
  59. # Increment and byte reverse counter in scalar registers and transfer to SIMD registers
  60. # REV ctr32, rev_ctr32
  61. # ORR ctr64, constctr96_top32, ctr32, LSL #32
  62. # INS ctr_next.d[0], constctr96_bottom64 // Keeping this in scalar registers to free up space in SIMD RF
  63. # INS ctr_next.d[1], ctr64X
  64. # ADD rev_ctr32, #1
  65. #
  66. # AES block:
  67. # Do AES encryption/decryption on CTR block X and EOR it with input block X. Take 256 bytes key below for example.
  68. # Doing small trick here of loading input in scalar registers, EORing with last key and then transferring
  69. # Given we are very constrained in our ASIMD registers this is quite important
  70. #
  71. # Encrypt:
  72. # LDR input_low, [ input_ptr ], #8
  73. # LDR input_high, [ input_ptr ], #8
  74. # EOR input_low, k14_low
  75. # EOR input_high, k14_high
  76. # INS res_curr.d[0], input_low
  77. # INS res_curr.d[1], input_high
  78. # AESE ctr_curr, k0; AESMC ctr_curr, ctr_curr
  79. # AESE ctr_curr, k1; AESMC ctr_curr, ctr_curr
  80. # AESE ctr_curr, k2; AESMC ctr_curr, ctr_curr
  81. # AESE ctr_curr, k3; AESMC ctr_curr, ctr_curr
  82. # AESE ctr_curr, k4; AESMC ctr_curr, ctr_curr
  83. # AESE ctr_curr, k5; AESMC ctr_curr, ctr_curr
  84. # AESE ctr_curr, k6; AESMC ctr_curr, ctr_curr
  85. # AESE ctr_curr, k7; AESMC ctr_curr, ctr_curr
  86. # AESE ctr_curr, k8; AESMC ctr_curr, ctr_curr
  87. # AESE ctr_curr, k9; AESMC ctr_curr, ctr_curr
  88. # AESE ctr_curr, k10; AESMC ctr_curr, ctr_curr
  89. # AESE ctr_curr, k11; AESMC ctr_curr, ctr_curr
  90. # AESE ctr_curr, k12; AESMC ctr_curr, ctr_curr
  91. # AESE ctr_curr, k13
  92. # EOR res_curr, res_curr, ctr_curr
  93. # ST1 { res_curr.16b }, [ output_ptr ], #16
  94. #
  95. # Decrypt:
  96. # AESE ctr_curr, k0; AESMC ctr_curr, ctr_curr
  97. # AESE ctr_curr, k1; AESMC ctr_curr, ctr_curr
  98. # AESE ctr_curr, k2; AESMC ctr_curr, ctr_curr
  99. # AESE ctr_curr, k3; AESMC ctr_curr, ctr_curr
  100. # AESE ctr_curr, k4; AESMC ctr_curr, ctr_curr
  101. # AESE ctr_curr, k5; AESMC ctr_curr, ctr_curr
  102. # AESE ctr_curr, k6; AESMC ctr_curr, ctr_curr
  103. # AESE ctr_curr, k7; AESMC ctr_curr, ctr_curr
  104. # AESE ctr_curr, k8; AESMC ctr_curr, ctr_curr
  105. # AESE ctr_curr, k9; AESMC ctr_curr, ctr_curr
  106. # AESE ctr_curr, k10; AESMC ctr_curr, ctr_curr
  107. # AESE ctr_curr, k11; AESMC ctr_curr, ctr_curr
  108. # AESE ctr_curr, k12; AESMC ctr_curr, ctr_curr
  109. # AESE ctr_curr, k13
  110. # LDR res_curr, [ input_ptr ], #16
  111. # EOR res_curr, res_curr, ctr_curr
  112. # MOV output_low, res_curr.d[0]
  113. # MOV output_high, res_curr.d[1]
  114. # EOR output_low, k14_low
  115. # EOR output_high, k14_high
  116. # STP output_low, output_high, [ output_ptr ], #16
  117. # GHASH block X:
  118. # Do 128b karatsuba polynomial multiplication on block
  119. # We only have 64b->128b polynomial multipliers, naively that means we need to do 4 64b multiplies to generate a 128b
  120. #
  121. # multiplication:
  122. # Pmull(A,B) == (Pmull(Ah,Bh)<<128 | Pmull(Al,Bl)) ^ (Pmull(Ah,Bl) ^ Pmull(Al,Bh))<<64
  123. #
  124. # The idea behind Karatsuba multiplication is that we can do just 3 64b multiplies:
  125. # Pmull(A,B) == (Pmull(Ah,Bh)<<128 | Pmull(Al,Bl)) ^ (Pmull(Ah^Al,Bh^Bl) ^ Pmull(Ah,Bh) ^ Pmull(Al,Bl))<<64
  126. #
  127. # There is some complication here because the bit order of GHASH's PMULL is reversed compared to elsewhere, so we are
  128. # multiplying with "twisted" powers of H
  129. #
  130. # Note: We can PMULL directly into the acc_x in first GHASH of the loop
  131. # Note: For scheduling big cores we want to split the processing to happen over two loop iterations - otherwise the critical
  132. # path latency dominates the performance
  133. #
  134. # This has a knock on effect on register pressure, so we have to be a bit more clever with our temporary registers
  135. # than indicated here
  136. # REV64 res_curr, res_curr
  137. # INS t_m.d[0], res_curr.d[1]
  138. # EOR t_m.8B, t_m.8B, res_curr.8B
  139. # PMULL2 t_h, res_curr, HX
  140. # PMULL t_l, res_curr, HX
  141. # PMULL t_m, t_m, HX_k
  142. # EOR acc_h, acc_h, t_h
  143. # EOR acc_l, acc_l, t_l
  144. # EOR acc_m, acc_m, t_m
  145. #
  146. # MODULO: take the partial accumulators (~representing sum of 256b multiplication results), from GHASH and do modulo reduction on them
  147. # There is some complication here because the bit order of GHASH's PMULL is reversed compared to elsewhere, so we are doing modulo
  148. # with a reversed constant
  149. # EOR3 acc_m, acc_m, acc_l, acc_h // Finish off karatsuba processing
  150. # PMULL t_mod, acc_h, mod_constant
  151. # EXT acc_h, acc_h, acc_h, #8
  152. # EOR3 acc_m, acc_m, t_mod, acc_h
  153. # PMULL acc_h, acc_m, mod_constant
  154. # EXT acc_m, acc_m, acc_m, #8
  155. # EOR3 acc_l, acc_l, acc_m, acc_h
  156. $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  157. $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  158. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  159. ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
  160. ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate ) or
  161. die "can't locate arm-xlate.pl";
  162. die "only for 64 bit" if $flavour !~ /64/;
  163. open OUT,"| \"$^X\" $xlate $flavour $output";
  164. *STDOUT=*OUT;
  165. $code=<<___;
  166. #include "arm_arch.h"
  167. #if __ARM_MAX_ARCH__>=8
  168. ___
  169. $code.=".arch armv8.2-a+crypto\n.arch_extension sha3\n.text\n";
  170. $input_ptr="x0"; #argument block
  171. $bit_length="x1";
  172. $output_ptr="x2";
  173. $current_tag="x3";
  174. $counter="x16";
  175. $constant_temp="x15";
  176. $modulo_constant="x10";
  177. $cc="x8";
  178. {
  179. my ($end_input_ptr,$main_end_input_ptr,$temp0_x,$temp1_x)=map("x$_",(4..7));
  180. my ($temp2_x,$temp3_x)=map("x$_",(13..14));
  181. my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$ctr4b,$ctr5b,$ctr6b,$ctr7b,$res0b,$res1b,$res2b,$res3b,$res4b,$res5b,$res6b,$res7b)=map("v$_.16b",(0..15));
  182. my ($ctr0,$ctr1,$ctr2,$ctr3,$ctr4,$ctr5,$ctr6,$ctr7,$res0,$res1,$res2,$res3,$res4,$res5,$res6,$res7)=map("v$_",(0..15));
  183. my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$ctr4d,$ctr5d,$ctr6d,$ctr7d)=map("d$_",(0..7));
  184. my ($ctr0q,$ctr1q,$ctr2q,$ctr3q,$ctr4q,$ctr5q,$ctr6q,$ctr7q)=map("q$_",(0..7));
  185. my ($res0q,$res1q,$res2q,$res3q,$res4q,$res5q,$res6q,$res7q)=map("q$_",(8..15));
  186. my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3,$ctr_t4,$ctr_t5,$ctr_t6,$ctr_t7)=map("v$_",(8..15));
  187. my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b,$ctr_t4b,$ctr_t5b,$ctr_t6b,$ctr_t7b)=map("v$_.16b",(8..15));
  188. my ($ctr_t0q,$ctr_t1q,$ctr_t2q,$ctr_t3q,$ctr_t4q,$ctr_t5q,$ctr_t6q,$ctr_t7q)=map("q$_",(8..15));
  189. my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(17..19));
  190. my ($acc_h,$acc_m,$acc_l)=map("v$_",(17..19));
  191. my ($h1,$h12k,$h2,$h3,$h34k,$h4)=map("v$_",(20..25));
  192. my ($h5,$h56k,$h6,$h7,$h78k,$h8)=map("v$_",(20..25));
  193. my ($h1q,$h12kq,$h2q,$h3q,$h34kq,$h4q)=map("q$_",(20..25));
  194. my ($h5q,$h56kq,$h6q,$h7q,$h78kq,$h8q)=map("q$_",(20..25));
  195. my $t0="v16";
  196. my $t0d="d16";
  197. my $t1="v29";
  198. my $t2=$res1;
  199. my $t3=$t1;
  200. my $t4=$res0;
  201. my $t5=$res2;
  202. my $t6=$t0;
  203. my $t7=$res3;
  204. my $t8=$res4;
  205. my $t9=$res5;
  206. my $t10=$res6;
  207. my $t11="v21";
  208. my $t12=$t1;
  209. my $rtmp_ctr="v30";
  210. my $rtmp_ctrq="q30";
  211. my $rctr_inc="v31";
  212. my $rctr_incd="d31";
  213. my $mod_constantd=$t0d;
  214. my $mod_constant=$t0;
  215. my ($rk0,$rk1,$rk2)=map("v$_.16b",(26..28));
  216. my ($rk3,$rk4,$rk5)=map("v$_.16b",(26..28));
  217. my ($rk6,$rk7,$rk8)=map("v$_.16b",(26..28));
  218. my ($rk9,$rk10,$rk11)=map("v$_.16b",(26..28));
  219. my ($rk12,$rk13,$rk14)=map("v$_.16b",(26..28));
  220. my ($rk0q,$rk1q,$rk2q)=map("q$_",(26..28));
  221. my ($rk3q,$rk4q,$rk5q)=map("q$_",(26..28));
  222. my ($rk6q,$rk7q,$rk8q)=map("q$_",(26..28));
  223. my ($rk9q,$rk10q,$rk11q)=map("q$_",(26..28));
  224. my ($rk12q,$rk13q,$rk14q)=map("q$_",(26..28));
  225. my $rk2q1="v28.1q";
  226. my $rk3q1="v26.1q";
  227. my $rk4v="v27";
  228. #########################################################################################
  229. # size_t unroll8_eor3_aes_gcm_enc_128_kernel(const unsigned char *in,
  230. # size_t len,
  231. # unsigned char *out,
  232. # const void *key,
  233. # unsigned char ivec[16],
  234. # u64 *Xi);
  235. #
  236. $code.=<<___;
  237. .global unroll8_eor3_aes_gcm_enc_128_kernel
  238. .type unroll8_eor3_aes_gcm_enc_128_kernel,%function
  239. .align 4
  240. unroll8_eor3_aes_gcm_enc_128_kernel:
  241. AARCH64_VALID_CALL_TARGET
  242. cbz x1, .L128_enc_ret
  243. stp d8, d9, [sp, #-80]!
  244. mov $counter, x4
  245. mov $cc, x5
  246. stp d10, d11, [sp, #16]
  247. stp d12, d13, [sp, #32]
  248. stp d14, d15, [sp, #48]
  249. mov x5, #0xc200000000000000
  250. stp x5, xzr, [sp, #64]
  251. add $modulo_constant, sp, #64
  252. mov $constant_temp, #0x100000000 @ set up counter increment
  253. movi $rctr_inc.16b, #0x0
  254. mov $rctr_inc.d[1], $constant_temp
  255. lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
  256. ld1 { $ctr0b}, [$counter] @ CTR block 0
  257. sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
  258. and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffff80 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
  259. rev32 $rtmp_ctr.16b, $ctr0.16b @ set up reversed counter
  260. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 0
  261. rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 1
  262. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 1
  263. rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 2
  264. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 2
  265. rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 3
  266. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 3
  267. rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 4
  268. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 4
  269. rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 5
  270. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 5
  271. ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
  272. rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 6
  273. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 6
  274. rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 7
  275. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 7
  276. aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 0
  277. aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 0
  278. aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
  279. aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
  280. aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
  281. aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
  282. aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 0
  283. aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 0
  284. ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
  285. aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
  286. aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 1
  287. aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 1
  288. aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 1
  289. aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
  290. aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 1
  291. aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
  292. aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 2
  293. aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
  294. aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
  295. aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
  296. aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
  297. aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 2
  298. aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
  299. aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 2
  300. aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 2
  301. aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
  302. ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
  303. aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 3
  304. aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
  305. aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 3
  306. aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
  307. aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 3
  308. aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 3
  309. aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 4
  310. aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
  311. aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 4
  312. aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 4
  313. aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 4
  314. aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
  315. aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
  316. aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
  317. aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
  318. aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 5
  319. aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
  320. ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
  321. aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
  322. aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
  323. aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
  324. aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 5
  325. aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 5
  326. aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 5
  327. aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 6
  328. aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
  329. aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
  330. aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 6
  331. aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 6
  332. aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 6
  333. aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
  334. aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
  335. ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
  336. aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 7
  337. ld1 { $acc_lb}, [$current_tag]
  338. ext $acc_lb, $acc_lb, $acc_lb, #8
  339. rev64 $acc_lb, $acc_lb
  340. aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 7
  341. aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 7
  342. aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
  343. aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 7
  344. aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
  345. aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
  346. aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
  347. aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8
  348. aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8
  349. aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8
  350. aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8
  351. aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8
  352. ldr $rk10q, [$cc, #160] @ load rk10
  353. aese $ctr3b, $rk9 @ AES block 8k+11 - round 9
  354. aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8
  355. aese $ctr2b, $rk9 @ AES block 8k+10 - round 9
  356. aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8
  357. aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8
  358. aese $ctr6b, $rk9 @ AES block 8k+14 - round 9
  359. aese $ctr4b, $rk9 @ AES block 8k+12 - round 9
  360. add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
  361. aese $ctr0b, $rk9 @ AES block 8k+8 - round 9
  362. aese $ctr7b, $rk9 @ AES block 8k+15 - round 9
  363. aese $ctr5b, $rk9 @ AES block 8k+13 - round 9
  364. aese $ctr1b, $rk9 @ AES block 8k+9 - round 9
  365. add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
  366. cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
  367. b.ge .L128_enc_tail @ handle tail
  368. ldp $ctr_t0q, $ctr_t1q, [$input_ptr], #32 @ AES block 0, 1 - load plaintext
  369. ldp $ctr_t2q, $ctr_t3q, [$input_ptr], #32 @ AES block 2, 3 - load plaintext
  370. ldp $ctr_t4q, $ctr_t5q, [$input_ptr], #32 @ AES block 4, 5 - load plaintext
  371. ldp $ctr_t6q, $ctr_t7q, [$input_ptr], #32 @ AES block 6, 7 - load plaintext
  372. cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
  373. eor3 $res0b, $ctr_t0b, $ctr0b, $rk10 @ AES block 0 - result
  374. rev32 $ctr0.16b, $rtmp_ctr.16b @ CTR block 8
  375. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8
  376. eor3 $res1b, $ctr_t1b, $ctr1b, $rk10 @ AES block 1 - result
  377. stp $res0q, $res1q, [$output_ptr], #32 @ AES block 0, 1 - store result
  378. rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 9
  379. eor3 $res5b, $ctr_t5b, $ctr5b, $rk10 @ AES block 5 - result
  380. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 9
  381. eor3 $res2b, $ctr_t2b, $ctr2b, $rk10 @ AES block 2 - result
  382. eor3 $res6b, $ctr_t6b, $ctr6b, $rk10 @ AES block 6 - result
  383. eor3 $res4b, $ctr_t4b, $ctr4b, $rk10 @ AES block 4 - result
  384. rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 10
  385. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 10
  386. eor3 $res3b, $ctr_t3b, $ctr3b, $rk10 @ AES block 3 - result
  387. eor3 $res7b, $ctr_t7b, $ctr7b,$rk10 @ AES block 7 - result
  388. stp $res2q, $res3q, [$output_ptr], #32 @ AES block 2, 3 - store result
  389. rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 11
  390. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 11
  391. stp $res4q, $res5q, [$output_ptr], #32 @ AES block 4, 5 - store result
  392. stp $res6q, $res7q, [$output_ptr], #32 @ AES block 6, 7 - store result
  393. rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 12
  394. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 12
  395. b.ge .L128_enc_prepretail @ do prepretail
  396. .L128_enc_main_loop: @ main loop start
  397. rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13
  398. ldr $h5q, [$current_tag, #128] @ load h5l | h5h
  399. ext $h5.16b, $h5.16b, $h5.16b, #8
  400. ldr $h6q, [$current_tag, #160] @ load h6l | h6h
  401. ext $h6.16b, $h6.16b, $h6.16b, #8
  402. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13
  403. rev64 $res1b, $res1b @ GHASH block 8k+1
  404. rev64 $res0b, $res0b @ GHASH block 8k
  405. ldr $h7q, [$current_tag, #176] @ load h7l | h7h
  406. ext $h7.16b, $h7.16b, $h7.16b, #8
  407. ldr $h8q, [$current_tag, #208] @ load h8l | h8h
  408. ext $h8.16b, $h8.16b, $h8.16b, #8
  409. rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14
  410. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14
  411. ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
  412. ldr $h56kq, [$current_tag, #144] @ load h6k | h5k
  413. ldr $h78kq, [$current_tag, #192] @ load h8k | h7k
  414. rev64 $res5b, $res5b @ GHASH block 8k+5 (t0, t1, t2 and t3 free)
  415. rev64 $res3b, $res3b @ GHASH block 8k+3
  416. ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
  417. eor $res0b, $res0b, $acc_lb @ PRE 1
  418. rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15
  419. rev64 $res7b, $res7b @ GHASH block 8k+7 (t0, t1, t2 and t3 free)
  420. pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high
  421. rev64 $res2b, $res2b @ GHASH block 8k+2
  422. pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high
  423. pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low
  424. trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
  425. pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low
  426. trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
  427. pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high
  428. pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high
  429. eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low
  430. ldr $h3q, [$current_tag, #80] @ load h3l | h3h
  431. ext $h3.16b, $h3.16b, $h3.16b, #8
  432. ldr $h4q, [$current_tag, #112] @ load h3l | h3h
  433. ext $h4.16b, $h4.16b, $h4.16b, #8
  434. aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0
  435. aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0
  436. aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0
  437. eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high
  438. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15
  439. aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0
  440. eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid
  441. aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0
  442. aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1
  443. aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0
  444. aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1
  445. aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0
  446. pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low
  447. aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1
  448. aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0
  449. aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1
  450. eor3 $acc_hb, $acc_hb, $t1.16b,$t2.16b @ GHASH block 8k+2, 8k+3 - high
  451. trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
  452. trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
  453. ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
  454. aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1
  455. aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1
  456. pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low
  457. aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1
  458. aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1
  459. pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid
  460. eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
  461. pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid
  462. rev64 $res6b, $res6b @ GHASH block 8k+6 (t0, t1, and t2 free)
  463. eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low
  464. pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid
  465. eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid
  466. pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid
  467. aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2
  468. aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2
  469. aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2
  470. aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2
  471. eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
  472. aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2
  473. aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2
  474. aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2
  475. aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2
  476. aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3
  477. ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
  478. ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
  479. rev64 $res4b, $res4b @ GHASH block 8k+4 (t0, t1, and t2 free)
  480. ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
  481. aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3
  482. aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3
  483. ldr $h1q, [$current_tag, #32] @ load h1l | h1h
  484. ext $h1.16b, $h1.16b, $h1.16b, #8
  485. ldr $h2q, [$current_tag, #64] @ load h1l | h1h
  486. ext $h2.16b, $h2.16b, $h2.16b, #8
  487. pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high
  488. pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low
  489. trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
  490. trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
  491. aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3
  492. aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3
  493. aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3
  494. aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3
  495. aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3
  496. aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4
  497. aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4
  498. aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4
  499. aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4
  500. aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4
  501. aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4
  502. aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4
  503. pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high
  504. eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
  505. pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low
  506. aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4
  507. ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
  508. trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
  509. pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid
  510. pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid
  511. pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high
  512. pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high
  513. aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5
  514. aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5
  515. pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low
  516. eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high
  517. trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
  518. eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low
  519. aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5
  520. eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
  521. aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5
  522. aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5
  523. aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5
  524. aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5
  525. aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5
  526. eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
  527. ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
  528. pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low
  529. aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6
  530. aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6
  531. pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid
  532. aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6
  533. aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6
  534. pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid
  535. eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low
  536. ldp $ctr_t0q, $ctr_t1q, [$input_ptr], #32 @ AES block 8k+8, 8k+9 - load plaintext
  537. aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6
  538. rev32 $h1.16b, $rtmp_ctr.16b @ CTR block 8k+16
  539. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+16
  540. aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6
  541. aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6
  542. aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6
  543. eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
  544. ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
  545. eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high
  546. aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7
  547. aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7
  548. ldp $ctr_t2q, $ctr_t3q, [$input_ptr], #32 @ AES block 8k+10, 8k+11 - load plaintext
  549. aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7
  550. aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7
  551. aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7
  552. pmull $t11.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
  553. aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7
  554. aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7
  555. rev32 $h2.16b, $rtmp_ctr.16b @ CTR block 8k+17
  556. aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7
  557. aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8
  558. ldp $ctr_t4q, $ctr_t5q, [$input_ptr], #32 @ AES block 8k+12, 8k+13 - load plaintext
  559. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+17
  560. aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8
  561. aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8
  562. aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8
  563. aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8
  564. eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
  565. ldr $rk10q, [$cc, #160] @ load rk10
  566. ext $t12.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
  567. rev32 $h3.16b, $rtmp_ctr.16b @ CTR block 8k+18
  568. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+18
  569. aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8
  570. aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8
  571. eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid
  572. aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8
  573. aese $ctr2b, $rk9 @ AES block 8k+10 - round 9
  574. aese $ctr4b, $rk9 @ AES block 8k+12 - round 9
  575. aese $ctr1b, $rk9 @ AES block 8k+9 - round 9
  576. ldp $ctr_t6q, $ctr_t7q, [$input_ptr], #32 @ AES block 8k+14, 8k+15 - load plaintext
  577. rev32 $h4.16b, $rtmp_ctr.16b @ CTR block 8k+19
  578. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+19
  579. cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
  580. eor3 $res4b, $ctr_t4b, $ctr4b, $rk10 @ AES block 4 - result
  581. aese $ctr7b, $rk9 @ AES block 8k+15 - round 9
  582. aese $ctr6b, $rk9 @ AES block 8k+14 - round 9
  583. aese $ctr3b, $rk9 @ AES block 8k+11 - round 9
  584. eor3 $res2b, $ctr_t2b, $ctr2b, $rk10 @ AES block 8k+10 - result
  585. mov $ctr2.16b, $h3.16b @ CTR block 8k+18
  586. aese $ctr0b, $rk9 @ AES block 8k+8 - round 9
  587. rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 8k+20
  588. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+20
  589. eor3 $res7b, $ctr_t7b, $ctr7b, $rk10 @ AES block 7 - result
  590. aese $ctr5b, $rk9 @ AES block 8k+13 - round 9
  591. pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
  592. eor3 $res1b, $ctr_t1b, $ctr1b, $rk10 @ AES block 8k+9 - result
  593. eor3 $res3b, $ctr_t3b, $ctr3b, $rk10 @ AES block 8k+11 - result
  594. mov $ctr3.16b, $h4.16b @ CTR block 8k+19
  595. ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
  596. eor3 $res5b, $ctr_t5b, $ctr5b, $rk10 @ AES block 5 - result
  597. mov $ctr1.16b, $h2.16b @ CTR block 8k+17
  598. eor3 $res0b, $ctr_t0b, $ctr0b, $rk10 @ AES block 8k+8 - result
  599. mov $ctr0.16b, $h1.16b @ CTR block 8k+16
  600. stp $res0q, $res1q, [$output_ptr], #32 @ AES block 8k+8, 8k+9 - store result
  601. stp $res2q, $res3q, [$output_ptr], #32 @ AES block 8k+10, 8k+11 - store result
  602. eor3 $res6b, $ctr_t6b, $ctr6b, $rk10 @ AES block 6 - result
  603. stp $res4q, $res5q, [$output_ptr], #32 @ AES block 8k+12, 8k+13 - store result
  604. eor3 $acc_lb, $acc_lb, $acc_hb, $t11.16b @ MODULO - fold into low
  605. stp $res6q, $res7q, [$output_ptr], #32 @ AES block 8k+14, 8k+15 - store result
  606. b.lt .L128_enc_main_loop
  607. .L128_enc_prepretail: @ PREPRETAIL
  608. rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13
  609. ldr $h7q, [$current_tag, #176] @ load h7l | h7h
  610. ext $h7.16b, $h7.16b, $h7.16b, #8
  611. ldr $h8q, [$current_tag, #208] @ load h8l | h8h
  612. ext $h8.16b, $h8.16b, $h8.16b, #8
  613. ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
  614. ldr $h5q, [$current_tag, #128] @ load h5l | h5h
  615. ext $h5.16b, $h5.16b, $h5.16b, #8
  616. ldr $h6q, [$current_tag, #160] @ load h6l | h6h
  617. ext $h6.16b, $h6.16b, $h6.16b, #8
  618. rev64 $res0b, $res0b @ GHASH block 8k
  619. rev64 $res1b, $res1b @ GHASH block 8k+1
  620. ldr $h56kq, [$current_tag, #144] @ load h6k | h5k
  621. ldr $h78kq, [$current_tag, #192] @ load h6k | h5k
  622. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13
  623. rev64 $res3b, $res3b @ GHASH block 8k+3
  624. rev64 $res2b, $res2b @ GHASH block 8k+2
  625. eor $res0b, $res0b, $acc_lb @ PRE 1
  626. rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14
  627. pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high
  628. pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low
  629. pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high
  630. rev64 $res5b, $res5b @ GHASH block 8k+5 (t0, t1, t2 and t3 free)
  631. trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
  632. pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low
  633. eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high
  634. trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
  635. eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low
  636. eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid
  637. ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
  638. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14
  639. pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid
  640. pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid
  641. rev64 $res4b, $res4b @ GHASH block 8k+4 (t0, t1, and t2 free)
  642. rev64 $res7b, $res7b @ GHASH block 8k+7 (t0, t1, t2 and t3 free)
  643. eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid
  644. rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15
  645. rev64 $res6b, $res6b @ GHASH block 8k+6 (t0, t1, and t2 free)
  646. aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0
  647. pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high
  648. pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high
  649. aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0
  650. aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0
  651. pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low
  652. aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0
  653. eor3 $acc_hb, $acc_hb, $t1.16b, $t2.16b @ GHASH block 8k+2, 8k+3 - high
  654. trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
  655. trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
  656. aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0
  657. aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0
  658. eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
  659. aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0
  660. aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0
  661. aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1
  662. pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low
  663. ldr $h3q, [$current_tag, #80] @ load h3l | h3h
  664. ext $h3.16b, $h3.16b, $h3.16b, #8
  665. ldr $h4q, [$current_tag, #112] @ load h4l | h4h
  666. ext $h4.16b, $h4.16b, $h4.16b, #8
  667. ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
  668. aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1
  669. pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid
  670. eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low
  671. pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid
  672. aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1
  673. aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1
  674. eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
  675. ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
  676. ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
  677. aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1
  678. aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1
  679. aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1
  680. aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2
  681. aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2
  682. aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2
  683. aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2
  684. aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1
  685. aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2
  686. aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3
  687. aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2
  688. aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2
  689. aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2
  690. ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
  691. ldr $h1q, [$current_tag, #32] @ load h1l | h1h
  692. ext $h1.16b, $h1.16b, $h1.16b, #8
  693. ldr $h2q, [$current_tag, #64] @ load h1l | h1h
  694. ext $h2.16b, $h2.16b, $h2.16b, #8
  695. trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
  696. aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3
  697. pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high
  698. aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3
  699. aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3
  700. pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low
  701. trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
  702. pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high
  703. aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3
  704. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15
  705. aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3
  706. aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3
  707. eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
  708. pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low
  709. aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3
  710. pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high
  711. trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
  712. pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low
  713. trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
  714. aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4
  715. aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4
  716. eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high
  717. eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low
  718. eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
  719. pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid
  720. aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5
  721. aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4
  722. aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4
  723. aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4
  724. aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4
  725. pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid
  726. aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4
  727. aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4
  728. pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high
  729. ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
  730. pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low
  731. eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
  732. pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid
  733. pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid
  734. aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5
  735. aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5
  736. ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
  737. aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5
  738. aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5
  739. eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high
  740. aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5
  741. aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5
  742. aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5
  743. aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6
  744. aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6
  745. aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6
  746. aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6
  747. aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6
  748. eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low
  749. eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
  750. aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6
  751. aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6
  752. aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6
  753. pmull $t11.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
  754. eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
  755. ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
  756. aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7
  757. aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7
  758. aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7
  759. ext $t12.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
  760. aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7
  761. aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7
  762. eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid
  763. aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7
  764. aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7
  765. pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
  766. aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7
  767. aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8
  768. aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8
  769. aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8
  770. ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
  771. aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8
  772. eor3 $acc_lb, $acc_lb, $acc_hb, $acc_mb @ MODULO - fold into low
  773. aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8
  774. aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8
  775. aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8
  776. aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8
  777. ldr $rk10q, [$cc, #160] @ load rk10
  778. aese $ctr6b, $rk9 @ AES block 8k+14 - round 9
  779. aese $ctr2b, $rk9 @ AES block 8k+10 - round 9
  780. aese $ctr0b, $rk9 @ AES block 8k+8 - round 9
  781. aese $ctr1b, $rk9 @ AES block 8k+9 - round 9
  782. aese $ctr3b, $rk9 @ AES block 8k+11 - round 9
  783. aese $ctr5b, $rk9 @ AES block 8k+13 - round 9
  784. aese $ctr4b, $rk9 @ AES block 8k+12 - round 9
  785. aese $ctr7b, $rk9 @ AES block 8k+15 - round 9
  786. .L128_enc_tail: @ TAIL
  787. sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
  788. ldr $ctr_t0q, [$input_ptr], #16 @ AES block 8k+8 - load plaintext
  789. mov $t1.16b, $rk10
  790. ldp $h5q, $h56kq, [$current_tag, #128] @ load h5l | h5h
  791. ext $h5.16b, $h5.16b, $h5.16b, #8
  792. eor3 $res1b, $ctr_t0b, $ctr0b, $t1.16b @ AES block 8k+8 - result
  793. ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
  794. ldp $h6q, $h7q, [$current_tag, #160] @ load h6k | h5k
  795. ext $h6.16b, $h6.16b, $h6.16b, #8
  796. ext $h7.16b, $h7.16b, $h7.16b, #8
  797. ldp $h78kq, $h8q, [$current_tag, #192] @ load h7l | h7h
  798. ext $h8.16b, $h8.16b, $h8.16b, #8
  799. cmp $main_end_input_ptr, #112
  800. b.gt .L128_enc_blocks_more_than_7
  801. mov $ctr7b, $ctr6b
  802. mov $ctr6b, $ctr5b
  803. movi $acc_h.8b, #0
  804. cmp $main_end_input_ptr, #96
  805. sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
  806. mov $ctr5b, $ctr4b
  807. mov $ctr4b, $ctr3b
  808. mov $ctr3b, $ctr2b
  809. mov $ctr2b, $ctr1b
  810. movi $acc_l.8b, #0
  811. movi $acc_m.8b, #0
  812. b.gt .L128_enc_blocks_more_than_6
  813. mov $ctr7b, $ctr6b
  814. cmp $main_end_input_ptr, #80
  815. sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
  816. mov $ctr6b, $ctr5b
  817. mov $ctr5b, $ctr4b
  818. mov $ctr4b, $ctr3b
  819. mov $ctr3b, $ctr1b
  820. b.gt .L128_enc_blocks_more_than_5
  821. cmp $main_end_input_ptr, #64
  822. sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
  823. mov $ctr7b, $ctr6b
  824. mov $ctr6b, $ctr5b
  825. mov $ctr5b, $ctr4b
  826. mov $ctr4b, $ctr1b
  827. b.gt .L128_enc_blocks_more_than_4
  828. mov $ctr7b, $ctr6b
  829. sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
  830. mov $ctr6b, $ctr5b
  831. mov $ctr5b, $ctr1b
  832. cmp $main_end_input_ptr, #48
  833. b.gt .L128_enc_blocks_more_than_3
  834. sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
  835. mov $ctr7b, $ctr6b
  836. mov $ctr6b, $ctr1b
  837. cmp $main_end_input_ptr, #32
  838. ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
  839. b.gt .L128_enc_blocks_more_than_2
  840. cmp $main_end_input_ptr, #16
  841. sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
  842. mov $ctr7b, $ctr1b
  843. b.gt .L128_enc_blocks_more_than_1
  844. ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
  845. sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
  846. b .L128_enc_blocks_less_than_1
  847. .L128_enc_blocks_more_than_7: @ blocks left > 7
  848. st1 { $res1b}, [$output_ptr], #16 @ AES final-7 block - store result
  849. rev64 $res0b, $res1b @ GHASH final-7 block
  850. ldr $ctr_t1q, [$input_ptr], #16 @ AES final-6 block - load plaintext
  851. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  852. ins $rk4v.d[0], $res0.d[1] @ GHASH final-7 block - mid
  853. pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH final-7 block - high
  854. ins $acc_m.d[0], $h78k.d[1] @ GHASH final-7 block - mid
  855. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-7 block - mid
  856. movi $t0.8b, #0 @ surpress further partial tag feed in
  857. eor3 $res1b, $ctr_t1b, $ctr1b, $t1.16b @ AES final-6 block - result
  858. pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-7 block - mid
  859. pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH final-7 block - low
  860. .L128_enc_blocks_more_than_6: @ blocks left > 6
  861. st1 { $res1b}, [$output_ptr], #16 @ AES final-6 block - store result
  862. rev64 $res0b, $res1b @ GHASH final-6 block
  863. ldr $ctr_t1q, [$input_ptr], #16 @ AES final-5 block - load plaintext
  864. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  865. ins $rk4v.d[0], $res0.d[1] @ GHASH final-6 block - mid
  866. eor3 $res1b, $ctr_t1b, $ctr2b, $t1.16b @ AES final-5 block - result
  867. pmull $rk3q1, $res0.1d, $h7.1d @ GHASH final-6 block - low
  868. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-6 block - mid
  869. movi $t0.8b, #0 @ surpress further partial tag feed in
  870. pmull $rk4v.1q, $rk4v.1d, $h78k.1d @ GHASH final-6 block - mid
  871. pmull2 $rk2q1, $res0.2d, $h7.2d @ GHASH final-6 block - high
  872. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-6 block - low
  873. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-6 block - mid
  874. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-6 block - high
  875. .L128_enc_blocks_more_than_5: @ blocks left > 5
  876. st1 { $res1b}, [$output_ptr], #16 @ AES final-5 block - store result
  877. rev64 $res0b, $res1b @ GHASH final-5 block
  878. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  879. ins $rk4v.d[0], $res0.d[1] @ GHASH final-5 block - mid
  880. ldr $ctr_t1q, [$input_ptr], #16 @ AES final-4 block - load plaintext
  881. pmull2 $rk2q1, $res0.2d, $h6.2d @ GHASH final-5 block - high
  882. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-5 block - high
  883. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-5 block - mid
  884. ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-5 block - mid
  885. eor3 $res1b, $ctr_t1b, $ctr3b, $t1.16b @ AES final-4 block - result
  886. pmull $rk3q1, $res0.1d, $h6.1d @ GHASH final-5 block - low
  887. movi $t0.8b, #0 @ surpress further partial tag feed in
  888. pmull2 $rk4v.1q, $rk4v.2d, $h56k.2d @ GHASH final-5 block - mid
  889. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-5 block - low
  890. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-5 block - mid
  891. .L128_enc_blocks_more_than_4: @ blocks left > 4
  892. st1 { $res1b}, [$output_ptr], #16 @ AES final-4 block - store result
  893. rev64 $res0b, $res1b @ GHASH final-4 block
  894. ldr $ctr_t1q, [$input_ptr], #16 @ AES final-3 block - load plaintext
  895. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  896. ins $rk4v.d[0], $res0.d[1] @ GHASH final-4 block - mid
  897. movi $t0.8b, #0 @ surpress further partial tag feed in
  898. pmull2 $rk2q1, $res0.2d, $h5.2d @ GHASH final-4 block - high
  899. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-4 block - mid
  900. pmull $rk3q1, $res0.1d, $h5.1d @ GHASH final-4 block - low
  901. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-4 block - high
  902. pmull $rk4v.1q, $rk4v.1d, $h56k.1d @ GHASH final-4 block - mid
  903. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-4 block - low
  904. eor3 $res1b, $ctr_t1b, $ctr4b, $t1.16b @ AES final-3 block - result
  905. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-4 block - mid
  906. .L128_enc_blocks_more_than_3: @ blocks left > 3
  907. st1 { $res1b}, [$output_ptr], #16 @ AES final-3 block - store result
  908. ldr $h4q, [$current_tag, #112] @ load h4l | h4h
  909. ext $h4.16b, $h4.16b, $h4.16b, #8
  910. rev64 $res0b, $res1b @ GHASH final-3 block
  911. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  912. movi $t0.8b, #0 @ surpress further partial tag feed in
  913. ins $rk4v.d[0], $res0.d[1] @ GHASH final-3 block - mid
  914. ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
  915. pmull $rk3q1, $res0.1d, $h4.1d @ GHASH final-3 block - low
  916. ldr $ctr_t1q, [$input_ptr], #16 @ AES final-2 block - load plaintext
  917. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
  918. ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-3 block - mid
  919. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-3 block - low
  920. eor3 $res1b, $ctr_t1b, $ctr5b, $t1.16b @ AES final-2 block - result
  921. pmull2 $rk4v.1q, $rk4v.2d, $h34k.2d @ GHASH final-3 block - mid
  922. pmull2 $rk2q1, $res0.2d, $h4.2d @ GHASH final-3 block - high
  923. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-3 block - mid
  924. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-3 block - high
  925. .L128_enc_blocks_more_than_2: @ blocks left > 2
  926. st1 { $res1b}, [$output_ptr], #16 @ AES final-2 block - store result
  927. rev64 $res0b, $res1b @ GHASH final-2 block
  928. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  929. ldr $ctr_t1q, [$input_ptr], #16 @ AES final-1 block - load plaintext
  930. ins $rk4v.d[0], $res0.d[1] @ GHASH final-2 block - mid
  931. ldr $h3q, [$current_tag, #80] @ load h3l | h3h
  932. ext $h3.16b, $h3.16b, $h3.16b, #8
  933. movi $t0.8b, #0 @ surpress further partial tag feed in
  934. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
  935. eor3 $res1b, $ctr_t1b, $ctr6b, $t1.16b @ AES final-1 block - result
  936. pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
  937. pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
  938. pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
  939. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
  940. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
  941. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
  942. .L128_enc_blocks_more_than_1: @ blocks left > 1
  943. st1 { $res1b}, [$output_ptr], #16 @ AES final-1 block - store result
  944. ldr $h2q, [$current_tag, #64] @ load h2l | h2h
  945. ext $h2.16b, $h2.16b, $h2.16b, #8
  946. rev64 $res0b, $res1b @ GHASH final-1 block
  947. ldr $ctr_t1q, [$input_ptr], #16 @ AES final block - load plaintext
  948. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  949. movi $t0.8b, #0 @ surpress further partial tag feed in
  950. ins $rk4v.d[0], $res0.d[1] @ GHASH final-1 block - mid
  951. eor3 $res1b, $ctr_t1b, $ctr7b, $t1.16b @ AES final block - result
  952. pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
  953. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
  954. ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
  955. ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
  956. pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
  957. pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
  958. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
  959. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
  960. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
  961. .L128_enc_blocks_less_than_1: @ blocks left <= 1
  962. rev32 $rtmp_ctr.16b, $rtmp_ctr.16b
  963. str $rtmp_ctrq, [$counter] @ store the updated counter
  964. and $bit_length, $bit_length, #127 @ bit_length %= 128
  965. sub $bit_length, $bit_length, #128 @ bit_length -= 128
  966. neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
  967. mvn $temp0_x, xzr @ temp0_x = 0xffffffffffffffff
  968. ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored
  969. and $bit_length, $bit_length, #127 @ bit_length %= 128
  970. lsr $temp0_x, $temp0_x, $bit_length @ temp0_x is mask for top 64b of last block
  971. mvn $temp1_x, xzr @ temp1_x = 0xffffffffffffffff
  972. cmp $bit_length, #64
  973. csel $temp2_x, $temp1_x, $temp0_x, lt
  974. csel $temp3_x, $temp0_x, xzr, lt
  975. mov $ctr0.d[1], $temp3_x
  976. mov $ctr0.d[0], $temp2_x @ ctr0b is mask for last block
  977. and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
  978. rev64 $res0b, $res1b @ GHASH final block
  979. bif $res1b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing
  980. st1 { $res1b}, [$output_ptr] @ store all 16B
  981. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  982. ins $t0.d[0], $res0.d[1] @ GHASH final block - mid
  983. eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
  984. ldr $h1q, [$current_tag, #32] @ load h1l | h1h
  985. ext $h1.16b, $h1.16b, $h1.16b, #8
  986. pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
  987. pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
  988. eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
  989. ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
  990. pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
  991. eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
  992. eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
  993. ext $t11.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
  994. pmull $t12.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
  995. eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
  996. eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid
  997. pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
  998. ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
  999. eor3 $acc_lb, $acc_lb, $acc_hb, $t11.16b @ MODULO - fold into low
  1000. ext $acc_lb, $acc_lb, $acc_lb, #8
  1001. rev64 $acc_lb, $acc_lb
  1002. st1 { $acc_l.16b }, [$current_tag]
  1003. lsr x0, $bit_length, #3 @ return sizes
  1004. ldp d10, d11, [sp, #16]
  1005. ldp d12, d13, [sp, #32]
  1006. ldp d14, d15, [sp, #48]
  1007. ldp d8, d9, [sp], #80
  1008. ret
  1009. .L128_enc_ret:
  1010. mov w0, #0x0
  1011. ret
  1012. .size unroll8_eor3_aes_gcm_enc_128_kernel,.-unroll8_eor3_aes_gcm_enc_128_kernel
  1013. ___
  1014. #########################################################################################
  1015. # size_t unroll8_eor3_aes_gcm_dec_128_kernel(const unsigned char *in,
  1016. # size_t len,
  1017. # unsigned char *out,
  1018. # u64 *Xi,
  1019. # unsigned char ivec[16],
  1020. # const void *key);
  1021. #
  1022. $code.=<<___;
  1023. .global unroll8_eor3_aes_gcm_dec_128_kernel
  1024. .type unroll8_eor3_aes_gcm_dec_128_kernel,%function
  1025. .align 4
  1026. unroll8_eor3_aes_gcm_dec_128_kernel:
  1027. AARCH64_VALID_CALL_TARGET
  1028. cbz x1, .L128_dec_ret
  1029. stp d8, d9, [sp, #-80]!
  1030. mov $counter, x4
  1031. mov $cc, x5
  1032. stp d10, d11, [sp, #16]
  1033. stp d12, d13, [sp, #32]
  1034. stp d14, d15, [sp, #48]
  1035. mov x5, #0xc200000000000000
  1036. stp x5, xzr, [sp, #64]
  1037. add $modulo_constant, sp, #64
  1038. lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
  1039. ld1 { $ctr0b}, [$counter] @ CTR block 0
  1040. ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
  1041. sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
  1042. mov $constant_temp, #0x100000000 @ set up counter increment
  1043. movi $rctr_inc.16b, #0x0
  1044. mov $rctr_inc.d[1], $constant_temp
  1045. ld1 { $acc_lb}, [$current_tag]
  1046. ext $acc_lb, $acc_lb, $acc_lb, #8
  1047. rev64 $acc_lb, $acc_lb
  1048. rev32 $rtmp_ctr.16b, $ctr0.16b @ set up reversed counter
  1049. aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
  1050. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 0
  1051. rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 1
  1052. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 1
  1053. and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffff80 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
  1054. rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 2
  1055. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 2
  1056. aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
  1057. rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 3
  1058. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 3
  1059. aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
  1060. aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
  1061. rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 4
  1062. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 4
  1063. rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 5
  1064. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 5
  1065. aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
  1066. rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 6
  1067. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 6
  1068. aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 0
  1069. aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
  1070. aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 0
  1071. rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 7
  1072. aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 0
  1073. aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
  1074. aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 0
  1075. ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
  1076. aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 1
  1077. aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 1
  1078. aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 1
  1079. aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 1
  1080. aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 2
  1081. aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
  1082. aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
  1083. aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 2
  1084. aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
  1085. aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 2
  1086. aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 2
  1087. aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
  1088. aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
  1089. aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 3
  1090. aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
  1091. ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
  1092. aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 3
  1093. aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
  1094. aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 3
  1095. aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
  1096. aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
  1097. aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
  1098. aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 4
  1099. aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 3
  1100. aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 4
  1101. aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
  1102. aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
  1103. aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 4
  1104. aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 4
  1105. aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
  1106. ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
  1107. aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
  1108. aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
  1109. aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 5
  1110. aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
  1111. aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 5
  1112. aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 5
  1113. aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 5
  1114. aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
  1115. aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
  1116. aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
  1117. aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 6
  1118. aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 6
  1119. aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
  1120. aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
  1121. aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 6
  1122. aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 6
  1123. aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
  1124. aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 7
  1125. aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
  1126. aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 7
  1127. aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 7
  1128. ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
  1129. aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 7
  1130. aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
  1131. aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
  1132. add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
  1133. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 7
  1134. aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 8
  1135. aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
  1136. aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
  1137. aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 8
  1138. aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
  1139. aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 8
  1140. aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
  1141. aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 8
  1142. aese $ctr0b, $rk9 @ AES block 0 - round 9
  1143. aese $ctr1b, $rk9 @ AES block 1 - round 9
  1144. aese $ctr6b, $rk9 @ AES block 6 - round 9
  1145. ldr $rk10q, [$cc, #160] @ load rk10
  1146. aese $ctr4b, $rk9 @ AES block 4 - round 9
  1147. aese $ctr3b, $rk9 @ AES block 3 - round 9
  1148. aese $ctr2b, $rk9 @ AES block 2 - round 9
  1149. aese $ctr5b, $rk9 @ AES block 5 - round 9
  1150. aese $ctr7b, $rk9 @ AES block 7 - round 9
  1151. add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
  1152. cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
  1153. b.ge .L128_dec_tail @ handle tail
  1154. ldp $res0q, $res1q, [$input_ptr], #32 @ AES block 0, 1 - load ciphertext
  1155. eor3 $ctr0b, $res0b, $ctr0b, $rk10 @ AES block 0 - result
  1156. eor3 $ctr1b, $res1b, $ctr1b, $rk10 @ AES block 1 - result
  1157. stp $ctr0q, $ctr1q, [$output_ptr], #32 @ AES block 0, 1 - store result
  1158. rev32 $ctr0.16b, $rtmp_ctr.16b @ CTR block 8
  1159. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8
  1160. ldp $res2q, $res3q, [$input_ptr], #32 @ AES block 2, 3 - load ciphertext
  1161. ldp $res4q, $res5q, [$input_ptr], #32 @ AES block 4, 5 - load ciphertext
  1162. rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 9
  1163. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 9
  1164. ldp $res6q, $res7q, [$input_ptr], #32 @ AES block 6, 7 - load ciphertext
  1165. eor3 $ctr3b, $res3b, $ctr3b, $rk10 @ AES block 3 - result
  1166. eor3 $ctr2b, $res2b, $ctr2b, $rk10 @ AES block 2 - result
  1167. stp $ctr2q, $ctr3q, [$output_ptr], #32 @ AES block 2, 3 - store result
  1168. rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 10
  1169. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 10
  1170. eor3 $ctr6b, $res6b, $ctr6b, $rk10 @ AES block 6 - result
  1171. rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 11
  1172. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 11
  1173. eor3 $ctr4b, $res4b, $ctr4b, $rk10 @ AES block 4 - result
  1174. eor3 $ctr5b, $res5b, $ctr5b, $rk10 @ AES block 5 - result
  1175. stp $ctr4q, $ctr5q, [$output_ptr], #32 @ AES block 4, 5 - store result
  1176. eor3 $ctr7b, $res7b, $ctr7b, $rk10 @ AES block 7 - result
  1177. stp $ctr6q, $ctr7q, [$output_ptr], #32 @ AES block 6, 7 - store result
  1178. rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 12
  1179. cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
  1180. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 12
  1181. b.ge .L128_dec_prepretail @ do prepretail
  1182. .L128_dec_main_loop: @ main loop start
  1183. ldr $h7q, [$current_tag, #176] @ load h7l | h7h
  1184. ext $h7.16b, $h7.16b, $h7.16b, #8
  1185. ldr $h8q, [$current_tag, #208] @ load h7l | h7h
  1186. ext $h8.16b, $h8.16b, $h8.16b, #8
  1187. rev64 $res1b, $res1b @ GHASH block 8k+1
  1188. rev64 $res0b, $res0b @ GHASH block 8k
  1189. ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
  1190. rev64 $res6b, $res6b @ GHASH block 8k+6
  1191. ldr $h5q, [$current_tag, #128] @ load h5l | h5h
  1192. ext $h5.16b, $h5.16b, $h5.16b, #8
  1193. ldr $h6q, [$current_tag, #160] @ load h6l | h6h
  1194. ext $h6.16b, $h6.16b, $h6.16b, #8
  1195. eor $res0b, $res0b, $acc_lb @ PRE 1
  1196. rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13
  1197. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13
  1198. rev64 $res2b, $res2b @ GHASH block 8k+2
  1199. rev64 $res4b, $res4b @ GHASH block 8k+4
  1200. ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
  1201. rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14
  1202. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14
  1203. ldr $h56kq, [$current_tag, #144] @ load h6k | h5k
  1204. ldr $h78kq, [$current_tag, #192] @ load h6k | h5k
  1205. pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high
  1206. pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high
  1207. rev64 $res3b, $res3b @ GHASH block 8k+3
  1208. rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15
  1209. trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
  1210. rev64 $res5b, $res5b @ GHASH block 8k+5
  1211. pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low
  1212. pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low
  1213. trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
  1214. pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high
  1215. aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0
  1216. pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high
  1217. aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0
  1218. aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0
  1219. aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0
  1220. aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0
  1221. aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0
  1222. eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high
  1223. aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0
  1224. eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid
  1225. aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0
  1226. aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1
  1227. eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low
  1228. eor3 $acc_hb, $acc_hb, $t1.16b, $t2.16b @ GHASH block 8k+2, 8k+3 - high
  1229. ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
  1230. trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
  1231. aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1
  1232. pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low
  1233. trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
  1234. pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid
  1235. ldr $h3q, [$current_tag, #80] @ load h3l | h3h
  1236. ext $h3.16b, $h3.16b, $h3.16b, #8
  1237. ldr $h4q, [$current_tag, #112] @ load h4l | h4h
  1238. ext $h4.16b, $h4.16b, $h4.16b, #8
  1239. pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid
  1240. aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1
  1241. aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1
  1242. aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1
  1243. pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low
  1244. aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1
  1245. aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1
  1246. aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1
  1247. aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2
  1248. aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2
  1249. eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low
  1250. aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2
  1251. eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
  1252. ldr $h1q, [$current_tag, #32] @ load h1l | h1h
  1253. ext $h1.16b, $h1.16b, $h1.16b, #8
  1254. ldr $h2q, [$current_tag, #64] @ load h1l | h1h
  1255. ext $h2.16b, $h2.16b, $h2.16b, #8
  1256. eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid
  1257. aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2
  1258. aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2
  1259. trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
  1260. aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2
  1261. aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2
  1262. aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2
  1263. pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid
  1264. pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid
  1265. aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3
  1266. rev64 $res7b, $res7b @ GHASH block 8k+7
  1267. pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high
  1268. ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
  1269. pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low
  1270. eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
  1271. ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
  1272. ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
  1273. aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3
  1274. trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
  1275. aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3
  1276. aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3
  1277. aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3
  1278. aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3
  1279. aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3
  1280. aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3
  1281. pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high
  1282. pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low
  1283. pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high
  1284. pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low
  1285. aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4
  1286. aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4
  1287. eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
  1288. trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
  1289. aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4
  1290. aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4
  1291. aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4
  1292. aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4
  1293. aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4
  1294. aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4
  1295. trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
  1296. ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
  1297. aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5
  1298. pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid
  1299. aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5
  1300. eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
  1301. aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5
  1302. pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid
  1303. aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5
  1304. aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5
  1305. aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5
  1306. aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5
  1307. aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5
  1308. pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high
  1309. eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
  1310. eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low
  1311. aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6
  1312. eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high
  1313. aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6
  1314. aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6
  1315. pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid
  1316. aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6
  1317. aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6
  1318. aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6
  1319. pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low
  1320. pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid
  1321. aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6
  1322. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15
  1323. eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high
  1324. aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6
  1325. ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
  1326. ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
  1327. eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low
  1328. aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7
  1329. rev32 $h1.16b, $rtmp_ctr.16b @ CTR block 8k+16
  1330. eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
  1331. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+16
  1332. aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7
  1333. aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7
  1334. aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7
  1335. aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7
  1336. aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7
  1337. rev32 $h2.16b, $rtmp_ctr.16b @ CTR block 8k+17
  1338. aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7
  1339. ext $t11.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
  1340. pmull $t12.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
  1341. eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
  1342. aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7
  1343. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+17
  1344. aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8
  1345. aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8
  1346. ldp $res0q, $res1q, [$input_ptr], #32 @ AES block 8k+8, 8k+9 - load ciphertext
  1347. ldp $res2q, $res3q, [$input_ptr], #32 @ AES block 8k+10, 8k+11 - load ciphertext
  1348. aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8
  1349. rev32 $h3.16b, $rtmp_ctr.16b @ CTR block 8k+18
  1350. ldp $res4q, $res5q, [$input_ptr], #32 @ AES block 8k+12, 8k+13 - load ciphertext
  1351. aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8
  1352. eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid
  1353. ldp $res6q, $res7q, [$input_ptr], #32 @ AES block 8k+14, 8k+15 - load ciphertext
  1354. aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8
  1355. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+18
  1356. aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8
  1357. aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8
  1358. aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8
  1359. aese $ctr0b, $rk9 @ AES block 8k+8 - round 9
  1360. aese $ctr1b, $rk9 @ AES block 8k+9 - round 9
  1361. ldr $rk10q, [$cc, #160] @ load rk10
  1362. aese $ctr6b, $rk9 @ AES block 8k+14 - round 9
  1363. pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
  1364. aese $ctr2b, $rk9 @ AES block 8k+10 - round 9
  1365. aese $ctr7b, $rk9 @ AES block 8k+15 - round 9
  1366. aese $ctr4b, $rk9 @ AES block 8k+12 - round 9
  1367. ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
  1368. rev32 $h4.16b, $rtmp_ctr.16b @ CTR block 8k+19
  1369. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+19
  1370. aese $ctr3b, $rk9 @ AES block 8k+11 - round 9
  1371. aese $ctr5b, $rk9 @ AES block 8k+13 - round 9
  1372. eor3 $ctr1b, $res1b, $ctr1b, $rk10 @ AES block 8k+9 - result
  1373. eor3 $ctr0b, $res0b, $ctr0b, $rk10 @ AES block 8k+8 - result
  1374. eor3 $ctr7b, $res7b, $ctr7b, $rk10 @ AES block 8k+15 - result
  1375. eor3 $ctr6b, $res6b, $ctr6b, $rk10 @ AES block 8k+14 - result
  1376. eor3 $ctr2b, $res2b, $ctr2b, $rk10 @ AES block 8k+10 - result
  1377. stp $ctr0q, $ctr1q, [$output_ptr], #32 @ AES block 8k+8, 8k+9 - store result
  1378. mov $ctr1.16b, $h2.16b @ CTR block 8k+17
  1379. eor3 $ctr4b, $res4b, $ctr4b, $rk10 @ AES block 8k+12 - result
  1380. eor3 $acc_lb, $acc_lb, $acc_hb, $t11.16b @ MODULO - fold into low
  1381. mov $ctr0.16b, $h1.16b @ CTR block 8k+16
  1382. eor3 $ctr3b, $res3b, $ctr3b, $rk10 @ AES block 8k+11 - result
  1383. cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
  1384. stp $ctr2q, $ctr3q, [$output_ptr], #32 @ AES block 8k+10, 8k+11 - store result
  1385. eor3 $ctr5b, $res5b, $ctr5b, $rk10 @ AES block 8k+13 - result
  1386. mov $ctr2.16b, $h3.16b @ CTR block 8k+18
  1387. stp $ctr4q, $ctr5q, [$output_ptr], #32 @ AES block 8k+12, 8k+13 - store result
  1388. rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 8k+20
  1389. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+20
  1390. stp $ctr6q, $ctr7q, [$output_ptr], #32 @ AES block 8k+14, 8k+15 - store result
  1391. mov $ctr3.16b, $h4.16b @ CTR block 8k+19
  1392. b.lt .L128_dec_main_loop
  1393. .L128_dec_prepretail: @ PREPRETAIL
  1394. rev64 $res3b, $res3b @ GHASH block 8k+3
  1395. ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
  1396. rev64 $res0b, $res0b @ GHASH block 8k
  1397. rev64 $res2b, $res2b @ GHASH block 8k+2
  1398. rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13
  1399. ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
  1400. ldr $h7q, [$current_tag, #176] @ load h7l | h7h
  1401. ext $h7.16b, $h7.16b, $h7.16b, #8
  1402. ldr $h8q, [$current_tag, #208] @ load h8l | h8h
  1403. ext $h8.16b, $h8.16b, $h8.16b, #8
  1404. eor $res0b, $res0b, $acc_lb @ PRE 1
  1405. rev64 $res1b, $res1b @ GHASH block 8k+1
  1406. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13
  1407. ldr $h5q, [$current_tag, #128] @ load h5l | h5h
  1408. ext $h5.16b, $h5.16b, $h5.16b, #8
  1409. ldr $h6q, [$current_tag, #160] @ load h6l | h6h
  1410. ext $h6.16b, $h6.16b, $h6.16b, #8
  1411. rev64 $res5b, $res5b @ GHASH block 8k+5
  1412. rev64 $res4b, $res4b @ GHASH block 8k+4
  1413. rev64 $res6b, $res6b @ GHASH block 8k+6
  1414. ldr $h56kq, [$current_tag, #144] @ load h6k | h5k
  1415. ldr $h78kq, [$current_tag, #192] @ load h6k | h5k
  1416. rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14
  1417. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14
  1418. pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high
  1419. pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low
  1420. pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high
  1421. trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
  1422. trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
  1423. pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high
  1424. pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low
  1425. pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high
  1426. aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0
  1427. eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high
  1428. aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0
  1429. eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid
  1430. pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low
  1431. rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15
  1432. aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0
  1433. eor3 $acc_hb, $acc_hb, $t1.16b, $t2.16b @ GHASH block 8k+2, 8k+3 - high
  1434. trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
  1435. trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
  1436. aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0
  1437. aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0
  1438. aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0
  1439. pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid
  1440. pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid
  1441. pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low
  1442. aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1
  1443. aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0
  1444. aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0
  1445. eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low
  1446. eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
  1447. eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid
  1448. aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1
  1449. aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1
  1450. aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1
  1451. ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
  1452. eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low
  1453. pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid
  1454. ldr $h3q, [$current_tag, #80] @ load h3l | h3h
  1455. ext $h3.16b, $h3.16b, $h3.16b, #8
  1456. ldr $h4q, [$current_tag, #112] @ load h4l | h4h
  1457. ext $h4.16b, $h4.16b, $h4.16b, #8
  1458. aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1
  1459. pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid
  1460. aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1
  1461. aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1
  1462. aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1
  1463. ldr $h1q, [$current_tag, #32] @ load h1l | h1h
  1464. ext $h1.16b, $h1.16b, $h1.16b, #8
  1465. ldr $h2q, [$current_tag, #64] @ load h1l | h1h
  1466. ext $h2.16b, $h2.16b, $h2.16b, #8
  1467. eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
  1468. aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2
  1469. aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2
  1470. aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2
  1471. aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2
  1472. trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
  1473. aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2
  1474. aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2
  1475. aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2
  1476. aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2
  1477. pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high
  1478. pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low
  1479. trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
  1480. ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
  1481. rev64 $res7b, $res7b @ GHASH block 8k+7
  1482. aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3
  1483. ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
  1484. ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
  1485. pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high
  1486. pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low
  1487. aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3
  1488. aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3
  1489. trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
  1490. pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high
  1491. pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low
  1492. trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
  1493. aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3
  1494. aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3
  1495. aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3
  1496. aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3
  1497. aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3
  1498. eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
  1499. eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high
  1500. aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4
  1501. aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4
  1502. eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
  1503. aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4
  1504. pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid
  1505. aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4
  1506. aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4
  1507. aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4
  1508. aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4
  1509. aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4
  1510. pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid
  1511. pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high
  1512. pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid
  1513. pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid
  1514. ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
  1515. eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
  1516. aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5
  1517. ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
  1518. pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low
  1519. eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low
  1520. aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5
  1521. aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5
  1522. aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5
  1523. aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5
  1524. aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5
  1525. aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5
  1526. aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5
  1527. eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
  1528. eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low
  1529. aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6
  1530. aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6
  1531. aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6
  1532. eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high
  1533. aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6
  1534. aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6
  1535. aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6
  1536. aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6
  1537. aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6
  1538. aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7
  1539. eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
  1540. ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
  1541. pmull $t12.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
  1542. aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7
  1543. ext $t11.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
  1544. aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7
  1545. aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7
  1546. aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7
  1547. aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7
  1548. aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7
  1549. aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7
  1550. eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid
  1551. ldr $rk10q, [$cc, #160] @ load rk10
  1552. aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8
  1553. aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8
  1554. pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
  1555. aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8
  1556. ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
  1557. aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8
  1558. aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8
  1559. aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8
  1560. aese $ctr6b, $rk9 @ AES block 8k+14 - round 9
  1561. aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8
  1562. aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8
  1563. eor3 $acc_lb, $acc_lb, $acc_hb, $t11.16b @ MODULO - fold into low
  1564. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15
  1565. aese $ctr2b, $rk9 @ AES block 8k+10 - round 9
  1566. aese $ctr3b, $rk9 @ AES block 8k+11 - round 9
  1567. aese $ctr5b, $rk9 @ AES block 8k+13 - round 9
  1568. aese $ctr0b, $rk9 @ AES block 8k+8 - round 9
  1569. aese $ctr4b, $rk9 @ AES block 8k+12 - round 9
  1570. aese $ctr1b, $rk9 @ AES block 8k+9 - round 9
  1571. aese $ctr7b, $rk9 @ AES block 8k+15 - round 9
  1572. .L128_dec_tail: @ TAIL
  1573. mov $t1.16b, $rk10
  1574. sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
  1575. cmp $main_end_input_ptr, #112
  1576. ldp $h78kq, $h8q, [$current_tag, #192] @ load h7l | h7h
  1577. ext $h8.16b, $h8.16b, $h8.16b, #8
  1578. ldr $res1q, [$input_ptr], #16 @ AES block 8k+8 - load ciphertext
  1579. ldp $h5q, $h56kq, [$current_tag, #128] @ load h5l | h5h
  1580. ext $h5.16b, $h5.16b, $h5.16b, #8
  1581. ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
  1582. ldp $h6q, $h7q, [$current_tag, #160] @ load h6k | h5k
  1583. ext $h6.16b, $h6.16b, $h6.16b, #8
  1584. ext $h7.16b, $h7.16b, $h7.16b, #8
  1585. eor3 $res4b, $res1b, $ctr0b, $t1.16b @ AES block 8k+8 - result
  1586. b.gt .L128_dec_blocks_more_than_7
  1587. cmp $main_end_input_ptr, #96
  1588. mov $ctr7b, $ctr6b
  1589. movi $acc_l.8b, #0
  1590. movi $acc_h.8b, #0
  1591. mov $ctr6b, $ctr5b
  1592. mov $ctr5b, $ctr4b
  1593. mov $ctr4b, $ctr3b
  1594. mov $ctr3b, $ctr2b
  1595. mov $ctr2b, $ctr1b
  1596. movi $acc_m.8b, #0
  1597. sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
  1598. b.gt .L128_dec_blocks_more_than_6
  1599. cmp $main_end_input_ptr, #80
  1600. sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
  1601. mov $ctr7b, $ctr6b
  1602. mov $ctr6b, $ctr5b
  1603. mov $ctr5b, $ctr4b
  1604. mov $ctr4b, $ctr3b
  1605. mov $ctr3b, $ctr1b
  1606. b.gt .L128_dec_blocks_more_than_5
  1607. cmp $main_end_input_ptr, #64
  1608. mov $ctr7b, $ctr6b
  1609. mov $ctr6b, $ctr5b
  1610. mov $ctr5b, $ctr4b
  1611. mov $ctr4b, $ctr1b
  1612. sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
  1613. b.gt .L128_dec_blocks_more_than_4
  1614. sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
  1615. mov $ctr7b, $ctr6b
  1616. mov $ctr6b, $ctr5b
  1617. mov $ctr5b, $ctr1b
  1618. cmp $main_end_input_ptr, #48
  1619. b.gt .L128_dec_blocks_more_than_3
  1620. sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
  1621. mov $ctr7b, $ctr6b
  1622. cmp $main_end_input_ptr, #32
  1623. ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
  1624. mov $ctr6b, $ctr1b
  1625. b.gt .L128_dec_blocks_more_than_2
  1626. cmp $main_end_input_ptr, #16
  1627. mov $ctr7b, $ctr1b
  1628. sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
  1629. b.gt L128_dec_blocks_more_than_1
  1630. sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
  1631. ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
  1632. b .L128_dec_blocks_less_than_1
  1633. .L128_dec_blocks_more_than_7: @ blocks left > 7
  1634. rev64 $res0b, $res1b @ GHASH final-7 block
  1635. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  1636. ins $acc_m.d[0], $h78k.d[1] @ GHASH final-7 block - mid
  1637. pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH final-7 block - low
  1638. ins $rk4v.d[0], $res0.d[1] @ GHASH final-7 block - mid
  1639. movi $t0.8b, #0 @ surpress further partial tag feed in
  1640. ldr $res1q, [$input_ptr], #16 @ AES final-6 block - load ciphertext
  1641. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-7 block - mid
  1642. pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH final-7 block - high
  1643. st1 { $res4b}, [$output_ptr], #16 @ AES final-7 block - store result
  1644. eor3 $res4b, $res1b, $ctr1b, $t1.16b @ AES final-6 block - result
  1645. pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-7 block - mid
  1646. .L128_dec_blocks_more_than_6: @ blocks left > 6
  1647. rev64 $res0b, $res1b @ GHASH final-6 block
  1648. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  1649. ins $rk4v.d[0], $res0.d[1] @ GHASH final-6 block - mid
  1650. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-6 block - mid
  1651. pmull $rk3q1, $res0.1d, $h7.1d @ GHASH final-6 block - low
  1652. ldr $res1q, [$input_ptr], #16 @ AES final-5 block - load ciphertext
  1653. movi $t0.8b, #0 @ surpress further partial tag feed in
  1654. pmull $rk4v.1q, $rk4v.1d, $h78k.1d @ GHASH final-6 block - mid
  1655. st1 { $res4b}, [$output_ptr], #16 @ AES final-6 block - store result
  1656. pmull2 $rk2q1, $res0.2d, $h7.2d @ GHASH final-6 block - high
  1657. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-6 block - low
  1658. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-6 block - high
  1659. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-6 block - mid
  1660. eor3 $res4b, $res1b, $ctr2b, $t1.16b @ AES final-5 block - result
  1661. .L128_dec_blocks_more_than_5: @ blocks left > 5
  1662. rev64 $res0b, $res1b @ GHASH final-5 block
  1663. ldr $res1q, [$input_ptr], #16 @ AES final-4 block - load ciphertext
  1664. st1 { $res4b}, [$output_ptr], #16 @ AES final-5 block - store result
  1665. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  1666. ins $rk4v.d[0], $res0.d[1] @ GHASH final-5 block - mid
  1667. eor3 $res4b, $res1b, $ctr3b, $t1.16b @ AES final-4 block - result
  1668. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-5 block - mid
  1669. ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-5 block - mid
  1670. pmull $rk3q1, $res0.1d, $h6.1d @ GHASH final-5 block - low
  1671. movi $t0.8b, #0 @ surpress further partial tag feed in
  1672. pmull2 $rk4v.1q, $rk4v.2d, $h56k.2d @ GHASH final-5 block - mid
  1673. pmull2 $rk2q1, $res0.2d, $h6.2d @ GHASH final-5 block - high
  1674. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-5 block - low
  1675. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-5 block - mid
  1676. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-5 block - high
  1677. .L128_dec_blocks_more_than_4: @ blocks left > 4
  1678. rev64 $res0b, $res1b @ GHASH final-4 block
  1679. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  1680. ldr $res1q, [$input_ptr], #16 @ AES final-3 block - load ciphertext
  1681. ins $rk4v.d[0], $res0.d[1] @ GHASH final-4 block - mid
  1682. movi $t0.8b, #0 @ surpress further partial tag feed in
  1683. pmull2 $rk2q1, $res0.2d, $h5.2d @ GHASH final-4 block - high
  1684. pmull $rk3q1, $res0.1d, $h5.1d @ GHASH final-4 block - low
  1685. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-4 block - high
  1686. st1 { $res4b}, [$output_ptr], #16 @ AES final-4 block - store result
  1687. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-4 block - mid
  1688. eor3 $res4b, $res1b, $ctr4b, $t1.16b @ AES final-3 block - result
  1689. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-4 block - low
  1690. pmull $rk4v.1q, $rk4v.1d, $h56k.1d @ GHASH final-4 block - mid
  1691. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-4 block - mid
  1692. .L128_dec_blocks_more_than_3: @ blocks left > 3
  1693. st1 { $res4b}, [$output_ptr], #16 @ AES final-3 block - store result
  1694. rev64 $res0b, $res1b @ GHASH final-3 block
  1695. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  1696. ins $rk4v.d[0], $res0.d[1] @ GHASH final-3 block - mid
  1697. ldr $h4q, [$current_tag, #112] @ load h4l | h4h
  1698. ext $h4.16b, $h4.16b, $h4.16b, #8
  1699. ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
  1700. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
  1701. ldr $res1q, [$input_ptr], #16 @ AES final-2 block - load ciphertext
  1702. ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-3 block - mid
  1703. pmull $rk3q1, $res0.1d, $h4.1d @ GHASH final-3 block - low
  1704. pmull2 $rk2q1, $res0.2d, $h4.2d @ GHASH final-3 block - high
  1705. movi $t0.8b, #0 @ surpress further partial tag feed in
  1706. eor3 $res4b, $res1b, $ctr5b, $t1.16b @ AES final-2 block - result
  1707. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-3 block - low
  1708. pmull2 $rk4v.1q, $rk4v.2d, $h34k.2d @ GHASH final-3 block - mid
  1709. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-3 block - high
  1710. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-3 block - mid
  1711. .L128_dec_blocks_more_than_2: @ blocks left > 2
  1712. rev64 $res0b, $res1b @ GHASH final-2 block
  1713. st1 { $res4b}, [$output_ptr], #16 @ AES final-2 block - store result
  1714. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  1715. ldr $h3q, [$current_tag, #80] @ load h3l | h3h
  1716. ext $h3.16b, $h3.16b, $h3.16b, #8
  1717. movi $t0.8b, #0 @ surpress further partial tag feed in
  1718. ins $rk4v.d[0], $res0.d[1] @ GHASH final-2 block - mid
  1719. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
  1720. pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
  1721. pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
  1722. pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
  1723. ldr $res1q, [$input_ptr], #16 @ AES final-1 block - load ciphertext
  1724. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
  1725. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
  1726. eor3 $res4b, $res1b, $ctr6b, $t1.16b @ AES final-1 block - result
  1727. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
  1728. .L128_dec_blocks_more_than_1: @ blocks left > 1
  1729. st1 { $res4b}, [$output_ptr], #16 @ AES final-1 block - store result
  1730. rev64 $res0b, $res1b @ GHASH final-1 block
  1731. ldr $h2q, [$current_tag, #64] @ load h1l | h1h
  1732. ext $h2.16b, $h2.16b, $h2.16b, #8
  1733. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  1734. movi $t0.8b, #0 @ surpress further partial tag feed in
  1735. ins $rk4v.d[0], $res0.d[1] @ GHASH final-1 block - mid
  1736. ldr $res1q, [$input_ptr], #16 @ AES final block - load ciphertext
  1737. pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
  1738. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
  1739. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
  1740. ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
  1741. ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
  1742. eor3 $res4b, $res1b, $ctr7b, $t1.16b @ AES final block - result
  1743. pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
  1744. pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
  1745. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
  1746. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
  1747. .L128_dec_blocks_less_than_1: @ blocks left <= 1
  1748. and $bit_length, $bit_length, #127 @ bit_length %= 128
  1749. sub $bit_length, $bit_length, #128 @ bit_length -= 128
  1750. neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
  1751. mvn $temp0_x, xzr @ temp0_x = 0xffffffffffffffff
  1752. and $bit_length, $bit_length, #127 @ bit_length %= 128
  1753. lsr $temp0_x, $temp0_x, $bit_length @ temp0_x is mask for top 64b of last block
  1754. cmp $bit_length, #64
  1755. mvn $temp1_x, xzr @ temp1_x = 0xffffffffffffffff
  1756. csel $temp2_x, $temp1_x, $temp0_x, lt
  1757. csel $temp3_x, $temp0_x, xzr, lt
  1758. mov $ctr0.d[1], $temp3_x
  1759. mov $ctr0.d[0], $temp2_x @ ctr0b is mask for last block
  1760. ldr $h1q, [$current_tag, #32] @ load h1l | h1h
  1761. ext $h1.16b, $h1.16b, $h1.16b, #8
  1762. ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored
  1763. and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
  1764. rev64 $res0b, $res1b @ GHASH final block
  1765. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  1766. pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
  1767. ins $t0.d[0], $res0.d[1] @ GHASH final block - mid
  1768. eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
  1769. eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
  1770. bif $res4b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing
  1771. pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
  1772. st1 { $res4b}, [$output_ptr] @ store all 16B
  1773. pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
  1774. eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
  1775. ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
  1776. eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
  1777. eor $t10.16b, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
  1778. pmull $t11.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
  1779. ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
  1780. eor $acc_mb, $acc_mb, $t10.16b @ MODULO - karatsuba tidy up
  1781. eor3 $acc_mb, $acc_mb, $acc_hb, $t11.16b @ MODULO - fold into mid
  1782. pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
  1783. ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
  1784. eor3 $acc_lb, $acc_lb, $acc_mb, $acc_hb @ MODULO - fold into low
  1785. ext $acc_lb, $acc_lb, $acc_lb, #8
  1786. rev64 $acc_lb, $acc_lb
  1787. st1 { $acc_l.16b }, [$current_tag]
  1788. rev32 $rtmp_ctr.16b, $rtmp_ctr.16b
  1789. str $rtmp_ctrq, [$counter] @ store the updated counter
  1790. lsr x0, $bit_length, #3
  1791. ldp d10, d11, [sp, #16]
  1792. ldp d12, d13, [sp, #32]
  1793. ldp d14, d15, [sp, #48]
  1794. ldp d8, d9, [sp], #80
  1795. ret
  1796. .L128_dec_ret:
  1797. mov w0, #0x0
  1798. ret
  1799. .size unroll8_eor3_aes_gcm_dec_128_kernel,.-unroll8_eor3_aes_gcm_dec_128_kernel
  1800. ___
  1801. }
  1802. {
  1803. my ($end_input_ptr,$main_end_input_ptr,$temp0_x,$temp1_x)=map("x$_",(4..7));
  1804. my ($temp2_x,$temp3_x)=map("x$_",(13..14));
  1805. my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$ctr4b,$ctr5b,$ctr6b,$ctr7b,$res0b,$res1b,$res2b,$res3b,$res4b,$res5b,$res6b,$res7b)=map("v$_.16b",(0..15));
  1806. my ($ctr0,$ctr1,$ctr2,$ctr3,$ctr4,$ctr5,$ctr6,$ctr7,$res0,$res1,$res2,$res3,$res4,$res5,$res6,$res7)=map("v$_",(0..15));
  1807. my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$ctr4d,$ctr5d,$ctr6d,$ctr7d)=map("d$_",(0..7));
  1808. my ($ctr0q,$ctr1q,$ctr2q,$ctr3q,$ctr4q,$ctr5q,$ctr6q,$ctr7q)=map("q$_",(0..7));
  1809. my ($res0q,$res1q,$res2q,$res3q,$res4q,$res5q,$res6q,$res7q)=map("q$_",(8..15));
  1810. my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3,$ctr_t4,$ctr_t5,$ctr_t6,$ctr_t7)=map("v$_",(8..15));
  1811. my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b,$ctr_t4b,$ctr_t5b,$ctr_t6b,$ctr_t7b)=map("v$_.16b",(8..15));
  1812. my ($ctr_t0q,$ctr_t1q,$ctr_t2q,$ctr_t3q,$ctr_t4q,$ctr_t5q,$ctr_t6q,$ctr_t7q)=map("q$_",(8..15));
  1813. my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(17..19));
  1814. my ($acc_h,$acc_m,$acc_l)=map("v$_",(17..19));
  1815. my ($h1,$h12k,$h2,$h3,$h34k,$h4)=map("v$_",(20..25));
  1816. my ($h5,$h56k,$h6,$h7,$h78k,$h8)=map("v$_",(20..25));
  1817. my ($h1q,$h12kq,$h2q,$h3q,$h34kq,$h4q)=map("q$_",(20..25));
  1818. my ($h5q,$h56kq,$h6q,$h7q,$h78kq,$h8q)=map("q$_",(20..25));
  1819. my $t0="v16";
  1820. my $t0d="d16";
  1821. my $t1="v29";
  1822. my $t2=$res1;
  1823. my $t3=$t1;
  1824. my $t4=$res0;
  1825. my $t5=$res2;
  1826. my $t6=$t0;
  1827. my $t7=$res3;
  1828. my $t8=$res4;
  1829. my $t9=$res5;
  1830. my $t10=$res6;
  1831. my $t11="v21";
  1832. my $t12=$t1;
  1833. my $rtmp_ctr="v30";
  1834. my $rtmp_ctrq="q30";
  1835. my $rctr_inc="v31";
  1836. my $rctr_incd="d31";
  1837. my $mod_constantd=$t0d;
  1838. my $mod_constant=$t0;
  1839. my ($rk0,$rk1,$rk2)=map("v$_.16b",(26..28));
  1840. my ($rk3,$rk4,$rk5)=map("v$_.16b",(26..28));
  1841. my ($rk6,$rk7,$rk8)=map("v$_.16b",(26..28));
  1842. my ($rk9,$rk10,$rk11)=map("v$_.16b",(26..28));
  1843. my ($rk12,$rk13,$rk14)=map("v$_.16b",(26..28));
  1844. my ($rk0q,$rk1q,$rk2q)=map("q$_",(26..28));
  1845. my ($rk3q,$rk4q,$rk5q)=map("q$_",(26..28));
  1846. my ($rk6q,$rk7q,$rk8q)=map("q$_",(26..28));
  1847. my ($rk9q,$rk10q,$rk11q)=map("q$_",(26..28));
  1848. my ($rk12q,$rk13q,$rk14q)=map("q$_",(26..28));
  1849. my $rk2q1="v28.1q";
  1850. my $rk3q1="v26.1q";
  1851. my $rk4v="v27";
  1852. #########################################################################################
  1853. # size_t unroll8_eor3_aes_gcm_enc_192_kernel(const unsigned char *in,
  1854. # size_t len,
  1855. # unsigned char *out,
  1856. # const void *key,
  1857. # unsigned char ivec[16],
  1858. # u64 *Xi);
  1859. #
  1860. $code.=<<___;
  1861. .global unroll8_eor3_aes_gcm_enc_192_kernel
  1862. .type unroll8_eor3_aes_gcm_enc_192_kernel,%function
  1863. .align 4
  1864. unroll8_eor3_aes_gcm_enc_192_kernel:
  1865. AARCH64_VALID_CALL_TARGET
  1866. cbz x1, .L192_enc_ret
  1867. stp d8, d9, [sp, #-80]!
  1868. mov $counter, x4
  1869. mov $cc, x5
  1870. stp d10, d11, [sp, #16]
  1871. stp d12, d13, [sp, #32]
  1872. stp d14, d15, [sp, #48]
  1873. mov x5, #0xc200000000000000
  1874. stp x5, xzr, [sp, #64]
  1875. add $modulo_constant, sp, #64
  1876. lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
  1877. ld1 { $ctr0b}, [$counter] @ CTR block 0
  1878. mov $constant_temp, #0x100000000 @ set up counter increment
  1879. movi $rctr_inc.16b, #0x0
  1880. mov $rctr_inc.d[1], $constant_temp
  1881. rev32 $rtmp_ctr.16b, $ctr0.16b @ set up reversed counter
  1882. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 0
  1883. rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 1
  1884. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 1
  1885. rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 2
  1886. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 2
  1887. rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 3
  1888. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 3
  1889. rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 4
  1890. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 4
  1891. sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
  1892. and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffff80 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
  1893. rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 5
  1894. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 5
  1895. ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
  1896. add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
  1897. rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 6
  1898. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 6
  1899. rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 7
  1900. aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 0
  1901. aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 0
  1902. aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
  1903. aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
  1904. aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
  1905. aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 0
  1906. aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 0
  1907. aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
  1908. ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
  1909. aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 1
  1910. aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 1
  1911. aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
  1912. aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
  1913. aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 1
  1914. aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 2
  1915. aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 1
  1916. aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
  1917. aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
  1918. aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 2
  1919. aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
  1920. aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
  1921. aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
  1922. aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
  1923. aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 2
  1924. aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 2
  1925. ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
  1926. aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 3
  1927. aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 3
  1928. aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
  1929. aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
  1930. aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
  1931. aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
  1932. aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 3
  1933. aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
  1934. aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
  1935. aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 3
  1936. aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
  1937. aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
  1938. aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 4
  1939. aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 4
  1940. aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 4
  1941. aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 4
  1942. aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
  1943. ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
  1944. aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
  1945. aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 5
  1946. aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 5
  1947. aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
  1948. aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 5
  1949. aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 5
  1950. aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
  1951. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 7
  1952. aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 6
  1953. aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 6
  1954. aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
  1955. aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
  1956. aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 6
  1957. aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
  1958. aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
  1959. aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 6
  1960. ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
  1961. aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 7
  1962. aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
  1963. aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 7
  1964. aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
  1965. aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 7
  1966. aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
  1967. aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
  1968. aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 7
  1969. aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 8
  1970. aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
  1971. aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 8
  1972. aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
  1973. aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 8
  1974. aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
  1975. aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
  1976. aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 8
  1977. add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
  1978. cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
  1979. aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9
  1980. ld1 { $acc_lb}, [$current_tag]
  1981. ext $acc_lb, $acc_lb, $acc_lb, #8
  1982. rev64 $acc_lb, $acc_lb
  1983. ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11
  1984. aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 9
  1985. aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9
  1986. aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 9
  1987. aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9
  1988. aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9
  1989. aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 9
  1990. aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 14 - round 10
  1991. aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 9
  1992. aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 11 - round 10
  1993. aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 9 - round 10
  1994. aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 13 - round 10
  1995. aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 12 - round 10
  1996. aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 8 - round 10
  1997. aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 10 - round 10
  1998. aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 15 - round 10
  1999. aese $ctr6b, $rk11 @ AES block 14 - round 11
  2000. aese $ctr3b, $rk11 @ AES block 11 - round 11
  2001. aese $ctr4b, $rk11 @ AES block 12 - round 11
  2002. aese $ctr7b, $rk11 @ AES block 15 - round 11
  2003. ldr $rk12q, [$cc, #192] @ load rk12
  2004. aese $ctr1b, $rk11 @ AES block 9 - round 11
  2005. aese $ctr5b, $rk11 @ AES block 13 - round 11
  2006. aese $ctr2b, $rk11 @ AES block 10 - round 11
  2007. aese $ctr0b, $rk11 @ AES block 8 - round 11
  2008. b.ge .L192_enc_tail @ handle tail
  2009. ldp $ctr_t0q, $ctr_t1q, [$input_ptr], #32 @ AES block 0, 1 - load plaintext
  2010. ldp $ctr_t2q, $ctr_t3q, [$input_ptr], #32 @ AES block 2, 3 - load plaintext
  2011. ldp $ctr_t4q, $ctr_t5q, [$input_ptr], #32 @ AES block 4, 5 - load plaintext
  2012. ldp $ctr_t6q, $ctr_t7q, [$input_ptr], #32 @ AES block 6, 7 - load plaintext
  2013. eor3 $res0b, $ctr_t0b, $ctr0b, $rk12 @ AES block 0 - result
  2014. rev32 $ctr0.16b, $rtmp_ctr.16b @ CTR block 8
  2015. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8
  2016. eor3 $res3b, $ctr_t3b, $ctr3b, $rk12 @ AES block 3 - result
  2017. eor3 $res1b, $ctr_t1b, $ctr1b, $rk12 @ AES block 1 - result
  2018. rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 9
  2019. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 9
  2020. eor3 $res4b, $ctr_t4b, $ctr4b, $rk12 @ AES block 4 - result
  2021. eor3 $res5b, $ctr_t5b, $ctr5b, $rk12 @ AES block 5 - result
  2022. eor3 $res7b, $ctr_t7b, $ctr7b, $rk12 @ AES block 7 - result
  2023. stp $res0q, $res1q, [$output_ptr], #32 @ AES block 0, 1 - store result
  2024. eor3 $res2b, $ctr_t2b, $ctr2b, $rk12 @ AES block 2 - result
  2025. rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 10
  2026. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 10
  2027. stp $res2q, $res3q, [$output_ptr], #32 @ AES block 2, 3 - store result
  2028. cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
  2029. rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 11
  2030. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 11
  2031. eor3 $res6b, $ctr_t6b, $ctr6b, $rk12 @ AES block 6 - result
  2032. stp $res4q, $res5q, [$output_ptr], #32 @ AES block 4, 5 - store result
  2033. rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 12
  2034. stp $res6q, $res7q, [$output_ptr], #32 @ AES block 6, 7 - store result
  2035. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 12
  2036. b.ge .L192_enc_prepretail @ do prepretail
  2037. .L192_enc_main_loop: @ main loop start
  2038. rev64 $res4b, $res4b @ GHASH block 8k+4 (t0, t1, and t2 free)
  2039. ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
  2040. rev64 $res2b, $res2b @ GHASH block 8k+2
  2041. rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13
  2042. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13
  2043. ldr $h7q, [$current_tag, #176] @ load h7l | h7h
  2044. ext $h7.16b, $h7.16b, $h7.16b, #8
  2045. ldr $h8q, [$current_tag, #208] @ load h8l | h8h
  2046. ext $h8.16b, $h8.16b, $h8.16b, #8
  2047. ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
  2048. rev64 $res0b, $res0b @ GHASH block 8k
  2049. ldr $h5q, [$current_tag, #128] @ load h5l | h5h
  2050. ext $h5.16b, $h5.16b, $h5.16b, #8
  2051. ldr $h6q, [$current_tag, #160] @ load h6l | h6h
  2052. ext $h6.16b, $h6.16b, $h6.16b, #8
  2053. rev64 $res1b, $res1b @ GHASH block 8k+1
  2054. rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14
  2055. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14
  2056. eor $res0b, $res0b, $acc_lb @ PRE 1
  2057. rev64 $res3b, $res3b @ GHASH block 8k+3
  2058. rev64 $res5b, $res5b @ GHASH block 8k+5 (t0, t1, t2 and t3 free)
  2059. aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0
  2060. rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15
  2061. aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0
  2062. aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0
  2063. aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0
  2064. aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0
  2065. aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0
  2066. aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0
  2067. aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0
  2068. ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
  2069. pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high
  2070. aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1
  2071. aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1
  2072. pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high
  2073. pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low
  2074. trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
  2075. aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1
  2076. ldr $h56kq, [$current_tag, #144] @ load h6k | h5k
  2077. ldr $h78kq, [$current_tag, #192] @ load h8k | h7k
  2078. pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high
  2079. pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low
  2080. trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
  2081. aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1
  2082. aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1
  2083. aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1
  2084. eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high
  2085. aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1
  2086. aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1
  2087. pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high
  2088. eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid
  2089. aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2
  2090. aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2
  2091. aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2
  2092. aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2
  2093. aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2
  2094. aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3
  2095. eor3 $acc_hb, $acc_hb, $t1.16b, $t2.16b @ GHASH block 8k+2, 8k+3 - high
  2096. pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low
  2097. aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2
  2098. aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3
  2099. aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2
  2100. trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
  2101. aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2
  2102. trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
  2103. aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3
  2104. ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
  2105. aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3
  2106. eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low
  2107. ldr $h3q, [$current_tag, #80] @ load h3l | h3h
  2108. ext $h3.16b, $h3.16b, $h3.16b, #8
  2109. ldr $h4q, [$current_tag, #112] @ load h4l | h4h
  2110. ext $h4.16b, $h4.16b, $h4.16b, #8
  2111. pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid
  2112. pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid
  2113. pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low
  2114. aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3
  2115. eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
  2116. trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
  2117. eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid
  2118. aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3
  2119. eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low
  2120. aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4
  2121. aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4
  2122. aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3
  2123. pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid
  2124. aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4
  2125. aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3
  2126. pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid
  2127. aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4
  2128. aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4
  2129. aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4
  2130. aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4
  2131. aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4
  2132. eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
  2133. aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5
  2134. ldr $h1q, [$current_tag, #32] @ load h1l | h1h
  2135. ext $h1.16b, $h1.16b, $h1.16b, #8
  2136. ldr $h2q, [$current_tag, #64] @ load h1l | h1h
  2137. ext $h2.16b, $h2.16b, $h2.16b, #8
  2138. ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
  2139. aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5
  2140. rev64 $res7b, $res7b @ GHASH block 8k+7 (t0, t1, t2 and t3 free)
  2141. rev64 $res6b, $res6b @ GHASH block 8k+6 (t0, t1, and t2 free)
  2142. pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high
  2143. pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low
  2144. aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5
  2145. trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
  2146. aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5
  2147. ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
  2148. ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
  2149. aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5
  2150. pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high
  2151. eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
  2152. aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5
  2153. aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5
  2154. aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5
  2155. pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low
  2156. aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6
  2157. trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
  2158. aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6
  2159. aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6
  2160. pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high
  2161. pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low
  2162. trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
  2163. aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6
  2164. aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6
  2165. aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6
  2166. aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6
  2167. aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7
  2168. aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6
  2169. aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7
  2170. eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
  2171. pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid
  2172. ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
  2173. pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid
  2174. aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7
  2175. pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high
  2176. aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7
  2177. eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
  2178. aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7
  2179. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15
  2180. ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
  2181. eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high
  2182. aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7
  2183. pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid
  2184. pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low
  2185. aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7
  2186. aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8
  2187. aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8
  2188. aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8
  2189. aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8
  2190. eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low
  2191. aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7
  2192. aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8
  2193. aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8
  2194. pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid
  2195. aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8
  2196. aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8
  2197. ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11
  2198. eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low
  2199. rev32 $h1.16b, $rtmp_ctr.16b @ CTR block 8k+16
  2200. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+16
  2201. aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 9
  2202. eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
  2203. eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high
  2204. aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 9
  2205. aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 9
  2206. ldp $ctr_t0q, $ctr_t1q, [$input_ptr], #32 @ AES block 8k+8, 8k+9 - load plaintext
  2207. pmull $t11.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
  2208. rev32 $h2.16b, $rtmp_ctr.16b @ CTR block 8k+17
  2209. aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 9
  2210. aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 9
  2211. aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 9
  2212. aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 9
  2213. eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
  2214. aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 9
  2215. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+17
  2216. aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 10
  2217. aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 10
  2218. ldr $rk12q, [$cc, #192] @ load rk12
  2219. ext $t12.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
  2220. aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 10
  2221. aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 10
  2222. ldp $ctr_t2q, $ctr_t3q, [$input_ptr], #32 @ AES block 8k+10, 8k+11 - load plaintext
  2223. aese $ctr4b, $rk11 @ AES block 8k+12 - round 11
  2224. eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid
  2225. ldp $ctr_t4q, $ctr_t5q, [$input_ptr], #32 @ AES block 8k+12, 8k+13 - load plaintext
  2226. ldp $ctr_t6q, $ctr_t7q, [$input_ptr], #32 @ AES block 8k+14, 8k+15 - load plaintext
  2227. aese $ctr2b, $rk11 @ AES block 8k+10 - round 11
  2228. aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 10
  2229. rev32 $h3.16b, $rtmp_ctr.16b @ CTR block 8k+18
  2230. aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 10
  2231. aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 10
  2232. pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
  2233. aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 10
  2234. aese $ctr5b, $rk11 @ AES block 8k+13 - round 11
  2235. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+18
  2236. aese $ctr7b, $rk11 @ AES block 8k+15 - round 11
  2237. aese $ctr0b, $rk11 @ AES block 8k+8 - round 11
  2238. eor3 $res4b, $ctr_t4b, $ctr4b, $rk12 @ AES block 4 - result
  2239. aese $ctr6b, $rk11 @ AES block 8k+14 - round 11
  2240. aese $ctr3b, $rk11 @ AES block 8k+11 - round 11
  2241. aese $ctr1b, $rk11 @ AES block 8k+9 - round 11
  2242. rev32 $h4.16b, $rtmp_ctr.16b @ CTR block 8k+19
  2243. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+19
  2244. eor3 $res7b, $ctr_t7b, $ctr7b, $rk12 @ AES block 7 - result
  2245. eor3 $res2b, $ctr_t2b, $ctr2b, $rk12 @ AES block 8k+10 - result
  2246. eor3 $res0b, $ctr_t0b, $ctr0b, $rk12 @ AES block 8k+8 - result
  2247. mov $ctr2.16b, $h3.16b @ CTR block 8k+18
  2248. eor3 $res1b, $ctr_t1b, $ctr1b, $rk12 @ AES block 8k+9 - result
  2249. mov $ctr1.16b, $h2.16b @ CTR block 8k+17
  2250. stp $res0q, $res1q, [$output_ptr], #32 @ AES block 8k+8, 8k+9 - store result
  2251. ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
  2252. eor3 $res6b, $ctr_t6b, $ctr6b, $rk12 @ AES block 6 - result
  2253. mov $ctr0.16b, $h1.16b @ CTR block 8k+16
  2254. rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 8k+20
  2255. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+20
  2256. eor3 $res5b, $ctr_t5b, $ctr5b, $rk12 @ AES block 5 - result
  2257. eor3 $acc_lb, $acc_lb, $acc_hb, $t11.16b @ MODULO - fold into low
  2258. eor3 $res3b, $ctr_t3b, $ctr3b, $rk12 @ AES block 8k+11 - result
  2259. mov $ctr3.16b, $h4.16b @ CTR block 8k+19
  2260. stp $res2q, $res3q, [$output_ptr], #32 @ AES block 8k+10, 8k+11 - store result
  2261. stp $res4q, $res5q, [$output_ptr], #32 @ AES block 8k+12, 8k+13 - store result
  2262. cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
  2263. stp $res6q, $res7q, [$output_ptr], #32 @ AES block 8k+14, 8k+15 - store result
  2264. b.lt .L192_enc_main_loop
  2265. .L192_enc_prepretail: @ PREPRETAIL
  2266. rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13
  2267. ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
  2268. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13
  2269. ldr $h7q, [$current_tag, #176] @ load h7l | h7h
  2270. ext $h7.16b, $h7.16b, $h7.16b, #8
  2271. ldr $h8q, [$current_tag, #208] @ load h8l | h8h
  2272. ext $h8.16b, $h8.16b, $h8.16b, #8
  2273. rev64 $res0b, $res0b @ GHASH block 8k
  2274. ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
  2275. rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14
  2276. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14
  2277. ldr $h56kq, [$current_tag, #144] @ load h6k | h5k
  2278. ldr $h78kq, [$current_tag, #192] @ load h8k | h7k
  2279. rev64 $res3b, $res3b @ GHASH block 8k+3
  2280. rev64 $res2b, $res2b @ GHASH block 8k+2
  2281. ldr $h5q, [$current_tag, #128] @ load h5l | h5h
  2282. ext $h5.16b, $h5.16b, $h5.16b, #8
  2283. ldr $h6q, [$current_tag, #160] @ load h6l | h6h
  2284. ext $h6.16b, $h6.16b, $h6.16b, #8
  2285. eor $res0b, $res0b, $acc_lb @ PRE 1
  2286. rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15
  2287. rev64 $res1b, $res1b @ GHASH block 8k+1
  2288. aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0
  2289. aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0
  2290. aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0
  2291. pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high
  2292. aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0
  2293. aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0
  2294. aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0
  2295. aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0
  2296. pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high
  2297. aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1
  2298. pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low
  2299. trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
  2300. trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
  2301. aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0
  2302. ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
  2303. pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low
  2304. eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high
  2305. aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1
  2306. aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1
  2307. eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid
  2308. aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1
  2309. aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1
  2310. pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high
  2311. pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high
  2312. aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1
  2313. aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1
  2314. aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1
  2315. pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low
  2316. aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2
  2317. eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low
  2318. pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low
  2319. aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2
  2320. eor3 $acc_hb, $acc_hb, $t1.16b, $t2.16b @ GHASH block 8k+2, 8k+3 - high
  2321. aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3
  2322. trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
  2323. aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2
  2324. aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2
  2325. pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid
  2326. trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
  2327. aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2
  2328. rev64 $res5b, $res5b @ GHASH block 8k+5 (t0, t1, t2 and t3 free)
  2329. rev64 $res6b, $res6b @ GHASH block 8k+6 (t0, t1, and t2 free)
  2330. aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2
  2331. aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2
  2332. aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2
  2333. eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
  2334. pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid
  2335. ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
  2336. aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3
  2337. aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3
  2338. aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3
  2339. eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid
  2340. eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low
  2341. aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3
  2342. ldr $h3q, [$current_tag, #80] @ load h3l | h3h
  2343. ext $h3.16b, $h3.16b, $h3.16b, #8
  2344. ldr $h4q, [$current_tag, #112] @ load h4l | h4h
  2345. ext $h4.16b, $h4.16b, $h4.16b, #8
  2346. aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3
  2347. pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid
  2348. ldr $h1q, [$current_tag, #32] @ load h1l | h1h
  2349. ext $h1.16b, $h1.16b, $h1.16b, #8
  2350. ldr $h2q, [$current_tag, #64] @ load h1l | h1h
  2351. ext $h2.16b, $h2.16b, $h2.16b, #8
  2352. aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3
  2353. rev64 $res4b, $res4b @ GHASH block 8k+4 (t0, t1, and t2 free)
  2354. aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3
  2355. pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid
  2356. aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4
  2357. trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
  2358. aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4
  2359. aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4
  2360. eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
  2361. aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4
  2362. aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4
  2363. aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4
  2364. aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4
  2365. aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4
  2366. aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5
  2367. rev64 $res7b, $res7b @ GHASH block 8k+7 (t0, t1, t2 and t3 free)
  2368. ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
  2369. ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
  2370. aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5
  2371. aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5
  2372. ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
  2373. pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high
  2374. pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high
  2375. pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low
  2376. aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5
  2377. trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
  2378. pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high
  2379. pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low
  2380. pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low
  2381. trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
  2382. eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
  2383. trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
  2384. aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5
  2385. aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6
  2386. aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5
  2387. aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5
  2388. eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
  2389. aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5
  2390. pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid
  2391. pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid
  2392. aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6
  2393. aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6
  2394. aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7
  2395. aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6
  2396. aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6
  2397. eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
  2398. aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6
  2399. eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high
  2400. aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7
  2401. aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6
  2402. ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
  2403. aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6
  2404. pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid
  2405. aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7
  2406. eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low
  2407. pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high
  2408. pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid
  2409. pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low
  2410. aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7
  2411. aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7
  2412. ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
  2413. aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7
  2414. eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
  2415. eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low
  2416. eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high
  2417. eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
  2418. ext $t12.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
  2419. aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7
  2420. pmull $t11.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
  2421. aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8
  2422. aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8
  2423. aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7
  2424. aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8
  2425. eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid
  2426. aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8
  2427. aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 9
  2428. aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8
  2429. aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8
  2430. aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8
  2431. aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8
  2432. aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 9
  2433. ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11
  2434. aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 9
  2435. aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 9
  2436. aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 9
  2437. ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
  2438. aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 9
  2439. aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 9
  2440. aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 9
  2441. pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
  2442. ldr $rk12q, [$cc, #192] @ load rk12
  2443. aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 10
  2444. aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 10
  2445. aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 10
  2446. eor3 $acc_lb, $acc_lb, $acc_hb, $t11.16b @ MODULO - fold into low
  2447. aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 10
  2448. aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 10
  2449. aese $ctr1b, $rk11 @ AES block 8k+9 - round 11
  2450. aese $ctr7b, $rk11 @ AES block 8k+15 - round 11
  2451. aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 10
  2452. aese $ctr3b, $rk11 @ AES block 8k+11 - round 11
  2453. aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 10
  2454. aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 10
  2455. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15
  2456. aese $ctr2b, $rk11 @ AES block 8k+10 - round 11
  2457. aese $ctr0b, $rk11 @ AES block 8k+8 - round 11
  2458. aese $ctr6b, $rk11 @ AES block 8k+14 - round 11
  2459. aese $ctr4b, $rk11 @ AES block 8k+12 - round 11
  2460. aese $ctr5b, $rk11 @ AES block 8k+13 - round 11
  2461. .L192_enc_tail: @ TAIL
  2462. ldp $h5q, $h56kq, [$current_tag, #128] @ load h5l | h5h
  2463. ext $h5.16b, $h5.16b, $h5.16b, #8
  2464. sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
  2465. ldr $ctr_t0q, [$input_ptr], #16 @ AES block 8k+8 - l3ad plaintext
  2466. ldp $h78kq, $h8q, [$current_tag, #192] @ load h8l | h8h
  2467. ext $h8.16b, $h8.16b, $h8.16b, #8
  2468. mov $t1.16b, $rk12
  2469. ldp $h6q, $h7q, [$current_tag, #160] @ load h6l | h6h
  2470. ext $h6.16b, $h6.16b, $h6.16b, #8
  2471. ext $h7.16b, $h7.16b, $h7.16b, #8
  2472. cmp $main_end_input_ptr, #112
  2473. eor3 $res1b, $ctr_t0b, $ctr0b, $t1.16b @ AES block 8k+8 - result
  2474. ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
  2475. b.gt .L192_enc_blocks_more_than_7
  2476. cmp $main_end_input_ptr, #96
  2477. mov $ctr7b, $ctr6b
  2478. movi $acc_h.8b, #0
  2479. mov $ctr6b, $ctr5b
  2480. movi $acc_l.8b, #0
  2481. sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
  2482. mov $ctr5b, $ctr4b
  2483. mov $ctr4b, $ctr3b
  2484. mov $ctr3b, $ctr2b
  2485. mov $ctr2b, $ctr1b
  2486. movi $acc_m.8b, #0
  2487. b.gt .L192_enc_blocks_more_than_6
  2488. mov $ctr7b, $ctr6b
  2489. cmp $main_end_input_ptr, #80
  2490. mov $ctr6b, $ctr5b
  2491. mov $ctr5b, $ctr4b
  2492. mov $ctr4b, $ctr3b
  2493. mov $ctr3b, $ctr1b
  2494. sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
  2495. b.gt .L192_enc_blocks_more_than_5
  2496. cmp $main_end_input_ptr, #64
  2497. sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
  2498. mov $ctr7b, $ctr6b
  2499. mov $ctr6b, $ctr5b
  2500. mov $ctr5b, $ctr4b
  2501. mov $ctr4b, $ctr1b
  2502. b.gt .L192_enc_blocks_more_than_4
  2503. mov $ctr7b, $ctr6b
  2504. mov $ctr6b, $ctr5b
  2505. mov $ctr5b, $ctr1b
  2506. sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
  2507. cmp $main_end_input_ptr, #48
  2508. b.gt .L192_enc_blocks_more_than_3
  2509. mov $ctr7b, $ctr6b
  2510. mov $ctr6b, $ctr1b
  2511. sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
  2512. ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
  2513. cmp $main_end_input_ptr, #32
  2514. b.gt .L192_enc_blocks_more_than_2
  2515. sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
  2516. cmp $main_end_input_ptr, #16
  2517. mov $ctr7b, $ctr1b
  2518. b.gt .L192_enc_blocks_more_than_1
  2519. sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
  2520. ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
  2521. b .L192_enc_blocks_less_than_1
  2522. .L192_enc_blocks_more_than_7: @ blocks left > 7
  2523. st1 { $res1b}, [$output_ptr], #16 @ AES final-7 block - store result
  2524. rev64 $res0b, $res1b @ GHASH final-7 block
  2525. ins $acc_m.d[0], $h78k.d[1] @ GHASH final-7 block - mid
  2526. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  2527. ins $rk4v.d[0], $res0.d[1] @ GHASH final-7 block - mid
  2528. ldr $ctr_t1q, [$input_ptr], #16 @ AES final-6 block - load plaintext
  2529. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-7 block - mid
  2530. movi $t0.8b, #0 @ surpress further partial tag feed in
  2531. pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH final-7 block - low
  2532. pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH final-7 block - high
  2533. pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-7 block - mid
  2534. eor3 $res1b, $ctr_t1b, $ctr1b, $t1.16b @ AES final-6 block - result
  2535. .L192_enc_blocks_more_than_6: @ blocks left > 6
  2536. st1 { $res1b}, [$output_ptr], #16 @ AES final-6 block - store result
  2537. rev64 $res0b, $res1b @ GHASH final-6 block
  2538. ldr $ctr_t1q, [$input_ptr], #16 @ AES final-5 block - load plaintext
  2539. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  2540. ins $rk4v.d[0], $res0.d[1] @ GHASH final-6 block - mid
  2541. pmull $rk3q1, $res0.1d, $h7.1d @ GHASH final-6 block - low
  2542. eor3 $res1b, $ctr_t1b, $ctr2b, $t1.16b @ AES final-5 block - result
  2543. movi $t0.8b, #0 @ surpress further partial tag feed in
  2544. pmull2 $rk2q1, $res0.2d, $h7.2d @ GHASH final-6 block - high
  2545. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-6 block - mid
  2546. pmull $rk4v.1q, $rk4v.1d, $h78k.1d @ GHASH final-6 block - mid
  2547. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-6 block - high
  2548. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-6 block - low
  2549. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-6 block - mid
  2550. .L192_enc_blocks_more_than_5: @ blocks left > 5
  2551. st1 { $res1b}, [$output_ptr], #16 @ AES final-5 block - store result
  2552. rev64 $res0b, $res1b @ GHASH final-5 block
  2553. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  2554. ins $rk4v.d[0], $res0.d[1] @ GHASH final-5 block - mid
  2555. ldr $ctr_t1q, [$input_ptr], #16 @ AES final-4 block - load plaintext
  2556. pmull2 $rk2q1, $res0.2d, $h6.2d @ GHASH final-5 block - high
  2557. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-5 block - mid
  2558. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-5 block - high
  2559. ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-5 block - mid
  2560. pmull $rk3q1, $res0.1d, $h6.1d @ GHASH final-5 block - low
  2561. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-5 block - low
  2562. pmull2 $rk4v.1q, $rk4v.2d, $h56k.2d @ GHASH final-5 block - mid
  2563. eor3 $res1b, $ctr_t1b, $ctr3b, $t1.16b @ AES final-4 block - result
  2564. movi $t0.8b, #0 @ surpress further partial tag feed in
  2565. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-5 block - mid
  2566. .L192_enc_blocks_more_than_4: @ blocks left > 4
  2567. st1 { $res1b}, [$output_ptr], #16 @ AES final-4 block - store result
  2568. rev64 $res0b, $res1b @ GHASH final-4 block
  2569. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  2570. ldr $ctr_t1q, [$input_ptr], #16 @ AES final-3 block - load plaintext
  2571. pmull2 $rk2q1, $res0.2d, $h5.2d @ GHASH final-4 block - high
  2572. ins $rk4v.d[0], $res0.d[1] @ GHASH final-4 block - mid
  2573. pmull $rk3q1, $res0.1d, $h5.1d @ GHASH final-4 block - low
  2574. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-4 block - high
  2575. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-4 block - mid
  2576. movi $t0.8b, #0 @ surpress further partial tag feed in
  2577. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-4 block - low
  2578. pmull $rk4v.1q, $rk4v.1d, $h56k.1d @ GHASH final-4 block - mid
  2579. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-4 block - mid
  2580. eor3 $res1b, $ctr_t1b, $ctr4b, $t1.16b @ AES final-3 block - result
  2581. .L192_enc_blocks_more_than_3: @ blocks left > 3
  2582. ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
  2583. st1 { $res1b}, [$output_ptr], #16 @ AES final-3 block - store result
  2584. rev64 $res0b, $res1b @ GHASH final-3 block
  2585. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  2586. movi $t0.8b, #0 @ surpress further partial tag feed in
  2587. ldr $ctr_t1q, [$input_ptr], #16 @ AES final-2 block - load plaintext
  2588. ldr $h4q, [$current_tag, #112] @ load h4l | h4h
  2589. ext $h4.16b, $h4.16b, $h4.16b, #8
  2590. ins $rk4v.d[0], $res0.d[1] @ GHASH final-3 block - mid
  2591. eor3 $res1b, $ctr_t1b, $ctr5b, $t1.16b @ AES final-2 block - result
  2592. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
  2593. ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-3 block - mid
  2594. pmull $rk3q1, $res0.1d, $h4.1d @ GHASH final-3 block - low
  2595. pmull2 $rk2q1, $res0.2d, $h4.2d @ GHASH final-3 block - high
  2596. pmull2 $rk4v.1q, $rk4v.2d, $h34k.2d @ GHASH final-3 block - mid
  2597. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-3 block - low
  2598. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-3 block - mid
  2599. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-3 block - high
  2600. .L192_enc_blocks_more_than_2: @ blocks left > 2
  2601. st1 { $res1b}, [$output_ptr], #16 @ AES final-2 block - store result
  2602. rev64 $res0b, $res1b @ GHASH final-2 block
  2603. ldr $h3q, [$current_tag, #80] @ load h3l | h3h
  2604. ext $h3.16b, $h3.16b, $h3.16b, #8
  2605. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  2606. ldr $ctr_t1q, [$input_ptr], #16 @ AES final-1 block - load plaintext
  2607. ins $rk4v.d[0], $res0.d[1] @ GHASH final-2 block - mid
  2608. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
  2609. pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
  2610. pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
  2611. movi $t0.8b, #0 @ surpress further partial tag feed in
  2612. pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
  2613. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
  2614. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
  2615. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
  2616. eor3 $res1b, $ctr_t1b, $ctr6b, $t1.16b @ AES final-1 block - result
  2617. .L192_enc_blocks_more_than_1: @ blocks left > 1
  2618. ldr $h2q, [$current_tag, #64] @ load h1l | h1h
  2619. ext $h2.16b, $h2.16b, $h2.16b, #8
  2620. st1 { $res1b}, [$output_ptr], #16 @ AES final-1 block - store result
  2621. rev64 $res0b, $res1b @ GHASH final-1 block
  2622. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  2623. ins $rk4v.d[0], $res0.d[1] @ GHASH final-1 block - mid
  2624. pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
  2625. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
  2626. pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
  2627. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
  2628. ldr $ctr_t1q, [$input_ptr], #16 @ AES final block - load plaintext
  2629. ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
  2630. ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
  2631. eor3 $res1b, $ctr_t1b, $ctr7b, $t1.16b @ AES final block - result
  2632. pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
  2633. movi $t0.8b, #0 @ surpress further partial tag feed in
  2634. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
  2635. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
  2636. .L192_enc_blocks_less_than_1: @ blocks left <= 1
  2637. mvn $temp0_x, xzr @ temp0_x = 0xffffffffffffffff
  2638. and $bit_length, $bit_length, #127 @ bit_length %= 128
  2639. sub $bit_length, $bit_length, #128 @ bit_length -= 128
  2640. neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
  2641. and $bit_length, $bit_length, #127 @ bit_length %= 128
  2642. lsr $temp0_x, $temp0_x, $bit_length @ temp0_x is mask for top 64b of last block
  2643. cmp $bit_length, #64
  2644. mvn $temp1_x, xzr @ temp1_x = 0xffffffffffffffff
  2645. csel $temp2_x, $temp1_x, $temp0_x, lt
  2646. csel $temp3_x, $temp0_x, xzr, lt
  2647. mov $ctr0.d[1], $temp3_x
  2648. ldr $h1q, [$current_tag, #32] @ load h1l | h1h
  2649. ext $h1.16b, $h1.16b, $h1.16b, #8
  2650. ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored
  2651. mov $ctr0.d[0], $temp2_x @ ctr0b is mask for last block
  2652. and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
  2653. rev64 $res0b, $res1b @ GHASH final block
  2654. bif $res1b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing
  2655. st1 { $res1b}, [$output_ptr] @ store all 16B
  2656. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  2657. ins $t0.d[0], $res0.d[1] @ GHASH final block - mid
  2658. pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
  2659. eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
  2660. pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
  2661. eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
  2662. pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
  2663. eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
  2664. ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
  2665. eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
  2666. ext $t11.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
  2667. rev32 $rtmp_ctr.16b, $rtmp_ctr.16b
  2668. str $rtmp_ctrq, [$counter] @ store the updated counter
  2669. eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
  2670. pmull $t12.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
  2671. eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid
  2672. pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
  2673. ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
  2674. eor3 $acc_lb, $acc_lb, $acc_hb, $t11.16b @ MODULO - fold into low
  2675. ext $acc_lb, $acc_lb, $acc_lb, #8
  2676. rev64 $acc_lb, $acc_lb
  2677. st1 { $acc_l.16b }, [$current_tag]
  2678. lsr x0, $bit_length, #3 @ return sizes
  2679. ldp d10, d11, [sp, #16]
  2680. ldp d12, d13, [sp, #32]
  2681. ldp d14, d15, [sp, #48]
  2682. ldp d8, d9, [sp], #80
  2683. ret
  2684. .L192_enc_ret:
  2685. mov w0, #0x0
  2686. ret
  2687. .size unroll8_eor3_aes_gcm_enc_192_kernel,.-unroll8_eor3_aes_gcm_enc_192_kernel
  2688. ___
  2689. #########################################################################################
  2690. # size_t unroll8_eor3_aes_gcm_dec_192_kernel(const unsigned char *in,
  2691. # size_t len,
  2692. # unsigned char *out,
  2693. # const void *key,
  2694. # unsigned char ivec[16],
  2695. # u64 *Xi);
  2696. #
  2697. $code.=<<___;
  2698. .global unroll8_eor3_aes_gcm_dec_192_kernel
  2699. .type unroll8_eor3_aes_gcm_dec_192_kernel,%function
  2700. .align 4
  2701. unroll8_eor3_aes_gcm_dec_192_kernel:
  2702. AARCH64_VALID_CALL_TARGET
  2703. cbz x1, .L192_dec_ret
  2704. stp d8, d9, [sp, #-80]!
  2705. mov $counter, x4
  2706. mov $cc, x5
  2707. stp d10, d11, [sp, #16]
  2708. stp d12, d13, [sp, #32]
  2709. stp d14, d15, [sp, #48]
  2710. mov x5, #0xc200000000000000
  2711. stp x5, xzr, [sp, #64]
  2712. add $modulo_constant, sp, #64
  2713. lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
  2714. ld1 { $ctr0b}, [$counter] @ CTR block 0
  2715. ld1 { $acc_lb}, [$current_tag]
  2716. mov $constant_temp, #0x100000000 @ set up counter increment
  2717. movi $rctr_inc.16b, #0x0
  2718. mov $rctr_inc.d[1], $constant_temp
  2719. rev32 $rtmp_ctr.16b, $ctr0.16b @ set up reversed counter
  2720. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 0
  2721. rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 1
  2722. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 1
  2723. rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 2
  2724. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 2
  2725. rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 3
  2726. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 3
  2727. rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 4
  2728. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 4
  2729. rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 5
  2730. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 5
  2731. ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
  2732. rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 6
  2733. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 6
  2734. rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 7
  2735. aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
  2736. aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 0
  2737. aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 0
  2738. aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
  2739. aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
  2740. aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 0
  2741. aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
  2742. aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 0
  2743. ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
  2744. aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
  2745. aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
  2746. aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
  2747. aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
  2748. aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 1
  2749. aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 1
  2750. aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 1
  2751. aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 2
  2752. aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
  2753. aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 1
  2754. aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 2
  2755. aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
  2756. aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
  2757. aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
  2758. aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 2
  2759. aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 2
  2760. aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 3
  2761. ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
  2762. aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
  2763. aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 3
  2764. aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
  2765. aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
  2766. aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 3
  2767. aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
  2768. aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 3
  2769. aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
  2770. aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
  2771. aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 4
  2772. aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
  2773. aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 4
  2774. aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 4
  2775. aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
  2776. aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 5
  2777. aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 4
  2778. aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 5
  2779. ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
  2780. aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
  2781. aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 5
  2782. aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
  2783. aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
  2784. aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
  2785. aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 5
  2786. sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
  2787. aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 6
  2788. aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 6
  2789. aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
  2790. aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
  2791. aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
  2792. aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 6
  2793. aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 6
  2794. aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
  2795. ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
  2796. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 7
  2797. aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
  2798. aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 7
  2799. aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
  2800. aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
  2801. aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 7
  2802. aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 7
  2803. aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
  2804. aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 7
  2805. aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
  2806. aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
  2807. and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffff80 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
  2808. aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 8
  2809. aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 8
  2810. aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 8
  2811. aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 8
  2812. aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
  2813. aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
  2814. add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
  2815. aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 9
  2816. ld1 { $acc_lb}, [$current_tag]
  2817. ext $acc_lb, $acc_lb, $acc_lb, #8
  2818. rev64 $acc_lb, $acc_lb
  2819. ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11
  2820. aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9
  2821. add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
  2822. aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9
  2823. aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 9
  2824. aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 9
  2825. cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
  2826. aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9
  2827. aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 9
  2828. aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9
  2829. aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 10
  2830. aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 10
  2831. aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 10
  2832. aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 10
  2833. aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 10
  2834. aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 10
  2835. aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 10
  2836. aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 10
  2837. ldr $rk12q, [$cc, #192] @ load rk12
  2838. aese $ctr0b, $rk11 @ AES block 0 - round 11
  2839. aese $ctr1b, $rk11 @ AES block 1 - round 11
  2840. aese $ctr4b, $rk11 @ AES block 4 - round 11
  2841. aese $ctr6b, $rk11 @ AES block 6 - round 11
  2842. aese $ctr5b, $rk11 @ AES block 5 - round 11
  2843. aese $ctr7b, $rk11 @ AES block 7 - round 11
  2844. aese $ctr2b, $rk11 @ AES block 2 - round 11
  2845. aese $ctr3b, $rk11 @ AES block 3 - round 11
  2846. b.ge .L192_dec_tail @ handle tail
  2847. ldp $res0q, $res1q, [$input_ptr], #32 @ AES block 0, 1 - load ciphertext
  2848. ldp $res2q, $res3q, [$input_ptr], #32 @ AES block 2, 3 - load ciphertext
  2849. ldp $res4q, $res5q, [$input_ptr], #32 @ AES block 4, 5 - load ciphertext
  2850. eor3 $ctr1b, $res1b, $ctr1b, $rk12 @ AES block 1 - result
  2851. eor3 $ctr0b, $res0b, $ctr0b, $rk12 @ AES block 0 - result
  2852. stp $ctr0q, $ctr1q, [$output_ptr], #32 @ AES block 0, 1 - store result
  2853. rev32 $ctr0.16b, $rtmp_ctr.16b @ CTR block 8
  2854. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8
  2855. rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 9
  2856. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 9
  2857. eor3 $ctr3b, $res3b, $ctr3b, $rk12 @ AES block 3 - result
  2858. eor3 $ctr2b, $res2b, $ctr2b, $rk12 @ AES block 2 - result
  2859. stp $ctr2q, $ctr3q, [$output_ptr], #32 @ AES block 2, 3 - store result
  2860. ldp $res6q, $res7q, [$input_ptr], #32 @ AES block 6, 7 - load ciphertext
  2861. rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 10
  2862. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 10
  2863. eor3 $ctr4b, $res4b, $ctr4b, $rk12 @ AES block 4 - result
  2864. rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 11
  2865. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 11
  2866. eor3 $ctr5b, $res5b, $ctr5b, $rk12 @ AES block 5 - result
  2867. stp $ctr4q, $ctr5q, [$output_ptr], #32 @ AES block 4, 5 - store result
  2868. cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
  2869. eor3 $ctr6b, $res6b, $ctr6b, $rk12 @ AES block 6 - result
  2870. eor3 $ctr7b, $res7b, $ctr7b, $rk12 @ AES block 7 - result
  2871. rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 12
  2872. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 12
  2873. stp $ctr6q, $ctr7q, [$output_ptr], #32 @ AES block 6, 7 - store result
  2874. b.ge .L192_dec_prepretail @ do prepretail
  2875. .L192_dec_main_loop: @ main loop start
  2876. rev64 $res1b, $res1b @ GHASH block 8k+1
  2877. ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
  2878. ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
  2879. rev64 $res0b, $res0b @ GHASH block 8k
  2880. rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13
  2881. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13
  2882. ldr $h7q, [$current_tag, #176] @ load h7l | h7h
  2883. ext $h7.16b, $h7.16b, $h7.16b, #8
  2884. ldr $h8q, [$current_tag, #208] @ load h8l | h8h
  2885. ext $h8.16b, $h8.16b, $h8.16b, #8
  2886. rev64 $res4b, $res4b @ GHASH block 8k+4
  2887. rev64 $res3b, $res3b @ GHASH block 8k+3
  2888. eor $res0b, $res0b, $acc_lb @ PRE 1
  2889. rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14
  2890. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14
  2891. rev64 $res5b, $res5b @ GHASH block 8k+5
  2892. rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15
  2893. aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0
  2894. aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0
  2895. aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0
  2896. aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0
  2897. aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0
  2898. aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0
  2899. aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0
  2900. aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0
  2901. pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low
  2902. pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high
  2903. ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
  2904. aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1
  2905. pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low
  2906. ldr $h5q, [$current_tag, #128] @ load h5l | h5h
  2907. ext $h5.16b, $h5.16b, $h5.16b, #8
  2908. ldr $h6q, [$current_tag, #160] @ load h6l | h6h
  2909. ext $h6.16b, $h6.16b, $h6.16b, #8
  2910. aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1
  2911. aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1
  2912. aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1
  2913. pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high
  2914. aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1
  2915. aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1
  2916. trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
  2917. rev64 $res2b, $res2b @ GHASH block 8k+2
  2918. aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1
  2919. aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1
  2920. ldr $h56kq, [$current_tag, #144] @ load h6k | h5k
  2921. ldr $h78kq, [$current_tag, #192] @ load h8k | h7k
  2922. trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
  2923. eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high
  2924. pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high
  2925. pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high
  2926. eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid
  2927. eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low
  2928. aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2
  2929. aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2
  2930. pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low
  2931. eor3 $acc_hb, $acc_hb, $t1.16b, $t2.16b @ GHASH block 8k+2, 8k+3 - high
  2932. aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2
  2933. aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3
  2934. aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2
  2935. aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2
  2936. aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2
  2937. aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2
  2938. ldr $h3q, [$current_tag, #80] @ load h3l | h3h
  2939. ext $h3.16b, $h3.16b, $h3.16b, #8
  2940. ldr $h4q, [$current_tag, #112] @ load h4l | h4h
  2941. ext $h4.16b, $h4.16b, $h4.16b, #8
  2942. aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2
  2943. aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3
  2944. pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low
  2945. trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
  2946. trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
  2947. aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3
  2948. aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3
  2949. aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3
  2950. aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3
  2951. ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
  2952. eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
  2953. eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low
  2954. aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3
  2955. trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
  2956. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15
  2957. pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid
  2958. pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid
  2959. pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid
  2960. aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3
  2961. pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid
  2962. pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high
  2963. aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4
  2964. aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4
  2965. eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid
  2966. aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4
  2967. aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4
  2968. aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4
  2969. aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4
  2970. aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4
  2971. aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4
  2972. ldr $h1q, [$current_tag, #32] @ load h1l | h1h
  2973. ext $h1.16b, $h1.16b, $h1.16b, #8
  2974. ldr $h2q, [$current_tag, #64] @ load h1l | h1h
  2975. ext $h2.16b, $h2.16b, $h2.16b, #8
  2976. aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5
  2977. aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5
  2978. ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
  2979. aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5
  2980. rev64 $res7b, $res7b @ GHASH block 8k+7
  2981. aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5
  2982. eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
  2983. aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5
  2984. pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low
  2985. trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
  2986. aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5
  2987. aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5
  2988. aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5
  2989. rev64 $res6b, $res6b @ GHASH block 8k+6
  2990. ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
  2991. ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
  2992. pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high
  2993. pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low
  2994. aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6
  2995. eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
  2996. trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
  2997. aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6
  2998. aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6
  2999. aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6
  3000. pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high
  3001. aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6
  3002. aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6
  3003. aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7
  3004. aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7
  3005. aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6
  3006. pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid
  3007. eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high
  3008. eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low
  3009. pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low
  3010. trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
  3011. aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6
  3012. aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7
  3013. ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
  3014. aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7
  3015. eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
  3016. pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid
  3017. aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7
  3018. aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7
  3019. aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7
  3020. aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7
  3021. eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
  3022. pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid
  3023. pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high
  3024. pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid
  3025. ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
  3026. pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low
  3027. aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8
  3028. aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8
  3029. aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8
  3030. aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8
  3031. aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8
  3032. eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low
  3033. aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8
  3034. aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8
  3035. aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8
  3036. eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high
  3037. rev32 $h1.16b, $rtmp_ctr.16b @ CTR block 8k+16
  3038. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+16
  3039. aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 9
  3040. eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
  3041. aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 9
  3042. aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 9
  3043. aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 9
  3044. ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11
  3045. eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
  3046. ldp $res0q, $res1q, [$input_ptr], #32 @ AES block 8k+8, 8k+9 - load ciphertext
  3047. aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 9
  3048. aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 9
  3049. ldp $res2q, $res3q, [$input_ptr], #32 @ AES block 8k+10, 8k+11 - load ciphertext
  3050. rev32 $h2.16b, $rtmp_ctr.16b @ CTR block 8k+17
  3051. pmull $t12.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
  3052. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+17
  3053. aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 9
  3054. aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 9
  3055. ext $t11.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
  3056. aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 10
  3057. aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 10
  3058. ldp $res4q, $res5q, [$input_ptr], #32 @ AES block 8k+12, 8k+13 - load ciphertext
  3059. rev32 $h3.16b, $rtmp_ctr.16b @ CTR block 8k+18
  3060. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+18
  3061. eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid
  3062. aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 10
  3063. aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 10
  3064. ldr $rk12q, [$cc, #192] @ load rk12
  3065. ldp $res6q, $res7q, [$input_ptr], #32 @ AES block 8k+14, 8k+15 - load ciphertext
  3066. aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 10
  3067. aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 10
  3068. aese $ctr0b, $rk11 @ AES block 8k+8 - round 11
  3069. ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
  3070. aese $ctr1b, $rk11 @ AES block 8k+9 - round 11
  3071. aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 10
  3072. aese $ctr6b, $rk11 @ AES block 8k+14 - round 11
  3073. aese $ctr3b, $rk11 @ AES block 8k+11 - round 11
  3074. eor3 $ctr0b, $res0b, $ctr0b, $rk12 @ AES block 8k+8 - result
  3075. rev32 $h4.16b, $rtmp_ctr.16b @ CTR block 8k+19
  3076. aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 10
  3077. aese $ctr4b, $rk11 @ AES block 8k+12 - round 11
  3078. aese $ctr2b, $rk11 @ AES block 8k+10 - round 11
  3079. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+19
  3080. aese $ctr7b, $rk11 @ AES block 8k+15 - round 11
  3081. aese $ctr5b, $rk11 @ AES block 8k+13 - round 11
  3082. pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
  3083. eor3 $ctr1b, $res1b, $ctr1b, $rk12 @ AES block 8k+9 - result
  3084. stp $ctr0q, $ctr1q, [$output_ptr], #32 @ AES block 8k+8, 8k+9 - store result
  3085. eor3 $ctr3b, $res3b, $ctr3b, $rk12 @ AES block 8k+11 - result
  3086. eor3 $ctr2b, $res2b, $ctr2b, $rk12 @ AES block 8k+10 - result
  3087. eor3 $ctr7b, $res7b, $ctr7b, $rk12 @ AES block 8k+15 - result
  3088. stp $ctr2q, $ctr3q, [$output_ptr], #32 @ AES block 8k+10, 8k+11 - store result
  3089. eor3 $ctr5b, $res5b, $ctr5b, $rk12 @ AES block 8k+13 - result
  3090. eor3 $acc_lb, $acc_lb, $acc_hb, $t11.16b @ MODULO - fold into low
  3091. mov $ctr3.16b, $h4.16b @ CTR block 8k+19
  3092. eor3 $ctr4b, $res4b, $ctr4b, $rk12 @ AES block 8k+12 - result
  3093. stp $ctr4q, $ctr5q, [$output_ptr], #32 @ AES block 8k+12, 8k+13 - store result
  3094. cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
  3095. eor3 $ctr6b, $res6b, $ctr6b, $rk12 @ AES block 8k+14 - result
  3096. stp $ctr6q, $ctr7q, [$output_ptr], #32 @ AES block 8k+14, 8k+15 - store result
  3097. mov $ctr0.16b, $h1.16b @ CTR block 8k+16
  3098. mov $ctr1.16b, $h2.16b @ CTR block 8k+17
  3099. mov $ctr2.16b, $h3.16b @ CTR block 8k+18
  3100. rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 8k+20
  3101. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+20
  3102. b.lt .L192_dec_main_loop
  3103. .L192_dec_prepretail: @ PREPRETAIL
  3104. ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
  3105. rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13
  3106. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13
  3107. ldr $h7q, [$current_tag, #176] @ load h7l | h7h
  3108. ext $h7.16b, $h7.16b, $h7.16b, #8
  3109. ldr $h8q, [$current_tag, #208] @ load h8l | h8h
  3110. ext $h8.16b, $h8.16b, $h8.16b, #8
  3111. rev64 $res0b, $res0b @ GHASH block 8k
  3112. ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
  3113. rev64 $res3b, $res3b @ GHASH block 8k+3
  3114. rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14
  3115. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14
  3116. eor $res0b, $res0b, $acc_lb @ PRE 1
  3117. rev64 $res2b, $res2b @ GHASH block 8k+2
  3118. rev64 $res1b, $res1b @ GHASH block 8k+1
  3119. ldr $h5q, [$current_tag, #128] @ load h5l | h5h
  3120. ext $h5.16b, $h5.16b, $h5.16b, #8
  3121. ldr $h6q, [$current_tag, #160] @ load h6l | h6h
  3122. ext $h6.16b, $h6.16b, $h6.16b, #8
  3123. rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15
  3124. aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0
  3125. aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0
  3126. aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0
  3127. aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0
  3128. aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0
  3129. pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high
  3130. aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0
  3131. pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high
  3132. aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0
  3133. aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1
  3134. aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0
  3135. ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
  3136. aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1
  3137. pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high
  3138. pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low
  3139. pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low
  3140. eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high
  3141. aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1
  3142. pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low
  3143. aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1
  3144. aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1
  3145. trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
  3146. trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
  3147. pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high
  3148. aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1
  3149. aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1
  3150. aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1
  3151. ldr $h56kq, [$current_tag, #144] @ load h6k | h5k
  3152. ldr $h78kq, [$current_tag, #192] @ load h8k | h7k
  3153. aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2
  3154. eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid
  3155. aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2
  3156. rev64 $res5b, $res5b @ GHASH block 8k+5
  3157. pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low
  3158. eor3 $acc_hb, $acc_hb, $t1.16b, $t2.16b @ GHASH block 8k+2, 8k+3 - high
  3159. aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2
  3160. aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2
  3161. trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
  3162. aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3
  3163. aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2
  3164. aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2
  3165. aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2
  3166. trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
  3167. pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid
  3168. aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2
  3169. pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid
  3170. aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3
  3171. eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
  3172. eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low
  3173. aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3
  3174. aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3
  3175. aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3
  3176. eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low
  3177. ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
  3178. aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3
  3179. ldr $h3q, [$current_tag, #80] @ load h3l | h3h
  3180. ext $h3.16b, $h3.16b, $h3.16b, #8
  3181. ldr $h4q, [$current_tag, #112] @ load h4l | h4h
  3182. ext $h4.16b, $h4.16b, $h4.16b, #8
  3183. pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid
  3184. pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid
  3185. ldr $h1q, [$current_tag, #32] @ load h1l | h1h
  3186. ext $h1.16b, $h1.16b, $h1.16b, #8
  3187. ldr $h2q, [$current_tag, #64] @ load h1l | h1h
  3188. ext $h2.16b, $h2.16b, $h2.16b, #8
  3189. eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid
  3190. aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3
  3191. rev64 $res7b, $res7b @ GHASH block 8k+7
  3192. eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
  3193. rev64 $res4b, $res4b @ GHASH block 8k+4
  3194. aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4
  3195. aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4
  3196. aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3
  3197. aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4
  3198. aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4
  3199. aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4
  3200. aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4
  3201. aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4
  3202. aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4
  3203. rev64 $res6b, $res6b @ GHASH block 8k+6
  3204. ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
  3205. ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
  3206. trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
  3207. aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5
  3208. aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5
  3209. aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5
  3210. ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
  3211. aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5
  3212. aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5
  3213. pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high
  3214. pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high
  3215. pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low
  3216. aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5
  3217. pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low
  3218. trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
  3219. pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high
  3220. pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low
  3221. trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
  3222. aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5
  3223. trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
  3224. aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5
  3225. eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
  3226. aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6
  3227. aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6
  3228. eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
  3229. aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6
  3230. aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6
  3231. pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid
  3232. pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid
  3233. aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6
  3234. pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid
  3235. aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6
  3236. pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high
  3237. eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
  3238. aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7
  3239. eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low
  3240. aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6
  3241. aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6
  3242. aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7
  3243. ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
  3244. pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid
  3245. aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7
  3246. ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
  3247. eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high
  3248. pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low
  3249. aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7
  3250. aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7
  3251. aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7
  3252. eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high
  3253. eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low
  3254. eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
  3255. aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7
  3256. aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7
  3257. eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
  3258. ext $t11.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
  3259. aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8
  3260. aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8
  3261. aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8
  3262. aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8
  3263. aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8
  3264. pmull $t12.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
  3265. aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8
  3266. aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8
  3267. aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8
  3268. ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11
  3269. eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid
  3270. aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 9
  3271. aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 9
  3272. aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 9
  3273. aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 9
  3274. aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 9
  3275. aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 9
  3276. aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 9
  3277. aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 9
  3278. pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
  3279. ldr $rk12q, [$cc, #192] @ load rk12
  3280. ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
  3281. aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 10
  3282. aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 10
  3283. aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 10
  3284. aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 10
  3285. aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 10
  3286. aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 10
  3287. aese $ctr0b, $rk11 @ AES block 8k+8 - round 11
  3288. eor3 $acc_lb, $acc_lb, $acc_hb, $t11.16b @ MODULO - fold into low
  3289. aese $ctr5b, $rk11 @ AES block 8k+13 - round 11
  3290. aese $ctr2b, $rk11 @ AES block 8k+10 - round 11
  3291. aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 10
  3292. aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 10
  3293. aese $ctr6b, $rk11 @ AES block 8k+14 - round 11
  3294. aese $ctr4b, $rk11 @ AES block 8k+12 - round 11
  3295. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15
  3296. aese $ctr3b, $rk11 @ AES block 8k+11 - round 11
  3297. aese $ctr1b, $rk11 @ AES block 8k+9 - round 11
  3298. aese $ctr7b, $rk11 @ AES block 8k+15 - round 11
  3299. .L192_dec_tail: @ TAIL
  3300. sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
  3301. ldp $h5q, $h56kq, [$current_tag, #128] @ load h5l | h5h
  3302. ext $h5.16b, $h5.16b, $h5.16b, #8
  3303. ldr $res1q, [$input_ptr], #16 @ AES block 8k+8 - load ciphertext
  3304. ldp $h78kq, $h8q, [$current_tag, #192] @ load h8l | h8h
  3305. ext $h8.16b, $h8.16b, $h8.16b, #8
  3306. mov $t1.16b, $rk12
  3307. ldp $h6q, $h7q, [$current_tag, #160] @ load h6l | h6h
  3308. ext $h6.16b, $h6.16b, $h6.16b, #8
  3309. ext $h7.16b, $h7.16b, $h7.16b, #8
  3310. ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
  3311. eor3 $res4b, $res1b, $ctr0b, $t1.16b @ AES block 8k+8 - result
  3312. cmp $main_end_input_ptr, #112
  3313. b.gt .L192_dec_blocks_more_than_7
  3314. mov $ctr7b, $ctr6b
  3315. movi $acc_h.8b, #0
  3316. sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
  3317. mov $ctr6b, $ctr5b
  3318. mov $ctr5b, $ctr4b
  3319. mov $ctr4b, $ctr3b
  3320. cmp $main_end_input_ptr, #96
  3321. movi $acc_l.8b, #0
  3322. mov $ctr3b, $ctr2b
  3323. mov $ctr2b, $ctr1b
  3324. movi $acc_m.8b, #0
  3325. b.gt .L192_dec_blocks_more_than_6
  3326. mov $ctr7b, $ctr6b
  3327. mov $ctr6b, $ctr5b
  3328. mov $ctr5b, $ctr4b
  3329. mov $ctr4b, $ctr3b
  3330. mov $ctr3b, $ctr1b
  3331. sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
  3332. cmp $main_end_input_ptr, #80
  3333. b.gt .L192_dec_blocks_more_than_5
  3334. mov $ctr7b, $ctr6b
  3335. mov $ctr6b, $ctr5b
  3336. mov $ctr5b, $ctr4b
  3337. mov $ctr4b, $ctr1b
  3338. cmp $main_end_input_ptr, #64
  3339. sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
  3340. b.gt .L192_dec_blocks_more_than_4
  3341. sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
  3342. mov $ctr7b, $ctr6b
  3343. mov $ctr6b, $ctr5b
  3344. mov $ctr5b, $ctr1b
  3345. cmp $main_end_input_ptr, #48
  3346. b.gt .L192_dec_blocks_more_than_3
  3347. sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
  3348. mov $ctr7b, $ctr6b
  3349. cmp $main_end_input_ptr, #32
  3350. mov $ctr6b, $ctr1b
  3351. ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
  3352. b.gt .L192_dec_blocks_more_than_2
  3353. sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
  3354. mov $ctr7b, $ctr1b
  3355. cmp $main_end_input_ptr, #16
  3356. b.gt .L192_dec_blocks_more_than_1
  3357. sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
  3358. ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
  3359. b .L192_dec_blocks_less_than_1
  3360. .L192_dec_blocks_more_than_7: @ blocks left > 7
  3361. rev64 $res0b, $res1b @ GHASH final-7 block
  3362. ins $acc_m.d[0], $h78k.d[1] @ GHASH final-7 block - mid
  3363. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  3364. pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH final-7 block - high
  3365. ins $rk4v.d[0], $res0.d[1] @ GHASH final-7 block - mid
  3366. ldr $res1q, [$input_ptr], #16 @ AES final-6 block - load ciphertext
  3367. pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH final-7 block - low
  3368. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-7 block - mid
  3369. st1 { $res4b}, [$output_ptr], #16 @ AES final-7 block - store result
  3370. eor3 $res4b, $res1b, $ctr1b, $t1.16b @ AES final-6 block - result
  3371. pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-7 block - mid
  3372. movi $t0.8b, #0 @ surpress further partial tag feed in
  3373. .L192_dec_blocks_more_than_6: @ blocks left > 6
  3374. rev64 $res0b, $res1b @ GHASH final-6 block
  3375. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  3376. ldr $res1q, [$input_ptr], #16 @ AES final-5 block - load ciphertext
  3377. ins $rk4v.d[0], $res0.d[1] @ GHASH final-6 block - mid
  3378. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-6 block - mid
  3379. movi $t0.8b, #0 @ surpress further partial tag feed in
  3380. pmull2 $rk2q1, $res0.2d, $h7.2d @ GHASH final-6 block - high
  3381. st1 { $res4b}, [$output_ptr], #16 @ AES final-6 block - store result
  3382. eor3 $res4b, $res1b, $ctr2b, $t1.16b @ AES final-5 block - result
  3383. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-6 block - high
  3384. pmull $rk4v.1q, $rk4v.1d, $h78k.1d @ GHASH final-6 block - mid
  3385. pmull $rk3q1, $res0.1d, $h7.1d @ GHASH final-6 block - low
  3386. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-6 block - mid
  3387. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-6 block - low
  3388. .L192_dec_blocks_more_than_5: @ blocks left > 5
  3389. rev64 $res0b, $res1b @ GHASH final-5 block
  3390. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  3391. ins $rk4v.d[0], $res0.d[1] @ GHASH final-5 block - mid
  3392. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-5 block - mid
  3393. ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-5 block - mid
  3394. pmull2 $rk2q1, $res0.2d, $h6.2d @ GHASH final-5 block - high
  3395. ldr $res1q, [$input_ptr], #16 @ AES final-4 block - load ciphertext
  3396. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-5 block - high
  3397. pmull $rk3q1, $res0.1d, $h6.1d @ GHASH final-5 block - low
  3398. pmull2 $rk4v.1q, $rk4v.2d, $h56k.2d @ GHASH final-5 block - mid
  3399. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-5 block - low
  3400. movi $t0.8b, #0 @ surpress further partial tag feed in
  3401. st1 { $res4b}, [$output_ptr], #16 @ AES final-5 block - store result
  3402. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-5 block - mid
  3403. eor3 $res4b, $res1b, $ctr3b, $t1.16b @ AES final-4 block - result
  3404. .L192_dec_blocks_more_than_4: @ blocks left > 4
  3405. rev64 $res0b, $res1b @ GHASH final-4 block
  3406. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  3407. movi $t0.8b, #0 @ surpress further partial tag feed in
  3408. ldr $res1q, [$input_ptr], #16 @ AES final-3 block - load ciphertext
  3409. ins $rk4v.d[0], $res0.d[1] @ GHASH final-4 block - mid
  3410. pmull $rk3q1, $res0.1d, $h5.1d @ GHASH final-4 block - low
  3411. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-4 block - mid
  3412. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-4 block - low
  3413. pmull $rk4v.1q, $rk4v.1d, $h56k.1d @ GHASH final-4 block - mid
  3414. st1 { $res4b}, [$output_ptr], #16 @ AES final-4 block - store result
  3415. pmull2 $rk2q1, $res0.2d, $h5.2d @ GHASH final-4 block - high
  3416. eor3 $res4b, $res1b, $ctr4b, $t1.16b @ AES final-3 block - result
  3417. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-4 block - mid
  3418. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-4 block - high
  3419. .L192_dec_blocks_more_than_3: @ blocks left > 3
  3420. ldr $h4q, [$current_tag, #112] @ load h4l | h4h
  3421. ext $h4.16b, $h4.16b, $h4.16b, #8
  3422. rev64 $res0b, $res1b @ GHASH final-3 block
  3423. ldr $res1q, [$input_ptr], #16 @ AES final-2 block - load ciphertext
  3424. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  3425. ins $rk4v.d[0], $res0.d[1] @ GHASH final-3 block - mid
  3426. pmull2 $rk2q1, $res0.2d, $h4.2d @ GHASH final-3 block - high
  3427. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-3 block - high
  3428. movi $t0.8b, #0 @ surpress further partial tag feed in
  3429. pmull $rk3q1, $res0.1d, $h4.1d @ GHASH final-3 block - low
  3430. st1 { $res4b}, [$output_ptr], #16 @ AES final-3 block - store result
  3431. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
  3432. eor3 $res4b, $res1b, $ctr5b, $t1.16b @ AES final-2 block - result
  3433. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-3 block - low
  3434. ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
  3435. ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-3 block - mid
  3436. pmull2 $rk4v.1q, $rk4v.2d, $h34k.2d @ GHASH final-3 block - mid
  3437. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-3 block - mid
  3438. .L192_dec_blocks_more_than_2: @ blocks left > 2
  3439. rev64 $res0b, $res1b @ GHASH final-2 block
  3440. ldr $h3q, [$current_tag, #80] @ load h3l | h3h
  3441. ext $h3.16b, $h3.16b, $h3.16b, #8
  3442. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  3443. ins $rk4v.d[0], $res0.d[1] @ GHASH final-2 block - mid
  3444. ldr $res1q, [$input_ptr], #16 @ AES final-1 block - load ciphertext
  3445. pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
  3446. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
  3447. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
  3448. pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
  3449. pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
  3450. movi $t0.8b, #0 @ surpress further partial tag feed in
  3451. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
  3452. st1 { $res4b}, [$output_ptr], #16 @ AES final-2 block - store result
  3453. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
  3454. eor3 $res4b, $res1b, $ctr6b, $t1.16b @ AES final-1 block - result
  3455. .L192_dec_blocks_more_than_1: @ blocks left > 1
  3456. rev64 $res0b, $res1b @ GHASH final-1 block
  3457. ldr $res1q, [$input_ptr], #16 @ AES final block - load ciphertext
  3458. ldr $h2q, [$current_tag, #64] @ load h1l | h1h
  3459. ext $h2.16b, $h2.16b, $h2.16b, #8
  3460. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  3461. movi $t0.8b, #0 @ surpress further partial tag feed in
  3462. ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
  3463. pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
  3464. ins $rk4v.d[0], $res0.d[1] @ GHASH final-1 block - mid
  3465. st1 { $res4b}, [$output_ptr], #16 @ AES final-1 block - store result
  3466. pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
  3467. eor3 $res4b, $res1b, $ctr7b, $t1.16b @ AES final block - result
  3468. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
  3469. ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
  3470. pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
  3471. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
  3472. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
  3473. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
  3474. .L192_dec_blocks_less_than_1: @ blocks left <= 1
  3475. rev32 $rtmp_ctr.16b, $rtmp_ctr.16b
  3476. and $bit_length, $bit_length, #127 @ bit_length %= 128
  3477. sub $bit_length, $bit_length, #128 @ bit_length -= 128
  3478. str $rtmp_ctrq, [$counter] @ store the updated counter
  3479. neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
  3480. mvn $temp0_x, xzr @ temp0_x = 0xffffffffffffffff
  3481. and $bit_length, $bit_length, #127 @ bit_length %= 128
  3482. mvn $temp1_x, xzr @ temp1_x = 0xffffffffffffffff
  3483. lsr $temp0_x, $temp0_x, $bit_length @ temp0_x is mask for top 64b of last block
  3484. cmp $bit_length, #64
  3485. csel $temp2_x, $temp1_x, $temp0_x, lt
  3486. csel $temp3_x, $temp0_x, xzr, lt
  3487. ldr $h1q, [$current_tag, #32] @ load h1l | h1h
  3488. ext $h1.16b, $h1.16b, $h1.16b, #8
  3489. mov $ctr0.d[1], $temp3_x
  3490. ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored
  3491. mov $ctr0.d[0], $temp2_x @ ctr0b is mask for last block
  3492. and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
  3493. bif $res4b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing
  3494. rev64 $res0b, $res1b @ GHASH final block
  3495. st1 { $res4b}, [$output_ptr] @ store all 16B
  3496. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  3497. ins $t0.d[0], $res0.d[1] @ GHASH final block - mid
  3498. pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
  3499. eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
  3500. pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
  3501. eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
  3502. pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
  3503. eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
  3504. eor $t10.16b, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
  3505. eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
  3506. ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
  3507. pmull $t11.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
  3508. ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
  3509. eor $acc_mb, $acc_mb, $t10.16b @ MODULO - karatsuba tidy up
  3510. eor3 $acc_mb, $acc_mb, $acc_hb, $t11.16b @ MODULO - fold into mid
  3511. pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
  3512. ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
  3513. eor3 $acc_lb, $acc_lb, $acc_mb, $acc_hb @ MODULO - fold into low
  3514. ext $acc_lb, $acc_lb, $acc_lb, #8
  3515. rev64 $acc_lb, $acc_lb
  3516. st1 { $acc_l.16b }, [$current_tag]
  3517. ldp d10, d11, [sp, #16]
  3518. ldp d12, d13, [sp, #32]
  3519. ldp d14, d15, [sp, #48]
  3520. ldp d8, d9, [sp], #80
  3521. ret
  3522. .L192_dec_ret:
  3523. mov w0, #0x0
  3524. ret
  3525. .size unroll8_eor3_aes_gcm_dec_192_kernel,.-unroll8_eor3_aes_gcm_dec_192_kernel
  3526. ___
  3527. }
  3528. {
  3529. my ($end_input_ptr,$main_end_input_ptr,$temp0_x,$temp1_x)=map("x$_",(4..7));
  3530. my ($temp2_x,$temp3_x)=map("x$_",(13..14));
  3531. my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$ctr4b,$ctr5b,$ctr6b,$ctr7b,$res0b,$res1b,$res2b,$res3b,$res4b,$res5b,$res6b,$res7b)=map("v$_.16b",(0..15));
  3532. my ($ctr0,$ctr1,$ctr2,$ctr3,$ctr4,$ctr5,$ctr6,$ctr7,$res0,$res1,$res2,$res3,$res4,$res5,$res6,$res7)=map("v$_",(0..15));
  3533. my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$ctr4d,$ctr5d,$ctr6d,$ctr7d)=map("d$_",(0..7));
  3534. my ($ctr0q,$ctr1q,$ctr2q,$ctr3q,$ctr4q,$ctr5q,$ctr6q,$ctr7q)=map("q$_",(0..7));
  3535. my ($res0q,$res1q,$res2q,$res3q,$res4q,$res5q,$res6q,$res7q)=map("q$_",(8..15));
  3536. my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3,$ctr_t4,$ctr_t5,$ctr_t6,$ctr_t7)=map("v$_",(8..15));
  3537. my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b,$ctr_t4b,$ctr_t5b,$ctr_t6b,$ctr_t7b)=map("v$_.16b",(8..15));
  3538. my ($ctr_t0q,$ctr_t1q,$ctr_t2q,$ctr_t3q,$ctr_t4q,$ctr_t5q,$ctr_t6q,$ctr_t7q)=map("q$_",(8..15));
  3539. my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(17..19));
  3540. my ($acc_h,$acc_m,$acc_l)=map("v$_",(17..19));
  3541. my ($h1,$h12k,$h2,$h3,$h34k,$h4)=map("v$_",(20..25));
  3542. my ($h5,$h56k,$h6,$h7,$h78k,$h8)=map("v$_",(20..25));
  3543. my ($h1q,$h12kq,$h2q,$h3q,$h34kq,$h4q)=map("q$_",(20..25));
  3544. my ($h5q,$h56kq,$h6q,$h7q,$h78kq,$h8q)=map("q$_",(20..25));
  3545. my $t0="v16";
  3546. my $t0d="d16";
  3547. my $t1="v29";
  3548. my $t2=$res1;
  3549. my $t3=$t1;
  3550. my $t4=$res0;
  3551. my $t5=$res2;
  3552. my $t6=$t0;
  3553. my $t7=$res3;
  3554. my $t8=$res4;
  3555. my $t9=$res5;
  3556. my $t10=$res6;
  3557. my $t11="v21";
  3558. my $t12=$t1;
  3559. my $rtmp_ctr="v30";
  3560. my $rtmp_ctrq="q30";
  3561. my $rctr_inc="v31";
  3562. my $rctr_incd="d31";
  3563. my $mod_constantd=$t0d;
  3564. my $mod_constant=$t0;
  3565. my ($rk0,$rk1,$rk2)=map("v$_.16b",(26..28));
  3566. my ($rk3,$rk4,$rk5)=map("v$_.16b",(26..28));
  3567. my ($rk6,$rk7,$rk8)=map("v$_.16b",(26..28));
  3568. my ($rk9,$rk10,$rk11)=map("v$_.16b",(26..28));
  3569. my ($rk12,$rk13,$rk14)=map("v$_.16b",(26..28));
  3570. my ($rk0q,$rk1q,$rk2q)=map("q$_",(26..28));
  3571. my ($rk3q,$rk4q,$rk5q)=map("q$_",(26..28));
  3572. my ($rk6q,$rk7q,$rk8q)=map("q$_",(26..28));
  3573. my ($rk9q,$rk10q,$rk11q)=map("q$_",(26..28));
  3574. my ($rk12q,$rk13q,$rk14q)=map("q$_",(26..28));
  3575. my $rk2q1="v28.1q";
  3576. my $rk3q1="v26.1q";
  3577. my $rk4v="v27";
  3578. #########################################################################################
  3579. # size_t unroll8_eor3_aes_gcm_enc_256_kernel(const unsigned char *in,
  3580. # size_t len,
  3581. # unsigned char *out,
  3582. # const void *key,
  3583. # unsigned char ivec[16],
  3584. # u64 *Xi);
  3585. #
  3586. $code.=<<___;
  3587. .global unroll8_eor3_aes_gcm_enc_256_kernel
  3588. .type unroll8_eor3_aes_gcm_enc_256_kernel,%function
  3589. .align 4
  3590. unroll8_eor3_aes_gcm_enc_256_kernel:
  3591. AARCH64_VALID_CALL_TARGET
  3592. cbz x1, .L256_enc_ret
  3593. stp d8, d9, [sp, #-80]!
  3594. mov $counter, x4
  3595. mov $cc, x5
  3596. stp d10, d11, [sp, #16]
  3597. stp d12, d13, [sp, #32]
  3598. stp d14, d15, [sp, #48]
  3599. mov x5, #0xc200000000000000
  3600. stp x5, xzr, [sp, #64]
  3601. add $modulo_constant, sp, #64
  3602. ld1 { $ctr0b}, [$counter] @ CTR block 0
  3603. lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
  3604. mov $constant_temp, #0x100000000 @ set up counter increment
  3605. movi $rctr_inc.16b, #0x0
  3606. mov $rctr_inc.d[1], $constant_temp
  3607. sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
  3608. and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffff80 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
  3609. add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
  3610. rev32 $rtmp_ctr.16b, $ctr0.16b @ set up reversed counter
  3611. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 0
  3612. rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 1
  3613. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 1
  3614. rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 2
  3615. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 2
  3616. rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 3
  3617. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 3
  3618. rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 4
  3619. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 4
  3620. rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 5
  3621. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 5
  3622. ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
  3623. rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 6
  3624. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 6
  3625. rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 7
  3626. aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
  3627. aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 0
  3628. aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
  3629. aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
  3630. aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
  3631. aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 0
  3632. aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 0
  3633. aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 0
  3634. ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
  3635. aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 1
  3636. aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
  3637. aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
  3638. aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 1
  3639. aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 1
  3640. aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
  3641. aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 1
  3642. aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
  3643. aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
  3644. aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
  3645. aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 2
  3646. aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 2
  3647. aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 2
  3648. aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 2
  3649. aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
  3650. aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
  3651. aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 3
  3652. aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
  3653. ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
  3654. aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 3
  3655. aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
  3656. aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 3
  3657. aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 3
  3658. aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
  3659. aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
  3660. aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 4
  3661. aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 4
  3662. aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
  3663. aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
  3664. aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
  3665. aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
  3666. aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 4
  3667. aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 4
  3668. aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
  3669. aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
  3670. ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
  3671. aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
  3672. aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 5
  3673. aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 5
  3674. aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
  3675. aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 5
  3676. aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 5
  3677. aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
  3678. aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 6
  3679. aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 6
  3680. aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
  3681. aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 6
  3682. aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
  3683. aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 6
  3684. aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
  3685. ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
  3686. aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
  3687. aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
  3688. aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 7
  3689. aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 7
  3690. aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
  3691. aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 7
  3692. aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
  3693. aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 7
  3694. aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 8
  3695. aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
  3696. aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
  3697. aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
  3698. aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 8
  3699. aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 8
  3700. aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 8
  3701. aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
  3702. ld1 { $acc_lb}, [$current_tag]
  3703. ext $acc_lb, $acc_lb, $acc_lb, #8
  3704. rev64 $acc_lb, $acc_lb
  3705. ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11
  3706. aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 9
  3707. aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 9
  3708. aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9
  3709. aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 9
  3710. aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 9
  3711. aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9
  3712. aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9
  3713. aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 10
  3714. aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 10
  3715. aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9
  3716. aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 10
  3717. aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 10
  3718. aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 10
  3719. aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 10
  3720. aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 10
  3721. aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 10
  3722. aese $ctr4b, $rk11 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 11
  3723. ldp $rk12q, $rk13q, [$cc, #192] @ load rk12, rk13
  3724. aese $ctr5b, $rk11 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 11
  3725. aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 11
  3726. aese $ctr6b, $rk11 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 11
  3727. aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 11
  3728. aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 11
  3729. aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 11
  3730. aese $ctr7b, $rk11 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 11
  3731. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 7
  3732. ldr $rk14q, [$cc, #224] @ load rk14
  3733. aese $ctr4b, $rk12 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 12
  3734. aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 12
  3735. aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 12
  3736. aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 12
  3737. aese $ctr5b, $rk12 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 12
  3738. aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 12
  3739. aese $ctr2b, $rk13 @ AES block 2 - round 13
  3740. aese $ctr1b, $rk13 @ AES block 1 - round 13
  3741. aese $ctr4b, $rk13 @ AES block 4 - round 13
  3742. aese $ctr6b, $rk12 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 12
  3743. aese $ctr7b, $rk12 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 12
  3744. aese $ctr0b, $rk13 @ AES block 0 - round 13
  3745. aese $ctr5b, $rk13 @ AES block 5 - round 13
  3746. aese $ctr6b, $rk13 @ AES block 6 - round 13
  3747. aese $ctr7b, $rk13 @ AES block 7 - round 13
  3748. aese $ctr3b, $rk13 @ AES block 3 - round 13
  3749. add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
  3750. cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
  3751. b.ge .L256_enc_tail @ handle tail
  3752. ldp $ctr_t0q, $ctr_t1q, [$input_ptr], #32 @ AES block 0, 1 - load plaintext
  3753. ldp $ctr_t2q, $ctr_t3q, [$input_ptr], #32 @ AES block 2, 3 - load plaintext
  3754. eor3 $res0b, $ctr_t0b, $ctr0b, $rk14 @ AES block 0 - result
  3755. rev32 $ctr0.16b, $rtmp_ctr.16b @ CTR block 8
  3756. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8
  3757. eor3 $res1b, $ctr_t1b, $ctr1b, $rk14 @ AES block 1 - result
  3758. eor3 $res3b, $ctr_t3b, $ctr3b, $rk14 @ AES block 3 - result
  3759. rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 9
  3760. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 9
  3761. ldp $ctr_t4q, $ctr_t5q, [$input_ptr], #32 @ AES block 4, 5 - load plaintext
  3762. ldp $ctr_t6q, $ctr_t7q, [$input_ptr], #32 @ AES block 6, 7 - load plaintext
  3763. eor3 $res2b, $ctr_t2b, $ctr2b, $rk14 @ AES block 2 - result
  3764. cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
  3765. rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 10
  3766. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 10
  3767. stp $res0q, $res1q, [$output_ptr], #32 @ AES block 0, 1 - store result
  3768. stp $res2q, $res3q, [$output_ptr], #32 @ AES block 2, 3 - store result
  3769. rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 11
  3770. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 11
  3771. eor3 $res4b, $ctr_t4b, $ctr4b, $rk14 @ AES block 4 - result
  3772. eor3 $res7b, $ctr_t7b, $ctr7b, $rk14 @ AES block 7 - result
  3773. eor3 $res6b, $ctr_t6b, $ctr6b, $rk14 @ AES block 6 - result
  3774. eor3 $res5b, $ctr_t5b, $ctr5b, $rk14 @ AES block 5 - result
  3775. stp $res4q, $res5q, [$output_ptr], #32 @ AES block 4, 5 - store result
  3776. rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 12
  3777. stp $res6q, $res7q, [$output_ptr], #32 @ AES block 6, 7 - store result
  3778. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 12
  3779. b.ge .L256_enc_prepretail @ do prepretail
  3780. .L256_enc_main_loop: @ main loop start
  3781. ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
  3782. rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13
  3783. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13
  3784. ldr $h56kq, [$current_tag, #144] @ load h6k | h5k
  3785. ldr $h78kq, [$current_tag, #192] @ load h8k | h7k
  3786. rev64 $res3b, $res3b @ GHASH block 8k+3
  3787. ldr $h5q, [$current_tag, #128] @ load h5l | h5h
  3788. ext $h5.16b, $h5.16b, $h5.16b, #8
  3789. ldr $h6q, [$current_tag, #160] @ load h6l | h6h
  3790. ext $h6.16b, $h6.16b, $h6.16b, #8
  3791. rev64 $res1b, $res1b @ GHASH block 8k+1
  3792. rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14
  3793. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14
  3794. rev64 $res0b, $res0b @ GHASH block 8k
  3795. rev64 $res4b, $res4b @ GHASH block 8k+4
  3796. ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
  3797. ldr $h7q, [$current_tag, #176] @ load h7l | h7h
  3798. ext $h7.16b, $h7.16b, $h7.16b, #8
  3799. ldr $h8q, [$current_tag, #208] @ load h8l | h8h
  3800. ext $h8.16b, $h8.16b, $h8.16b, #8
  3801. aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0
  3802. aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0
  3803. rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15
  3804. aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0
  3805. aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0
  3806. aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0
  3807. aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0
  3808. aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0
  3809. aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0
  3810. ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
  3811. eor $res0b, $res0b, $acc_lb @ PRE 1
  3812. aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1
  3813. aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1
  3814. aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1
  3815. aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1
  3816. aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1
  3817. aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1
  3818. aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1
  3819. pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high
  3820. pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low
  3821. pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high
  3822. trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
  3823. trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
  3824. aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1
  3825. aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2
  3826. aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2
  3827. aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2
  3828. aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2
  3829. pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low
  3830. aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2
  3831. aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3
  3832. aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3
  3833. aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2
  3834. aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3
  3835. aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2
  3836. aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2
  3837. aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3
  3838. rev64 $res6b, $res6b @ GHASH block 8k+6
  3839. pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high
  3840. aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3
  3841. ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
  3842. rev64 $res2b, $res2b @ GHASH block 8k+2
  3843. aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3
  3844. aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3
  3845. aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3
  3846. eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high
  3847. pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high
  3848. rev64 $res5b, $res5b @ GHASH block 8k+5
  3849. pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low
  3850. eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low
  3851. ldr $h3q, [$current_tag, #80] @ load h3l | h3h
  3852. ext $h3.16b, $h3.16b, $h3.16b, #8
  3853. ldr $h4q, [$current_tag, #112] @ load h4l | h4h
  3854. ext $h4.16b, $h4.16b, $h4.16b, #8
  3855. trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
  3856. eor3 $acc_hb, $acc_hb, $t1.16b, $t2.16b @ GHASH block 8k+2, 8k+3 - high
  3857. pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low
  3858. aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4
  3859. aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4
  3860. aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4
  3861. aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4
  3862. aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4
  3863. aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4
  3864. trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
  3865. aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4
  3866. aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4
  3867. trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
  3868. eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid
  3869. ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
  3870. aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5
  3871. aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5
  3872. aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5
  3873. eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
  3874. aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5
  3875. rev64 $res7b, $res7b @ GHASH block 8k+7
  3876. aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5
  3877. aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5
  3878. aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5
  3879. pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid
  3880. pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid
  3881. aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5
  3882. pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid
  3883. aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6
  3884. aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6
  3885. aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6
  3886. aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6
  3887. aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6
  3888. eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid
  3889. pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid
  3890. aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6
  3891. eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low
  3892. aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6
  3893. aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6
  3894. ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
  3895. pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high
  3896. aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7
  3897. ldr $h1q, [$current_tag, #32] @ load h1l | h1h
  3898. ext $h1.16b, $h1.16b, $h1.16b, #8
  3899. ldr $h2q, [$current_tag, #64] @ load h1l | h1h
  3900. ext $h2.16b, $h2.16b, $h2.16b, #8
  3901. aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7
  3902. eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
  3903. ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
  3904. ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
  3905. aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7
  3906. aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7
  3907. aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7
  3908. aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7
  3909. pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low
  3910. trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
  3911. aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7
  3912. aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7
  3913. pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high
  3914. aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8
  3915. aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8
  3916. pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low
  3917. trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
  3918. eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
  3919. aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8
  3920. aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 9
  3921. aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8
  3922. pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid
  3923. pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid
  3924. aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8
  3925. aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8
  3926. pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high
  3927. pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low
  3928. aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8
  3929. trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
  3930. aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8
  3931. eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
  3932. aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 9
  3933. aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 9
  3934. eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
  3935. aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 9
  3936. aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 9
  3937. ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11
  3938. aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 9
  3939. aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 9
  3940. pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high
  3941. eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low
  3942. pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low
  3943. ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
  3944. pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid
  3945. pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid
  3946. aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 9
  3947. eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
  3948. eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low
  3949. eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high
  3950. aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 10
  3951. aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 10
  3952. aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 10
  3953. aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 10
  3954. aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 10
  3955. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15
  3956. aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 10
  3957. aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 10
  3958. aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 10
  3959. eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high
  3960. ldp $rk12q, $rk13q, [$cc, #192] @ load rk12, rk13
  3961. rev32 $h1.16b, $rtmp_ctr.16b @ CTR block 8k+16
  3962. ext $t11.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
  3963. ldp $ctr_t0q, $ctr_t1q, [$input_ptr], #32 @ AES block 8k+8, 8k+9 - load plaintext
  3964. aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 11
  3965. aese $ctr6b, $rk11 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 11
  3966. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+16
  3967. aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 11
  3968. aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 11
  3969. aese $ctr7b, $rk11 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 11
  3970. pmull $t12.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
  3971. aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 11
  3972. aese $ctr7b, $rk12 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 12
  3973. aese $ctr5b, $rk11 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 11
  3974. aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 12
  3975. aese $ctr6b, $rk12 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 12
  3976. rev32 $h2.16b, $rtmp_ctr.16b @ CTR block 8k+17
  3977. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+17
  3978. aese $ctr4b, $rk11 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 11
  3979. eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
  3980. aese $ctr5b, $rk12 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 12
  3981. ldr $rk14q, [$cc, #224] @ load rk14
  3982. aese $ctr7b, $rk13 @ AES block 8k+15 - round 13
  3983. ldp $ctr_t2q, $ctr_t3q, [$input_ptr], #32 @ AES block 8k+10, 8k+11 - load plaintext
  3984. aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 12
  3985. aese $ctr4b, $rk12 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 12
  3986. eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid
  3987. aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 12
  3988. ldp $ctr_t4q, $ctr_t5q, [$input_ptr], #32 @ AES block 4, 5 - load plaintext
  3989. ldp $ctr_t6q, $ctr_t7q, [$input_ptr], #32 @ AES block 6, 7 - load plaintext
  3990. aese $ctr2b, $rk13 @ AES block 8k+10 - round 13
  3991. aese $ctr4b, $rk13 @ AES block 8k+12 - round 13
  3992. rev32 $h3.16b, $rtmp_ctr.16b @ CTR block 8k+18
  3993. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+18
  3994. aese $ctr5b, $rk13 @ AES block 8k+13 - round 13
  3995. aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 12
  3996. aese $ctr3b, $rk13 @ AES block 8k+11 - round 13
  3997. cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
  3998. eor3 $res2b, $ctr_t2b, $ctr2b, $rk14 @ AES block 8k+10 - result
  3999. rev32 $h4.16b, $rtmp_ctr.16b @ CTR block 8k+19
  4000. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+19
  4001. aese $ctr0b, $rk13 @ AES block 8k+8 - round 13
  4002. aese $ctr6b, $rk13 @ AES block 8k+14 - round 13
  4003. eor3 $res5b, $ctr_t5b, $ctr5b, $rk14 @ AES block 5 - result
  4004. ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
  4005. pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
  4006. aese $ctr1b, $rk13 @ AES block 8k+9 - round 13
  4007. eor3 $res4b, $ctr_t4b, $ctr4b, $rk14 @ AES block 4 - result
  4008. rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 8k+20
  4009. eor3 $res3b, $ctr_t3b, $ctr3b, $rk14 @ AES block 8k+11 - result
  4010. mov $ctr3.16b, $h4.16b @ CTR block 8k+19
  4011. eor3 $res1b, $ctr_t1b, $ctr1b, $rk14 @ AES block 8k+9 - result
  4012. eor3 $res0b, $ctr_t0b, $ctr0b, $rk14 @ AES block 8k+8 - result
  4013. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+20
  4014. stp $res0q, $res1q, [$output_ptr], #32 @ AES block 8k+8, 8k+9 - store result
  4015. mov $ctr2.16b, $h3.16b @ CTR block 8k+18
  4016. eor3 $res7b, $ctr_t7b, $ctr7b, $rk14 @ AES block 7 - result
  4017. eor3 $acc_lb, $acc_lb, $t11.16b, $acc_hb @ MODULO - fold into low
  4018. stp $res2q, $res3q, [$output_ptr], #32 @ AES block 8k+10, 8k+11 - store result
  4019. eor3 $res6b, $ctr_t6b, $ctr6b, $rk14 @ AES block 6 - result
  4020. mov $ctr1.16b, $h2.16b @ CTR block 8k+17
  4021. stp $res4q, $res5q, [$output_ptr], #32 @ AES block 4, 5 - store result
  4022. stp $res6q, $res7q, [$output_ptr], #32 @ AES block 6, 7 - store result
  4023. mov $ctr0.16b, $h1.16b @ CTR block 8k+16
  4024. b.lt .L256_enc_main_loop
  4025. .L256_enc_prepretail: @ PREPRETAIL
  4026. rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13
  4027. ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
  4028. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13
  4029. rev64 $res2b, $res2b @ GHASH block 8k+2
  4030. rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14
  4031. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14
  4032. rev64 $res5b, $res5b @ GHASH block 8k+5
  4033. ldr $h56kq, [$current_tag, #144] @ load h6k | h5k
  4034. ldr $h78kq, [$current_tag, #192] @ load h8k | h7k
  4035. rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15
  4036. aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0
  4037. aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0
  4038. aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0
  4039. aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0
  4040. aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0
  4041. aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0
  4042. aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0
  4043. aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0
  4044. ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
  4045. rev64 $res0b, $res0b @ GHASH block 8k
  4046. aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1
  4047. rev64 $res1b, $res1b @ GHASH block 8k+1
  4048. ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
  4049. aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1
  4050. ldr $h7q, [$current_tag, #176] @ load h7l | h7h
  4051. ext $h7.16b, $h7.16b, $h7.16b, #8
  4052. ldr $h8q, [$current_tag, #208] @ load h8l | h8h
  4053. ext $h8.16b, $h8.16b, $h8.16b, #8
  4054. aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1
  4055. ldr $h5q, [$current_tag, #128] @ load h5l | h5h
  4056. ext $h5.16b, $h5.16b, $h5.16b, #8
  4057. ldr $h6q, [$current_tag, #160] @ load h6l | h6h
  4058. ext $h6.16b, $h6.16b, $h6.16b, #8
  4059. aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1
  4060. aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1
  4061. aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1
  4062. eor $res0b, $res0b, $acc_lb @ PRE 1
  4063. rev64 $res3b, $res3b @ GHASH block 8k+3
  4064. aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1
  4065. aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2
  4066. aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2
  4067. aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1
  4068. aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2
  4069. aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2
  4070. aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2
  4071. aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2
  4072. aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2
  4073. aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2
  4074. ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
  4075. trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
  4076. pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high
  4077. rev64 $res6b, $res6b @ GHASH block 8k+6
  4078. aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3
  4079. pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high
  4080. aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3
  4081. pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low
  4082. trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
  4083. pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high
  4084. aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3
  4085. aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3
  4086. aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3
  4087. eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high
  4088. pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low
  4089. pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high
  4090. aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3
  4091. aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3
  4092. eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid
  4093. aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3
  4094. pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low
  4095. aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4
  4096. aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4
  4097. aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4
  4098. aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4
  4099. aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4
  4100. aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5
  4101. pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid
  4102. eor3 $acc_hb, $acc_hb, $t1.16b, $t2.16b @ GHASH block 8k+2, 8k+3 - high
  4103. aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4
  4104. trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
  4105. trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
  4106. aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4
  4107. eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low
  4108. aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4
  4109. pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low
  4110. pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid
  4111. eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
  4112. rev64 $res4b, $res4b @ GHASH block 8k+4
  4113. aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5
  4114. aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5
  4115. aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5
  4116. aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5
  4117. ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
  4118. ldr $h3q, [$current_tag, #80] @ load h3l | h3h
  4119. ext $h3.16b, $h3.16b, $h3.16b, #8
  4120. ldr $h4q, [$current_tag, #112] @ load h4l | h4h
  4121. ext $h4.16b, $h4.16b, $h4.16b, #8
  4122. pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid
  4123. pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid
  4124. eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low
  4125. eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid
  4126. aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5
  4127. rev64 $res7b, $res7b @ GHASH block 8k+7
  4128. trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
  4129. aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5
  4130. aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5
  4131. eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
  4132. aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6
  4133. aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6
  4134. aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6
  4135. ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
  4136. ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
  4137. aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6
  4138. aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6
  4139. aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6
  4140. aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6
  4141. aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6
  4142. pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high
  4143. pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low
  4144. ldr $h1q, [$current_tag, #32] @ load h1l | h1h
  4145. ext $h1.16b, $h1.16b, $h1.16b, #8
  4146. ldr $h2q, [$current_tag, #64] @ load h1l | h1h
  4147. ext $h2.16b, $h2.16b, $h2.16b, #8
  4148. ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
  4149. aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7
  4150. aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7
  4151. pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high
  4152. trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
  4153. aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7
  4154. aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7
  4155. pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low
  4156. aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7
  4157. aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7
  4158. eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
  4159. pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high
  4160. pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low
  4161. aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7
  4162. trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
  4163. trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
  4164. aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7
  4165. aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8
  4166. eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low
  4167. aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8
  4168. aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8
  4169. aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8
  4170. aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8
  4171. aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8
  4172. eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
  4173. aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8
  4174. pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid
  4175. pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid
  4176. aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8
  4177. pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high
  4178. pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid
  4179. pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid
  4180. pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low
  4181. eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
  4182. eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high
  4183. ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11
  4184. aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 9
  4185. aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 9
  4186. eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high
  4187. eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
  4188. ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
  4189. eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low
  4190. aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 9
  4191. aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 9
  4192. aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 9
  4193. aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 9
  4194. aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 9
  4195. aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 10
  4196. aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 10
  4197. aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 9
  4198. aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 10
  4199. aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 10
  4200. aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 10
  4201. aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 10
  4202. aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 10
  4203. aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 10
  4204. pmull $t12.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
  4205. eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
  4206. aese $ctr7b, $rk11 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 11
  4207. ldp $rk12q, $rk13q, [$cc, #192] @ load rk12, rk13
  4208. ext $t11.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
  4209. aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 11
  4210. eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid
  4211. aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 11
  4212. aese $ctr6b, $rk11 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 11
  4213. aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 11
  4214. aese $ctr4b, $rk11 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 11
  4215. aese $ctr5b, $rk11 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 11
  4216. pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
  4217. aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 11
  4218. ldr $rk14q, [$cc, #224] @ load rk14
  4219. aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 12
  4220. aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 12
  4221. aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 12
  4222. aese $ctr6b, $rk12 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 12
  4223. aese $ctr5b, $rk12 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 12
  4224. ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
  4225. aese $ctr4b, $rk12 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 12
  4226. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15
  4227. aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 12
  4228. aese $ctr7b, $rk12 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 12
  4229. aese $ctr0b, $rk13 @ AES block 8k+8 - round 13
  4230. eor3 $acc_lb, $acc_lb, $t11.16b, $acc_hb @ MODULO - fold into low
  4231. aese $ctr5b, $rk13 @ AES block 8k+13 - round 13
  4232. aese $ctr1b, $rk13 @ AES block 8k+9 - round 13
  4233. aese $ctr3b, $rk13 @ AES block 8k+11 - round 13
  4234. aese $ctr4b, $rk13 @ AES block 8k+12 - round 13
  4235. aese $ctr7b, $rk13 @ AES block 8k+15 - round 13
  4236. aese $ctr2b, $rk13 @ AES block 8k+10 - round 13
  4237. aese $ctr6b, $rk13 @ AES block 8k+14 - round 13
  4238. .L256_enc_tail: @ TAIL
  4239. ldp $h78kq, $h8q, [$current_tag, #192] @ load h8l | h8h
  4240. ext $h8.16b, $h8.16b, $h8.16b, #8
  4241. sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
  4242. ldr $ctr_t0q, [$input_ptr], #16 @ AES block 8k+8 - load plaintext
  4243. ldp $h5q, $h56kq, [$current_tag, #128] @ load h5l | h5h
  4244. ext $h5.16b, $h5.16b, $h5.16b, #8
  4245. ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
  4246. ldp $h6q, $h7q, [$current_tag, #160] @ load h6l | h6h
  4247. ext $h6.16b, $h6.16b, $h6.16b, #8
  4248. ext $h7.16b, $h7.16b, $h7.16b, #8
  4249. mov $t1.16b, $rk14
  4250. cmp $main_end_input_ptr, #112
  4251. eor3 $res1b, $ctr_t0b, $ctr0b, $t1.16b @ AES block 8k+8 - result
  4252. b.gt .L256_enc_blocks_more_than_7
  4253. movi $acc_l.8b, #0
  4254. mov $ctr7b, $ctr6b
  4255. movi $acc_h.8b, #0
  4256. mov $ctr6b, $ctr5b
  4257. mov $ctr5b, $ctr4b
  4258. mov $ctr4b, $ctr3b
  4259. mov $ctr3b, $ctr2b
  4260. sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
  4261. mov $ctr2b, $ctr1b
  4262. movi $acc_m.8b, #0
  4263. cmp $main_end_input_ptr, #96
  4264. b.gt .L256_enc_blocks_more_than_6
  4265. mov $ctr7b, $ctr6b
  4266. mov $ctr6b, $ctr5b
  4267. cmp $main_end_input_ptr, #80
  4268. mov $ctr5b, $ctr4b
  4269. mov $ctr4b, $ctr3b
  4270. mov $ctr3b, $ctr1b
  4271. sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
  4272. b.gt .L256_enc_blocks_more_than_5
  4273. mov $ctr7b, $ctr6b
  4274. sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
  4275. mov $ctr6b, $ctr5b
  4276. mov $ctr5b, $ctr4b
  4277. cmp $main_end_input_ptr, #64
  4278. mov $ctr4b, $ctr1b
  4279. b.gt .L256_enc_blocks_more_than_4
  4280. cmp $main_end_input_ptr, #48
  4281. mov $ctr7b, $ctr6b
  4282. mov $ctr6b, $ctr5b
  4283. mov $ctr5b, $ctr1b
  4284. sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
  4285. b.gt .L256_enc_blocks_more_than_3
  4286. cmp $main_end_input_ptr, #32
  4287. mov $ctr7b, $ctr6b
  4288. ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
  4289. mov $ctr6b, $ctr1b
  4290. sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
  4291. b.gt .L256_enc_blocks_more_than_2
  4292. mov $ctr7b, $ctr1b
  4293. sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
  4294. cmp $main_end_input_ptr, #16
  4295. b.gt .L256_enc_blocks_more_than_1
  4296. sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
  4297. ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
  4298. b .L256_enc_blocks_less_than_1
  4299. .L256_enc_blocks_more_than_7: @ blocks left > 7
  4300. st1 { $res1b}, [$output_ptr], #16 @ AES final-7 block - store result
  4301. rev64 $res0b, $res1b @ GHASH final-7 block
  4302. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  4303. ldr $ctr_t1q, [$input_ptr], #16 @ AES final-6 block - load plaintext
  4304. pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH final-7 block - high
  4305. ins $rk4v.d[0], $res0.d[1] @ GHASH final-7 block - mid
  4306. ins $acc_m.d[0], $h78k.d[1] @ GHASH final-7 block - mid
  4307. movi $t0.8b, #0 @ surpress further partial tag feed in
  4308. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-7 block - mid
  4309. eor3 $res1b, $ctr_t1b, $ctr1b, $t1.16b @ AES final-6 block - result
  4310. pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-7 block - mid
  4311. pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH final-7 block - low
  4312. .L256_enc_blocks_more_than_6: @ blocks left > 6
  4313. st1 { $res1b}, [$output_ptr], #16 @ AES final-6 block - store result
  4314. rev64 $res0b, $res1b @ GHASH final-6 block
  4315. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  4316. pmull $rk3q1, $res0.1d, $h7.1d @ GHASH final-6 block - low
  4317. ins $rk4v.d[0], $res0.d[1] @ GHASH final-6 block - mid
  4318. pmull2 $rk2q1, $res0.2d, $h7.2d @ GHASH final-6 block - high
  4319. ldr $ctr_t1q, [$input_ptr], #16 @ AES final-5 block - load plaintext
  4320. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-6 block - low
  4321. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-6 block - mid
  4322. pmull $rk4v.1q, $rk4v.1d, $h78k.1d @ GHASH final-6 block - mid
  4323. eor3 $res1b, $ctr_t1b, $ctr2b, $t1.16b @ AES final-5 block - result
  4324. movi $t0.8b, #0 @ surpress further partial tag feed in
  4325. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-6 block - mid
  4326. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-6 block - high
  4327. .L256_enc_blocks_more_than_5: @ blocks left > 5
  4328. st1 { $res1b}, [$output_ptr], #16 @ AES final-5 block - store result
  4329. rev64 $res0b, $res1b @ GHASH final-5 block
  4330. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  4331. ins $rk4v.d[0], $res0.d[1] @ GHASH final-5 block - mid
  4332. pmull2 $rk2q1, $res0.2d, $h6.2d @ GHASH final-5 block - high
  4333. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-5 block - high
  4334. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-5 block - mid
  4335. ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-5 block - mid
  4336. ldr $ctr_t1q, [$input_ptr], #16 @ AES final-4 block - load plaintext
  4337. pmull $rk3q1, $res0.1d, $h6.1d @ GHASH final-5 block - low
  4338. pmull2 $rk4v.1q, $rk4v.2d, $h56k.2d @ GHASH final-5 block - mid
  4339. movi $t0.8b, #0 @ surpress further partial tag feed in
  4340. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-5 block - low
  4341. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-5 block - mid
  4342. eor3 $res1b, $ctr_t1b, $ctr3b, $t1.16b @ AES final-4 block - result
  4343. .L256_enc_blocks_more_than_4: @ blocks left > 4
  4344. st1 { $res1b}, [$output_ptr], #16 @ AES final-4 block - store result
  4345. rev64 $res0b, $res1b @ GHASH final-4 block
  4346. ldr $ctr_t1q, [$input_ptr], #16 @ AES final-3 block - load plaintext
  4347. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  4348. ins $rk4v.d[0], $res0.d[1] @ GHASH final-4 block - mid
  4349. pmull2 $rk2q1, $res0.2d, $h5.2d @ GHASH final-4 block - high
  4350. eor3 $res1b, $ctr_t1b, $ctr4b, $t1.16b @ AES final-3 block - result
  4351. pmull $rk3q1, $res0.1d, $h5.1d @ GHASH final-4 block - low
  4352. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-4 block - mid
  4353. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-4 block - low
  4354. pmull $rk4v.1q, $rk4v.1d, $h56k.1d @ GHASH final-4 block - mid
  4355. movi $t0.8b, #0 @ surpress further partial tag feed in
  4356. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-4 block - mid
  4357. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-4 block - high
  4358. .L256_enc_blocks_more_than_3: @ blocks left > 3
  4359. st1 { $res1b}, [$output_ptr], #16 @ AES final-3 block - store result
  4360. ldr $h4q, [$current_tag, #112] @ load h4l | h4h
  4361. ext $h4.16b, $h4.16b, $h4.16b, #8
  4362. rev64 $res0b, $res1b @ GHASH final-3 block
  4363. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  4364. ins $rk4v.d[0], $res0.d[1] @ GHASH final-3 block - mid
  4365. pmull2 $rk2q1, $res0.2d, $h4.2d @ GHASH final-3 block - high
  4366. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-3 block - high
  4367. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
  4368. ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
  4369. ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-3 block - mid
  4370. ldr $ctr_t1q, [$input_ptr], #16 @ AES final-2 block - load plaintext
  4371. pmull2 $rk4v.1q, $rk4v.2d, $h34k.2d @ GHASH final-3 block - mid
  4372. pmull $rk3q1, $res0.1d, $h4.1d @ GHASH final-3 block - low
  4373. eor3 $res1b, $ctr_t1b, $ctr5b, $t1.16b @ AES final-2 block - result
  4374. movi $t0.8b, #0 @ surpress further partial tag feed in
  4375. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-3 block - mid
  4376. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-3 block - low
  4377. .L256_enc_blocks_more_than_2: @ blocks left > 2
  4378. ldr $h3q, [$current_tag, #80] @ load h3l | h3h
  4379. ext $h3.16b, $h3.16b, $h3.16b, #8
  4380. st1 { $res1b}, [$output_ptr], #16 @ AES final-2 block - store result
  4381. rev64 $res0b, $res1b @ GHASH final-2 block
  4382. ldr $ctr_t1q, [$input_ptr], #16 @ AES final-1 block - load plaintext
  4383. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  4384. ins $rk4v.d[0], $res0.d[1] @ GHASH final-2 block - mid
  4385. movi $t0.8b, #0 @ surpress further partial tag feed in
  4386. pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
  4387. eor3 $res1b, $ctr_t1b, $ctr6b, $t1.16b @ AES final-1 block - result
  4388. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
  4389. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
  4390. pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
  4391. pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
  4392. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
  4393. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
  4394. .L256_enc_blocks_more_than_1: @ blocks left > 1
  4395. st1 { $res1b}, [$output_ptr], #16 @ AES final-1 block - store result
  4396. ldr $h2q, [$current_tag, #64] @ load h1l | h1h
  4397. ext $h2.16b, $h2.16b, $h2.16b, #8
  4398. rev64 $res0b, $res1b @ GHASH final-1 block
  4399. ldr $ctr_t1q, [$input_ptr], #16 @ AES final block - load plaintext
  4400. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  4401. movi $t0.8b, #0 @ surpress further partial tag feed in
  4402. ins $rk4v.d[0], $res0.d[1] @ GHASH final-1 block - mid
  4403. pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
  4404. eor3 $res1b, $ctr_t1b, $ctr7b, $t1.16b @ AES final block - result
  4405. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
  4406. pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
  4407. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
  4408. ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
  4409. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
  4410. ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
  4411. pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
  4412. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
  4413. .L256_enc_blocks_less_than_1: @ blocks left <= 1
  4414. and $bit_length, $bit_length, #127 @ bit_length %= 128
  4415. sub $bit_length, $bit_length, #128 @ bit_length -= 128
  4416. neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
  4417. mvn $temp0_x, xzr @ temp0_x = 0xffffffffffffffff
  4418. and $bit_length, $bit_length, #127 @ bit_length %= 128
  4419. lsr $temp0_x, $temp0_x, $bit_length @ temp0_x is mask for top 64b of last block
  4420. cmp $bit_length, #64
  4421. mvn $temp1_x, xzr @ temp1_x = 0xffffffffffffffff
  4422. csel $temp3_x, $temp0_x, xzr, lt
  4423. csel $temp2_x, $temp1_x, $temp0_x, lt
  4424. mov $ctr0.d[0], $temp2_x @ ctr0b is mask for last block
  4425. ldr $h1q, [$current_tag, #32] @ load h1l | h1h
  4426. ext $h1.16b, $h1.16b, $h1.16b, #8
  4427. ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored
  4428. mov $ctr0.d[1], $temp3_x
  4429. and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
  4430. rev64 $res0b, $res1b @ GHASH final block
  4431. rev32 $rtmp_ctr.16b, $rtmp_ctr.16b
  4432. bif $res1b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing
  4433. str $rtmp_ctrq, [$counter] @ store the updated counter
  4434. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  4435. st1 { $res1b}, [$output_ptr] @ store all 16B
  4436. ins $t0.d[0], $res0.d[1] @ GHASH final block - mid
  4437. pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
  4438. pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
  4439. eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
  4440. eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
  4441. eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
  4442. pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
  4443. eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
  4444. ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
  4445. ext $t11.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
  4446. eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
  4447. pmull $t12.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
  4448. eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid
  4449. pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
  4450. ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
  4451. eor3 $acc_lb, $acc_lb, $acc_hb, $t11.16b @ MODULO - fold into low
  4452. ext $acc_lb, $acc_lb, $acc_lb, #8
  4453. rev64 $acc_lb, $acc_lb
  4454. st1 { $acc_l.16b }, [$current_tag]
  4455. lsr x0, $bit_length, #3 @ return sizes
  4456. ldp d10, d11, [sp, #16]
  4457. ldp d12, d13, [sp, #32]
  4458. ldp d14, d15, [sp, #48]
  4459. ldp d8, d9, [sp], #80
  4460. ret
  4461. .L256_enc_ret:
  4462. mov w0, #0x0
  4463. ret
  4464. .size unroll8_eor3_aes_gcm_enc_256_kernel,.-unroll8_eor3_aes_gcm_enc_256_kernel
  4465. ___
  4466. {
  4467. #########################################################################################
  4468. # size_t unroll8_eor3_aes_gcm_dec_256_kernel(const unsigned char *in,
  4469. # size_t len,
  4470. # unsigned char *out,
  4471. # const void *key,
  4472. # unsigned char ivec[16],
  4473. # u64 *Xi);
  4474. #
  4475. $code.=<<___;
  4476. .global unroll8_eor3_aes_gcm_dec_256_kernel
  4477. .type unroll8_eor3_aes_gcm_dec_256_kernel,%function
  4478. .align 4
  4479. unroll8_eor3_aes_gcm_dec_256_kernel:
  4480. AARCH64_VALID_CALL_TARGET
  4481. cbz x1, .L256_dec_ret
  4482. stp d8, d9, [sp, #-80]!
  4483. mov $counter, x4
  4484. mov $cc, x5
  4485. stp d10, d11, [sp, #16]
  4486. stp d12, d13, [sp, #32]
  4487. stp d14, d15, [sp, #48]
  4488. mov x5, #0xc200000000000000
  4489. stp x5, xzr, [sp, #64]
  4490. add $modulo_constant, sp, #64
  4491. ld1 { $ctr0b}, [$counter] @ CTR block 0
  4492. mov $constant_temp, #0x100000000 @ set up counter increment
  4493. movi $rctr_inc.16b, #0x0
  4494. mov $rctr_inc.d[1], $constant_temp
  4495. lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
  4496. sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
  4497. rev32 $rtmp_ctr.16b, $ctr0.16b @ set up reversed counter
  4498. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 0
  4499. rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 1
  4500. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 1
  4501. rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 2
  4502. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 2
  4503. ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
  4504. rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 3
  4505. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 3
  4506. rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 4
  4507. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 4
  4508. aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
  4509. rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 5
  4510. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 5
  4511. aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
  4512. aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
  4513. rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 6
  4514. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 6
  4515. rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 7
  4516. aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 0
  4517. aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 0
  4518. aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 0
  4519. aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
  4520. aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 0
  4521. ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
  4522. aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 1
  4523. aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 1
  4524. aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
  4525. aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 1
  4526. aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 1
  4527. aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
  4528. aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
  4529. aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
  4530. aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
  4531. aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
  4532. aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 2
  4533. aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
  4534. aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 2
  4535. aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 2
  4536. aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
  4537. aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 2
  4538. ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
  4539. aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
  4540. aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
  4541. aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
  4542. aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 3
  4543. aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 3
  4544. aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 3
  4545. aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
  4546. aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 3
  4547. aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 4
  4548. aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
  4549. aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 4
  4550. aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
  4551. aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
  4552. aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 4
  4553. aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
  4554. aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 4
  4555. aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
  4556. aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 5
  4557. ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
  4558. aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 5
  4559. aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 5
  4560. aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 5
  4561. aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
  4562. aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
  4563. aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
  4564. aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 6
  4565. aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
  4566. aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 6
  4567. aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 6
  4568. aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
  4569. aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 6
  4570. aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
  4571. aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
  4572. ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
  4573. aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 7
  4574. aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
  4575. aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
  4576. aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
  4577. aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 7
  4578. aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 7
  4579. aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
  4580. aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 7
  4581. and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffff80 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
  4582. aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 8
  4583. aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 8
  4584. aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
  4585. aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
  4586. aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
  4587. aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 8
  4588. aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
  4589. aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 8
  4590. aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9
  4591. ld1 { $acc_lb}, [$current_tag]
  4592. ext $acc_lb, $acc_lb, $acc_lb, #8
  4593. rev64 $acc_lb, $acc_lb
  4594. ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11
  4595. add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
  4596. add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
  4597. aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9
  4598. aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 9
  4599. aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 9
  4600. aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 9
  4601. aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 9
  4602. aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9
  4603. aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9
  4604. aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 10
  4605. aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 10
  4606. aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 10
  4607. aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 10
  4608. aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 10
  4609. aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 10
  4610. aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 10
  4611. aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 10
  4612. ldp $rk12q, $rk13q, [$cc, #192] @ load rk12, rk13
  4613. aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 11
  4614. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 7
  4615. aese $ctr7b, $rk11 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 11
  4616. aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 11
  4617. aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 11
  4618. aese $ctr5b, $rk11 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 11
  4619. aese $ctr4b, $rk11 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 11
  4620. aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 11
  4621. aese $ctr6b, $rk11 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 11
  4622. ldr $rk14q, [$cc, #224] @ load rk14
  4623. aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 12
  4624. aese $ctr4b, $rk12 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 12
  4625. aese $ctr5b, $rk12 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 12
  4626. cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
  4627. aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 12
  4628. aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 12
  4629. aese $ctr6b, $rk12 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 12
  4630. aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 12
  4631. aese $ctr7b, $rk12 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 12
  4632. aese $ctr5b, $rk13 @ AES block 5 - round 13
  4633. aese $ctr1b, $rk13 @ AES block 1 - round 13
  4634. aese $ctr2b, $rk13 @ AES block 2 - round 13
  4635. aese $ctr0b, $rk13 @ AES block 0 - round 13
  4636. aese $ctr4b, $rk13 @ AES block 4 - round 13
  4637. aese $ctr6b, $rk13 @ AES block 6 - round 13
  4638. aese $ctr3b, $rk13 @ AES block 3 - round 13
  4639. aese $ctr7b, $rk13 @ AES block 7 - round 13
  4640. b.ge .L256_dec_tail @ handle tail
  4641. ldp $res0q, $res1q, [$input_ptr], #32 @ AES block 0, 1 - load ciphertext
  4642. ldp $res2q, $res3q, [$input_ptr], #32 @ AES block 2, 3 - load ciphertext
  4643. ldp $res4q, $res5q, [$input_ptr], #32 @ AES block 4, 5 - load ciphertext
  4644. ldp $res6q, $res7q, [$input_ptr], #32 @ AES block 6, 7 - load ciphertext
  4645. cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
  4646. eor3 $ctr1b, $res1b, $ctr1b, $rk14 @ AES block 1 - result
  4647. eor3 $ctr0b, $res0b, $ctr0b, $rk14 @ AES block 0 - result
  4648. stp $ctr0q, $ctr1q, [$output_ptr], #32 @ AES block 0, 1 - store result
  4649. rev32 $ctr0.16b, $rtmp_ctr.16b @ CTR block 8
  4650. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8
  4651. eor3 $ctr3b, $res3b, $ctr3b, $rk14 @ AES block 3 - result
  4652. eor3 $ctr5b, $res5b, $ctr5b, $rk14 @ AES block 5 - result
  4653. eor3 $ctr4b, $res4b, $ctr4b, $rk14 @ AES block 4 - result
  4654. rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 9
  4655. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 9
  4656. eor3 $ctr2b, $res2b, $ctr2b, $rk14 @ AES block 2 - result
  4657. stp $ctr2q, $ctr3q, [$output_ptr], #32 @ AES block 2, 3 - store result
  4658. rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 10
  4659. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 10
  4660. eor3 $ctr6b, $res6b, $ctr6b, $rk14 @ AES block 6 - result
  4661. rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 11
  4662. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 11
  4663. stp $ctr4q, $ctr5q, [$output_ptr], #32 @ AES block 4, 5 - store result
  4664. eor3 $ctr7b, $res7b, $ctr7b, $rk14 @ AES block 7 - result
  4665. stp $ctr6q, $ctr7q, [$output_ptr], #32 @ AES block 6, 7 - store result
  4666. rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 12
  4667. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 12
  4668. b.ge .L256_dec_prepretail @ do prepretail
  4669. .L256_dec_main_loop: @ main loop start
  4670. rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13
  4671. ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
  4672. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13
  4673. rev64 $res1b, $res1b @ GHASH block 8k+1
  4674. ldr $h7q, [$current_tag, #176] @ load h7l | h7h
  4675. ext $h7.16b, $h7.16b, $h7.16b, #8
  4676. ldr $h8q, [$current_tag, #208] @ load h8l | h8h
  4677. ext $h8.16b, $h8.16b, $h8.16b, #8
  4678. rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14
  4679. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14
  4680. rev64 $res0b, $res0b @ GHASH block 8k
  4681. ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
  4682. rev64 $res4b, $res4b @ GHASH block 8k+4
  4683. rev64 $res3b, $res3b @ GHASH block 8k+3
  4684. rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15
  4685. rev64 $res7b, $res7b @ GHASH block 8k+7
  4686. aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0
  4687. aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0
  4688. aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0
  4689. aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0
  4690. aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0
  4691. aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0
  4692. aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0
  4693. aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0
  4694. ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
  4695. eor $res0b, $res0b, $acc_lb @ PRE 1
  4696. ldr $h5q, [$current_tag, #128] @ load h5l | h5h
  4697. ext $h5.16b, $h5.16b, $h5.16b, #8
  4698. ldr $h6q, [$current_tag, #160] @ load h6l | h6h
  4699. ext $h6.16b, $h6.16b, $h6.16b, #8
  4700. aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1
  4701. aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1
  4702. rev64 $res2b, $res2b @ GHASH block 8k+2
  4703. aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1
  4704. aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1
  4705. aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1
  4706. aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1
  4707. trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
  4708. aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1
  4709. aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1
  4710. aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2
  4711. aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2
  4712. aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2
  4713. aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2
  4714. aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2
  4715. pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low
  4716. aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2
  4717. aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2
  4718. aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2
  4719. ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
  4720. pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high
  4721. aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3
  4722. aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3
  4723. pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high
  4724. pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low
  4725. aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3
  4726. aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3
  4727. pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high
  4728. aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3
  4729. aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3
  4730. trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
  4731. pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high
  4732. aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3
  4733. eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high
  4734. aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4
  4735. aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3
  4736. aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4
  4737. aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4
  4738. aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4
  4739. aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4
  4740. aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4
  4741. aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4
  4742. aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4
  4743. ldr $h56kq, [$current_tag, #144] @ load h6k | h5k
  4744. ldr $h78kq, [$current_tag, #192] @ load h8k | h7k
  4745. eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid
  4746. pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low
  4747. ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
  4748. aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5
  4749. eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low
  4750. aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5
  4751. aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5
  4752. aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5
  4753. aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5
  4754. aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5
  4755. aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5
  4756. eor3 $acc_hb, $acc_hb, $t1.16b, $t2.16b @ GHASH block 8k+2, 8k+3 - high
  4757. trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
  4758. rev64 $res5b, $res5b @ GHASH block 8k+5
  4759. pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid
  4760. pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid
  4761. trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
  4762. aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6
  4763. aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6
  4764. aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5
  4765. trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
  4766. aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6
  4767. aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6
  4768. eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
  4769. pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low
  4770. aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6
  4771. aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6
  4772. aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6
  4773. aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6
  4774. pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid
  4775. pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid
  4776. eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low
  4777. ldr $h3q, [$current_tag, #80] @ load h3l | h3h
  4778. ext $h3.16b, $h3.16b, $h3.16b, #8
  4779. ldr $h4q, [$current_tag, #112] @ load h4l | h4h
  4780. ext $h4.16b, $h4.16b, $h4.16b, #8
  4781. rev64 $res6b, $res6b @ GHASH block 8k+6
  4782. eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid
  4783. aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7
  4784. aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7
  4785. ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
  4786. ldr $h1q, [$current_tag, #32] @ load h1l | h1h
  4787. ext $h1.16b, $h1.16b, $h1.16b, #8
  4788. ldr $h2q, [$current_tag, #64] @ load h1l | h1h
  4789. ext $h2.16b, $h2.16b, $h2.16b, #8
  4790. eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
  4791. aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7
  4792. aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7
  4793. aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7
  4794. aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7
  4795. ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
  4796. ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
  4797. aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7
  4798. aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7
  4799. pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high
  4800. pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low
  4801. trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
  4802. aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8
  4803. pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high
  4804. aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8
  4805. aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8
  4806. pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low
  4807. aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8
  4808. aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8
  4809. aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8
  4810. pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high
  4811. trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
  4812. aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8
  4813. aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8
  4814. ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11
  4815. pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low
  4816. trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
  4817. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15
  4818. eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high
  4819. aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 9
  4820. aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 9
  4821. eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
  4822. aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 9
  4823. ldp $res0q, $res1q, [$input_ptr], #32 @ AES block 8k+8, 8k+9 - load ciphertext
  4824. eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
  4825. aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 9
  4826. pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid
  4827. aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 9
  4828. aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 9
  4829. pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid
  4830. pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid
  4831. pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high
  4832. pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low
  4833. aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 10
  4834. aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 10
  4835. pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid
  4836. aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 9
  4837. eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low
  4838. aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 9
  4839. eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
  4840. eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high
  4841. aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 10
  4842. aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 10
  4843. aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 10
  4844. aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 10
  4845. aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 10
  4846. aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 10
  4847. eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low
  4848. rev32 $h1.16b, $rtmp_ctr.16b @ CTR block 8k+16
  4849. ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
  4850. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+16
  4851. aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 11
  4852. ldp $rk12q, $rk13q, [$cc, #192] @ load rk12, rk13
  4853. aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 11
  4854. aese $ctr6b, $rk11 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 11
  4855. eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
  4856. rev32 $h2.16b, $rtmp_ctr.16b @ CTR block 8k+17
  4857. aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 11
  4858. ldp $res2q, $res3q, [$input_ptr], #32 @ AES block 8k+10, 8k+11 - load ciphertext
  4859. aese $ctr7b, $rk11 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 11
  4860. ext $t11.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
  4861. aese $ctr5b, $rk11 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 11
  4862. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+17
  4863. aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 11
  4864. aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 12
  4865. aese $ctr7b, $rk12 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 12
  4866. aese $ctr6b, $rk12 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 12
  4867. rev32 $h3.16b, $rtmp_ctr.16b @ CTR block 8k+18
  4868. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+18
  4869. pmull $t12.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
  4870. eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
  4871. aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 12
  4872. aese $ctr4b, $rk11 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 11
  4873. ldr $rk14q, [$cc, #224] @ load rk14
  4874. aese $ctr5b, $rk12 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 12
  4875. aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 12
  4876. eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid
  4877. aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 12
  4878. aese $ctr4b, $rk12 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 12
  4879. ldp $res4q, $res5q, [$input_ptr], #32 @ AES block 8k+12, 8k+13 - load ciphertext
  4880. aese $ctr1b, $rk13 @ AES block 8k+9 - round 13
  4881. aese $ctr2b, $rk13 @ AES block 8k+10 - round 13
  4882. ldp $res6q, $res7q, [$input_ptr], #32 @ AES block 8k+14, 8k+15 - load ciphertext
  4883. aese $ctr0b, $rk13 @ AES block 8k+8 - round 13
  4884. aese $ctr5b, $rk13 @ AES block 8k+13 - round 13
  4885. rev32 $h4.16b, $rtmp_ctr.16b @ CTR block 8k+19
  4886. eor3 $ctr2b, $res2b, $ctr2b, $rk14 @ AES block 8k+10 - result
  4887. eor3 $ctr1b, $res1b, $ctr1b, $rk14 @ AES block 8k+9 - result
  4888. ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
  4889. aese $ctr7b, $rk13 @ AES block 8k+15 - round 13
  4890. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+19
  4891. pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
  4892. aese $ctr4b, $rk13 @ AES block 8k+12 - round 13
  4893. eor3 $ctr5b, $res5b, $ctr5b, $rk14 @ AES block 8k+13 - result
  4894. eor3 $ctr0b, $res0b, $ctr0b, $rk14 @ AES block 8k+8 - result
  4895. aese $ctr3b, $rk13 @ AES block 8k+11 - round 13
  4896. stp $ctr0q, $ctr1q, [$output_ptr], #32 @ AES block 8k+8, 8k+9 - store result
  4897. mov $ctr0.16b, $h1.16b @ CTR block 8k+16
  4898. eor3 $ctr4b, $res4b, $ctr4b, $rk14 @ AES block 8k+12 - result
  4899. eor3 $acc_lb, $acc_lb, $t11.16b, $acc_hb @ MODULO - fold into low
  4900. eor3 $ctr3b, $res3b, $ctr3b, $rk14 @ AES block 8k+11 - result
  4901. stp $ctr2q, $ctr3q, [$output_ptr], #32 @ AES block 8k+10, 8k+11 - store result
  4902. mov $ctr3.16b, $h4.16b @ CTR block 8k+19
  4903. mov $ctr2.16b, $h3.16b @ CTR block 8k+18
  4904. aese $ctr6b, $rk13 @ AES block 8k+14 - round 13
  4905. mov $ctr1.16b, $h2.16b @ CTR block 8k+17
  4906. stp $ctr4q, $ctr5q, [$output_ptr], #32 @ AES block 8k+12, 8k+13 - store result
  4907. eor3 $ctr7b, $res7b, $ctr7b, $rk14 @ AES block 8k+15 - result
  4908. eor3 $ctr6b, $res6b, $ctr6b, $rk14 @ AES block 8k+14 - result
  4909. rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 8k+20
  4910. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+20
  4911. cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
  4912. stp $ctr6q, $ctr7q, [$output_ptr], #32 @ AES block 8k+14, 8k+15 - store result
  4913. b.lt .L256_dec_main_loop
  4914. .L256_dec_prepretail: @ PREPRETAIL
  4915. ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
  4916. rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13
  4917. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13
  4918. rev64 $res4b, $res4b @ GHASH block 8k+4
  4919. ldr $h56kq, [$current_tag, #144] @ load h6k | h5k
  4920. ldr $h78kq, [$current_tag, #192] @ load h8k | h7k
  4921. rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14
  4922. rev64 $res0b, $res0b @ GHASH block 8k
  4923. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14
  4924. ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
  4925. ldr $h7q, [$current_tag, #176] @ load h7l | h7h
  4926. ext $h7.16b, $h7.16b, $h7.16b, #8
  4927. ldr $h8q, [$current_tag, #208] @ load h8l | h8h
  4928. ext $h8.16b, $h8.16b, $h8.16b, #8
  4929. rev64 $res1b, $res1b @ GHASH block 8k+1
  4930. rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15
  4931. rev64 $res2b, $res2b @ GHASH block 8k+2
  4932. ldr $h5q, [$current_tag, #128] @ load h5l | h5h
  4933. ext $h5.16b, $h5.16b, $h5.16b, #8
  4934. ldr $h6q, [$current_tag, #160] @ load h6l | h6h
  4935. ext $h6.16b, $h6.16b, $h6.16b, #8
  4936. aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0
  4937. aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0
  4938. aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0
  4939. aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0
  4940. aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0
  4941. aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0
  4942. aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1
  4943. aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0
  4944. aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0
  4945. ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
  4946. aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1
  4947. eor $res0b, $res0b, $acc_lb @ PRE 1
  4948. aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1
  4949. aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1
  4950. aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1
  4951. aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1
  4952. aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1
  4953. aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1
  4954. pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high
  4955. trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
  4956. pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low
  4957. rev64 $res3b, $res3b @ GHASH block 8k+3
  4958. pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low
  4959. aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2
  4960. aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2
  4961. aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2
  4962. aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2
  4963. aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2
  4964. pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high
  4965. aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2
  4966. aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3
  4967. aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3
  4968. rev64 $res6b, $res6b @ GHASH block 8k+6
  4969. aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3
  4970. aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2
  4971. aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3
  4972. pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high
  4973. trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
  4974. aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2
  4975. ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
  4976. aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3
  4977. pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high
  4978. aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3
  4979. eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high
  4980. eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid
  4981. aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3
  4982. pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low
  4983. aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3
  4984. eor3 $acc_hb, $acc_hb, $t1.16b, $t2.16b @ GHASH block 8k+2, 8k+3 - high
  4985. trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
  4986. trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
  4987. pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid
  4988. pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low
  4989. eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low
  4990. pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid
  4991. aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4
  4992. aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4
  4993. eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low
  4994. ldr $h1q, [$current_tag, #32] @ load h1l | h1h
  4995. ext $h1.16b, $h1.16b, $h1.16b, #8
  4996. ldr $h2q, [$current_tag, #64] @ load h1l | h1h
  4997. ext $h2.16b, $h2.16b, $h2.16b, #8
  4998. aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4
  4999. aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4
  5000. aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4
  5001. eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid
  5002. eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
  5003. aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5
  5004. aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4
  5005. aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5
  5006. aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4
  5007. aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4
  5008. aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5
  5009. pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid
  5010. aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5
  5011. aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5
  5012. aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5
  5013. pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid
  5014. aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5
  5015. aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5
  5016. ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
  5017. ldr $h3q, [$current_tag, #80] @ load h3l | h3h
  5018. ext $h3.16b, $h3.16b, $h3.16b, #8
  5019. ldr $h4q, [$current_tag, #112] @ load h4l | h4h
  5020. ext $h4.16b, $h4.16b, $h4.16b, #8
  5021. rev64 $res7b, $res7b @ GHASH block 8k+7
  5022. rev64 $res5b, $res5b @ GHASH block 8k+5
  5023. eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
  5024. trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
  5025. aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6
  5026. ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
  5027. ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
  5028. aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6
  5029. aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6
  5030. aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6
  5031. pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high
  5032. pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high
  5033. pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low
  5034. trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
  5035. pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low
  5036. trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
  5037. aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7
  5038. pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high
  5039. aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6
  5040. aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6
  5041. aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6
  5042. aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6
  5043. ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
  5044. pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low
  5045. aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7
  5046. aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7
  5047. aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7
  5048. aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7
  5049. aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7
  5050. eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high
  5051. aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7
  5052. trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
  5053. aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7
  5054. aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8
  5055. aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8
  5056. aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8
  5057. aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8
  5058. aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8
  5059. aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8
  5060. aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8
  5061. aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 9
  5062. eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
  5063. aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 9
  5064. aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 9
  5065. eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
  5066. aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 9
  5067. aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 9
  5068. pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid
  5069. aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8
  5070. pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid
  5071. pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high
  5072. pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid
  5073. pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid
  5074. pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low
  5075. ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11
  5076. eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low
  5077. eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
  5078. aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 9
  5079. aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 9
  5080. aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 9
  5081. eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high
  5082. eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low
  5083. ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
  5084. eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
  5085. aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 10
  5086. aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 10
  5087. aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 10
  5088. aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 10
  5089. aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 10
  5090. aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 10
  5091. eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
  5092. aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 10
  5093. aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 10
  5094. ldp $rk12q, $rk13q, [$cc, #192] @ load rk12, rk13
  5095. ext $t11.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
  5096. aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 11
  5097. aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 11
  5098. aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 11
  5099. pmull $t12.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
  5100. aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 11
  5101. aese $ctr7b, $rk11 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 11
  5102. aese $ctr6b, $rk11 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 11
  5103. aese $ctr4b, $rk11 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 11
  5104. aese $ctr5b, $rk11 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 11
  5105. aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 12
  5106. eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid
  5107. aese $ctr3b, $rk13 @ AES block 8k+11 - round 13
  5108. aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 12
  5109. aese $ctr6b, $rk12 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 12
  5110. pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
  5111. aese $ctr4b, $rk12 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 12
  5112. aese $ctr7b, $rk12 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 12
  5113. aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 12
  5114. ldr $rk14q, [$cc, #224] @ load rk14
  5115. aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 12
  5116. aese $ctr4b, $rk13 @ AES block 8k+12 - round 13
  5117. ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
  5118. aese $ctr5b, $rk12 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 12
  5119. aese $ctr6b, $rk13 @ AES block 8k+14 - round 13
  5120. aese $ctr2b, $rk13 @ AES block 8k+10 - round 13
  5121. aese $ctr1b, $rk13 @ AES block 8k+9 - round 13
  5122. aese $ctr5b, $rk13 @ AES block 8k+13 - round 13
  5123. eor3 $acc_lb, $acc_lb, $t11.16b, $acc_hb @ MODULO - fold into low
  5124. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15
  5125. aese $ctr7b, $rk13 @ AES block 8k+15 - round 13
  5126. aese $ctr0b, $rk13 @ AES block 8k+8 - round 13
  5127. .L256_dec_tail: @ TAIL
  5128. ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
  5129. sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
  5130. cmp $main_end_input_ptr, #112
  5131. ldr $res1q, [$input_ptr], #16 @ AES block 8k+8 - load ciphertext
  5132. ldp $h78kq, $h8q, [$current_tag, #192] @ load h8l | h8h
  5133. ext $h8.16b, $h8.16b, $h8.16b, #8
  5134. mov $t1.16b, $rk14
  5135. ldp $h5q, $h56kq, [$current_tag, #128] @ load h5l | h5h
  5136. ext $h5.16b, $h5.16b, $h5.16b, #8
  5137. eor3 $res4b, $res1b, $ctr0b, $t1.16b @ AES block 8k+8 - result
  5138. ldp $h6q, $h7q, [$current_tag, #160] @ load h6l | h6h
  5139. ext $h6.16b, $h6.16b, $h6.16b, #8
  5140. ext $h7.16b, $h7.16b, $h7.16b, #8
  5141. b.gt .L256_dec_blocks_more_than_7
  5142. mov $ctr7b, $ctr6b
  5143. sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
  5144. mov $ctr6b, $ctr5b
  5145. mov $ctr5b, $ctr4b
  5146. mov $ctr4b, $ctr3b
  5147. movi $acc_l.8b, #0
  5148. movi $acc_h.8b, #0
  5149. movi $acc_m.8b, #0
  5150. mov $ctr3b, $ctr2b
  5151. cmp $main_end_input_ptr, #96
  5152. mov $ctr2b, $ctr1b
  5153. b.gt .L256_dec_blocks_more_than_6
  5154. mov $ctr7b, $ctr6b
  5155. mov $ctr6b, $ctr5b
  5156. mov $ctr5b, $ctr4b
  5157. cmp $main_end_input_ptr, #80
  5158. sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
  5159. mov $ctr4b, $ctr3b
  5160. mov $ctr3b, $ctr1b
  5161. b.gt .L256_dec_blocks_more_than_5
  5162. cmp $main_end_input_ptr, #64
  5163. mov $ctr7b, $ctr6b
  5164. sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
  5165. mov $ctr6b, $ctr5b
  5166. mov $ctr5b, $ctr4b
  5167. mov $ctr4b, $ctr1b
  5168. b.gt .L256_dec_blocks_more_than_4
  5169. sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
  5170. mov $ctr7b, $ctr6b
  5171. cmp $main_end_input_ptr, #48
  5172. mov $ctr6b, $ctr5b
  5173. mov $ctr5b, $ctr1b
  5174. b.gt .L256_dec_blocks_more_than_3
  5175. ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
  5176. sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
  5177. mov $ctr7b, $ctr6b
  5178. cmp $main_end_input_ptr, #32
  5179. mov $ctr6b, $ctr1b
  5180. b.gt .L256_dec_blocks_more_than_2
  5181. sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
  5182. mov $ctr7b, $ctr1b
  5183. cmp $main_end_input_ptr, #16
  5184. b.gt .L256_dec_blocks_more_than_1
  5185. sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
  5186. ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
  5187. b .L256_dec_blocks_less_than_1
  5188. .L256_dec_blocks_more_than_7: @ blocks left > 7
  5189. rev64 $res0b, $res1b @ GHASH final-7 block
  5190. ldr $res1q, [$input_ptr], #16 @ AES final-6 block - load ciphertext
  5191. st1 { $res4b}, [$output_ptr], #16 @ AES final-7 block - store result
  5192. ins $acc_m.d[0], $h78k.d[1] @ GHASH final-7 block - mid
  5193. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  5194. ins $rk4v.d[0], $res0.d[1] @ GHASH final-7 block - mid
  5195. eor3 $res4b, $res1b, $ctr1b, $t1.16b @ AES final-6 block - result
  5196. pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH final-7 block - high
  5197. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-7 block - mid
  5198. movi $t0.8b, #0 @ surpress further partial tag feed in
  5199. pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH final-7 block - low
  5200. pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-7 block - mid
  5201. .L256_dec_blocks_more_than_6: @ blocks left > 6
  5202. rev64 $res0b, $res1b @ GHASH final-6 block
  5203. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  5204. ldr $res1q, [$input_ptr], #16 @ AES final-5 block - load ciphertext
  5205. movi $t0.8b, #0 @ surpress further partial tag feed in
  5206. ins $rk4v.d[0], $res0.d[1] @ GHASH final-6 block - mid
  5207. st1 { $res4b}, [$output_ptr], #16 @ AES final-6 block - store result
  5208. pmull2 $rk2q1, $res0.2d, $h7.2d @ GHASH final-6 block - high
  5209. pmull $rk3q1, $res0.1d, $h7.1d @ GHASH final-6 block - low
  5210. eor3 $res4b, $res1b, $ctr2b, $t1.16b @ AES final-5 block - result
  5211. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-6 block - low
  5212. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-6 block - mid
  5213. pmull $rk4v.1q, $rk4v.1d, $h78k.1d @ GHASH final-6 block - mid
  5214. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-6 block - mid
  5215. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-6 block - high
  5216. .L256_dec_blocks_more_than_5: @ blocks left > 5
  5217. rev64 $res0b, $res1b @ GHASH final-5 block
  5218. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  5219. pmull2 $rk2q1, $res0.2d, $h6.2d @ GHASH final-5 block - high
  5220. ins $rk4v.d[0], $res0.d[1] @ GHASH final-5 block - mid
  5221. ldr $res1q, [$input_ptr], #16 @ AES final-4 block - load ciphertext
  5222. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-5 block - mid
  5223. st1 { $res4b}, [$output_ptr], #16 @ AES final-5 block - store result
  5224. pmull $rk3q1, $res0.1d, $h6.1d @ GHASH final-5 block - low
  5225. ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-5 block - mid
  5226. pmull2 $rk4v.1q, $rk4v.2d, $h56k.2d @ GHASH final-5 block - mid
  5227. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-5 block - high
  5228. eor3 $res4b, $res1b, $ctr3b, $t1.16b @ AES final-4 block - result
  5229. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-5 block - low
  5230. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-5 block - mid
  5231. movi $t0.8b, #0 @ surpress further partial tag feed in
  5232. .L256_dec_blocks_more_than_4: @ blocks left > 4
  5233. rev64 $res0b, $res1b @ GHASH final-4 block
  5234. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  5235. ins $rk4v.d[0], $res0.d[1] @ GHASH final-4 block - mid
  5236. ldr $res1q, [$input_ptr], #16 @ AES final-3 block - load ciphertext
  5237. movi $t0.8b, #0 @ surpress further partial tag feed in
  5238. pmull $rk3q1, $res0.1d, $h5.1d @ GHASH final-4 block - low
  5239. pmull2 $rk2q1, $res0.2d, $h5.2d @ GHASH final-4 block - high
  5240. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-4 block - mid
  5241. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-4 block - high
  5242. pmull $rk4v.1q, $rk4v.1d, $h56k.1d @ GHASH final-4 block - mid
  5243. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-4 block - low
  5244. st1 { $res4b}, [$output_ptr], #16 @ AES final-4 block - store result
  5245. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-4 block - mid
  5246. eor3 $res4b, $res1b, $ctr4b, $t1.16b @ AES final-3 block - result
  5247. .L256_dec_blocks_more_than_3: @ blocks left > 3
  5248. ldr $h4q, [$current_tag, #112] @ load h4l | h4h
  5249. ext $h4.16b, $h4.16b, $h4.16b, #8
  5250. rev64 $res0b, $res1b @ GHASH final-3 block
  5251. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  5252. ldr $res1q, [$input_ptr], #16 @ AES final-2 block - load ciphertext
  5253. ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
  5254. ins $rk4v.d[0], $res0.d[1] @ GHASH final-3 block - mid
  5255. st1 { $res4b}, [$output_ptr], #16 @ AES final-3 block - store result
  5256. eor3 $res4b, $res1b, $ctr5b, $t1.16b @ AES final-2 block - result
  5257. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
  5258. ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-3 block - mid
  5259. pmull $rk3q1, $res0.1d, $h4.1d @ GHASH final-3 block - low
  5260. pmull2 $rk2q1, $res0.2d, $h4.2d @ GHASH final-3 block - high
  5261. movi $t0.8b, #0 @ surpress further partial tag feed in
  5262. pmull2 $rk4v.1q, $rk4v.2d, $h34k.2d @ GHASH final-3 block - mid
  5263. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-3 block - low
  5264. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-3 block - high
  5265. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-3 block - mid
  5266. .L256_dec_blocks_more_than_2: @ blocks left > 2
  5267. rev64 $res0b, $res1b @ GHASH final-2 block
  5268. ldr $h3q, [$current_tag, #80] @ load h3l | h3h
  5269. ext $h3.16b, $h3.16b, $h3.16b, #8
  5270. ldr $res1q, [$input_ptr], #16 @ AES final-1 block - load ciphertext
  5271. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  5272. ins $rk4v.d[0], $res0.d[1] @ GHASH final-2 block - mid
  5273. pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
  5274. st1 { $res4b}, [$output_ptr], #16 @ AES final-2 block - store result
  5275. eor3 $res4b, $res1b, $ctr6b, $t1.16b @ AES final-1 block - result
  5276. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
  5277. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
  5278. movi $t0.8b, #0 @ surpress further partial tag feed in
  5279. pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
  5280. pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
  5281. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
  5282. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
  5283. .L256_dec_blocks_more_than_1: @ blocks left > 1
  5284. rev64 $res0b, $res1b @ GHASH final-1 block
  5285. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  5286. ins $rk4v.d[0], $res0.d[1] @ GHASH final-1 block - mid
  5287. ldr $h2q, [$current_tag, #64] @ load h1l | h1h
  5288. ext $h2.16b, $h2.16b, $h2.16b, #8
  5289. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
  5290. ldr $res1q, [$input_ptr], #16 @ AES final block - load ciphertext
  5291. st1 { $res4b}, [$output_ptr], #16 @ AES final-1 block - store result
  5292. ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
  5293. pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
  5294. ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
  5295. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
  5296. eor3 $res4b, $res1b, $ctr7b, $t1.16b @ AES final block - result
  5297. pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
  5298. pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
  5299. movi $t0.8b, #0 @ surpress further partial tag feed in
  5300. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
  5301. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
  5302. .L256_dec_blocks_less_than_1: @ blocks left <= 1
  5303. ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored
  5304. mvn $temp0_x, xzr @ temp0_x = 0xffffffffffffffff
  5305. and $bit_length, $bit_length, #127 @ bit_length %= 128
  5306. sub $bit_length, $bit_length, #128 @ bit_length -= 128
  5307. rev32 $rtmp_ctr.16b, $rtmp_ctr.16b
  5308. str $rtmp_ctrq, [$counter] @ store the updated counter
  5309. neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
  5310. and $bit_length, $bit_length, #127 @ bit_length %= 128
  5311. lsr $temp0_x, $temp0_x, $bit_length @ temp0_x is mask for top 64b of last block
  5312. cmp $bit_length, #64
  5313. mvn $temp1_x, xzr @ temp1_x = 0xffffffffffffffff
  5314. csel $temp3_x, $temp0_x, xzr, lt
  5315. csel $temp2_x, $temp1_x, $temp0_x, lt
  5316. mov $ctr0.d[0], $temp2_x @ ctr0b is mask for last block
  5317. mov $ctr0.d[1], $temp3_x
  5318. and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
  5319. ldr $h1q, [$current_tag, #32] @ load h1l | h1h
  5320. ext $h1.16b, $h1.16b, $h1.16b, #8
  5321. bif $res4b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing
  5322. rev64 $res0b, $res1b @ GHASH final block
  5323. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  5324. ins $t0.d[0], $res0.d[1] @ GHASH final block - mid
  5325. pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
  5326. eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
  5327. pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
  5328. eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
  5329. pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
  5330. eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
  5331. ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
  5332. eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
  5333. pmull $t11.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
  5334. eor $t10.16b, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
  5335. ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
  5336. st1 { $res4b}, [$output_ptr] @ store all 16B
  5337. eor $acc_mb, $acc_mb, $t10.16b @ MODULO - karatsuba tidy up
  5338. eor $t11.16b, $acc_hb, $t11.16b @ MODULO - fold into mid
  5339. eor $acc_mb, $acc_mb, $t11.16b @ MODULO - fold into mid
  5340. pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
  5341. ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
  5342. eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low
  5343. eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
  5344. ext $acc_lb, $acc_lb, $acc_lb, #8
  5345. rev64 $acc_lb, $acc_lb
  5346. st1 { $acc_l.16b }, [$current_tag]
  5347. lsr x0, $bit_length, #3 @ return sizes
  5348. ldp d10, d11, [sp, #16]
  5349. ldp d12, d13, [sp, #32]
  5350. ldp d14, d15, [sp, #48]
  5351. ldp d8, d9, [sp], #80
  5352. ret
  5353. .L256_dec_ret:
  5354. mov w0, #0x0
  5355. ret
  5356. .size unroll8_eor3_aes_gcm_dec_256_kernel,.-unroll8_eor3_aes_gcm_dec_256_kernel
  5357. ___
  5358. }
  5359. }
  5360. $code.=<<___;
  5361. .asciz "AES GCM module for ARMv8, SPDX BSD-3-Clause by <xiaokang.qian\@arm.com>"
  5362. .align 2
  5363. #endif
  5364. ___
  5365. {
  5366. my %opcode = (
  5367. "rax1" => 0xce608c00, "eor3" => 0xce000000,
  5368. "bcax" => 0xce200000, "xar" => 0xce800000 );
  5369. sub unsha3 {
  5370. my ($mnemonic,$arg)=@_;
  5371. $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv#]([0-9\-]+))?)?/
  5372. &&
  5373. sprintf ".inst\t0x%08x\t//%s %s",
  5374. $opcode{$mnemonic}|$1|($2<<5)|($3<<16)|(eval($4)<<10),
  5375. $mnemonic,$arg;
  5376. }
  5377. sub unvmov {
  5378. my $arg=shift;
  5379. $arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o &&
  5380. sprintf "ins v%d.d[%d],v%d.d[%d]",$1<8?$1:$1+8,($2 eq "lo")?0:1,
  5381. $3<8?$3:$3+8,($4 eq "lo")?0:1;
  5382. }
  5383. foreach(split("\n",$code)) {
  5384. s/@\s/\/\//o; # old->new style commentary
  5385. s/\`([^\`]*)\`/eval($1)/ge;
  5386. m/\bld1r\b/ and s/\.16b/.2d/g or
  5387. s/\b(eor3|rax1|xar|bcax)\s+(v.*)/unsha3($1,$2)/ge;
  5388. print $_,"\n";
  5389. }
  5390. }
  5391. close STDOUT or die "error closing STDOUT: $!"; # enforce flush