aes-gcm-armv8-unroll8_64.pl 318 KB


  1. #! /usr/bin/env perl
  2. # Copyright 2020-2023 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. #========================================================================
  10. # Written by Xiaokang Qian <xiaokang.qian@arm.com> for the OpenSSL project,
  11. # derived from https://github.com/ARM-software/AArch64cryptolib, original
  12. # author Samuel Lee <Samuel.Lee@arm.com>. The module is, however, dual
  13. # licensed under OpenSSL and SPDX BSD-3-Clause licenses depending on where you
  14. # obtain it.
  15. #========================================================================
  16. #
  17. # Approach - We want to reload constants as we have plenty of spare ASIMD slots around crypto units for loading
  18. # Unroll x8 in main loop, main loop to act on 8 16B blocks per iteration, and then do modulo of the accumulated
  19. # intermediate hashesfrom the 8 blocks.
  20. #
  21. # ____________________________________________________
  22. # | |
  23. # | PRE |
  24. # |____________________________________________________|
  25. # | | | |
  26. # | CTR block 8k+13| AES block 8k+8 | GHASH block 8k+0 |
  27. # |________________|________________|__________________|
  28. # | | | |
  29. # | CTR block 8k+14| AES block 8k+9 | GHASH block 8k+1 |
  30. # |________________|________________|__________________|
  31. # | | | |
  32. # | CTR block 8k+15| AES block 8k+10| GHASH block 8k+2 |
  33. # |________________|________________|__________________|
  34. # | | | |
  35. # | CTR block 8k+16| AES block 8k+11| GHASH block 8k+3 |
  36. # |________________|________________|__________________|
  37. # | | | |
  38. # | CTR block 8k+17| AES block 8k+12| GHASH block 8k+4 |
  39. # |________________|________________|__________________|
  40. # | | | |
  41. # | CTR block 8k+18| AES block 8k+13| GHASH block 8k+5 |
  42. # |________________|________________|__________________|
  43. # | | | |
  44. # | CTR block 8k+19| AES block 8k+14| GHASH block 8k+6 |
  45. # |________________|________________|__________________|
  46. # | | | |
  47. # | CTR block 8k+20| AES block 8k+15| GHASH block 8k+7 |
  48. # |________________|____(mostly)____|__________________|
  49. # | |
  50. # | MODULO |
  51. # |____________________________________________________|
  52. #
  53. # PRE:
  54. # Ensure previous generated intermediate hash is aligned and merged with result for GHASH 4k+0
  55. # EXT low_acc, low_acc, low_acc, #8
  56. # EOR res_curr (8k+0), res_curr (4k+0), low_acc
  57. #
  58. # CTR block:
  59. # Increment and byte reverse counter in scalar registers and transfer to SIMD registers
  60. # REV ctr32, rev_ctr32
  61. # ORR ctr64, constctr96_top32, ctr32, LSL #32
  62. # INS ctr_next.d[0], constctr96_bottom64 // Keeping this in scalar registers to free up space in SIMD RF
  63. # INS ctr_next.d[1], ctr64X
  64. # ADD rev_ctr32, #1
  65. #
  66. # AES block:
  67. # Do AES encryption/decryption on CTR block X and EOR it with input block X. Take 256 bytes key below for example.
  68. # Doing small trick here of loading input in scalar registers, EORing with last key and then transferring
  69. # Given we are very constrained in our ASIMD registers this is quite important
  70. #
  71. # Encrypt:
  72. # LDR input_low, [ input_ptr ], #8
  73. # LDR input_high, [ input_ptr ], #8
  74. # EOR input_low, k14_low
  75. # EOR input_high, k14_high
  76. # INS res_curr.d[0], input_low
  77. # INS res_curr.d[1], input_high
  78. # AESE ctr_curr, k0; AESMC ctr_curr, ctr_curr
  79. # AESE ctr_curr, k1; AESMC ctr_curr, ctr_curr
  80. # AESE ctr_curr, k2; AESMC ctr_curr, ctr_curr
  81. # AESE ctr_curr, k3; AESMC ctr_curr, ctr_curr
  82. # AESE ctr_curr, k4; AESMC ctr_curr, ctr_curr
  83. # AESE ctr_curr, k5; AESMC ctr_curr, ctr_curr
  84. # AESE ctr_curr, k6; AESMC ctr_curr, ctr_curr
  85. # AESE ctr_curr, k7; AESMC ctr_curr, ctr_curr
  86. # AESE ctr_curr, k8; AESMC ctr_curr, ctr_curr
  87. # AESE ctr_curr, k9; AESMC ctr_curr, ctr_curr
  88. # AESE ctr_curr, k10; AESMC ctr_curr, ctr_curr
  89. # AESE ctr_curr, k11; AESMC ctr_curr, ctr_curr
  90. # AESE ctr_curr, k12; AESMC ctr_curr, ctr_curr
  91. # AESE ctr_curr, k13
  92. # EOR res_curr, res_curr, ctr_curr
  93. # ST1 { res_curr.16b }, [ output_ptr ], #16
  94. #
  95. # Decrypt:
  96. # AESE ctr_curr, k0; AESMC ctr_curr, ctr_curr
  97. # AESE ctr_curr, k1; AESMC ctr_curr, ctr_curr
  98. # AESE ctr_curr, k2; AESMC ctr_curr, ctr_curr
  99. # AESE ctr_curr, k3; AESMC ctr_curr, ctr_curr
  100. # AESE ctr_curr, k4; AESMC ctr_curr, ctr_curr
  101. # AESE ctr_curr, k5; AESMC ctr_curr, ctr_curr
  102. # AESE ctr_curr, k6; AESMC ctr_curr, ctr_curr
  103. # AESE ctr_curr, k7; AESMC ctr_curr, ctr_curr
  104. # AESE ctr_curr, k8; AESMC ctr_curr, ctr_curr
  105. # AESE ctr_curr, k9; AESMC ctr_curr, ctr_curr
  106. # AESE ctr_curr, k10; AESMC ctr_curr, ctr_curr
  107. # AESE ctr_curr, k11; AESMC ctr_curr, ctr_curr
  108. # AESE ctr_curr, k12; AESMC ctr_curr, ctr_curr
  109. # AESE ctr_curr, k13
  110. # LDR res_curr, [ input_ptr ], #16
  111. # EOR res_curr, res_curr, ctr_curr
  112. # MOV output_low, res_curr.d[0]
  113. # MOV output_high, res_curr.d[1]
  114. # EOR output_low, k14_low
  115. # EOR output_high, k14_high
  116. # STP output_low, output_high, [ output_ptr ], #16
  117. # GHASH block X:
  118. # Do 128b karatsuba polynomial multiplication on block
  119. # We only have 64b->128b polynomial multipliers, naively that means we need to do 4 64b multiplies to generate a 128b
  120. #
  121. # multiplication:
  122. # Pmull(A,B) == (Pmull(Ah,Bh)<<128 | Pmull(Al,Bl)) ^ (Pmull(Ah,Bl) ^ Pmull(Al,Bh))<<64
  123. #
  124. # The idea behind Karatsuba multiplication is that we can do just 3 64b multiplies:
  125. # Pmull(A,B) == (Pmull(Ah,Bh)<<128 | Pmull(Al,Bl)) ^ (Pmull(Ah^Al,Bh^Bl) ^ Pmull(Ah,Bh) ^ Pmull(Al,Bl))<<64
  126. #
  127. # There is some complication here because the bit order of GHASH's PMULL is reversed compared to elsewhere, so we are
  128. # multiplying with "twisted" powers of H
  129. #
  130. # Note: We can PMULL directly into the acc_x in first GHASH of the loop
  131. # Note: For scheduling big cores we want to split the processing to happen over two loop iterations - otherwise the critical
  132. # path latency dominates the performance
  133. #
  134. # This has a knock on effect on register pressure, so we have to be a bit more clever with our temporary registers
  135. # than indicated here
  136. # REV64 res_curr, res_curr
  137. # INS t_m.d[0], res_curr.d[1]
  138. # EOR t_m.8B, t_m.8B, res_curr.8B
  139. # PMULL2 t_h, res_curr, HX
  140. # PMULL t_l, res_curr, HX
  141. # PMULL t_m, t_m, HX_k
  142. # EOR acc_h, acc_h, t_h
  143. # EOR acc_l, acc_l, t_l
  144. # EOR acc_m, acc_m, t_m
  145. #
  146. # MODULO: take the partial accumulators (~representing sum of 256b multiplication results), from GHASH and do modulo reduction on them
  147. # There is some complication here because the bit order of GHASH's PMULL is reversed compared to elsewhere, so we are doing modulo
  148. # with a reversed constant
  149. # EOR3 acc_m, acc_m, acc_l, acc_h // Finish off karatsuba processing
  150. # PMULL t_mod, acc_h, mod_constant
  151. # EXT acc_h, acc_h, acc_h, #8
  152. # EOR3 acc_m, acc_m, t_mod, acc_h
  153. # PMULL acc_h, acc_m, mod_constant
  154. # EXT acc_m, acc_m, acc_m, #8
  155. # EOR3 acc_l, acc_l, acc_m, acc_h
  156. $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  157. $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  158. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  159. ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
  160. ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate ) or
  161. die "can't locate arm-xlate.pl";
  162. die "only for 64 bit" if $flavour !~ /64/;
  163. open OUT,"| \"$^X\" $xlate $flavour $output";
  164. *STDOUT=*OUT;
  165. $code=<<___;
  166. #include "arm_arch.h"
  167. #if __ARM_MAX_ARCH__>=8
  168. ___
  169. $code.=".arch armv8-a+crypto\n.text\n";
  170. $input_ptr="x0"; #argument block
  171. $bit_length="x1";
  172. $byte_length="x9";
  173. $output_ptr="x2";
  174. $current_tag="x3";
  175. $counter="x16";
  176. $constant_temp="x15";
  177. $modulo_constant="x10";
  178. $cc="x8";
  179. {
  180. my ($end_input_ptr,$main_end_input_ptr,$temp0_x,$temp1_x)=map("x$_",(4..7));
  181. my ($temp2_x,$temp3_x)=map("x$_",(13..14));
  182. my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$ctr4b,$ctr5b,$ctr6b,$ctr7b,$res0b,$res1b,$res2b,$res3b,$res4b,$res5b,$res6b,$res7b)=map("v$_.16b",(0..15));
  183. my ($ctr0,$ctr1,$ctr2,$ctr3,$ctr4,$ctr5,$ctr6,$ctr7,$res0,$res1,$res2,$res3,$res4,$res5,$res6,$res7)=map("v$_",(0..15));
  184. my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$ctr4d,$ctr5d,$ctr6d,$ctr7d)=map("d$_",(0..7));
  185. my ($ctr0q,$ctr1q,$ctr2q,$ctr3q,$ctr4q,$ctr5q,$ctr6q,$ctr7q)=map("q$_",(0..7));
  186. my ($res0q,$res1q,$res2q,$res3q,$res4q,$res5q,$res6q,$res7q)=map("q$_",(8..15));
  187. my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3,$ctr_t4,$ctr_t5,$ctr_t6,$ctr_t7)=map("v$_",(8..15));
  188. my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b,$ctr_t4b,$ctr_t5b,$ctr_t6b,$ctr_t7b)=map("v$_.16b",(8..15));
  189. my ($ctr_t0q,$ctr_t1q,$ctr_t2q,$ctr_t3q,$ctr_t4q,$ctr_t5q,$ctr_t6q,$ctr_t7q)=map("q$_",(8..15));
  190. my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(17..19));
  191. my ($acc_h,$acc_m,$acc_l)=map("v$_",(17..19));
  192. my ($h1,$h12k,$h2,$h3,$h34k,$h4)=map("v$_",(20..25));
  193. my ($h5,$h56k,$h6,$h7,$h78k,$h8)=map("v$_",(20..25));
  194. my ($h1q,$h12kq,$h2q,$h3q,$h34kq,$h4q)=map("q$_",(20..25));
  195. my ($h5q,$h56kq,$h6q,$h7q,$h78kq,$h8q)=map("q$_",(20..25));
  196. my $t0="v16";
  197. my $t0d="d16";
  198. my $t1="v29";
  199. my $t2=$res1;
  200. my $t3=$t1;
  201. my $t4=$res0;
  202. my $t5=$res2;
  203. my $t6=$t0;
  204. my $t7=$res3;
  205. my $t8=$res4;
  206. my $t9=$res5;
  207. my $t10=$res6;
  208. my $t11="v21";
  209. my $t12=$t1;
  210. my $rtmp_ctr="v30";
  211. my $rtmp_ctrq="q30";
  212. my $rctr_inc="v31";
  213. my $rctr_incd="d31";
  214. my $mod_constantd=$t0d;
  215. my $mod_constant=$t0;
  216. my ($rk0,$rk1,$rk2)=map("v$_.16b",(26..28));
  217. my ($rk3,$rk4,$rk5)=map("v$_.16b",(26..28));
  218. my ($rk6,$rk7,$rk8)=map("v$_.16b",(26..28));
  219. my ($rk9,$rk10,$rk11)=map("v$_.16b",(26..28));
  220. my ($rk12,$rk13,$rk14)=map("v$_.16b",(26..28));
  221. my ($rk0q,$rk1q,$rk2q)=map("q$_",(26..28));
  222. my ($rk3q,$rk4q,$rk5q)=map("q$_",(26..28));
  223. my ($rk6q,$rk7q,$rk8q)=map("q$_",(26..28));
  224. my ($rk9q,$rk10q,$rk11q)=map("q$_",(26..28));
  225. my ($rk12q,$rk13q,$rk14q)=map("q$_",(26..28));
  226. my $rk2q1="v28.1q";
  227. my $rk3q1="v26.1q";
  228. my $rk4v="v27";
  229. #########################################################################################
  230. # size_t unroll8_eor3_aes_gcm_enc_128_kernel(const uint8_t * plaintext,
  231. # uint64_t plaintext_length,
  232. # uint8_t * ciphertext,
  233. # uint64_t *Xi,
  234. # unsigned char ivec[16],
  235. # const void *key);
  236. #
  237. $code.=<<___;
  238. .global unroll8_eor3_aes_gcm_enc_128_kernel
  239. .type unroll8_eor3_aes_gcm_enc_128_kernel,%function
  240. .align 4
  241. unroll8_eor3_aes_gcm_enc_128_kernel:
  242. AARCH64_VALID_CALL_TARGET
  243. cbz x1, .L128_enc_ret
  244. stp d8, d9, [sp, #-80]!
  245. lsr $byte_length, $bit_length, #3
  246. mov $counter, x4
  247. mov $cc, x5
  248. stp d10, d11, [sp, #16]
  249. stp d12, d13, [sp, #32]
  250. stp d14, d15, [sp, #48]
  251. mov x5, #0xc200000000000000
  252. stp x5, xzr, [sp, #64]
  253. add $modulo_constant, sp, #64
  254. mov $constant_temp, #0x100000000 @ set up counter increment
  255. movi $rctr_inc.16b, #0x0
  256. mov $rctr_inc.d[1], $constant_temp
  257. mov $main_end_input_ptr, $byte_length
  258. ld1 { $ctr0b}, [$counter] @ CTR block 0
  259. sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
  260. and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffff80 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
  261. rev32 $rtmp_ctr.16b, $ctr0.16b @ set up reversed counter
  262. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 0
  263. rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 1
  264. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 1
  265. rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 2
  266. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 2
  267. rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 3
  268. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 3
  269. rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 4
  270. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 4
  271. rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 5
  272. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 5
  273. ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
  274. rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 6
  275. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 6
  276. rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 7
  277. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 7
  278. aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 0
  279. aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 0
  280. aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
  281. aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
  282. aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
  283. aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
  284. aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 0
  285. aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 0
  286. ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
  287. aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
  288. aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 1
  289. aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 1
  290. aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 1
  291. aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
  292. aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 1
  293. aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
  294. aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 2
  295. aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
  296. aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
  297. aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
  298. aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
  299. aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 2
  300. aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
  301. aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 2
  302. aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 2
  303. aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
  304. ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
  305. aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 3
  306. aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
  307. aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 3
  308. aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
  309. aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 3
  310. aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 3
  311. aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 4
  312. aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
  313. aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 4
  314. aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 4
  315. aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 4
  316. aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
  317. aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
  318. aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
  319. aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
  320. aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 5
  321. aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
  322. ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
  323. aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
  324. aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
  325. aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
  326. aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 5
  327. aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 5
  328. aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 5
  329. aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 6
  330. aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
  331. aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
  332. aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 6
  333. aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 6
  334. aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 6
  335. aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
  336. aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
  337. ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
  338. aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 7
  339. ld1 { $acc_lb}, [$current_tag]
  340. ext $acc_lb, $acc_lb, $acc_lb, #8
  341. rev64 $acc_lb, $acc_lb
  342. aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 7
  343. aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 7
  344. aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
  345. aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 7
  346. aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
  347. aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
  348. aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
  349. aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8
  350. aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8
  351. aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8
  352. aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8
  353. aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8
  354. ldr $rk10q, [$cc, #160] @ load rk10
  355. aese $ctr3b, $rk9 @ AES block 8k+11 - round 9
  356. aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8
  357. aese $ctr2b, $rk9 @ AES block 8k+10 - round 9
  358. aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8
  359. aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8
  360. aese $ctr6b, $rk9 @ AES block 8k+14 - round 9
  361. aese $ctr4b, $rk9 @ AES block 8k+12 - round 9
  362. add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
  363. aese $ctr0b, $rk9 @ AES block 8k+8 - round 9
  364. aese $ctr7b, $rk9 @ AES block 8k+15 - round 9
  365. aese $ctr5b, $rk9 @ AES block 8k+13 - round 9
  366. aese $ctr1b, $rk9 @ AES block 8k+9 - round 9
  367. add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
  368. cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
  369. b.ge .L128_enc_tail @ handle tail
  370. ldp $ctr_t0q, $ctr_t1q, [$input_ptr], #32 @ AES block 0, 1 - load plaintext
  371. ldp $ctr_t2q, $ctr_t3q, [$input_ptr], #32 @ AES block 2, 3 - load plaintext
  372. ldp $ctr_t4q, $ctr_t5q, [$input_ptr], #32 @ AES block 4, 5 - load plaintext
  373. ldp $ctr_t6q, $ctr_t7q, [$input_ptr], #32 @ AES block 6, 7 - load plaintext
  374. cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
  375. eor3 $res0b, $ctr_t0b, $ctr0b, $rk10 @ AES block 0 - result
  376. rev32 $ctr0.16b, $rtmp_ctr.16b @ CTR block 8
  377. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8
  378. eor3 $res1b, $ctr_t1b, $ctr1b, $rk10 @ AES block 1 - result
  379. stp $res0q, $res1q, [$output_ptr], #32 @ AES block 0, 1 - store result
  380. rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 9
  381. eor3 $res5b, $ctr_t5b, $ctr5b, $rk10 @ AES block 5 - result
  382. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 9
  383. eor3 $res2b, $ctr_t2b, $ctr2b, $rk10 @ AES block 2 - result
  384. eor3 $res6b, $ctr_t6b, $ctr6b, $rk10 @ AES block 6 - result
  385. eor3 $res4b, $ctr_t4b, $ctr4b, $rk10 @ AES block 4 - result
  386. rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 10
  387. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 10
  388. eor3 $res3b, $ctr_t3b, $ctr3b, $rk10 @ AES block 3 - result
  389. eor3 $res7b, $ctr_t7b, $ctr7b,$rk10 @ AES block 7 - result
  390. stp $res2q, $res3q, [$output_ptr], #32 @ AES block 2, 3 - store result
  391. rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 11
  392. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 11
  393. stp $res4q, $res5q, [$output_ptr], #32 @ AES block 4, 5 - store result
  394. stp $res6q, $res7q, [$output_ptr], #32 @ AES block 6, 7 - store result
  395. rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 12
  396. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 12
  397. b.ge .L128_enc_prepretail @ do prepretail
  398. .L128_enc_main_loop: @ main loop start
  399. rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13
  400. ldr $h5q, [$current_tag, #128] @ load h5l | h5h
  401. ext $h5.16b, $h5.16b, $h5.16b, #8
  402. ldr $h6q, [$current_tag, #160] @ load h6l | h6h
  403. ext $h6.16b, $h6.16b, $h6.16b, #8
  404. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13
  405. rev64 $res1b, $res1b @ GHASH block 8k+1
  406. rev64 $res0b, $res0b @ GHASH block 8k
  407. ldr $h7q, [$current_tag, #176] @ load h7l | h7h
  408. ext $h7.16b, $h7.16b, $h7.16b, #8
  409. ldr $h8q, [$current_tag, #208] @ load h8l | h8h
  410. ext $h8.16b, $h8.16b, $h8.16b, #8
  411. rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14
  412. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14
  413. ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
  414. ldr $h56kq, [$current_tag, #144] @ load h6k | h5k
  415. ldr $h78kq, [$current_tag, #192] @ load h8k | h7k
  416. rev64 $res5b, $res5b @ GHASH block 8k+5 (t0, t1, t2 and t3 free)
  417. rev64 $res3b, $res3b @ GHASH block 8k+3
  418. ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
  419. eor $res0b, $res0b, $acc_lb @ PRE 1
  420. rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15
  421. rev64 $res7b, $res7b @ GHASH block 8k+7 (t0, t1, t2 and t3 free)
  422. pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high
  423. rev64 $res2b, $res2b @ GHASH block 8k+2
  424. pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high
  425. pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low
  426. trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
  427. pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low
  428. trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
  429. pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high
  430. pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high
  431. eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low
  432. ldr $h3q, [$current_tag, #80] @ load h3l | h3h
  433. ext $h3.16b, $h3.16b, $h3.16b, #8
  434. ldr $h4q, [$current_tag, #112] @ load h3l | h3h
  435. ext $h4.16b, $h4.16b, $h4.16b, #8
  436. aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0
  437. aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0
  438. aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0
  439. eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high
  440. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15
  441. aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0
  442. eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid
  443. aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0
  444. aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1
  445. aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0
  446. aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1
  447. aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0
  448. pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low
  449. aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1
  450. aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0
  451. aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1
  452. eor3 $acc_hb, $acc_hb, $t1.16b,$t2.16b @ GHASH block 8k+2, 8k+3 - high
  453. trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
  454. trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
  455. ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
  456. aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1
  457. aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1
  458. pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low
  459. aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1
  460. aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1
  461. pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid
  462. eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
  463. pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid
  464. rev64 $res6b, $res6b @ GHASH block 8k+6 (t0, t1, and t2 free)
  465. eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low
  466. pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid
  467. eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid
  468. pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid
  469. aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2
  470. aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2
  471. aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2
  472. aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2
  473. eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
  474. aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2
  475. aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2
  476. aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2
  477. aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2
  478. aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3
  479. ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
  480. ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
  481. rev64 $res4b, $res4b @ GHASH block 8k+4 (t0, t1, and t2 free)
  482. ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
  483. aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3
  484. aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3
  485. ldr $h1q, [$current_tag, #32] @ load h1l | h1h
  486. ext $h1.16b, $h1.16b, $h1.16b, #8
  487. ldr $h2q, [$current_tag, #64] @ load h1l | h1h
  488. ext $h2.16b, $h2.16b, $h2.16b, #8
  489. pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high
  490. pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low
  491. trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
  492. trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
  493. aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3
  494. aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3
  495. aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3
  496. aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3
  497. aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3
  498. aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4
  499. aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4
  500. aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4
  501. aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4
  502. aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4
  503. aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4
  504. aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4
  505. pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high
  506. eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
  507. pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low
  508. aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4
  509. ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
  510. trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
  511. pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid
  512. pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid
  513. pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high
  514. pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high
  515. aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5
  516. aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5
  517. pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low
  518. eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high
  519. trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
  520. eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low
  521. aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5
  522. eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
  523. aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5
  524. aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5
  525. aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5
  526. aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5
  527. aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5
  528. eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
  529. ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
  530. pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low
  531. aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6
  532. aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6
  533. pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid
  534. aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6
  535. aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6
  536. pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid
  537. eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low
  538. ldp $ctr_t0q, $ctr_t1q, [$input_ptr], #32 @ AES block 8k+8, 8k+9 - load plaintext
  539. aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6
  540. rev32 $h1.16b, $rtmp_ctr.16b @ CTR block 8k+16
  541. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+16
  542. aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6
  543. aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6
  544. aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6
  545. eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
  546. ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
  547. eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high
  548. aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7
  549. aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7
  550. ldp $ctr_t2q, $ctr_t3q, [$input_ptr], #32 @ AES block 8k+10, 8k+11 - load plaintext
  551. aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7
  552. aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7
  553. aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7
  554. pmull $t11.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
  555. aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7
  556. aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7
  557. rev32 $h2.16b, $rtmp_ctr.16b @ CTR block 8k+17
  558. aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7
  559. aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8
  560. ldp $ctr_t4q, $ctr_t5q, [$input_ptr], #32 @ AES block 8k+12, 8k+13 - load plaintext
  561. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+17
  562. aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8
  563. aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8
  564. aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8
  565. aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8
  566. eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
  567. ldr $rk10q, [$cc, #160] @ load rk10
  568. ext $t12.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
  569. rev32 $h3.16b, $rtmp_ctr.16b @ CTR block 8k+18
  570. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+18
  571. aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8
  572. aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8
  573. eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid
  574. aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8
  575. aese $ctr2b, $rk9 @ AES block 8k+10 - round 9
  576. aese $ctr4b, $rk9 @ AES block 8k+12 - round 9
  577. aese $ctr1b, $rk9 @ AES block 8k+9 - round 9
  578. ldp $ctr_t6q, $ctr_t7q, [$input_ptr], #32 @ AES block 8k+14, 8k+15 - load plaintext
  579. rev32 $h4.16b, $rtmp_ctr.16b @ CTR block 8k+19
  580. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+19
  581. cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
  582. eor3 $res4b, $ctr_t4b, $ctr4b, $rk10 @ AES block 4 - result
  583. aese $ctr7b, $rk9 @ AES block 8k+15 - round 9
  584. aese $ctr6b, $rk9 @ AES block 8k+14 - round 9
  585. aese $ctr3b, $rk9 @ AES block 8k+11 - round 9
  586. eor3 $res2b, $ctr_t2b, $ctr2b, $rk10 @ AES block 8k+10 - result
  587. mov $ctr2.16b, $h3.16b @ CTR block 8k+18
  588. aese $ctr0b, $rk9 @ AES block 8k+8 - round 9
  589. rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 8k+20
  590. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+20
  591. eor3 $res7b, $ctr_t7b, $ctr7b, $rk10 @ AES block 7 - result
  592. aese $ctr5b, $rk9 @ AES block 8k+13 - round 9
  593. pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
  594. eor3 $res1b, $ctr_t1b, $ctr1b, $rk10 @ AES block 8k+9 - result
  595. eor3 $res3b, $ctr_t3b, $ctr3b, $rk10 @ AES block 8k+11 - result
  596. mov $ctr3.16b, $h4.16b @ CTR block 8k+19
  597. ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
  598. eor3 $res5b, $ctr_t5b, $ctr5b, $rk10 @ AES block 5 - result
  599. mov $ctr1.16b, $h2.16b @ CTR block 8k+17
  600. eor3 $res0b, $ctr_t0b, $ctr0b, $rk10 @ AES block 8k+8 - result
  601. mov $ctr0.16b, $h1.16b @ CTR block 8k+16
  602. stp $res0q, $res1q, [$output_ptr], #32 @ AES block 8k+8, 8k+9 - store result
  603. stp $res2q, $res3q, [$output_ptr], #32 @ AES block 8k+10, 8k+11 - store result
  604. eor3 $res6b, $ctr_t6b, $ctr6b, $rk10 @ AES block 6 - result
  605. stp $res4q, $res5q, [$output_ptr], #32 @ AES block 8k+12, 8k+13 - store result
  606. eor3 $acc_lb, $acc_lb, $acc_hb, $t11.16b @ MODULO - fold into low
  607. stp $res6q, $res7q, [$output_ptr], #32 @ AES block 8k+14, 8k+15 - store result
  608. b.lt .L128_enc_main_loop
  609. .L128_enc_prepretail: @ PREPRETAIL
  610. rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13
  611. ldr $h7q, [$current_tag, #176] @ load h7l | h7h
  612. ext $h7.16b, $h7.16b, $h7.16b, #8
  613. ldr $h8q, [$current_tag, #208] @ load h8l | h8h
  614. ext $h8.16b, $h8.16b, $h8.16b, #8
  615. ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
  616. ldr $h5q, [$current_tag, #128] @ load h5l | h5h
  617. ext $h5.16b, $h5.16b, $h5.16b, #8
  618. ldr $h6q, [$current_tag, #160] @ load h6l | h6h
  619. ext $h6.16b, $h6.16b, $h6.16b, #8
  620. rev64 $res0b, $res0b @ GHASH block 8k
  621. rev64 $res1b, $res1b @ GHASH block 8k+1
  622. ldr $h56kq, [$current_tag, #144] @ load h6k | h5k
  623. ldr $h78kq, [$current_tag, #192] @ load h6k | h5k
  624. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13
  625. rev64 $res3b, $res3b @ GHASH block 8k+3
  626. rev64 $res2b, $res2b @ GHASH block 8k+2
  627. eor $res0b, $res0b, $acc_lb @ PRE 1
  628. rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14
  629. pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high
  630. pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low
  631. pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high
  632. rev64 $res5b, $res5b @ GHASH block 8k+5 (t0, t1, t2 and t3 free)
  633. trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
  634. pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low
  635. eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high
  636. trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
  637. eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low
  638. eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid
  639. ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
  640. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14
  641. pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid
  642. pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid
  643. rev64 $res4b, $res4b @ GHASH block 8k+4 (t0, t1, and t2 free)
  644. rev64 $res7b, $res7b @ GHASH block 8k+7 (t0, t1, t2 and t3 free)
  645. eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid
  646. rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15
  647. rev64 $res6b, $res6b @ GHASH block 8k+6 (t0, t1, and t2 free)
  648. aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0
  649. pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high
  650. pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high
  651. aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0
  652. aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0
  653. pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low
  654. aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0
  655. eor3 $acc_hb, $acc_hb, $t1.16b, $t2.16b @ GHASH block 8k+2, 8k+3 - high
  656. trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
  657. trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
  658. aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0
  659. aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0
  660. eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
  661. aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0
  662. aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0
  663. aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1
  664. pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low
  665. ldr $h3q, [$current_tag, #80] @ load h3l | h3h
  666. ext $h3.16b, $h3.16b, $h3.16b, #8
  667. ldr $h4q, [$current_tag, #112] @ load h4l | h4h
  668. ext $h4.16b, $h4.16b, $h4.16b, #8
  669. ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
  670. aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1
  671. pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid
  672. eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low
  673. pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid
  674. aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1
  675. aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1
  676. eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
  677. ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
  678. ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
  679. aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1
  680. aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1
  681. aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1
  682. aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2
  683. aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2
  684. aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2
  685. aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2
  686. aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1
  687. aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2
  688. aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3
  689. aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2
  690. aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2
  691. aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2
  692. ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
  693. ldr $h1q, [$current_tag, #32] @ load h1l | h1h
  694. ext $h1.16b, $h1.16b, $h1.16b, #8
  695. ldr $h2q, [$current_tag, #64] @ load h1l | h1h
  696. ext $h2.16b, $h2.16b, $h2.16b, #8
  697. trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
  698. aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3
  699. pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high
  700. aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3
  701. aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3
  702. pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low
  703. trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
  704. pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high
  705. aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3
  706. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15
  707. aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3
  708. aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3
  709. eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
  710. pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low
  711. aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3
  712. pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high
  713. trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
  714. pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low
  715. trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
  716. aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4
  717. aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4
  718. eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high
  719. eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low
  720. eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
  721. pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid
  722. aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5
  723. aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4
  724. aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4
  725. aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4
  726. aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4
  727. pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid
  728. aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4
  729. aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4
  730. pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high
  731. ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
  732. pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low
  733. eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
  734. pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid
  735. pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid
  736. aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5
  737. aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5
  738. ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
  739. aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5
  740. aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5
  741. eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high
  742. aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5
  743. aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5
  744. aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5
  745. aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6
  746. aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6
  747. aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6
  748. aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6
  749. aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6
  750. eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low
  751. eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
  752. aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6
  753. aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6
  754. aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6
  755. pmull $t11.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
  756. eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
  757. ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
  758. aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7
  759. aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7
  760. aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7
  761. ext $t12.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
  762. aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7
  763. aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7
  764. eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid
  765. aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7
  766. aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7
  767. pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
  768. aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7
  769. aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8
  770. aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8
  771. aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8
  772. ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
  773. aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8
  774. eor3 $acc_lb, $acc_lb, $acc_hb, $acc_mb @ MODULO - fold into low
  775. aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8
  776. aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8
  777. aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8
  778. aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8
  779. ldr $rk10q, [$cc, #160] @ load rk10
  780. aese $ctr6b, $rk9 @ AES block 8k+14 - round 9
  781. aese $ctr2b, $rk9 @ AES block 8k+10 - round 9
  782. aese $ctr0b, $rk9 @ AES block 8k+8 - round 9
  783. aese $ctr1b, $rk9 @ AES block 8k+9 - round 9
  784. aese $ctr3b, $rk9 @ AES block 8k+11 - round 9
  785. aese $ctr5b, $rk9 @ AES block 8k+13 - round 9
  786. aese $ctr4b, $rk9 @ AES block 8k+12 - round 9
  787. aese $ctr7b, $rk9 @ AES block 8k+15 - round 9
  788. .L128_enc_tail: @ TAIL
  789. sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
  790. ldr $ctr_t0q, [$input_ptr], #16 @ AES block 8k+8 - load plaintext
  791. mov $t1.16b, $rk10
  792. ldp $h5q, $h56kq, [$current_tag, #128] @ load h5l | h5h
  793. ext $h5.16b, $h5.16b, $h5.16b, #8
  794. eor3 $res1b, $ctr_t0b, $ctr0b, $t1.16b @ AES block 8k+8 - result
  795. ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
  796. ldp $h6q, $h7q, [$current_tag, #160] @ load h6l | h6h
  797. ext $h6.16b, $h6.16b, $h6.16b, #8
  798. ext $h7.16b, $h7.16b, $h7.16b, #8
  799. ldp $h78kq, $h8q, [$current_tag, #192] @ load h8k | h7k
  800. ext $h8.16b, $h8.16b, $h8.16b, #8
  801. cmp $main_end_input_ptr, #112
  802. b.gt .L128_enc_blocks_more_than_7
  803. mov $ctr7b, $ctr6b
  804. mov $ctr6b, $ctr5b
  805. movi $acc_h.8b, #0
  806. cmp $main_end_input_ptr, #96
  807. sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
  808. mov $ctr5b, $ctr4b
  809. mov $ctr4b, $ctr3b
  810. mov $ctr3b, $ctr2b
  811. mov $ctr2b, $ctr1b
  812. movi $acc_l.8b, #0
  813. movi $acc_m.8b, #0
  814. b.gt .L128_enc_blocks_more_than_6
  815. mov $ctr7b, $ctr6b
  816. cmp $main_end_input_ptr, #80
  817. sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
  818. mov $ctr6b, $ctr5b
  819. mov $ctr5b, $ctr4b
  820. mov $ctr4b, $ctr3b
  821. mov $ctr3b, $ctr1b
  822. b.gt .L128_enc_blocks_more_than_5
  823. cmp $main_end_input_ptr, #64
  824. sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
  825. mov $ctr7b, $ctr6b
  826. mov $ctr6b, $ctr5b
  827. mov $ctr5b, $ctr4b
  828. mov $ctr4b, $ctr1b
  829. b.gt .L128_enc_blocks_more_than_4
  830. mov $ctr7b, $ctr6b
  831. sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
  832. mov $ctr6b, $ctr5b
  833. mov $ctr5b, $ctr1b
  834. cmp $main_end_input_ptr, #48
  835. b.gt .L128_enc_blocks_more_than_3
  836. sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
  837. mov $ctr7b, $ctr6b
  838. mov $ctr6b, $ctr1b
  839. cmp $main_end_input_ptr, #32
  840. ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
  841. b.gt .L128_enc_blocks_more_than_2
  842. cmp $main_end_input_ptr, #16
  843. sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
  844. mov $ctr7b, $ctr1b
  845. b.gt .L128_enc_blocks_more_than_1
  846. ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
  847. sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
  848. b .L128_enc_blocks_less_than_1
  849. .L128_enc_blocks_more_than_7: @ blocks left > 7
  850. st1 { $res1b}, [$output_ptr], #16 @ AES final-7 block - store result
  851. rev64 $res0b, $res1b @ GHASH final-7 block
  852. ldr $ctr_t1q, [$input_ptr], #16 @ AES final-6 block - load plaintext
  853. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  854. ins $rk4v.d[0], $res0.d[1] @ GHASH final-7 block - mid
  855. pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH final-7 block - high
  856. ins $acc_m.d[0], $h78k.d[1] @ GHASH final-7 block - mid
  857. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-7 block - mid
  858. movi $t0.8b, #0 @ suppress further partial tag feed in
  859. eor3 $res1b, $ctr_t1b, $ctr1b, $t1.16b @ AES final-6 block - result
  860. pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-7 block - mid
  861. pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH final-7 block - low
  862. .L128_enc_blocks_more_than_6: @ blocks left > 6
  863. st1 { $res1b}, [$output_ptr], #16 @ AES final-6 block - store result
  864. rev64 $res0b, $res1b @ GHASH final-6 block
  865. ldr $ctr_t1q, [$input_ptr], #16 @ AES final-5 block - load plaintext
  866. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  867. ins $rk4v.d[0], $res0.d[1] @ GHASH final-6 block - mid
  868. eor3 $res1b, $ctr_t1b, $ctr2b, $t1.16b @ AES final-5 block - result
  869. pmull $rk3q1, $res0.1d, $h7.1d @ GHASH final-6 block - low
  870. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-6 block - mid
  871. movi $t0.8b, #0 @ suppress further partial tag feed in
  872. pmull $rk4v.1q, $rk4v.1d, $h78k.1d @ GHASH final-6 block - mid
  873. pmull2 $rk2q1, $res0.2d, $h7.2d @ GHASH final-6 block - high
  874. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-6 block - low
  875. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-6 block - mid
  876. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-6 block - high
  877. .L128_enc_blocks_more_than_5: @ blocks left > 5
  878. st1 { $res1b}, [$output_ptr], #16 @ AES final-5 block - store result
  879. rev64 $res0b, $res1b @ GHASH final-5 block
  880. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  881. ins $rk4v.d[0], $res0.d[1] @ GHASH final-5 block - mid
  882. ldr $ctr_t1q, [$input_ptr], #16 @ AES final-4 block - load plaintext
  883. pmull2 $rk2q1, $res0.2d, $h6.2d @ GHASH final-5 block - high
  884. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-5 block - high
  885. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-5 block - mid
  886. ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-5 block - mid
  887. eor3 $res1b, $ctr_t1b, $ctr3b, $t1.16b @ AES final-4 block - result
  888. pmull $rk3q1, $res0.1d, $h6.1d @ GHASH final-5 block - low
  889. movi $t0.8b, #0 @ suppress further partial tag feed in
  890. pmull2 $rk4v.1q, $rk4v.2d, $h56k.2d @ GHASH final-5 block - mid
  891. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-5 block - low
  892. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-5 block - mid
  893. .L128_enc_blocks_more_than_4: @ blocks left > 4
  894. st1 { $res1b}, [$output_ptr], #16 @ AES final-4 block - store result
  895. rev64 $res0b, $res1b @ GHASH final-4 block
  896. ldr $ctr_t1q, [$input_ptr], #16 @ AES final-3 block - load plaintext
  897. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  898. ins $rk4v.d[0], $res0.d[1] @ GHASH final-4 block - mid
  899. movi $t0.8b, #0 @ suppress further partial tag feed in
  900. pmull2 $rk2q1, $res0.2d, $h5.2d @ GHASH final-4 block - high
  901. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-4 block - mid
  902. pmull $rk3q1, $res0.1d, $h5.1d @ GHASH final-4 block - low
  903. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-4 block - high
  904. pmull $rk4v.1q, $rk4v.1d, $h56k.1d @ GHASH final-4 block - mid
  905. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-4 block - low
  906. eor3 $res1b, $ctr_t1b, $ctr4b, $t1.16b @ AES final-3 block - result
  907. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-4 block - mid
  908. .L128_enc_blocks_more_than_3: @ blocks left > 3
  909. st1 { $res1b}, [$output_ptr], #16 @ AES final-3 block - store result
  910. ldr $h4q, [$current_tag, #112] @ load h4l | h4h
  911. ext $h4.16b, $h4.16b, $h4.16b, #8
  912. rev64 $res0b, $res1b @ GHASH final-3 block
  913. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  914. movi $t0.8b, #0 @ suppress further partial tag feed in
  915. ins $rk4v.d[0], $res0.d[1] @ GHASH final-3 block - mid
  916. ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
  917. pmull $rk3q1, $res0.1d, $h4.1d @ GHASH final-3 block - low
  918. ldr $ctr_t1q, [$input_ptr], #16 @ AES final-2 block - load plaintext
  919. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
  920. ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-3 block - mid
  921. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-3 block - low
  922. eor3 $res1b, $ctr_t1b, $ctr5b, $t1.16b @ AES final-2 block - result
  923. pmull2 $rk4v.1q, $rk4v.2d, $h34k.2d @ GHASH final-3 block - mid
  924. pmull2 $rk2q1, $res0.2d, $h4.2d @ GHASH final-3 block - high
  925. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-3 block - mid
  926. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-3 block - high
  927. .L128_enc_blocks_more_than_2: @ blocks left > 2
  928. st1 { $res1b}, [$output_ptr], #16 @ AES final-2 block - store result
  929. rev64 $res0b, $res1b @ GHASH final-2 block
  930. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  931. ldr $ctr_t1q, [$input_ptr], #16 @ AES final-1 block - load plaintext
  932. ins $rk4v.d[0], $res0.d[1] @ GHASH final-2 block - mid
  933. ldr $h3q, [$current_tag, #80] @ load h3l | h3h
  934. ext $h3.16b, $h3.16b, $h3.16b, #8
  935. movi $t0.8b, #0 @ suppress further partial tag feed in
  936. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
  937. eor3 $res1b, $ctr_t1b, $ctr6b, $t1.16b @ AES final-1 block - result
  938. pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
  939. pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
  940. pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
  941. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
  942. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
  943. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
  944. .L128_enc_blocks_more_than_1: @ blocks left > 1
  945. st1 { $res1b}, [$output_ptr], #16 @ AES final-1 block - store result
  946. ldr $h2q, [$current_tag, #64] @ load h2l | h2h
  947. ext $h2.16b, $h2.16b, $h2.16b, #8
  948. rev64 $res0b, $res1b @ GHASH final-1 block
  949. ldr $ctr_t1q, [$input_ptr], #16 @ AES final block - load plaintext
  950. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  951. movi $t0.8b, #0 @ suppress further partial tag feed in
  952. ins $rk4v.d[0], $res0.d[1] @ GHASH final-1 block - mid
  953. eor3 $res1b, $ctr_t1b, $ctr7b, $t1.16b @ AES final block - result
  954. pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
  955. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
  956. ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
  957. ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
  958. pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
  959. pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
  960. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
  961. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
  962. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
  963. .L128_enc_blocks_less_than_1: @ blocks left <= 1
  964. rev32 $rtmp_ctr.16b, $rtmp_ctr.16b
  965. str $rtmp_ctrq, [$counter] @ store the updated counter
  966. and $bit_length, $bit_length, #127 @ bit_length %= 128
  967. sub $bit_length, $bit_length, #128 @ bit_length -= 128
  968. neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
  969. mvn $temp0_x, xzr @ temp0_x = 0xffffffffffffffff
  970. ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored
  971. and $bit_length, $bit_length, #127 @ bit_length %= 128
  972. lsr $temp0_x, $temp0_x, $bit_length @ temp0_x is mask for top 64b of last block
  973. mvn $temp1_x, xzr @ temp1_x = 0xffffffffffffffff
  974. cmp $bit_length, #64
  975. csel $temp2_x, $temp1_x, $temp0_x, lt
  976. csel $temp3_x, $temp0_x, xzr, lt
  977. mov $ctr0.d[1], $temp3_x
  978. mov $ctr0.d[0], $temp2_x @ ctr0b is mask for last block
  979. and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
  980. rev64 $res0b, $res1b @ GHASH final block
  981. bif $res1b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing
  982. st1 { $res1b}, [$output_ptr] @ store all 16B
  983. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  984. ins $t0.d[0], $res0.d[1] @ GHASH final block - mid
  985. eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
  986. ldr $h1q, [$current_tag, #32] @ load h1l | h1h
  987. ext $h1.16b, $h1.16b, $h1.16b, #8
  988. pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
  989. pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
  990. eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
  991. ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
  992. pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
  993. eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
  994. eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
  995. ext $t11.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
  996. pmull $t12.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
  997. eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
  998. eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid
  999. pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
  1000. ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
  1001. eor3 $acc_lb, $acc_lb, $acc_hb, $t11.16b @ MODULO - fold into low
  1002. ext $acc_lb, $acc_lb, $acc_lb, #8
  1003. rev64 $acc_lb, $acc_lb
  1004. st1 { $acc_l.16b }, [$current_tag]
  1005. mov x0, $byte_length
  1006. ldp d10, d11, [sp, #16]
  1007. ldp d12, d13, [sp, #32]
  1008. ldp d14, d15, [sp, #48]
  1009. ldp d8, d9, [sp], #80
  1010. ret
  1011. .L128_enc_ret:
  1012. mov w0, #0x0
  1013. ret
  1014. .size unroll8_eor3_aes_gcm_enc_128_kernel,.-unroll8_eor3_aes_gcm_enc_128_kernel
  1015. ___
  1016. #########################################################################################
  1017. # size_t unroll8_eor3_aes_gcm_dec_128_kernel(const uint8_t * ciphertext,
  1018. # uint64_t plaintext_length,
  1019. # uint8_t * plaintext,
  1020. # uint64_t *Xi,
  1021. # unsigned char ivec[16],
  1022. # const void *key);
  1023. #
  1024. $code.=<<___;
  1025. .global unroll8_eor3_aes_gcm_dec_128_kernel
  1026. .type unroll8_eor3_aes_gcm_dec_128_kernel,%function
  1027. .align 4
  1028. unroll8_eor3_aes_gcm_dec_128_kernel:
  1029. AARCH64_VALID_CALL_TARGET
  1030. cbz x1, .L128_dec_ret
  1031. stp d8, d9, [sp, #-80]!
  1032. lsr $byte_length, $bit_length, #3
  1033. mov $counter, x4
  1034. mov $cc, x5
  1035. stp d10, d11, [sp, #16]
  1036. stp d12, d13, [sp, #32]
  1037. stp d14, d15, [sp, #48]
  1038. mov x5, #0xc200000000000000
  1039. stp x5, xzr, [sp, #64]
  1040. add $modulo_constant, sp, #64
  1041. mov $main_end_input_ptr, $byte_length
  1042. ld1 { $ctr0b}, [$counter] @ CTR block 0
  1043. ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
  1044. sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
  1045. mov $constant_temp, #0x100000000 @ set up counter increment
  1046. movi $rctr_inc.16b, #0x0
  1047. mov $rctr_inc.d[1], $constant_temp
  1048. ld1 { $acc_lb}, [$current_tag]
  1049. ext $acc_lb, $acc_lb, $acc_lb, #8
  1050. rev64 $acc_lb, $acc_lb
  1051. rev32 $rtmp_ctr.16b, $ctr0.16b @ set up reversed counter
  1052. aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
  1053. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 0
  1054. rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 1
  1055. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 1
  1056. and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffff80 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
  1057. rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 2
  1058. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 2
  1059. aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
  1060. rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 3
  1061. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 3
  1062. aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
  1063. aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
  1064. rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 4
  1065. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 4
  1066. rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 5
  1067. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 5
  1068. aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
  1069. rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 6
  1070. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 6
  1071. aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 0
  1072. aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
  1073. aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 0
  1074. rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 7
  1075. aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 0
  1076. aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
  1077. aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 0
  1078. ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
  1079. aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 1
  1080. aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 1
  1081. aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 1
  1082. aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 1
  1083. aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 2
  1084. aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
  1085. aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
  1086. aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 2
  1087. aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
  1088. aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 2
  1089. aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 2
  1090. aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
  1091. aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
  1092. aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 3
  1093. aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
  1094. ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
  1095. aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 3
  1096. aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
  1097. aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 3
  1098. aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
  1099. aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
  1100. aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
  1101. aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 4
  1102. aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 3
  1103. aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 4
  1104. aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
  1105. aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
  1106. aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 4
  1107. aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 4
  1108. aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
  1109. ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
  1110. aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
  1111. aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
  1112. aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 5
  1113. aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
  1114. aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 5
  1115. aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 5
  1116. aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 5
  1117. aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
  1118. aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
  1119. aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
  1120. aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 6
  1121. aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 6
  1122. aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
  1123. aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
  1124. aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 6
  1125. aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 6
  1126. aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
  1127. aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 7
  1128. aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
  1129. aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 7
  1130. aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 7
  1131. ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
  1132. aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 7
  1133. aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
  1134. aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
  1135. add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
  1136. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 7
  1137. aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 8
  1138. aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
  1139. aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
  1140. aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 8
  1141. aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
  1142. aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 8
  1143. aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
  1144. aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 8
  1145. aese $ctr0b, $rk9 @ AES block 0 - round 9
  1146. aese $ctr1b, $rk9 @ AES block 1 - round 9
  1147. aese $ctr6b, $rk9 @ AES block 6 - round 9
  1148. ldr $rk10q, [$cc, #160] @ load rk10
  1149. aese $ctr4b, $rk9 @ AES block 4 - round 9
  1150. aese $ctr3b, $rk9 @ AES block 3 - round 9
  1151. aese $ctr2b, $rk9 @ AES block 2 - round 9
  1152. aese $ctr5b, $rk9 @ AES block 5 - round 9
  1153. aese $ctr7b, $rk9 @ AES block 7 - round 9
  1154. add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
  1155. cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
  1156. b.ge .L128_dec_tail @ handle tail
  1157. ldp $res0q, $res1q, [$input_ptr], #32 @ AES block 0, 1 - load ciphertext
  1158. eor3 $ctr0b, $res0b, $ctr0b, $rk10 @ AES block 0 - result
  1159. eor3 $ctr1b, $res1b, $ctr1b, $rk10 @ AES block 1 - result
  1160. stp $ctr0q, $ctr1q, [$output_ptr], #32 @ AES block 0, 1 - store result
  1161. rev32 $ctr0.16b, $rtmp_ctr.16b @ CTR block 8
  1162. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8
  1163. ldp $res2q, $res3q, [$input_ptr], #32 @ AES block 2, 3 - load ciphertext
  1164. ldp $res4q, $res5q, [$input_ptr], #32 @ AES block 4, 5 - load ciphertext
  1165. rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 9
  1166. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 9
  1167. ldp $res6q, $res7q, [$input_ptr], #32 @ AES block 6, 7 - load ciphertext
  1168. eor3 $ctr3b, $res3b, $ctr3b, $rk10 @ AES block 3 - result
  1169. eor3 $ctr2b, $res2b, $ctr2b, $rk10 @ AES block 2 - result
  1170. stp $ctr2q, $ctr3q, [$output_ptr], #32 @ AES block 2, 3 - store result
  1171. rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 10
  1172. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 10
  1173. eor3 $ctr6b, $res6b, $ctr6b, $rk10 @ AES block 6 - result
  1174. rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 11
  1175. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 11
  1176. eor3 $ctr4b, $res4b, $ctr4b, $rk10 @ AES block 4 - result
  1177. eor3 $ctr5b, $res5b, $ctr5b, $rk10 @ AES block 5 - result
  1178. stp $ctr4q, $ctr5q, [$output_ptr], #32 @ AES block 4, 5 - store result
  1179. eor3 $ctr7b, $res7b, $ctr7b, $rk10 @ AES block 7 - result
  1180. stp $ctr6q, $ctr7q, [$output_ptr], #32 @ AES block 6, 7 - store result
  1181. rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 12
  1182. cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
  1183. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 12
  1184. b.ge .L128_dec_prepretail @ do prepretail
  1185. .L128_dec_main_loop: @ main loop start
  1186. ldr $h7q, [$current_tag, #176] @ load h7l | h7h
  1187. ext $h7.16b, $h7.16b, $h7.16b, #8
  1188. ldr $h8q, [$current_tag, #208] @ load h8l | h8h
  1189. ext $h8.16b, $h8.16b, $h8.16b, #8
  1190. rev64 $res1b, $res1b @ GHASH block 8k+1
  1191. rev64 $res0b, $res0b @ GHASH block 8k
  1192. ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
  1193. rev64 $res6b, $res6b @ GHASH block 8k+6
  1194. ldr $h5q, [$current_tag, #128] @ load h5l | h5h
  1195. ext $h5.16b, $h5.16b, $h5.16b, #8
  1196. ldr $h6q, [$current_tag, #160] @ load h6l | h6h
  1197. ext $h6.16b, $h6.16b, $h6.16b, #8
  1198. eor $res0b, $res0b, $acc_lb @ PRE 1
  1199. rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13
  1200. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13
  1201. rev64 $res2b, $res2b @ GHASH block 8k+2
  1202. rev64 $res4b, $res4b @ GHASH block 8k+4
  1203. ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
  1204. rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14
  1205. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14
  1206. ldr $h56kq, [$current_tag, #144] @ load h6k | h5k
  1207. ldr $h78kq, [$current_tag, #192] @ load h8k | h7k
  1208. pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high
  1209. pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high
  1210. rev64 $res3b, $res3b @ GHASH block 8k+3
  1211. rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15
  1212. trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
  1213. rev64 $res5b, $res5b @ GHASH block 8k+5
  1214. pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low
  1215. pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low
  1216. trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
  1217. pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high
  1218. aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0
  1219. pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high
  1220. aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0
  1221. aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0
  1222. aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0
  1223. aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0
  1224. aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0
  1225. eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high
  1226. aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0
  1227. eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid
  1228. aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0
  1229. aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1
  1230. eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low
  1231. eor3 $acc_hb, $acc_hb, $t1.16b, $t2.16b @ GHASH block 8k+2, 8k+3 - high
  1232. ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
  1233. trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
  1234. aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1
  1235. pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low
  1236. trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
  1237. pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid
  1238. ldr $h3q, [$current_tag, #80] @ load h3l | h3h
  1239. ext $h3.16b, $h3.16b, $h3.16b, #8
  1240. ldr $h4q, [$current_tag, #112] @ load h4l | h4h
  1241. ext $h4.16b, $h4.16b, $h4.16b, #8
  1242. pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid
  1243. aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1
  1244. aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1
  1245. aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1
  1246. pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low
  1247. aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1
  1248. aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1
  1249. aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1
  1250. aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2
  1251. aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2
  1252. eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low
  1253. aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2
  1254. eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
  1255. ldr $h1q, [$current_tag, #32] @ load h1l | h1h
  1256. ext $h1.16b, $h1.16b, $h1.16b, #8
  1257. ldr $h2q, [$current_tag, #64] @ load h2l | h2h
  1258. ext $h2.16b, $h2.16b, $h2.16b, #8
  1259. eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid
  1260. aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2
  1261. aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2
  1262. trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
  1263. aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2
  1264. aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2
  1265. aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2
  1266. pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid
  1267. pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid
  1268. aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3
  1269. rev64 $res7b, $res7b @ GHASH block 8k+7
  1270. pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high
  1271. ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
  1272. pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low
  1273. eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
  1274. ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
  1275. ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
  1276. aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3
  1277. trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
  1278. aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3
  1279. aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3
  1280. aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3
  1281. aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3
  1282. aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3
  1283. aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3
  1284. pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high
  1285. pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low
  1286. pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high
  1287. pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low
  1288. aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4
  1289. aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4
  1290. eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
  1291. trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
  1292. aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4
  1293. aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4
  1294. aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4
  1295. aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4
  1296. aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4
  1297. aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4
  1298. trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
  1299. ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
  1300. aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5
  1301. pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid
  1302. aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5
  1303. eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
  1304. aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5
  1305. pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid
  1306. aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5
  1307. aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5
  1308. aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5
  1309. aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5
  1310. aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5
  1311. pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high
  1312. eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
  1313. eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low
  1314. aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6
  1315. eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high
  1316. aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6
  1317. aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6
  1318. pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid
  1319. aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6
  1320. aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6
  1321. aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6
  1322. pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low
  1323. pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid
  1324. aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6
  1325. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15
  1326. eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high
  1327. aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6
  1328. ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
  1329. ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
  1330. eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low
  1331. aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7
  1332. rev32 $h1.16b, $rtmp_ctr.16b @ CTR block 8k+16
  1333. eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
  1334. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+16
  1335. aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7
  1336. aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7
  1337. aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7
  1338. aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7
  1339. aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7
  1340. rev32 $h2.16b, $rtmp_ctr.16b @ CTR block 8k+17
  1341. aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7
  1342. ext $t11.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
  1343. pmull $t12.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
  1344. eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
  1345. aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7
  1346. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+17
  1347. aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8
  1348. aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8
  1349. ldp $res0q, $res1q, [$input_ptr], #32 @ AES block 8k+8, 8k+9 - load ciphertext
  1350. ldp $res2q, $res3q, [$input_ptr], #32 @ AES block 8k+10, 8k+11 - load ciphertext
  1351. aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8
  1352. rev32 $h3.16b, $rtmp_ctr.16b @ CTR block 8k+18
  1353. ldp $res4q, $res5q, [$input_ptr], #32 @ AES block 8k+12, 8k+13 - load ciphertext
  1354. aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8
  1355. eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid
  1356. ldp $res6q, $res7q, [$input_ptr], #32 @ AES block 8k+14, 8k+15 - load ciphertext
  1357. aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8
  1358. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+18
  1359. aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8
  1360. aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8
  1361. aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8
  1362. aese $ctr0b, $rk9 @ AES block 8k+8 - round 9
  1363. aese $ctr1b, $rk9 @ AES block 8k+9 - round 9
  1364. ldr $rk10q, [$cc, #160] @ load rk10
  1365. aese $ctr6b, $rk9 @ AES block 8k+14 - round 9
  1366. pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
  1367. aese $ctr2b, $rk9 @ AES block 8k+10 - round 9
  1368. aese $ctr7b, $rk9 @ AES block 8k+15 - round 9
  1369. aese $ctr4b, $rk9 @ AES block 8k+12 - round 9
  1370. ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
  1371. rev32 $h4.16b, $rtmp_ctr.16b @ CTR block 8k+19
  1372. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+19
  1373. aese $ctr3b, $rk9 @ AES block 8k+11 - round 9
  1374. aese $ctr5b, $rk9 @ AES block 8k+13 - round 9
  1375. eor3 $ctr1b, $res1b, $ctr1b, $rk10 @ AES block 8k+9 - result
  1376. eor3 $ctr0b, $res0b, $ctr0b, $rk10 @ AES block 8k+8 - result
  1377. eor3 $ctr7b, $res7b, $ctr7b, $rk10 @ AES block 8k+15 - result
  1378. eor3 $ctr6b, $res6b, $ctr6b, $rk10 @ AES block 8k+14 - result
  1379. eor3 $ctr2b, $res2b, $ctr2b, $rk10 @ AES block 8k+10 - result
  1380. stp $ctr0q, $ctr1q, [$output_ptr], #32 @ AES block 8k+8, 8k+9 - store result
  1381. mov $ctr1.16b, $h2.16b @ CTR block 8k+17
  1382. eor3 $ctr4b, $res4b, $ctr4b, $rk10 @ AES block 8k+12 - result
  1383. eor3 $acc_lb, $acc_lb, $acc_hb, $t11.16b @ MODULO - fold into low
  1384. mov $ctr0.16b, $h1.16b @ CTR block 8k+16
  1385. eor3 $ctr3b, $res3b, $ctr3b, $rk10 @ AES block 8k+11 - result
  1386. cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
  1387. stp $ctr2q, $ctr3q, [$output_ptr], #32 @ AES block 8k+10, 8k+11 - store result
  1388. eor3 $ctr5b, $res5b, $ctr5b, $rk10 @ AES block 8k+13 - result
  1389. mov $ctr2.16b, $h3.16b @ CTR block 8k+18
  1390. stp $ctr4q, $ctr5q, [$output_ptr], #32 @ AES block 8k+12, 8k+13 - store result
  1391. rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 8k+20
  1392. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+20
  1393. stp $ctr6q, $ctr7q, [$output_ptr], #32 @ AES block 8k+14, 8k+15 - store result
  1394. mov $ctr3.16b, $h4.16b @ CTR block 8k+19
  1395. b.lt .L128_dec_main_loop
  1396. .L128_dec_prepretail: @ PREPRETAIL
  1397. rev64 $res3b, $res3b @ GHASH block 8k+3
  1398. ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
  1399. rev64 $res0b, $res0b @ GHASH block 8k
  1400. rev64 $res2b, $res2b @ GHASH block 8k+2
  1401. rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13
  1402. ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
  1403. ldr $h7q, [$current_tag, #176] @ load h7l | h7h
  1404. ext $h7.16b, $h7.16b, $h7.16b, #8
  1405. ldr $h8q, [$current_tag, #208] @ load h8l | h8h
  1406. ext $h8.16b, $h8.16b, $h8.16b, #8
  1407. eor $res0b, $res0b, $acc_lb @ PRE 1
  1408. rev64 $res1b, $res1b @ GHASH block 8k+1
  1409. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13
  1410. ldr $h5q, [$current_tag, #128] @ load h5l | h5h
  1411. ext $h5.16b, $h5.16b, $h5.16b, #8
  1412. ldr $h6q, [$current_tag, #160] @ load h6l | h6h
  1413. ext $h6.16b, $h6.16b, $h6.16b, #8
  1414. rev64 $res5b, $res5b @ GHASH block 8k+5
  1415. rev64 $res4b, $res4b @ GHASH block 8k+4
  1416. rev64 $res6b, $res6b @ GHASH block 8k+6
  1417. ldr $h56kq, [$current_tag, #144] @ load h6k | h5k
  1418. ldr $h78kq, [$current_tag, #192] @ load h8k | h7k
  1419. rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14
  1420. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14
  1421. pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high
  1422. pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low
  1423. pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high
  1424. trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
  1425. trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
  1426. pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high
  1427. pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low
  1428. pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high
  1429. aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0
  1430. eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high
  1431. aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0
  1432. eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid
  1433. pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low
  1434. rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15
  1435. aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0
  1436. eor3 $acc_hb, $acc_hb, $t1.16b, $t2.16b @ GHASH block 8k+2, 8k+3 - high
  1437. trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
  1438. trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
  1439. aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0
  1440. aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0
  1441. aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0
  1442. pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid
  1443. pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid
  1444. pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low
  1445. aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1
  1446. aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0
  1447. aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0
  1448. eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low
  1449. eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
  1450. eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid
  1451. aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1
  1452. aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1
  1453. aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1
  1454. ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
  1455. eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low
  1456. pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid
  1457. ldr $h3q, [$current_tag, #80] @ load h3l | h3h
  1458. ext $h3.16b, $h3.16b, $h3.16b, #8
  1459. ldr $h4q, [$current_tag, #112] @ load h4l | h4h
  1460. ext $h4.16b, $h4.16b, $h4.16b, #8
  1461. aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1
  1462. pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid
  1463. aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1
  1464. aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1
  1465. aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1
  1466. ldr $h1q, [$current_tag, #32] @ load h1l | h1h
  1467. ext $h1.16b, $h1.16b, $h1.16b, #8
  1468. ldr $h2q, [$current_tag, #64] @ load h2l | h2h
  1469. ext $h2.16b, $h2.16b, $h2.16b, #8
  1470. eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
  1471. aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2
  1472. aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2
  1473. aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2
  1474. aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2
  1475. trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
  1476. aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2
  1477. aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2
  1478. aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2
  1479. aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2
  1480. pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high
  1481. pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low
  1482. trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
  1483. ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
  1484. rev64 $res7b, $res7b @ GHASH block 8k+7
  1485. aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3
  1486. ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
  1487. ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
  1488. pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high
  1489. pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low
  1490. aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3
  1491. aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3
  1492. trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
  1493. pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high
  1494. pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low
  1495. trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
  1496. aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3
  1497. aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3
  1498. aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3
  1499. aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3
  1500. aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3
  1501. eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
  1502. eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high
  1503. aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4
  1504. aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4
  1505. eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
  1506. aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4
  1507. pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid
  1508. aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4
  1509. aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4
  1510. aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4
  1511. aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4
  1512. aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4
  1513. pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid
  1514. pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high
  1515. pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid
  1516. pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid
  1517. ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
  1518. eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
  1519. aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5
  1520. ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
  1521. pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low
  1522. eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low
  1523. aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5
  1524. aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5
  1525. aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5
  1526. aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5
  1527. aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5
  1528. aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5
  1529. aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5
  1530. eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
  1531. eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low
  1532. aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6
  1533. aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6
  1534. aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6
  1535. eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high
  1536. aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6
  1537. aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6
  1538. aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6
  1539. aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6
  1540. aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6
  1541. aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7
  1542. eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
  1543. ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
  1544. pmull $t12.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
  1545. aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7
  1546. ext $t11.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
  1547. aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7
  1548. aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7
  1549. aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7
  1550. aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7
  1551. aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7
  1552. aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7
  1553. eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid
  1554. ldr $rk10q, [$cc, #160] @ load rk10
  1555. aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8
  1556. aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8
  1557. pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
  1558. aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8
  1559. ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
  1560. aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8
  1561. aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8
  1562. aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8
  1563. aese $ctr6b, $rk9 @ AES block 8k+14 - round 9
  1564. aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8
  1565. aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8
  1566. eor3 $acc_lb, $acc_lb, $acc_hb, $t11.16b @ MODULO - fold into low
  1567. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15
  1568. aese $ctr2b, $rk9 @ AES block 8k+10 - round 9
  1569. aese $ctr3b, $rk9 @ AES block 8k+11 - round 9
  1570. aese $ctr5b, $rk9 @ AES block 8k+13 - round 9
  1571. aese $ctr0b, $rk9 @ AES block 8k+8 - round 9
  1572. aese $ctr4b, $rk9 @ AES block 8k+12 - round 9
  1573. aese $ctr1b, $rk9 @ AES block 8k+9 - round 9
  1574. aese $ctr7b, $rk9 @ AES block 8k+15 - round 9
  1575. .L128_dec_tail: @ TAIL
  1576. mov $t1.16b, $rk10
  1577. sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
  1578. cmp $main_end_input_ptr, #112
  1579. ldp $h78kq, $h8q, [$current_tag, #192] @ load h8k | h7k
  1580. ext $h8.16b, $h8.16b, $h8.16b, #8
  1581. ldr $res1q, [$input_ptr], #16 @ AES block 8k+8 - load ciphertext
  1582. ldp $h5q, $h56kq, [$current_tag, #128] @ load h5l | h5h
  1583. ext $h5.16b, $h5.16b, $h5.16b, #8
  1584. ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
  1585. ldp $h6q, $h7q, [$current_tag, #160] @ load h6l | h6h
  1586. ext $h6.16b, $h6.16b, $h6.16b, #8
  1587. ext $h7.16b, $h7.16b, $h7.16b, #8
  1588. eor3 $res4b, $res1b, $ctr0b, $t1.16b @ AES block 8k+8 - result
  1589. b.gt .L128_dec_blocks_more_than_7
  1590. cmp $main_end_input_ptr, #96
  1591. mov $ctr7b, $ctr6b
  1592. movi $acc_l.8b, #0
  1593. movi $acc_h.8b, #0
  1594. mov $ctr6b, $ctr5b
  1595. mov $ctr5b, $ctr4b
  1596. mov $ctr4b, $ctr3b
  1597. mov $ctr3b, $ctr2b
  1598. mov $ctr2b, $ctr1b
  1599. movi $acc_m.8b, #0
  1600. sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
  1601. b.gt .L128_dec_blocks_more_than_6
  1602. cmp $main_end_input_ptr, #80
  1603. sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
  1604. mov $ctr7b, $ctr6b
  1605. mov $ctr6b, $ctr5b
  1606. mov $ctr5b, $ctr4b
  1607. mov $ctr4b, $ctr3b
  1608. mov $ctr3b, $ctr1b
  1609. b.gt .L128_dec_blocks_more_than_5
  1610. cmp $main_end_input_ptr, #64
  1611. mov $ctr7b, $ctr6b
  1612. mov $ctr6b, $ctr5b
  1613. mov $ctr5b, $ctr4b
  1614. mov $ctr4b, $ctr1b
  1615. sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
  1616. b.gt .L128_dec_blocks_more_than_4
  1617. sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
  1618. mov $ctr7b, $ctr6b
  1619. mov $ctr6b, $ctr5b
  1620. mov $ctr5b, $ctr1b
  1621. cmp $main_end_input_ptr, #48
  1622. b.gt .L128_dec_blocks_more_than_3
  1623. sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
  1624. mov $ctr7b, $ctr6b
  1625. cmp $main_end_input_ptr, #32
  1626. ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
  1627. mov $ctr6b, $ctr1b
  1628. b.gt .L128_dec_blocks_more_than_2
  1629. cmp $main_end_input_ptr, #16
  1630. mov $ctr7b, $ctr1b
  1631. sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
  1632. b.gt L128_dec_blocks_more_than_1
  1633. sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
  1634. ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
  1635. b .L128_dec_blocks_less_than_1
  1636. .L128_dec_blocks_more_than_7: @ blocks left > 7
  1637. rev64 $res0b, $res1b @ GHASH final-7 block
  1638. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  1639. ins $acc_m.d[0], $h78k.d[1] @ GHASH final-7 block - mid
  1640. pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH final-7 block - low
  1641. ins $rk4v.d[0], $res0.d[1] @ GHASH final-7 block - mid
  1642. movi $t0.8b, #0 @ suppress further partial tag feed in
  1643. ldr $res1q, [$input_ptr], #16 @ AES final-6 block - load ciphertext
  1644. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-7 block - mid
  1645. pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH final-7 block - high
  1646. st1 { $res4b}, [$output_ptr], #16 @ AES final-7 block - store result
  1647. eor3 $res4b, $res1b, $ctr1b, $t1.16b @ AES final-6 block - result
  1648. pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-7 block - mid
  1649. .L128_dec_blocks_more_than_6: @ blocks left > 6
  1650. rev64 $res0b, $res1b @ GHASH final-6 block
  1651. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  1652. ins $rk4v.d[0], $res0.d[1] @ GHASH final-6 block - mid
  1653. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-6 block - mid
  1654. pmull $rk3q1, $res0.1d, $h7.1d @ GHASH final-6 block - low
  1655. ldr $res1q, [$input_ptr], #16 @ AES final-5 block - load ciphertext
  1656. movi $t0.8b, #0 @ suppress further partial tag feed in
  1657. pmull $rk4v.1q, $rk4v.1d, $h78k.1d @ GHASH final-6 block - mid
  1658. st1 { $res4b}, [$output_ptr], #16 @ AES final-6 block - store result
  1659. pmull2 $rk2q1, $res0.2d, $h7.2d @ GHASH final-6 block - high
  1660. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-6 block - low
  1661. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-6 block - high
  1662. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-6 block - mid
  1663. eor3 $res4b, $res1b, $ctr2b, $t1.16b @ AES final-5 block - result
  1664. .L128_dec_blocks_more_than_5: @ blocks left > 5
  1665. rev64 $res0b, $res1b @ GHASH final-5 block
  1666. ldr $res1q, [$input_ptr], #16 @ AES final-4 block - load ciphertext
  1667. st1 { $res4b}, [$output_ptr], #16 @ AES final-5 block - store result
  1668. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  1669. ins $rk4v.d[0], $res0.d[1] @ GHASH final-5 block - mid
  1670. eor3 $res4b, $res1b, $ctr3b, $t1.16b @ AES final-4 block - result
  1671. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-5 block - mid
  1672. ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-5 block - mid
  1673. pmull $rk3q1, $res0.1d, $h6.1d @ GHASH final-5 block - low
  1674. movi $t0.8b, #0 @ suppress further partial tag feed in
  1675. pmull2 $rk4v.1q, $rk4v.2d, $h56k.2d @ GHASH final-5 block - mid
  1676. pmull2 $rk2q1, $res0.2d, $h6.2d @ GHASH final-5 block - high
  1677. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-5 block - low
  1678. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-5 block - mid
  1679. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-5 block - high
  1680. .L128_dec_blocks_more_than_4: @ blocks left > 4
  1681. rev64 $res0b, $res1b @ GHASH final-4 block
  1682. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  1683. ldr $res1q, [$input_ptr], #16 @ AES final-3 block - load ciphertext
  1684. ins $rk4v.d[0], $res0.d[1] @ GHASH final-4 block - mid
  1685. movi $t0.8b, #0 @ suppress further partial tag feed in
  1686. pmull2 $rk2q1, $res0.2d, $h5.2d @ GHASH final-4 block - high
  1687. pmull $rk3q1, $res0.1d, $h5.1d @ GHASH final-4 block - low
  1688. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-4 block - high
  1689. st1 { $res4b}, [$output_ptr], #16 @ AES final-4 block - store result
  1690. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-4 block - mid
  1691. eor3 $res4b, $res1b, $ctr4b, $t1.16b @ AES final-3 block - result
  1692. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-4 block - low
  1693. pmull $rk4v.1q, $rk4v.1d, $h56k.1d @ GHASH final-4 block - mid
  1694. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-4 block - mid
  1695. .L128_dec_blocks_more_than_3: @ blocks left > 3
  1696. st1 { $res4b}, [$output_ptr], #16 @ AES final-3 block - store result
  1697. rev64 $res0b, $res1b @ GHASH final-3 block
  1698. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  1699. ins $rk4v.d[0], $res0.d[1] @ GHASH final-3 block - mid
  1700. ldr $h4q, [$current_tag, #112] @ load h4l | h4h
  1701. ext $h4.16b, $h4.16b, $h4.16b, #8
  1702. ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
  1703. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
  1704. ldr $res1q, [$input_ptr], #16 @ AES final-2 block - load ciphertext
  1705. ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-3 block - mid
  1706. pmull $rk3q1, $res0.1d, $h4.1d @ GHASH final-3 block - low
  1707. pmull2 $rk2q1, $res0.2d, $h4.2d @ GHASH final-3 block - high
  1708. movi $t0.8b, #0 @ suppress further partial tag feed in
  1709. eor3 $res4b, $res1b, $ctr5b, $t1.16b @ AES final-2 block - result
  1710. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-3 block - low
  1711. pmull2 $rk4v.1q, $rk4v.2d, $h34k.2d @ GHASH final-3 block - mid
  1712. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-3 block - high
  1713. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-3 block - mid
  1714. .L128_dec_blocks_more_than_2: @ blocks left > 2
  1715. rev64 $res0b, $res1b @ GHASH final-2 block
  1716. st1 { $res4b}, [$output_ptr], #16 @ AES final-2 block - store result
  1717. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  1718. ldr $h3q, [$current_tag, #80] @ load h3l | h3h
  1719. ext $h3.16b, $h3.16b, $h3.16b, #8
  1720. movi $t0.8b, #0 @ suppress further partial tag feed in
  1721. ins $rk4v.d[0], $res0.d[1] @ GHASH final-2 block - mid
  1722. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
  1723. pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
  1724. pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
  1725. pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
  1726. ldr $res1q, [$input_ptr], #16 @ AES final-1 block - load ciphertext
  1727. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
  1728. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
  1729. eor3 $res4b, $res1b, $ctr6b, $t1.16b @ AES final-1 block - result
  1730. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
  1731. .L128_dec_blocks_more_than_1: @ blocks left > 1
  1732. st1 { $res4b}, [$output_ptr], #16 @ AES final-1 block - store result
  1733. rev64 $res0b, $res1b @ GHASH final-1 block
  1734. ldr $h2q, [$current_tag, #64] @ load h2l | h2h
  1735. ext $h2.16b, $h2.16b, $h2.16b, #8
  1736. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  1737. movi $t0.8b, #0 @ suppress further partial tag feed in
  1738. ins $rk4v.d[0], $res0.d[1] @ GHASH final-1 block - mid
  1739. ldr $res1q, [$input_ptr], #16 @ AES final block - load ciphertext
  1740. pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
  1741. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
  1742. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
  1743. ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
  1744. ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
  1745. eor3 $res4b, $res1b, $ctr7b, $t1.16b @ AES final block - result
  1746. pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
  1747. pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
  1748. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
  1749. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
  1750. .L128_dec_blocks_less_than_1: @ blocks left <= 1
  1751. and $bit_length, $bit_length, #127 @ bit_length %= 128
  1752. sub $bit_length, $bit_length, #128 @ bit_length -= 128
  1753. neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
  1754. mvn $temp0_x, xzr @ temp0_x = 0xffffffffffffffff
  1755. and $bit_length, $bit_length, #127 @ bit_length %= 128
  1756. lsr $temp0_x, $temp0_x, $bit_length @ temp0_x is mask for top 64b of last block
  1757. cmp $bit_length, #64
  1758. mvn $temp1_x, xzr @ temp1_x = 0xffffffffffffffff
  1759. csel $temp2_x, $temp1_x, $temp0_x, lt
  1760. csel $temp3_x, $temp0_x, xzr, lt
  1761. mov $ctr0.d[1], $temp3_x
  1762. mov $ctr0.d[0], $temp2_x @ ctr0b is mask for last block
  1763. ldr $h1q, [$current_tag, #32] @ load h1l | h1h
  1764. ext $h1.16b, $h1.16b, $h1.16b, #8
  1765. ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored
  1766. and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
  1767. rev64 $res0b, $res1b @ GHASH final block
  1768. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  1769. pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
  1770. ins $t0.d[0], $res0.d[1] @ GHASH final block - mid
  1771. eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
  1772. eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
  1773. bif $res4b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing
  1774. pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
  1775. st1 { $res4b}, [$output_ptr] @ store all 16B
  1776. pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
  1777. eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
  1778. ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
  1779. eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
  1780. eor $t10.16b, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
  1781. pmull $t11.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
  1782. ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
  1783. eor $acc_mb, $acc_mb, $t10.16b @ MODULO - karatsuba tidy up
  1784. eor3 $acc_mb, $acc_mb, $acc_hb, $t11.16b @ MODULO - fold into mid
  1785. pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
  1786. ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
  1787. eor3 $acc_lb, $acc_lb, $acc_mb, $acc_hb @ MODULO - fold into low
  1788. ext $acc_lb, $acc_lb, $acc_lb, #8
  1789. rev64 $acc_lb, $acc_lb
  1790. st1 { $acc_l.16b }, [$current_tag]
  1791. rev32 $rtmp_ctr.16b, $rtmp_ctr.16b
  1792. str $rtmp_ctrq, [$counter] @ store the updated counter
  1793. mov x0, $byte_length
  1794. ldp d10, d11, [sp, #16]
  1795. ldp d12, d13, [sp, #32]
  1796. ldp d14, d15, [sp, #48]
  1797. ldp d8, d9, [sp], #80
  1798. ret
  1799. .L128_dec_ret:
  1800. mov w0, #0x0
  1801. ret
  1802. .size unroll8_eor3_aes_gcm_dec_128_kernel,.-unroll8_eor3_aes_gcm_dec_128_kernel
  1803. ___
  1804. }
  1805. {
  1806. my ($end_input_ptr,$main_end_input_ptr,$temp0_x,$temp1_x)=map("x$_",(4..7));
  1807. my ($temp2_x,$temp3_x)=map("x$_",(13..14));
  1808. my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$ctr4b,$ctr5b,$ctr6b,$ctr7b,$res0b,$res1b,$res2b,$res3b,$res4b,$res5b,$res6b,$res7b)=map("v$_.16b",(0..15));
  1809. my ($ctr0,$ctr1,$ctr2,$ctr3,$ctr4,$ctr5,$ctr6,$ctr7,$res0,$res1,$res2,$res3,$res4,$res5,$res6,$res7)=map("v$_",(0..15));
  1810. my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$ctr4d,$ctr5d,$ctr6d,$ctr7d)=map("d$_",(0..7));
  1811. my ($ctr0q,$ctr1q,$ctr2q,$ctr3q,$ctr4q,$ctr5q,$ctr6q,$ctr7q)=map("q$_",(0..7));
  1812. my ($res0q,$res1q,$res2q,$res3q,$res4q,$res5q,$res6q,$res7q)=map("q$_",(8..15));
  1813. my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3,$ctr_t4,$ctr_t5,$ctr_t6,$ctr_t7)=map("v$_",(8..15));
  1814. my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b,$ctr_t4b,$ctr_t5b,$ctr_t6b,$ctr_t7b)=map("v$_.16b",(8..15));
  1815. my ($ctr_t0q,$ctr_t1q,$ctr_t2q,$ctr_t3q,$ctr_t4q,$ctr_t5q,$ctr_t6q,$ctr_t7q)=map("q$_",(8..15));
  1816. my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(17..19));
  1817. my ($acc_h,$acc_m,$acc_l)=map("v$_",(17..19));
  1818. my ($h1,$h12k,$h2,$h3,$h34k,$h4)=map("v$_",(20..25));
  1819. my ($h5,$h56k,$h6,$h7,$h78k,$h8)=map("v$_",(20..25));
  1820. my ($h1q,$h12kq,$h2q,$h3q,$h34kq,$h4q)=map("q$_",(20..25));
  1821. my ($h5q,$h56kq,$h6q,$h7q,$h78kq,$h8q)=map("q$_",(20..25));
  1822. my $t0="v16";
  1823. my $t0d="d16";
  1824. my $t1="v29";
  1825. my $t2=$res1;
  1826. my $t3=$t1;
  1827. my $t4=$res0;
  1828. my $t5=$res2;
  1829. my $t6=$t0;
  1830. my $t7=$res3;
  1831. my $t8=$res4;
  1832. my $t9=$res5;
  1833. my $t10=$res6;
  1834. my $t11="v21";
  1835. my $t12=$t1;
  1836. my $rtmp_ctr="v30";
  1837. my $rtmp_ctrq="q30";
  1838. my $rctr_inc="v31";
  1839. my $rctr_incd="d31";
  1840. my $mod_constantd=$t0d;
  1841. my $mod_constant=$t0;
  1842. my ($rk0,$rk1,$rk2)=map("v$_.16b",(26..28));
  1843. my ($rk3,$rk4,$rk5)=map("v$_.16b",(26..28));
  1844. my ($rk6,$rk7,$rk8)=map("v$_.16b",(26..28));
  1845. my ($rk9,$rk10,$rk11)=map("v$_.16b",(26..28));
  1846. my ($rk12,$rk13,$rk14)=map("v$_.16b",(26..28));
  1847. my ($rk0q,$rk1q,$rk2q)=map("q$_",(26..28));
  1848. my ($rk3q,$rk4q,$rk5q)=map("q$_",(26..28));
  1849. my ($rk6q,$rk7q,$rk8q)=map("q$_",(26..28));
  1850. my ($rk9q,$rk10q,$rk11q)=map("q$_",(26..28));
  1851. my ($rk12q,$rk13q,$rk14q)=map("q$_",(26..28));
  1852. my $rk2q1="v28.1q";
  1853. my $rk3q1="v26.1q";
  1854. my $rk4v="v27";
  1855. #########################################################################################
  1856. # size_t unroll8_eor3_aes_gcm_enc_192_kernel(const uint8_t * plaintext,
  1857. # uint64_t plaintext_length,
  1858. # uint8_t * ciphertext,
  1859. # uint64_t *Xi,
  1860. # unsigned char ivec[16],
  1861. # const void *key);
  1862. #
  1863. $code.=<<___;
  1864. .global unroll8_eor3_aes_gcm_enc_192_kernel
  1865. .type unroll8_eor3_aes_gcm_enc_192_kernel,%function
  1866. .align 4
  1867. unroll8_eor3_aes_gcm_enc_192_kernel:
  1868. AARCH64_VALID_CALL_TARGET
  1869. cbz x1, .L192_enc_ret
  1870. stp d8, d9, [sp, #-80]!
  1871. lsr $byte_length, $bit_length, #3
  1872. mov $counter, x4
  1873. mov $cc, x5
  1874. stp d10, d11, [sp, #16]
  1875. stp d12, d13, [sp, #32]
  1876. stp d14, d15, [sp, #48]
  1877. mov x5, #0xc200000000000000
  1878. stp x5, xzr, [sp, #64]
  1879. add $modulo_constant, sp, #64
  1880. mov $main_end_input_ptr, $byte_length
  1881. ld1 { $ctr0b}, [$counter] @ CTR block 0
  1882. mov $constant_temp, #0x100000000 @ set up counter increment
  1883. movi $rctr_inc.16b, #0x0
  1884. mov $rctr_inc.d[1], $constant_temp
  1885. rev32 $rtmp_ctr.16b, $ctr0.16b @ set up reversed counter
  1886. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 0
  1887. rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 1
  1888. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 1
  1889. rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 2
  1890. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 2
  1891. rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 3
  1892. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 3
  1893. rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 4
  1894. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 4
  1895. sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
  1896. and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffff80 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
  1897. rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 5
  1898. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 5
  1899. ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
  1900. add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
  1901. rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 6
  1902. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 6
  1903. rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 7
  1904. aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 0
  1905. aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 0
  1906. aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
  1907. aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
  1908. aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
  1909. aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 0
  1910. aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 0
  1911. aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
  1912. ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
  1913. aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 1
  1914. aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 1
  1915. aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
  1916. aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
  1917. aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 1
  1918. aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 2
  1919. aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 1
  1920. aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
  1921. aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
  1922. aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 2
  1923. aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
  1924. aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
  1925. aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
  1926. aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
  1927. aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 2
  1928. aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 2
  1929. ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
  1930. aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 3
  1931. aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 3
  1932. aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
  1933. aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
  1934. aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
  1935. aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
  1936. aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 3
  1937. aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
  1938. aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
  1939. aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 3
  1940. aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
  1941. aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
  1942. aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 4
  1943. aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 4
  1944. aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 4
  1945. aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 4
  1946. aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
  1947. ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
  1948. aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
  1949. aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 5
  1950. aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 5
  1951. aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
  1952. aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 5
  1953. aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 5
  1954. aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
  1955. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 7
  1956. aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 6
  1957. aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 6
  1958. aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
  1959. aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
  1960. aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 6
  1961. aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
  1962. aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
  1963. aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 6
  1964. ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
  1965. aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 7
  1966. aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
  1967. aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 7
  1968. aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
  1969. aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 7
  1970. aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
  1971. aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
  1972. aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 7
  1973. aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 8
  1974. aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
  1975. aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 8
  1976. aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
  1977. aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 8
  1978. aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
  1979. aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
  1980. aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 8
  1981. add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
  1982. cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
  1983. aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9
  1984. ld1 { $acc_lb}, [$current_tag]
  1985. ext $acc_lb, $acc_lb, $acc_lb, #8
  1986. rev64 $acc_lb, $acc_lb
  1987. ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11
  1988. aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 9
  1989. aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9
  1990. aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 9
  1991. aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9
  1992. aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9
  1993. aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 9
  1994. aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 14 - round 10
  1995. aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 9
  1996. aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 11 - round 10
  1997. aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 9 - round 10
  1998. aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 13 - round 10
  1999. aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 12 - round 10
  2000. aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 8 - round 10
  2001. aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 10 - round 10
  2002. aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 15 - round 10
  2003. aese $ctr6b, $rk11 @ AES block 14 - round 11
  2004. aese $ctr3b, $rk11 @ AES block 11 - round 11
  2005. aese $ctr4b, $rk11 @ AES block 12 - round 11
  2006. aese $ctr7b, $rk11 @ AES block 15 - round 11
  2007. ldr $rk12q, [$cc, #192] @ load rk12
  2008. aese $ctr1b, $rk11 @ AES block 9 - round 11
  2009. aese $ctr5b, $rk11 @ AES block 13 - round 11
  2010. aese $ctr2b, $rk11 @ AES block 10 - round 11
  2011. aese $ctr0b, $rk11 @ AES block 8 - round 11
  2012. b.ge .L192_enc_tail @ handle tail
  2013. ldp $ctr_t0q, $ctr_t1q, [$input_ptr], #32 @ AES block 0, 1 - load plaintext
  2014. ldp $ctr_t2q, $ctr_t3q, [$input_ptr], #32 @ AES block 2, 3 - load plaintext
  2015. ldp $ctr_t4q, $ctr_t5q, [$input_ptr], #32 @ AES block 4, 5 - load plaintext
  2016. ldp $ctr_t6q, $ctr_t7q, [$input_ptr], #32 @ AES block 6, 7 - load plaintext
  2017. eor3 $res0b, $ctr_t0b, $ctr0b, $rk12 @ AES block 0 - result
  2018. rev32 $ctr0.16b, $rtmp_ctr.16b @ CTR block 8
  2019. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8
  2020. eor3 $res3b, $ctr_t3b, $ctr3b, $rk12 @ AES block 3 - result
  2021. eor3 $res1b, $ctr_t1b, $ctr1b, $rk12 @ AES block 1 - result
  2022. rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 9
  2023. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 9
  2024. eor3 $res4b, $ctr_t4b, $ctr4b, $rk12 @ AES block 4 - result
  2025. eor3 $res5b, $ctr_t5b, $ctr5b, $rk12 @ AES block 5 - result
  2026. eor3 $res7b, $ctr_t7b, $ctr7b, $rk12 @ AES block 7 - result
  2027. stp $res0q, $res1q, [$output_ptr], #32 @ AES block 0, 1 - store result
  2028. eor3 $res2b, $ctr_t2b, $ctr2b, $rk12 @ AES block 2 - result
  2029. rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 10
  2030. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 10
  2031. stp $res2q, $res3q, [$output_ptr], #32 @ AES block 2, 3 - store result
  2032. cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
  2033. rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 11
  2034. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 11
  2035. eor3 $res6b, $ctr_t6b, $ctr6b, $rk12 @ AES block 6 - result
  2036. stp $res4q, $res5q, [$output_ptr], #32 @ AES block 4, 5 - store result
  2037. rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 12
  2038. stp $res6q, $res7q, [$output_ptr], #32 @ AES block 6, 7 - store result
  2039. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 12
  2040. b.ge .L192_enc_prepretail @ do prepretail
  2041. .L192_enc_main_loop: @ main loop start
  2042. rev64 $res4b, $res4b @ GHASH block 8k+4 (t0, t1, and t2 free)
  2043. ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
  2044. rev64 $res2b, $res2b @ GHASH block 8k+2
  2045. rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13
  2046. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13
  2047. ldr $h7q, [$current_tag, #176] @ load h7l | h7h
  2048. ext $h7.16b, $h7.16b, $h7.16b, #8
  2049. ldr $h8q, [$current_tag, #208] @ load h8l | h8h
  2050. ext $h8.16b, $h8.16b, $h8.16b, #8
  2051. ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
  2052. rev64 $res0b, $res0b @ GHASH block 8k
  2053. ldr $h5q, [$current_tag, #128] @ load h5l | h5h
  2054. ext $h5.16b, $h5.16b, $h5.16b, #8
  2055. ldr $h6q, [$current_tag, #160] @ load h6l | h6h
  2056. ext $h6.16b, $h6.16b, $h6.16b, #8
  2057. rev64 $res1b, $res1b @ GHASH block 8k+1
  2058. rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14
  2059. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14
  2060. eor $res0b, $res0b, $acc_lb @ PRE 1
  2061. rev64 $res3b, $res3b @ GHASH block 8k+3
  2062. rev64 $res5b, $res5b @ GHASH block 8k+5 (t0, t1, t2 and t3 free)
  2063. aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0
  2064. rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15
  2065. aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0
  2066. aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0
  2067. aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0
  2068. aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0
  2069. aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0
  2070. aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0
  2071. aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0
  2072. ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
  2073. pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high
  2074. aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1
  2075. aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1
  2076. pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high
  2077. pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low
  2078. trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
  2079. aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1
  2080. ldr $h56kq, [$current_tag, #144] @ load h6k | h5k
  2081. ldr $h78kq, [$current_tag, #192] @ load h8k | h7k
  2082. pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high
  2083. pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low
  2084. trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
  2085. aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1
  2086. aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1
  2087. aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1
  2088. eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high
  2089. aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1
  2090. aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1
  2091. pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high
  2092. eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid
  2093. aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2
  2094. aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2
  2095. aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2
  2096. aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2
  2097. aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2
  2098. aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3
  2099. eor3 $acc_hb, $acc_hb, $t1.16b, $t2.16b @ GHASH block 8k+2, 8k+3 - high
  2100. pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low
  2101. aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2
  2102. aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3
  2103. aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2
  2104. trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
  2105. aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2
  2106. trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
  2107. aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3
  2108. ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
  2109. aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3
  2110. eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low
  2111. ldr $h3q, [$current_tag, #80] @ load h3l | h3h
  2112. ext $h3.16b, $h3.16b, $h3.16b, #8
  2113. ldr $h4q, [$current_tag, #112] @ load h4l | h4h
  2114. ext $h4.16b, $h4.16b, $h4.16b, #8
  2115. pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid
  2116. pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid
  2117. pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low
  2118. aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3
  2119. eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
  2120. trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
  2121. eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid
  2122. aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3
  2123. eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low
  2124. aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4
  2125. aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4
  2126. aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3
  2127. pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid
  2128. aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4
  2129. aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3
  2130. pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid
  2131. aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4
  2132. aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4
  2133. aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4
  2134. aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4
  2135. aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4
  2136. eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
  2137. aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5
  2138. ldr $h1q, [$current_tag, #32] @ load h1l | h1h
  2139. ext $h1.16b, $h1.16b, $h1.16b, #8
  2140. ldr $h2q, [$current_tag, #64] @ load h2l | h2h
  2141. ext $h2.16b, $h2.16b, $h2.16b, #8
  2142. ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
  2143. aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5
  2144. rev64 $res7b, $res7b @ GHASH block 8k+7 (t0, t1, t2 and t3 free)
  2145. rev64 $res6b, $res6b @ GHASH block 8k+6 (t0, t1, and t2 free)
  2146. pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high
  2147. pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low
  2148. aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5
  2149. trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
  2150. aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5
  2151. ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
  2152. ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
  2153. aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5
  2154. pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high
  2155. eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
  2156. aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5
  2157. aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5
  2158. aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5
  2159. pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low
  2160. aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6
  2161. trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
  2162. aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6
  2163. aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6
  2164. pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high
  2165. pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low
  2166. trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
  2167. aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6
  2168. aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6
  2169. aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6
  2170. aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6
  2171. aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7
  2172. aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6
  2173. aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7
  2174. eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
  2175. pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid
  2176. ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
  2177. pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid
  2178. aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7
  2179. pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high
  2180. aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7
  2181. eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
  2182. aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7
  2183. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15
  2184. ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
  2185. eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high
  2186. aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7
  2187. pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid
  2188. pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low
  2189. aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7
  2190. aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8
  2191. aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8
  2192. aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8
  2193. aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8
  2194. eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low
  2195. aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7
  2196. aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8
  2197. aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8
  2198. pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid
  2199. aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8
  2200. aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8
  2201. ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11
  2202. eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low
  2203. rev32 $h1.16b, $rtmp_ctr.16b @ CTR block 8k+16
  2204. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+16
  2205. aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 9
  2206. eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
  2207. eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high
  2208. aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 9
  2209. aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 9
  2210. ldp $ctr_t0q, $ctr_t1q, [$input_ptr], #32 @ AES block 8k+8, 8k+9 - load plaintext
  2211. pmull $t11.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
  2212. rev32 $h2.16b, $rtmp_ctr.16b @ CTR block 8k+17
  2213. aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 9
  2214. aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 9
  2215. aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 9
  2216. aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 9
  2217. eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
  2218. aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 9
  2219. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+17
  2220. aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 10
  2221. aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 10
  2222. ldr $rk12q, [$cc, #192] @ load rk12
  2223. ext $t12.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
  2224. aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 10
  2225. aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 10
  2226. ldp $ctr_t2q, $ctr_t3q, [$input_ptr], #32 @ AES block 8k+10, 8k+11 - load plaintext
  2227. aese $ctr4b, $rk11 @ AES block 8k+12 - round 11
  2228. eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid
  2229. ldp $ctr_t4q, $ctr_t5q, [$input_ptr], #32 @ AES block 8k+12, 8k+13 - load plaintext
  2230. ldp $ctr_t6q, $ctr_t7q, [$input_ptr], #32 @ AES block 8k+14, 8k+15 - load plaintext
  2231. aese $ctr2b, $rk11 @ AES block 8k+10 - round 11
  2232. aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 10
  2233. rev32 $h3.16b, $rtmp_ctr.16b @ CTR block 8k+18
  2234. aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 10
  2235. aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 10
  2236. pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
  2237. aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 10
  2238. aese $ctr5b, $rk11 @ AES block 8k+13 - round 11
  2239. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+18
  2240. aese $ctr7b, $rk11 @ AES block 8k+15 - round 11
  2241. aese $ctr0b, $rk11 @ AES block 8k+8 - round 11
  2242. eor3 $res4b, $ctr_t4b, $ctr4b, $rk12 @ AES block 4 - result
  2243. aese $ctr6b, $rk11 @ AES block 8k+14 - round 11
  2244. aese $ctr3b, $rk11 @ AES block 8k+11 - round 11
  2245. aese $ctr1b, $rk11 @ AES block 8k+9 - round 11
  2246. rev32 $h4.16b, $rtmp_ctr.16b @ CTR block 8k+19
  2247. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+19
  2248. eor3 $res7b, $ctr_t7b, $ctr7b, $rk12 @ AES block 7 - result
  2249. eor3 $res2b, $ctr_t2b, $ctr2b, $rk12 @ AES block 8k+10 - result
  2250. eor3 $res0b, $ctr_t0b, $ctr0b, $rk12 @ AES block 8k+8 - result
  2251. mov $ctr2.16b, $h3.16b @ CTR block 8k+18
  2252. eor3 $res1b, $ctr_t1b, $ctr1b, $rk12 @ AES block 8k+9 - result
  2253. mov $ctr1.16b, $h2.16b @ CTR block 8k+17
  2254. stp $res0q, $res1q, [$output_ptr], #32 @ AES block 8k+8, 8k+9 - store result
  2255. ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
  2256. eor3 $res6b, $ctr_t6b, $ctr6b, $rk12 @ AES block 6 - result
  2257. mov $ctr0.16b, $h1.16b @ CTR block 8k+16
  2258. rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 8k+20
  2259. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+20
  2260. eor3 $res5b, $ctr_t5b, $ctr5b, $rk12 @ AES block 5 - result
  2261. eor3 $acc_lb, $acc_lb, $acc_hb, $t11.16b @ MODULO - fold into low
  2262. eor3 $res3b, $ctr_t3b, $ctr3b, $rk12 @ AES block 8k+11 - result
  2263. mov $ctr3.16b, $h4.16b @ CTR block 8k+19
  2264. stp $res2q, $res3q, [$output_ptr], #32 @ AES block 8k+10, 8k+11 - store result
  2265. stp $res4q, $res5q, [$output_ptr], #32 @ AES block 8k+12, 8k+13 - store result
  2266. cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
  2267. stp $res6q, $res7q, [$output_ptr], #32 @ AES block 8k+14, 8k+15 - store result
  2268. b.lt .L192_enc_main_loop
  2269. .L192_enc_prepretail: @ PREPRETAIL
  2270. rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13
  2271. ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
  2272. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13
  2273. ldr $h7q, [$current_tag, #176] @ load h7l | h7h
  2274. ext $h7.16b, $h7.16b, $h7.16b, #8
  2275. ldr $h8q, [$current_tag, #208] @ load h8l | h8h
  2276. ext $h8.16b, $h8.16b, $h8.16b, #8
  2277. rev64 $res0b, $res0b @ GHASH block 8k
  2278. ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
  2279. rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14
  2280. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14
  2281. ldr $h56kq, [$current_tag, #144] @ load h6k | h5k
  2282. ldr $h78kq, [$current_tag, #192] @ load h8k | h7k
  2283. rev64 $res3b, $res3b @ GHASH block 8k+3
  2284. rev64 $res2b, $res2b @ GHASH block 8k+2
  2285. ldr $h5q, [$current_tag, #128] @ load h5l | h5h
  2286. ext $h5.16b, $h5.16b, $h5.16b, #8
  2287. ldr $h6q, [$current_tag, #160] @ load h6l | h6h
  2288. ext $h6.16b, $h6.16b, $h6.16b, #8
  2289. eor $res0b, $res0b, $acc_lb @ PRE 1
  2290. rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15
  2291. rev64 $res1b, $res1b @ GHASH block 8k+1
  2292. aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0
  2293. aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0
  2294. aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0
  2295. pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high
  2296. aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0
  2297. aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0
  2298. aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0
  2299. aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0
  2300. pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high
  2301. aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1
  2302. pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low
  2303. trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
  2304. trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
  2305. aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0
  2306. ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
  2307. pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low
  2308. eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high
  2309. aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1
  2310. aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1
  2311. eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid
  2312. aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1
  2313. aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1
  2314. pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high
  2315. pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high
  2316. aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1
  2317. aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1
  2318. aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1
  2319. pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low
  2320. aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2
  2321. eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low
  2322. pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low
  2323. aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2
  2324. eor3 $acc_hb, $acc_hb, $t1.16b, $t2.16b @ GHASH block 8k+2, 8k+3 - high
  2325. aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3
  2326. trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
  2327. aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2
  2328. aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2
  2329. pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid
  2330. trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
  2331. aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2
  2332. rev64 $res5b, $res5b @ GHASH block 8k+5 (t0, t1, t2 and t3 free)
  2333. rev64 $res6b, $res6b @ GHASH block 8k+6 (t0, t1, and t2 free)
  2334. aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2
  2335. aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2
  2336. aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2
  2337. eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
  2338. pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid
  2339. ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
  2340. aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3
  2341. aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3
  2342. aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3
  2343. eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid
  2344. eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low
  2345. aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3
  2346. ldr $h3q, [$current_tag, #80] @ load h3l | h3h
  2347. ext $h3.16b, $h3.16b, $h3.16b, #8
  2348. ldr $h4q, [$current_tag, #112] @ load h4l | h4h
  2349. ext $h4.16b, $h4.16b, $h4.16b, #8
  2350. aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3
  2351. pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid
  2352. ldr $h1q, [$current_tag, #32] @ load h1l | h1h
  2353. ext $h1.16b, $h1.16b, $h1.16b, #8
  2354. ldr $h2q, [$current_tag, #64] @ load h2l | h2h
  2355. ext $h2.16b, $h2.16b, $h2.16b, #8
  2356. aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3
  2357. rev64 $res4b, $res4b @ GHASH block 8k+4 (t0, t1, and t2 free)
  2358. aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3
  2359. pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid
  2360. aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4
  2361. trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
  2362. aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4
  2363. aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4
  2364. eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
  2365. aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4
  2366. aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4
  2367. aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4
  2368. aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4
  2369. aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4
  2370. aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5
  2371. rev64 $res7b, $res7b @ GHASH block 8k+7 (t0, t1, t2 and t3 free)
  2372. ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
  2373. ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
  2374. aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5
  2375. aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5
  2376. ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
  2377. pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high
  2378. pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high
  2379. pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low
  2380. aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5
  2381. trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
  2382. pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high
  2383. pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low
  2384. pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low
  2385. trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
  2386. eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
  2387. trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
  2388. aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5
  2389. aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6
  2390. aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5
  2391. aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5
  2392. eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
  2393. aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5
  2394. pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid
  2395. pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid
  2396. aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6
  2397. aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6
  2398. aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7
  2399. aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6
  2400. aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6
  2401. eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
  2402. aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6
  2403. eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high
  2404. aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7
  2405. aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6
  2406. ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
  2407. aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6
  2408. pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid
  2409. aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7
  2410. eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low
  2411. pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high
  2412. pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid
  2413. pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low
  2414. aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7
  2415. aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7
  2416. ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
  2417. aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7
  2418. eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
  2419. eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low
  2420. eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high
  2421. eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
  2422. ext $t12.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
  2423. aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7
  2424. pmull $t11.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
  2425. aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8
  2426. aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8
  2427. aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7
  2428. aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8
  2429. eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid
  2430. aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8
  2431. aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 9
  2432. aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8
  2433. aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8
  2434. aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8
  2435. aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8
  2436. aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 9
  2437. ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11
  2438. aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 9
  2439. aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 9
  2440. aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 9
  2441. ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
  2442. aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 9
  2443. aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 9
  2444. aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 9
  2445. pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
  2446. ldr $rk12q, [$cc, #192] @ load rk12
  2447. aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 10
  2448. aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 10
  2449. aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 10
  2450. eor3 $acc_lb, $acc_lb, $acc_hb, $t11.16b @ MODULO - fold into low
  2451. aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 10
  2452. aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 10
  2453. aese $ctr1b, $rk11 @ AES block 8k+9 - round 11
  2454. aese $ctr7b, $rk11 @ AES block 8k+15 - round 11
  2455. aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 10
  2456. aese $ctr3b, $rk11 @ AES block 8k+11 - round 11
  2457. aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 10
  2458. aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 10
  2459. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15
  2460. aese $ctr2b, $rk11 @ AES block 8k+10 - round 11
  2461. aese $ctr0b, $rk11 @ AES block 8k+8 - round 11
  2462. aese $ctr6b, $rk11 @ AES block 8k+14 - round 11
  2463. aese $ctr4b, $rk11 @ AES block 8k+12 - round 11
  2464. aese $ctr5b, $rk11 @ AES block 8k+13 - round 11
  2465. .L192_enc_tail: @ TAIL
  2466. ldp $h5q, $h56kq, [$current_tag, #128] @ load h5l | h5h
  2467. ext $h5.16b, $h5.16b, $h5.16b, #8
  2468. sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
  2469. ldr $ctr_t0q, [$input_ptr], #16 @ AES block 8k+8 - l3ad plaintext
  2470. ldp $h78kq, $h8q, [$current_tag, #192] @ load h8k | h7k
  2471. ext $h8.16b, $h8.16b, $h8.16b, #8
  2472. mov $t1.16b, $rk12
  2473. ldp $h6q, $h7q, [$current_tag, #160] @ load h6l | h6h
  2474. ext $h6.16b, $h6.16b, $h6.16b, #8
  2475. ext $h7.16b, $h7.16b, $h7.16b, #8
  2476. cmp $main_end_input_ptr, #112
  2477. eor3 $res1b, $ctr_t0b, $ctr0b, $t1.16b @ AES block 8k+8 - result
  2478. ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
  2479. b.gt .L192_enc_blocks_more_than_7
  2480. cmp $main_end_input_ptr, #96
  2481. mov $ctr7b, $ctr6b
  2482. movi $acc_h.8b, #0
  2483. mov $ctr6b, $ctr5b
  2484. movi $acc_l.8b, #0
  2485. sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
  2486. mov $ctr5b, $ctr4b
  2487. mov $ctr4b, $ctr3b
  2488. mov $ctr3b, $ctr2b
  2489. mov $ctr2b, $ctr1b
  2490. movi $acc_m.8b, #0
  2491. b.gt .L192_enc_blocks_more_than_6
  2492. mov $ctr7b, $ctr6b
  2493. cmp $main_end_input_ptr, #80
  2494. mov $ctr6b, $ctr5b
  2495. mov $ctr5b, $ctr4b
  2496. mov $ctr4b, $ctr3b
  2497. mov $ctr3b, $ctr1b
  2498. sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
  2499. b.gt .L192_enc_blocks_more_than_5
  2500. cmp $main_end_input_ptr, #64
  2501. sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
  2502. mov $ctr7b, $ctr6b
  2503. mov $ctr6b, $ctr5b
  2504. mov $ctr5b, $ctr4b
  2505. mov $ctr4b, $ctr1b
  2506. b.gt .L192_enc_blocks_more_than_4
  2507. mov $ctr7b, $ctr6b
  2508. mov $ctr6b, $ctr5b
  2509. mov $ctr5b, $ctr1b
  2510. sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
  2511. cmp $main_end_input_ptr, #48
  2512. b.gt .L192_enc_blocks_more_than_3
  2513. mov $ctr7b, $ctr6b
  2514. mov $ctr6b, $ctr1b
  2515. sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
  2516. ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
  2517. cmp $main_end_input_ptr, #32
  2518. b.gt .L192_enc_blocks_more_than_2
  2519. sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
  2520. cmp $main_end_input_ptr, #16
  2521. mov $ctr7b, $ctr1b
  2522. b.gt .L192_enc_blocks_more_than_1
  2523. sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
  2524. ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
  2525. b .L192_enc_blocks_less_than_1
  2526. .L192_enc_blocks_more_than_7: @ blocks left > 7
  2527. st1 { $res1b}, [$output_ptr], #16 @ AES final-7 block - store result
  2528. rev64 $res0b, $res1b @ GHASH final-7 block
  2529. ins $acc_m.d[0], $h78k.d[1] @ GHASH final-7 block - mid
  2530. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  2531. ins $rk4v.d[0], $res0.d[1] @ GHASH final-7 block - mid
  2532. ldr $ctr_t1q, [$input_ptr], #16 @ AES final-6 block - load plaintext
  2533. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-7 block - mid
  2534. movi $t0.8b, #0 @ suppress further partial tag feed in
  2535. pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH final-7 block - low
  2536. pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH final-7 block - high
  2537. pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-7 block - mid
  2538. eor3 $res1b, $ctr_t1b, $ctr1b, $t1.16b @ AES final-6 block - result
  2539. .L192_enc_blocks_more_than_6: @ blocks left > 6
  2540. st1 { $res1b}, [$output_ptr], #16 @ AES final-6 block - store result
  2541. rev64 $res0b, $res1b @ GHASH final-6 block
  2542. ldr $ctr_t1q, [$input_ptr], #16 @ AES final-5 block - load plaintext
  2543. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  2544. ins $rk4v.d[0], $res0.d[1] @ GHASH final-6 block - mid
  2545. pmull $rk3q1, $res0.1d, $h7.1d @ GHASH final-6 block - low
  2546. eor3 $res1b, $ctr_t1b, $ctr2b, $t1.16b @ AES final-5 block - result
  2547. movi $t0.8b, #0 @ suppress further partial tag feed in
  2548. pmull2 $rk2q1, $res0.2d, $h7.2d @ GHASH final-6 block - high
  2549. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-6 block - mid
  2550. pmull $rk4v.1q, $rk4v.1d, $h78k.1d @ GHASH final-6 block - mid
  2551. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-6 block - high
  2552. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-6 block - low
  2553. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-6 block - mid
  2554. .L192_enc_blocks_more_than_5: @ blocks left > 5
  2555. st1 { $res1b}, [$output_ptr], #16 @ AES final-5 block - store result
  2556. rev64 $res0b, $res1b @ GHASH final-5 block
  2557. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  2558. ins $rk4v.d[0], $res0.d[1] @ GHASH final-5 block - mid
  2559. ldr $ctr_t1q, [$input_ptr], #16 @ AES final-4 block - load plaintext
  2560. pmull2 $rk2q1, $res0.2d, $h6.2d @ GHASH final-5 block - high
  2561. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-5 block - mid
  2562. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-5 block - high
  2563. ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-5 block - mid
  2564. pmull $rk3q1, $res0.1d, $h6.1d @ GHASH final-5 block - low
  2565. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-5 block - low
  2566. pmull2 $rk4v.1q, $rk4v.2d, $h56k.2d @ GHASH final-5 block - mid
  2567. eor3 $res1b, $ctr_t1b, $ctr3b, $t1.16b @ AES final-4 block - result
  2568. movi $t0.8b, #0 @ suppress further partial tag feed in
  2569. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-5 block - mid
  2570. .L192_enc_blocks_more_than_4: @ blocks left > 4
  2571. st1 { $res1b}, [$output_ptr], #16 @ AES final-4 block - store result
  2572. rev64 $res0b, $res1b @ GHASH final-4 block
  2573. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  2574. ldr $ctr_t1q, [$input_ptr], #16 @ AES final-3 block - load plaintext
  2575. pmull2 $rk2q1, $res0.2d, $h5.2d @ GHASH final-4 block - high
  2576. ins $rk4v.d[0], $res0.d[1] @ GHASH final-4 block - mid
  2577. pmull $rk3q1, $res0.1d, $h5.1d @ GHASH final-4 block - low
  2578. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-4 block - high
  2579. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-4 block - mid
  2580. movi $t0.8b, #0 @ suppress further partial tag feed in
  2581. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-4 block - low
  2582. pmull $rk4v.1q, $rk4v.1d, $h56k.1d @ GHASH final-4 block - mid
  2583. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-4 block - mid
  2584. eor3 $res1b, $ctr_t1b, $ctr4b, $t1.16b @ AES final-3 block - result
  2585. .L192_enc_blocks_more_than_3: @ blocks left > 3
  2586. ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
  2587. st1 { $res1b}, [$output_ptr], #16 @ AES final-3 block - store result
  2588. rev64 $res0b, $res1b @ GHASH final-3 block
  2589. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  2590. movi $t0.8b, #0 @ suppress further partial tag feed in
  2591. ldr $ctr_t1q, [$input_ptr], #16 @ AES final-2 block - load plaintext
  2592. ldr $h4q, [$current_tag, #112] @ load h4l | h4h
  2593. ext $h4.16b, $h4.16b, $h4.16b, #8
  2594. ins $rk4v.d[0], $res0.d[1] @ GHASH final-3 block - mid
  2595. eor3 $res1b, $ctr_t1b, $ctr5b, $t1.16b @ AES final-2 block - result
  2596. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
  2597. ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-3 block - mid
  2598. pmull $rk3q1, $res0.1d, $h4.1d @ GHASH final-3 block - low
  2599. pmull2 $rk2q1, $res0.2d, $h4.2d @ GHASH final-3 block - high
  2600. pmull2 $rk4v.1q, $rk4v.2d, $h34k.2d @ GHASH final-3 block - mid
  2601. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-3 block - low
  2602. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-3 block - mid
  2603. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-3 block - high
  2604. .L192_enc_blocks_more_than_2: @ blocks left > 2
  2605. st1 { $res1b}, [$output_ptr], #16 @ AES final-2 block - store result
  2606. rev64 $res0b, $res1b @ GHASH final-2 block
  2607. ldr $h3q, [$current_tag, #80] @ load h3l | h3h
  2608. ext $h3.16b, $h3.16b, $h3.16b, #8
  2609. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  2610. ldr $ctr_t1q, [$input_ptr], #16 @ AES final-1 block - load plaintext
  2611. ins $rk4v.d[0], $res0.d[1] @ GHASH final-2 block - mid
  2612. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
  2613. pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
  2614. pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
  2615. movi $t0.8b, #0 @ suppress further partial tag feed in
  2616. pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
  2617. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
  2618. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
  2619. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
  2620. eor3 $res1b, $ctr_t1b, $ctr6b, $t1.16b @ AES final-1 block - result
  2621. .L192_enc_blocks_more_than_1: @ blocks left > 1
  2622. ldr $h2q, [$current_tag, #64] @ load h1l | h1h
  2623. ext $h2.16b, $h2.16b, $h2.16b, #8
  2624. st1 { $res1b}, [$output_ptr], #16 @ AES final-1 block - store result
  2625. rev64 $res0b, $res1b @ GHASH final-1 block
  2626. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  2627. ins $rk4v.d[0], $res0.d[1] @ GHASH final-1 block - mid
  2628. pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
  2629. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
  2630. pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
  2631. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
  2632. ldr $ctr_t1q, [$input_ptr], #16 @ AES final block - load plaintext
  2633. ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
  2634. ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
  2635. eor3 $res1b, $ctr_t1b, $ctr7b, $t1.16b @ AES final block - result
  2636. pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
  2637. movi $t0.8b, #0 @ suppress further partial tag feed in
  2638. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
  2639. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
  2640. .L192_enc_blocks_less_than_1: @ blocks left <= 1
  2641. mvn $temp0_x, xzr @ temp0_x = 0xffffffffffffffff
  2642. and $bit_length, $bit_length, #127 @ bit_length %= 128
  2643. sub $bit_length, $bit_length, #128 @ bit_length -= 128
  2644. neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
  2645. and $bit_length, $bit_length, #127 @ bit_length %= 128
  2646. lsr $temp0_x, $temp0_x, $bit_length @ temp0_x is mask for top 64b of last block
  2647. cmp $bit_length, #64
  2648. mvn $temp1_x, xzr @ temp1_x = 0xffffffffffffffff
  2649. csel $temp2_x, $temp1_x, $temp0_x, lt
  2650. csel $temp3_x, $temp0_x, xzr, lt
  2651. mov $ctr0.d[1], $temp3_x
  2652. ldr $h1q, [$current_tag, #32] @ load h1l | h1h
  2653. ext $h1.16b, $h1.16b, $h1.16b, #8
  2654. ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored
  2655. mov $ctr0.d[0], $temp2_x @ ctr0b is mask for last block
  2656. and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
  2657. rev64 $res0b, $res1b @ GHASH final block
  2658. bif $res1b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing
  2659. st1 { $res1b}, [$output_ptr] @ store all 16B
  2660. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  2661. ins $t0.d[0], $res0.d[1] @ GHASH final block - mid
  2662. pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
  2663. eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
  2664. pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
  2665. eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
  2666. pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
  2667. eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
  2668. ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
  2669. eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
  2670. ext $t11.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
  2671. rev32 $rtmp_ctr.16b, $rtmp_ctr.16b
  2672. str $rtmp_ctrq, [$counter] @ store the updated counter
  2673. eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
  2674. pmull $t12.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
  2675. eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid
  2676. pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
  2677. ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
  2678. eor3 $acc_lb, $acc_lb, $acc_hb, $t11.16b @ MODULO - fold into low
  2679. ext $acc_lb, $acc_lb, $acc_lb, #8
  2680. rev64 $acc_lb, $acc_lb
  2681. st1 { $acc_l.16b }, [$current_tag]
  2682. mov x0, $byte_length @ return sizes
  2683. ldp d10, d11, [sp, #16]
  2684. ldp d12, d13, [sp, #32]
  2685. ldp d14, d15, [sp, #48]
  2686. ldp d8, d9, [sp], #80
  2687. ret
  2688. .L192_enc_ret:
  2689. mov w0, #0x0
  2690. ret
  2691. .size unroll8_eor3_aes_gcm_enc_192_kernel,.-unroll8_eor3_aes_gcm_enc_192_kernel
  2692. ___
  2693. #########################################################################################
  2694. # size_t unroll8_eor3_aes_gcm_dec_192_kernel(const uint8_t * ciphertext,
  2695. # uint64_t plaintext_length,
  2696. # uint8_t * plaintext,
  2697. # uint64_t *Xi,
  2698. # unsigned char ivec[16],
  2699. # const void *key);
  2700. #
  2701. $code.=<<___;
  2702. .global unroll8_eor3_aes_gcm_dec_192_kernel
  2703. .type unroll8_eor3_aes_gcm_dec_192_kernel,%function
  2704. .align 4
  2705. unroll8_eor3_aes_gcm_dec_192_kernel:
  2706. AARCH64_VALID_CALL_TARGET
  2707. cbz x1, .L192_dec_ret
  2708. stp d8, d9, [sp, #-80]!
  2709. lsr $byte_length, $bit_length, #3
  2710. mov $counter, x4
  2711. mov $cc, x5
  2712. stp d10, d11, [sp, #16]
  2713. stp d12, d13, [sp, #32]
  2714. stp d14, d15, [sp, #48]
  2715. mov x5, #0xc200000000000000
  2716. stp x5, xzr, [sp, #64]
  2717. add $modulo_constant, sp, #64
  2718. mov $main_end_input_ptr, $byte_length
  2719. ld1 { $ctr0b}, [$counter] @ CTR block 0
  2720. ld1 { $acc_lb}, [$current_tag]
  2721. mov $constant_temp, #0x100000000 @ set up counter increment
  2722. movi $rctr_inc.16b, #0x0
  2723. mov $rctr_inc.d[1], $constant_temp
  2724. rev32 $rtmp_ctr.16b, $ctr0.16b @ set up reversed counter
  2725. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 0
  2726. rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 1
  2727. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 1
  2728. rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 2
  2729. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 2
  2730. rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 3
  2731. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 3
  2732. rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 4
  2733. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 4
  2734. rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 5
  2735. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 5
  2736. ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
  2737. rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 6
  2738. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 6
  2739. rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 7
  2740. aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
  2741. aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 0
  2742. aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 0
  2743. aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
  2744. aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
  2745. aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 0
  2746. aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
  2747. aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 0
  2748. ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
  2749. aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
  2750. aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
  2751. aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
  2752. aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
  2753. aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 1
  2754. aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 1
  2755. aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 1
  2756. aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 2
  2757. aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
  2758. aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 1
  2759. aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 2
  2760. aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
  2761. aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
  2762. aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
  2763. aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 2
  2764. aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 2
  2765. aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 3
  2766. ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
  2767. aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
  2768. aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 3
  2769. aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
  2770. aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
  2771. aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 3
  2772. aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
  2773. aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 3
  2774. aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
  2775. aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
  2776. aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 4
  2777. aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
  2778. aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 4
  2779. aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 4
  2780. aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
  2781. aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 5
  2782. aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 4
  2783. aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 5
  2784. ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
  2785. aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
  2786. aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 5
  2787. aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
  2788. aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
  2789. aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
  2790. aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 5
  2791. sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
  2792. aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 6
  2793. aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 6
  2794. aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
  2795. aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
  2796. aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
  2797. aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 6
  2798. aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 6
  2799. aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
  2800. ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
  2801. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 7
  2802. aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
  2803. aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 7
  2804. aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
  2805. aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
  2806. aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 7
  2807. aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 7
  2808. aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
  2809. aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 7
  2810. aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
  2811. aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
  2812. and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffff80 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
  2813. aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 8
  2814. aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 8
  2815. aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 8
  2816. aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 8
  2817. aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
  2818. aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
  2819. add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
  2820. aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 9
  2821. ld1 { $acc_lb}, [$current_tag]
  2822. ext $acc_lb, $acc_lb, $acc_lb, #8
  2823. rev64 $acc_lb, $acc_lb
  2824. ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11
  2825. aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9
  2826. add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
  2827. aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9
  2828. aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 9
  2829. aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 9
  2830. cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
  2831. aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9
  2832. aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 9
  2833. aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9
  2834. aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 10
  2835. aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 10
  2836. aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 10
  2837. aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 10
  2838. aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 10
  2839. aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 10
  2840. aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 10
  2841. aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 10
  2842. ldr $rk12q, [$cc, #192] @ load rk12
  2843. aese $ctr0b, $rk11 @ AES block 0 - round 11
  2844. aese $ctr1b, $rk11 @ AES block 1 - round 11
  2845. aese $ctr4b, $rk11 @ AES block 4 - round 11
  2846. aese $ctr6b, $rk11 @ AES block 6 - round 11
  2847. aese $ctr5b, $rk11 @ AES block 5 - round 11
  2848. aese $ctr7b, $rk11 @ AES block 7 - round 11
  2849. aese $ctr2b, $rk11 @ AES block 2 - round 11
  2850. aese $ctr3b, $rk11 @ AES block 3 - round 11
  2851. b.ge .L192_dec_tail @ handle tail
  2852. ldp $res0q, $res1q, [$input_ptr], #32 @ AES block 0, 1 - load ciphertext
  2853. ldp $res2q, $res3q, [$input_ptr], #32 @ AES block 2, 3 - load ciphertext
  2854. ldp $res4q, $res5q, [$input_ptr], #32 @ AES block 4, 5 - load ciphertext
  2855. eor3 $ctr1b, $res1b, $ctr1b, $rk12 @ AES block 1 - result
  2856. eor3 $ctr0b, $res0b, $ctr0b, $rk12 @ AES block 0 - result
  2857. stp $ctr0q, $ctr1q, [$output_ptr], #32 @ AES block 0, 1 - store result
  2858. rev32 $ctr0.16b, $rtmp_ctr.16b @ CTR block 8
  2859. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8
  2860. rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 9
  2861. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 9
  2862. eor3 $ctr3b, $res3b, $ctr3b, $rk12 @ AES block 3 - result
  2863. eor3 $ctr2b, $res2b, $ctr2b, $rk12 @ AES block 2 - result
  2864. stp $ctr2q, $ctr3q, [$output_ptr], #32 @ AES block 2, 3 - store result
  2865. ldp $res6q, $res7q, [$input_ptr], #32 @ AES block 6, 7 - load ciphertext
  2866. rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 10
  2867. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 10
  2868. eor3 $ctr4b, $res4b, $ctr4b, $rk12 @ AES block 4 - result
  2869. rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 11
  2870. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 11
  2871. eor3 $ctr5b, $res5b, $ctr5b, $rk12 @ AES block 5 - result
  2872. stp $ctr4q, $ctr5q, [$output_ptr], #32 @ AES block 4, 5 - store result
  2873. cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
  2874. eor3 $ctr6b, $res6b, $ctr6b, $rk12 @ AES block 6 - result
  2875. eor3 $ctr7b, $res7b, $ctr7b, $rk12 @ AES block 7 - result
  2876. rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 12
  2877. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 12
  2878. stp $ctr6q, $ctr7q, [$output_ptr], #32 @ AES block 6, 7 - store result
  2879. b.ge .L192_dec_prepretail @ do prepretail
  2880. .L192_dec_main_loop: @ main loop start
  2881. rev64 $res1b, $res1b @ GHASH block 8k+1
  2882. ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
  2883. ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
  2884. rev64 $res0b, $res0b @ GHASH block 8k
  2885. rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13
  2886. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13
  2887. ldr $h7q, [$current_tag, #176] @ load h7l | h7h
  2888. ext $h7.16b, $h7.16b, $h7.16b, #8
  2889. ldr $h8q, [$current_tag, #208] @ load h8l | h8h
  2890. ext $h8.16b, $h8.16b, $h8.16b, #8
  2891. rev64 $res4b, $res4b @ GHASH block 8k+4
  2892. rev64 $res3b, $res3b @ GHASH block 8k+3
  2893. eor $res0b, $res0b, $acc_lb @ PRE 1
  2894. rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14
  2895. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14
  2896. rev64 $res5b, $res5b @ GHASH block 8k+5
  2897. rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15
  2898. aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0
  2899. aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0
  2900. aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0
  2901. aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0
  2902. aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0
  2903. aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0
  2904. aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0
  2905. aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0
  2906. pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low
  2907. pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high
  2908. ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
  2909. aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1
  2910. pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low
  2911. ldr $h5q, [$current_tag, #128] @ load h5l | h5h
  2912. ext $h5.16b, $h5.16b, $h5.16b, #8
  2913. ldr $h6q, [$current_tag, #160] @ load h6l | h6h
  2914. ext $h6.16b, $h6.16b, $h6.16b, #8
  2915. aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1
  2916. aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1
  2917. aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1
  2918. pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high
  2919. aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1
  2920. aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1
  2921. trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
  2922. rev64 $res2b, $res2b @ GHASH block 8k+2
  2923. aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1
  2924. aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1
  2925. ldr $h56kq, [$current_tag, #144] @ load h6k | h5k
  2926. ldr $h78kq, [$current_tag, #192] @ load h8k | h7k
  2927. trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
  2928. eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high
  2929. pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high
  2930. pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high
  2931. eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid
  2932. eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low
  2933. aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2
  2934. aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2
  2935. pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low
  2936. eor3 $acc_hb, $acc_hb, $t1.16b, $t2.16b @ GHASH block 8k+2, 8k+3 - high
  2937. aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2
  2938. aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3
  2939. aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2
  2940. aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2
  2941. aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2
  2942. aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2
  2943. ldr $h3q, [$current_tag, #80] @ load h3l | h3h
  2944. ext $h3.16b, $h3.16b, $h3.16b, #8
  2945. ldr $h4q, [$current_tag, #112] @ load h4l | h4h
  2946. ext $h4.16b, $h4.16b, $h4.16b, #8
  2947. aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2
  2948. aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3
  2949. pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low
  2950. trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
  2951. trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
  2952. aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3
  2953. aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3
  2954. aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3
  2955. aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3
  2956. ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
  2957. eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
  2958. eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low
  2959. aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3
  2960. trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
  2961. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15
  2962. pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid
  2963. pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid
  2964. pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid
  2965. aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3
  2966. pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid
  2967. pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high
  2968. aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4
  2969. aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4
  2970. eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid
  2971. aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4
  2972. aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4
  2973. aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4
  2974. aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4
  2975. aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4
  2976. aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4
  2977. ldr $h1q, [$current_tag, #32] @ load h1l | h1h
  2978. ext $h1.16b, $h1.16b, $h1.16b, #8
  2979. ldr $h2q, [$current_tag, #64] @ load h2l | h2h
  2980. ext $h2.16b, $h2.16b, $h2.16b, #8
  2981. aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5
  2982. aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5
  2983. ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
  2984. aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5
  2985. rev64 $res7b, $res7b @ GHASH block 8k+7
  2986. aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5
  2987. eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
  2988. aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5
  2989. pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low
  2990. trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
  2991. aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5
  2992. aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5
  2993. aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5
  2994. rev64 $res6b, $res6b @ GHASH block 8k+6
  2995. ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
  2996. ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
  2997. pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high
  2998. pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low
  2999. aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6
  3000. eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
  3001. trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
  3002. aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6
  3003. aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6
  3004. aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6
  3005. pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high
  3006. aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6
  3007. aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6
  3008. aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7
  3009. aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7
  3010. aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6
  3011. pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid
  3012. eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high
  3013. eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low
  3014. pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low
  3015. trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
  3016. aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6
  3017. aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7
  3018. ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
  3019. aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7
  3020. eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
  3021. pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid
  3022. aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7
  3023. aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7
  3024. aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7
  3025. aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7
  3026. eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
  3027. pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid
  3028. pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high
  3029. pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid
  3030. ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
  3031. pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low
  3032. aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8
  3033. aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8
  3034. aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8
  3035. aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8
  3036. aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8
  3037. eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low
  3038. aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8
  3039. aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8
  3040. aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8
  3041. eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high
  3042. rev32 $h1.16b, $rtmp_ctr.16b @ CTR block 8k+16
  3043. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+16
  3044. aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 9
  3045. eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
  3046. aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 9
  3047. aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 9
  3048. aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 9
  3049. ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11
  3050. eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
  3051. ldp $res0q, $res1q, [$input_ptr], #32 @ AES block 8k+8, 8k+9 - load ciphertext
  3052. aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 9
  3053. aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 9
  3054. ldp $res2q, $res3q, [$input_ptr], #32 @ AES block 8k+10, 8k+11 - load ciphertext
  3055. rev32 $h2.16b, $rtmp_ctr.16b @ CTR block 8k+17
  3056. pmull $t12.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
  3057. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+17
  3058. aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 9
  3059. aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 9
  3060. ext $t11.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
  3061. aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 10
  3062. aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 10
  3063. ldp $res4q, $res5q, [$input_ptr], #32 @ AES block 8k+12, 8k+13 - load ciphertext
  3064. rev32 $h3.16b, $rtmp_ctr.16b @ CTR block 8k+18
  3065. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+18
  3066. eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid
  3067. aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 10
  3068. aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 10
  3069. ldr $rk12q, [$cc, #192] @ load rk12
  3070. ldp $res6q, $res7q, [$input_ptr], #32 @ AES block 8k+14, 8k+15 - load ciphertext
  3071. aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 10
  3072. aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 10
  3073. aese $ctr0b, $rk11 @ AES block 8k+8 - round 11
  3074. ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
  3075. aese $ctr1b, $rk11 @ AES block 8k+9 - round 11
  3076. aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 10
  3077. aese $ctr6b, $rk11 @ AES block 8k+14 - round 11
  3078. aese $ctr3b, $rk11 @ AES block 8k+11 - round 11
  3079. eor3 $ctr0b, $res0b, $ctr0b, $rk12 @ AES block 8k+8 - result
  3080. rev32 $h4.16b, $rtmp_ctr.16b @ CTR block 8k+19
  3081. aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 10
  3082. aese $ctr4b, $rk11 @ AES block 8k+12 - round 11
  3083. aese $ctr2b, $rk11 @ AES block 8k+10 - round 11
  3084. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+19
  3085. aese $ctr7b, $rk11 @ AES block 8k+15 - round 11
  3086. aese $ctr5b, $rk11 @ AES block 8k+13 - round 11
  3087. pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
  3088. eor3 $ctr1b, $res1b, $ctr1b, $rk12 @ AES block 8k+9 - result
  3089. stp $ctr0q, $ctr1q, [$output_ptr], #32 @ AES block 8k+8, 8k+9 - store result
  3090. eor3 $ctr3b, $res3b, $ctr3b, $rk12 @ AES block 8k+11 - result
  3091. eor3 $ctr2b, $res2b, $ctr2b, $rk12 @ AES block 8k+10 - result
  3092. eor3 $ctr7b, $res7b, $ctr7b, $rk12 @ AES block 8k+15 - result
  3093. stp $ctr2q, $ctr3q, [$output_ptr], #32 @ AES block 8k+10, 8k+11 - store result
  3094. eor3 $ctr5b, $res5b, $ctr5b, $rk12 @ AES block 8k+13 - result
  3095. eor3 $acc_lb, $acc_lb, $acc_hb, $t11.16b @ MODULO - fold into low
  3096. mov $ctr3.16b, $h4.16b @ CTR block 8k+19
  3097. eor3 $ctr4b, $res4b, $ctr4b, $rk12 @ AES block 8k+12 - result
  3098. stp $ctr4q, $ctr5q, [$output_ptr], #32 @ AES block 8k+12, 8k+13 - store result
  3099. cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
  3100. eor3 $ctr6b, $res6b, $ctr6b, $rk12 @ AES block 8k+14 - result
  3101. stp $ctr6q, $ctr7q, [$output_ptr], #32 @ AES block 8k+14, 8k+15 - store result
  3102. mov $ctr0.16b, $h1.16b @ CTR block 8k+16
  3103. mov $ctr1.16b, $h2.16b @ CTR block 8k+17
  3104. mov $ctr2.16b, $h3.16b @ CTR block 8k+18
  3105. rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 8k+20
  3106. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+20
  3107. b.lt .L192_dec_main_loop
  3108. .L192_dec_prepretail: @ PREPRETAIL
  3109. ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
  3110. rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13
  3111. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13
  3112. ldr $h7q, [$current_tag, #176] @ load h7l | h7h
  3113. ext $h7.16b, $h7.16b, $h7.16b, #8
  3114. ldr $h8q, [$current_tag, #208] @ load h8l | h8h
  3115. ext $h8.16b, $h8.16b, $h8.16b, #8
  3116. rev64 $res0b, $res0b @ GHASH block 8k
  3117. ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
  3118. rev64 $res3b, $res3b @ GHASH block 8k+3
  3119. rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14
  3120. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14
  3121. eor $res0b, $res0b, $acc_lb @ PRE 1
  3122. rev64 $res2b, $res2b @ GHASH block 8k+2
  3123. rev64 $res1b, $res1b @ GHASH block 8k+1
  3124. ldr $h5q, [$current_tag, #128] @ load h5l | h5h
  3125. ext $h5.16b, $h5.16b, $h5.16b, #8
  3126. ldr $h6q, [$current_tag, #160] @ load h6l | h6h
  3127. ext $h6.16b, $h6.16b, $h6.16b, #8
  3128. rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15
  3129. aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0
  3130. aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0
  3131. aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0
  3132. aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0
  3133. aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0
  3134. pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high
  3135. aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0
  3136. pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high
  3137. aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0
  3138. aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1
  3139. aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0
  3140. ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
  3141. aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1
  3142. pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high
  3143. pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low
  3144. pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low
  3145. eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high
  3146. aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1
  3147. pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low
  3148. aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1
  3149. aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1
  3150. trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
  3151. trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
  3152. pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high
  3153. aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1
  3154. aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1
  3155. aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1
  3156. ldr $h56kq, [$current_tag, #144] @ load h6k | h5k
  3157. ldr $h78kq, [$current_tag, #192] @ load h8k | h7k
  3158. aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2
  3159. eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid
  3160. aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2
  3161. rev64 $res5b, $res5b @ GHASH block 8k+5
  3162. pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low
  3163. eor3 $acc_hb, $acc_hb, $t1.16b, $t2.16b @ GHASH block 8k+2, 8k+3 - high
  3164. aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2
  3165. aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2
  3166. trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
  3167. aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3
  3168. aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2
  3169. aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2
  3170. aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2
  3171. trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
  3172. pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid
  3173. aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2
  3174. pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid
  3175. aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3
  3176. eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
  3177. eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low
  3178. aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3
  3179. aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3
  3180. aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3
  3181. eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low
  3182. ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
  3183. aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3
  3184. ldr $h3q, [$current_tag, #80] @ load h3l | h3h
  3185. ext $h3.16b, $h3.16b, $h3.16b, #8
  3186. ldr $h4q, [$current_tag, #112] @ load h4l | h4h
  3187. ext $h4.16b, $h4.16b, $h4.16b, #8
  3188. pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid
  3189. pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid
  3190. ldr $h1q, [$current_tag, #32] @ load h1l | h1h
  3191. ext $h1.16b, $h1.16b, $h1.16b, #8
  3192. ldr $h2q, [$current_tag, #64] @ load h2l | h2h
  3193. ext $h2.16b, $h2.16b, $h2.16b, #8
  3194. eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid
  3195. aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3
  3196. rev64 $res7b, $res7b @ GHASH block 8k+7
  3197. eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
  3198. rev64 $res4b, $res4b @ GHASH block 8k+4
  3199. aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4
  3200. aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4
  3201. aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3
  3202. aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4
  3203. aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4
  3204. aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4
  3205. aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4
  3206. aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4
  3207. aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4
  3208. rev64 $res6b, $res6b @ GHASH block 8k+6
  3209. ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
  3210. ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
  3211. trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
  3212. aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5
  3213. aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5
  3214. aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5
  3215. ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
  3216. aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5
  3217. aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5
  3218. pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high
  3219. pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high
  3220. pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low
  3221. aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5
  3222. pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low
  3223. trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
  3224. pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high
  3225. pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low
  3226. trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
  3227. aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5
  3228. trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
  3229. aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5
  3230. eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
  3231. aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6
  3232. aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6
  3233. eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
  3234. aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6
  3235. aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6
  3236. pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid
  3237. pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid
  3238. aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6
  3239. pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid
  3240. aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6
  3241. pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high
  3242. eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
  3243. aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7
  3244. eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low
  3245. aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6
  3246. aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6
  3247. aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7
  3248. ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
  3249. pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid
  3250. aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7
  3251. ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
  3252. eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high
  3253. pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low
  3254. aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7
  3255. aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7
  3256. aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7
  3257. eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high
  3258. eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low
  3259. eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
  3260. aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7
  3261. aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7
  3262. eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
  3263. ext $t11.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
  3264. aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8
  3265. aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8
  3266. aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8
  3267. aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8
  3268. aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8
  3269. pmull $t12.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
  3270. aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8
  3271. aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8
  3272. aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8
  3273. ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11
  3274. eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid
  3275. aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 9
  3276. aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 9
  3277. aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 9
  3278. aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 9
  3279. aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 9
  3280. aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 9
  3281. aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 9
  3282. aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 9
  3283. pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
  3284. ldr $rk12q, [$cc, #192] @ load rk12
  3285. ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
  3286. aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 10
  3287. aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 10
  3288. aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 10
  3289. aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 10
  3290. aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 10
  3291. aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 10
  3292. aese $ctr0b, $rk11 @ AES block 8k+8 - round 11
  3293. eor3 $acc_lb, $acc_lb, $acc_hb, $t11.16b @ MODULO - fold into low
  3294. aese $ctr5b, $rk11 @ AES block 8k+13 - round 11
  3295. aese $ctr2b, $rk11 @ AES block 8k+10 - round 11
  3296. aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 10
  3297. aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 10
  3298. aese $ctr6b, $rk11 @ AES block 8k+14 - round 11
  3299. aese $ctr4b, $rk11 @ AES block 8k+12 - round 11
  3300. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15
  3301. aese $ctr3b, $rk11 @ AES block 8k+11 - round 11
  3302. aese $ctr1b, $rk11 @ AES block 8k+9 - round 11
  3303. aese $ctr7b, $rk11 @ AES block 8k+15 - round 11
  3304. .L192_dec_tail: @ TAIL
  3305. sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
  3306. ldp $h5q, $h56kq, [$current_tag, #128] @ load h5l | h5h
  3307. ext $h5.16b, $h5.16b, $h5.16b, #8
  3308. ldr $res1q, [$input_ptr], #16 @ AES block 8k+8 - load ciphertext
  3309. ldp $h78kq, $h8q, [$current_tag, #192] @ load h8k | h7k
  3310. ext $h8.16b, $h8.16b, $h8.16b, #8
  3311. mov $t1.16b, $rk12
  3312. ldp $h6q, $h7q, [$current_tag, #160] @ load h6l | h6h
  3313. ext $h6.16b, $h6.16b, $h6.16b, #8
  3314. ext $h7.16b, $h7.16b, $h7.16b, #8
  3315. ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
  3316. eor3 $res4b, $res1b, $ctr0b, $t1.16b @ AES block 8k+8 - result
  3317. cmp $main_end_input_ptr, #112
  3318. b.gt .L192_dec_blocks_more_than_7
  3319. mov $ctr7b, $ctr6b
  3320. movi $acc_h.8b, #0
  3321. sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
  3322. mov $ctr6b, $ctr5b
  3323. mov $ctr5b, $ctr4b
  3324. mov $ctr4b, $ctr3b
  3325. cmp $main_end_input_ptr, #96
  3326. movi $acc_l.8b, #0
  3327. mov $ctr3b, $ctr2b
  3328. mov $ctr2b, $ctr1b
  3329. movi $acc_m.8b, #0
  3330. b.gt .L192_dec_blocks_more_than_6
  3331. mov $ctr7b, $ctr6b
  3332. mov $ctr6b, $ctr5b
  3333. mov $ctr5b, $ctr4b
  3334. mov $ctr4b, $ctr3b
  3335. mov $ctr3b, $ctr1b
  3336. sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
  3337. cmp $main_end_input_ptr, #80
  3338. b.gt .L192_dec_blocks_more_than_5
  3339. mov $ctr7b, $ctr6b
  3340. mov $ctr6b, $ctr5b
  3341. mov $ctr5b, $ctr4b
  3342. mov $ctr4b, $ctr1b
  3343. cmp $main_end_input_ptr, #64
  3344. sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
  3345. b.gt .L192_dec_blocks_more_than_4
  3346. sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
  3347. mov $ctr7b, $ctr6b
  3348. mov $ctr6b, $ctr5b
  3349. mov $ctr5b, $ctr1b
  3350. cmp $main_end_input_ptr, #48
  3351. b.gt .L192_dec_blocks_more_than_3
  3352. sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
  3353. mov $ctr7b, $ctr6b
  3354. cmp $main_end_input_ptr, #32
  3355. mov $ctr6b, $ctr1b
  3356. ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
  3357. b.gt .L192_dec_blocks_more_than_2
  3358. sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
  3359. mov $ctr7b, $ctr1b
  3360. cmp $main_end_input_ptr, #16
  3361. b.gt .L192_dec_blocks_more_than_1
  3362. sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
  3363. ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
  3364. b .L192_dec_blocks_less_than_1
  3365. .L192_dec_blocks_more_than_7: @ blocks left > 7
  3366. rev64 $res0b, $res1b @ GHASH final-7 block
  3367. ins $acc_m.d[0], $h78k.d[1] @ GHASH final-7 block - mid
  3368. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  3369. pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH final-7 block - high
  3370. ins $rk4v.d[0], $res0.d[1] @ GHASH final-7 block - mid
  3371. ldr $res1q, [$input_ptr], #16 @ AES final-6 block - load ciphertext
  3372. pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH final-7 block - low
  3373. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-7 block - mid
  3374. st1 { $res4b}, [$output_ptr], #16 @ AES final-7 block - store result
  3375. eor3 $res4b, $res1b, $ctr1b, $t1.16b @ AES final-6 block - result
  3376. pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-7 block - mid
  3377. movi $t0.8b, #0 @ suppress further partial tag feed in
  3378. .L192_dec_blocks_more_than_6: @ blocks left > 6
  3379. rev64 $res0b, $res1b @ GHASH final-6 block
  3380. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  3381. ldr $res1q, [$input_ptr], #16 @ AES final-5 block - load ciphertext
  3382. ins $rk4v.d[0], $res0.d[1] @ GHASH final-6 block - mid
  3383. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-6 block - mid
  3384. movi $t0.8b, #0 @ suppress further partial tag feed in
  3385. pmull2 $rk2q1, $res0.2d, $h7.2d @ GHASH final-6 block - high
  3386. st1 { $res4b}, [$output_ptr], #16 @ AES final-6 block - store result
  3387. eor3 $res4b, $res1b, $ctr2b, $t1.16b @ AES final-5 block - result
  3388. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-6 block - high
  3389. pmull $rk4v.1q, $rk4v.1d, $h78k.1d @ GHASH final-6 block - mid
  3390. pmull $rk3q1, $res0.1d, $h7.1d @ GHASH final-6 block - low
  3391. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-6 block - mid
  3392. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-6 block - low
  3393. .L192_dec_blocks_more_than_5: @ blocks left > 5
  3394. rev64 $res0b, $res1b @ GHASH final-5 block
  3395. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  3396. ins $rk4v.d[0], $res0.d[1] @ GHASH final-5 block - mid
  3397. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-5 block - mid
  3398. ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-5 block - mid
  3399. pmull2 $rk2q1, $res0.2d, $h6.2d @ GHASH final-5 block - high
  3400. ldr $res1q, [$input_ptr], #16 @ AES final-4 block - load ciphertext
  3401. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-5 block - high
  3402. pmull $rk3q1, $res0.1d, $h6.1d @ GHASH final-5 block - low
  3403. pmull2 $rk4v.1q, $rk4v.2d, $h56k.2d @ GHASH final-5 block - mid
  3404. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-5 block - low
  3405. movi $t0.8b, #0 @ suppress further partial tag feed in
  3406. st1 { $res4b}, [$output_ptr], #16 @ AES final-5 block - store result
  3407. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-5 block - mid
  3408. eor3 $res4b, $res1b, $ctr3b, $t1.16b @ AES final-4 block - result
  3409. .L192_dec_blocks_more_than_4: @ blocks left > 4
  3410. rev64 $res0b, $res1b @ GHASH final-4 block
  3411. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  3412. movi $t0.8b, #0 @ suppress further partial tag feed in
  3413. ldr $res1q, [$input_ptr], #16 @ AES final-3 block - load ciphertext
  3414. ins $rk4v.d[0], $res0.d[1] @ GHASH final-4 block - mid
  3415. pmull $rk3q1, $res0.1d, $h5.1d @ GHASH final-4 block - low
  3416. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-4 block - mid
  3417. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-4 block - low
  3418. pmull $rk4v.1q, $rk4v.1d, $h56k.1d @ GHASH final-4 block - mid
  3419. st1 { $res4b}, [$output_ptr], #16 @ AES final-4 block - store result
  3420. pmull2 $rk2q1, $res0.2d, $h5.2d @ GHASH final-4 block - high
  3421. eor3 $res4b, $res1b, $ctr4b, $t1.16b @ AES final-3 block - result
  3422. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-4 block - mid
  3423. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-4 block - high
  3424. .L192_dec_blocks_more_than_3: @ blocks left > 3
  3425. ldr $h4q, [$current_tag, #112] @ load h4l | h4h
  3426. ext $h4.16b, $h4.16b, $h4.16b, #8
  3427. rev64 $res0b, $res1b @ GHASH final-3 block
  3428. ldr $res1q, [$input_ptr], #16 @ AES final-2 block - load ciphertext
  3429. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  3430. ins $rk4v.d[0], $res0.d[1] @ GHASH final-3 block - mid
  3431. pmull2 $rk2q1, $res0.2d, $h4.2d @ GHASH final-3 block - high
  3432. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-3 block - high
  3433. movi $t0.8b, #0 @ suppress further partial tag feed in
  3434. pmull $rk3q1, $res0.1d, $h4.1d @ GHASH final-3 block - low
  3435. st1 { $res4b}, [$output_ptr], #16 @ AES final-3 block - store result
  3436. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
  3437. eor3 $res4b, $res1b, $ctr5b, $t1.16b @ AES final-2 block - result
  3438. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-3 block - low
  3439. ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
  3440. ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-3 block - mid
  3441. pmull2 $rk4v.1q, $rk4v.2d, $h34k.2d @ GHASH final-3 block - mid
  3442. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-3 block - mid
  3443. .L192_dec_blocks_more_than_2: @ blocks left > 2
  3444. rev64 $res0b, $res1b @ GHASH final-2 block
  3445. ldr $h3q, [$current_tag, #80] @ load h3l | h3h
  3446. ext $h3.16b, $h3.16b, $h3.16b, #8
  3447. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  3448. ins $rk4v.d[0], $res0.d[1] @ GHASH final-2 block - mid
  3449. ldr $res1q, [$input_ptr], #16 @ AES final-1 block - load ciphertext
  3450. pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
  3451. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
  3452. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
  3453. pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
  3454. pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
  3455. movi $t0.8b, #0 @ suppress further partial tag feed in
  3456. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
  3457. st1 { $res4b}, [$output_ptr], #16 @ AES final-2 block - store result
  3458. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
  3459. eor3 $res4b, $res1b, $ctr6b, $t1.16b @ AES final-1 block - result
  3460. .L192_dec_blocks_more_than_1: @ blocks left > 1
  3461. rev64 $res0b, $res1b @ GHASH final-1 block
  3462. ldr $res1q, [$input_ptr], #16 @ AES final block - load ciphertext
  3463. ldr $h2q, [$current_tag, #64] @ load h1l | h1h
  3464. ext $h2.16b, $h2.16b, $h2.16b, #8
  3465. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  3466. movi $t0.8b, #0 @ suppress further partial tag feed in
  3467. ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
  3468. pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
  3469. ins $rk4v.d[0], $res0.d[1] @ GHASH final-1 block - mid
  3470. st1 { $res4b}, [$output_ptr], #16 @ AES final-1 block - store result
  3471. pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
  3472. eor3 $res4b, $res1b, $ctr7b, $t1.16b @ AES final block - result
  3473. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
  3474. ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
  3475. pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
  3476. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
  3477. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
  3478. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
  3479. .L192_dec_blocks_less_than_1: @ blocks left <= 1
  3480. rev32 $rtmp_ctr.16b, $rtmp_ctr.16b
  3481. and $bit_length, $bit_length, #127 @ bit_length %= 128
  3482. sub $bit_length, $bit_length, #128 @ bit_length -= 128
  3483. str $rtmp_ctrq, [$counter] @ store the updated counter
  3484. neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
  3485. mvn $temp0_x, xzr @ temp0_x = 0xffffffffffffffff
  3486. and $bit_length, $bit_length, #127 @ bit_length %= 128
  3487. mvn $temp1_x, xzr @ temp1_x = 0xffffffffffffffff
  3488. lsr $temp0_x, $temp0_x, $bit_length @ temp0_x is mask for top 64b of last block
  3489. cmp $bit_length, #64
  3490. csel $temp2_x, $temp1_x, $temp0_x, lt
  3491. csel $temp3_x, $temp0_x, xzr, lt
  3492. ldr $h1q, [$current_tag, #32] @ load h1l | h1h
  3493. ext $h1.16b, $h1.16b, $h1.16b, #8
  3494. mov $ctr0.d[1], $temp3_x
  3495. ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored
  3496. mov $ctr0.d[0], $temp2_x @ ctr0b is mask for last block
  3497. and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
  3498. bif $res4b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing
  3499. rev64 $res0b, $res1b @ GHASH final block
  3500. st1 { $res4b}, [$output_ptr] @ store all 16B
  3501. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  3502. ins $t0.d[0], $res0.d[1] @ GHASH final block - mid
  3503. pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
  3504. eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
  3505. pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
  3506. eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
  3507. pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
  3508. eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
  3509. eor $t10.16b, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
  3510. eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
  3511. ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
  3512. pmull $t11.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
  3513. ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
  3514. eor $acc_mb, $acc_mb, $t10.16b @ MODULO - karatsuba tidy up
  3515. eor3 $acc_mb, $acc_mb, $acc_hb, $t11.16b @ MODULO - fold into mid
  3516. pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
  3517. ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
  3518. eor3 $acc_lb, $acc_lb, $acc_mb, $acc_hb @ MODULO - fold into low
  3519. ext $acc_lb, $acc_lb, $acc_lb, #8
  3520. rev64 $acc_lb, $acc_lb
  3521. st1 { $acc_l.16b }, [$current_tag]
  3522. mov x0, $byte_length
  3523. ldp d10, d11, [sp, #16]
  3524. ldp d12, d13, [sp, #32]
  3525. ldp d14, d15, [sp, #48]
  3526. ldp d8, d9, [sp], #80
  3527. ret
  3528. .L192_dec_ret:
  3529. mov w0, #0x0
  3530. ret
  3531. .size unroll8_eor3_aes_gcm_dec_192_kernel,.-unroll8_eor3_aes_gcm_dec_192_kernel
  3532. ___
  3533. }
  3534. {
  3535. my ($end_input_ptr,$main_end_input_ptr,$temp0_x,$temp1_x)=map("x$_",(4..7));
  3536. my ($temp2_x,$temp3_x)=map("x$_",(13..14));
  3537. my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$ctr4b,$ctr5b,$ctr6b,$ctr7b,$res0b,$res1b,$res2b,$res3b,$res4b,$res5b,$res6b,$res7b)=map("v$_.16b",(0..15));
  3538. my ($ctr0,$ctr1,$ctr2,$ctr3,$ctr4,$ctr5,$ctr6,$ctr7,$res0,$res1,$res2,$res3,$res4,$res5,$res6,$res7)=map("v$_",(0..15));
  3539. my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$ctr4d,$ctr5d,$ctr6d,$ctr7d)=map("d$_",(0..7));
  3540. my ($ctr0q,$ctr1q,$ctr2q,$ctr3q,$ctr4q,$ctr5q,$ctr6q,$ctr7q)=map("q$_",(0..7));
  3541. my ($res0q,$res1q,$res2q,$res3q,$res4q,$res5q,$res6q,$res7q)=map("q$_",(8..15));
  3542. my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3,$ctr_t4,$ctr_t5,$ctr_t6,$ctr_t7)=map("v$_",(8..15));
  3543. my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b,$ctr_t4b,$ctr_t5b,$ctr_t6b,$ctr_t7b)=map("v$_.16b",(8..15));
  3544. my ($ctr_t0q,$ctr_t1q,$ctr_t2q,$ctr_t3q,$ctr_t4q,$ctr_t5q,$ctr_t6q,$ctr_t7q)=map("q$_",(8..15));
  3545. my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(17..19));
  3546. my ($acc_h,$acc_m,$acc_l)=map("v$_",(17..19));
  3547. my ($h1,$h12k,$h2,$h3,$h34k,$h4)=map("v$_",(20..25));
  3548. my ($h5,$h56k,$h6,$h7,$h78k,$h8)=map("v$_",(20..25));
  3549. my ($h1q,$h12kq,$h2q,$h3q,$h34kq,$h4q)=map("q$_",(20..25));
  3550. my ($h5q,$h56kq,$h6q,$h7q,$h78kq,$h8q)=map("q$_",(20..25));
  3551. my $t0="v16";
  3552. my $t0d="d16";
  3553. my $t1="v29";
  3554. my $t2=$res1;
  3555. my $t3=$t1;
  3556. my $t4=$res0;
  3557. my $t5=$res2;
  3558. my $t6=$t0;
  3559. my $t7=$res3;
  3560. my $t8=$res4;
  3561. my $t9=$res5;
  3562. my $t10=$res6;
  3563. my $t11="v21";
  3564. my $t12=$t1;
  3565. my $rtmp_ctr="v30";
  3566. my $rtmp_ctrq="q30";
  3567. my $rctr_inc="v31";
  3568. my $rctr_incd="d31";
  3569. my $mod_constantd=$t0d;
  3570. my $mod_constant=$t0;
  3571. my ($rk0,$rk1,$rk2)=map("v$_.16b",(26..28));
  3572. my ($rk3,$rk4,$rk5)=map("v$_.16b",(26..28));
  3573. my ($rk6,$rk7,$rk8)=map("v$_.16b",(26..28));
  3574. my ($rk9,$rk10,$rk11)=map("v$_.16b",(26..28));
  3575. my ($rk12,$rk13,$rk14)=map("v$_.16b",(26..28));
  3576. my ($rk0q,$rk1q,$rk2q)=map("q$_",(26..28));
  3577. my ($rk3q,$rk4q,$rk5q)=map("q$_",(26..28));
  3578. my ($rk6q,$rk7q,$rk8q)=map("q$_",(26..28));
  3579. my ($rk9q,$rk10q,$rk11q)=map("q$_",(26..28));
  3580. my ($rk12q,$rk13q,$rk14q)=map("q$_",(26..28));
  3581. my $rk2q1="v28.1q";
  3582. my $rk3q1="v26.1q";
  3583. my $rk4v="v27";
  3584. #########################################################################################
  3585. # size_t unroll8_eor3_aes_gcm_enc_256_kernel(const uint8_t * plaintext,
  3586. # uint64_t plaintext_length,
  3587. # uint8_t * ciphertext,
  3588. # uint64_t *Xi,
  3589. # unsigned char ivec[16],
  3590. # const void *key);
  3591. #
  3592. $code.=<<___;
  3593. .global unroll8_eor3_aes_gcm_enc_256_kernel
  3594. .type unroll8_eor3_aes_gcm_enc_256_kernel,%function
  3595. .align 4
  3596. unroll8_eor3_aes_gcm_enc_256_kernel:
  3597. AARCH64_VALID_CALL_TARGET
  3598. cbz x1, .L256_enc_ret
  3599. stp d8, d9, [sp, #-80]!
  3600. lsr $byte_length, $bit_length, #3
  3601. mov $counter, x4
  3602. mov $cc, x5
  3603. stp d10, d11, [sp, #16]
  3604. stp d12, d13, [sp, #32]
  3605. stp d14, d15, [sp, #48]
  3606. mov x5, #0xc200000000000000
  3607. stp x5, xzr, [sp, #64]
  3608. add $modulo_constant, sp, #64
  3609. ld1 { $ctr0b}, [$counter] @ CTR block 0
  3610. mov $main_end_input_ptr, $byte_length
  3611. mov $constant_temp, #0x100000000 @ set up counter increment
  3612. movi $rctr_inc.16b, #0x0
  3613. mov $rctr_inc.d[1], $constant_temp
  3614. sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
  3615. and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffff80 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
  3616. add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
  3617. rev32 $rtmp_ctr.16b, $ctr0.16b @ set up reversed counter
  3618. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 0
  3619. rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 1
  3620. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 1
  3621. rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 2
  3622. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 2
  3623. rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 3
  3624. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 3
  3625. rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 4
  3626. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 4
  3627. rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 5
  3628. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 5
  3629. ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
  3630. rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 6
  3631. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 6
  3632. rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 7
  3633. aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
  3634. aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 0
  3635. aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
  3636. aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
  3637. aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
  3638. aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 0
  3639. aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 0
  3640. aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 0
  3641. ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
  3642. aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 1
  3643. aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
  3644. aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
  3645. aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 1
  3646. aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 1
  3647. aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
  3648. aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 1
  3649. aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
  3650. aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
  3651. aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
  3652. aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 2
  3653. aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 2
  3654. aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 2
  3655. aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 2
  3656. aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
  3657. aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
  3658. aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 3
  3659. aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
  3660. ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
  3661. aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 3
  3662. aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
  3663. aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 3
  3664. aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 3
  3665. aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
  3666. aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
  3667. aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 4
  3668. aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 4
  3669. aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
  3670. aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
  3671. aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
  3672. aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
  3673. aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 4
  3674. aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 4
  3675. aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
  3676. aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
  3677. ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
  3678. aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
  3679. aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 5
  3680. aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 5
  3681. aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
  3682. aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 5
  3683. aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 5
  3684. aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
  3685. aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 6
  3686. aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 6
  3687. aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
  3688. aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 6
  3689. aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
  3690. aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 6
  3691. aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
  3692. ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
  3693. aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
  3694. aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
  3695. aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 7
  3696. aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 7
  3697. aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
  3698. aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 7
  3699. aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
  3700. aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 7
  3701. aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 8
  3702. aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
  3703. aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
  3704. aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
  3705. aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 8
  3706. aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 8
  3707. aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 8
  3708. aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
  3709. ld1 { $acc_lb}, [$current_tag]
  3710. ext $acc_lb, $acc_lb, $acc_lb, #8
  3711. rev64 $acc_lb, $acc_lb
  3712. ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11
  3713. aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 9
  3714. aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 9
  3715. aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9
  3716. aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 9
  3717. aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 9
  3718. aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9
  3719. aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9
  3720. aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 10
  3721. aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 10
  3722. aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9
  3723. aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 10
  3724. aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 10
  3725. aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 10
  3726. aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 10
  3727. aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 10
  3728. aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 10
  3729. aese $ctr4b, $rk11 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 11
  3730. ldp $rk12q, $rk13q, [$cc, #192] @ load rk12, rk13
  3731. aese $ctr5b, $rk11 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 11
  3732. aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 11
  3733. aese $ctr6b, $rk11 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 11
  3734. aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 11
  3735. aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 11
  3736. aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 11
  3737. aese $ctr7b, $rk11 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 11
  3738. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 7
  3739. ldr $rk14q, [$cc, #224] @ load rk14
  3740. aese $ctr4b, $rk12 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 12
  3741. aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 12
  3742. aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 12
  3743. aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 12
  3744. aese $ctr5b, $rk12 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 12
  3745. aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 12
  3746. aese $ctr2b, $rk13 @ AES block 2 - round 13
  3747. aese $ctr1b, $rk13 @ AES block 1 - round 13
  3748. aese $ctr4b, $rk13 @ AES block 4 - round 13
  3749. aese $ctr6b, $rk12 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 12
  3750. aese $ctr7b, $rk12 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 12
  3751. aese $ctr0b, $rk13 @ AES block 0 - round 13
  3752. aese $ctr5b, $rk13 @ AES block 5 - round 13
  3753. aese $ctr6b, $rk13 @ AES block 6 - round 13
  3754. aese $ctr7b, $rk13 @ AES block 7 - round 13
  3755. aese $ctr3b, $rk13 @ AES block 3 - round 13
  3756. add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
  3757. cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
  3758. b.ge .L256_enc_tail @ handle tail
  3759. ldp $ctr_t0q, $ctr_t1q, [$input_ptr], #32 @ AES block 0, 1 - load plaintext
  3760. ldp $ctr_t2q, $ctr_t3q, [$input_ptr], #32 @ AES block 2, 3 - load plaintext
  3761. eor3 $res0b, $ctr_t0b, $ctr0b, $rk14 @ AES block 0 - result
  3762. rev32 $ctr0.16b, $rtmp_ctr.16b @ CTR block 8
  3763. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8
  3764. eor3 $res1b, $ctr_t1b, $ctr1b, $rk14 @ AES block 1 - result
  3765. eor3 $res3b, $ctr_t3b, $ctr3b, $rk14 @ AES block 3 - result
  3766. rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 9
  3767. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 9
  3768. ldp $ctr_t4q, $ctr_t5q, [$input_ptr], #32 @ AES block 4, 5 - load plaintext
  3769. ldp $ctr_t6q, $ctr_t7q, [$input_ptr], #32 @ AES block 6, 7 - load plaintext
  3770. eor3 $res2b, $ctr_t2b, $ctr2b, $rk14 @ AES block 2 - result
  3771. cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
  3772. rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 10
  3773. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 10
  3774. stp $res0q, $res1q, [$output_ptr], #32 @ AES block 0, 1 - store result
  3775. stp $res2q, $res3q, [$output_ptr], #32 @ AES block 2, 3 - store result
  3776. rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 11
  3777. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 11
  3778. eor3 $res4b, $ctr_t4b, $ctr4b, $rk14 @ AES block 4 - result
  3779. eor3 $res7b, $ctr_t7b, $ctr7b, $rk14 @ AES block 7 - result
  3780. eor3 $res6b, $ctr_t6b, $ctr6b, $rk14 @ AES block 6 - result
  3781. eor3 $res5b, $ctr_t5b, $ctr5b, $rk14 @ AES block 5 - result
  3782. stp $res4q, $res5q, [$output_ptr], #32 @ AES block 4, 5 - store result
  3783. rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 12
  3784. stp $res6q, $res7q, [$output_ptr], #32 @ AES block 6, 7 - store result
  3785. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 12
  3786. b.ge .L256_enc_prepretail @ do prepretail
  3787. .L256_enc_main_loop: @ main loop start
  3788. ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
  3789. rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13
  3790. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13
  3791. ldr $h56kq, [$current_tag, #144] @ load h6k | h5k
  3792. ldr $h78kq, [$current_tag, #192] @ load h8k | h7k
  3793. rev64 $res3b, $res3b @ GHASH block 8k+3
  3794. ldr $h5q, [$current_tag, #128] @ load h5l | h5h
  3795. ext $h5.16b, $h5.16b, $h5.16b, #8
  3796. ldr $h6q, [$current_tag, #160] @ load h6l | h6h
  3797. ext $h6.16b, $h6.16b, $h6.16b, #8
  3798. rev64 $res1b, $res1b @ GHASH block 8k+1
  3799. rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14
  3800. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14
  3801. rev64 $res0b, $res0b @ GHASH block 8k
  3802. rev64 $res4b, $res4b @ GHASH block 8k+4
  3803. ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
  3804. ldr $h7q, [$current_tag, #176] @ load h7l | h7h
  3805. ext $h7.16b, $h7.16b, $h7.16b, #8
  3806. ldr $h8q, [$current_tag, #208] @ load h8l | h8h
  3807. ext $h8.16b, $h8.16b, $h8.16b, #8
  3808. aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0
  3809. aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0
  3810. rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15
  3811. aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0
  3812. aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0
  3813. aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0
  3814. aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0
  3815. aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0
  3816. aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0
  3817. ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
  3818. eor $res0b, $res0b, $acc_lb @ PRE 1
  3819. aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1
  3820. aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1
  3821. aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1
  3822. aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1
  3823. aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1
  3824. aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1
  3825. aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1
  3826. pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high
  3827. pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low
  3828. pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high
  3829. trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
  3830. trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
  3831. aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1
  3832. aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2
  3833. aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2
  3834. aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2
  3835. aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2
  3836. pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low
  3837. aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2
  3838. aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3
  3839. aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3
  3840. aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2
  3841. aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3
  3842. aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2
  3843. aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2
  3844. aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3
  3845. rev64 $res6b, $res6b @ GHASH block 8k+6
  3846. pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high
  3847. aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3
  3848. ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
  3849. rev64 $res2b, $res2b @ GHASH block 8k+2
  3850. aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3
  3851. aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3
  3852. aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3
  3853. eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high
  3854. pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high
  3855. rev64 $res5b, $res5b @ GHASH block 8k+5
  3856. pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low
  3857. eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low
  3858. ldr $h3q, [$current_tag, #80] @ load h3l | h3h
  3859. ext $h3.16b, $h3.16b, $h3.16b, #8
  3860. ldr $h4q, [$current_tag, #112] @ load h4l | h4h
  3861. ext $h4.16b, $h4.16b, $h4.16b, #8
  3862. trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
  3863. eor3 $acc_hb, $acc_hb, $t1.16b, $t2.16b @ GHASH block 8k+2, 8k+3 - high
  3864. pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low
  3865. aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4
  3866. aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4
  3867. aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4
  3868. aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4
  3869. aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4
  3870. aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4
  3871. trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
  3872. aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4
  3873. aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4
  3874. trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
  3875. eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid
  3876. ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
  3877. aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5
  3878. aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5
  3879. aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5
  3880. eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
  3881. aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5
  3882. rev64 $res7b, $res7b @ GHASH block 8k+7
  3883. aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5
  3884. aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5
  3885. aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5
  3886. pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid
  3887. pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid
  3888. aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5
  3889. pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid
  3890. aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6
  3891. aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6
  3892. aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6
  3893. aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6
  3894. aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6
  3895. eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid
  3896. pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid
  3897. aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6
  3898. eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low
  3899. aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6
  3900. aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6
  3901. ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
  3902. pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high
  3903. aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7
  3904. ldr $h1q, [$current_tag, #32] @ load h1l | h1h
  3905. ext $h1.16b, $h1.16b, $h1.16b, #8
  3906. ldr $h2q, [$current_tag, #64] @ load h2l | h2h
  3907. ext $h2.16b, $h2.16b, $h2.16b, #8
  3908. aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7
  3909. eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
  3910. ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
  3911. ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
  3912. aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7
  3913. aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7
  3914. aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7
  3915. aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7
  3916. pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low
  3917. trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
  3918. aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7
  3919. aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7
  3920. pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high
  3921. aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8
  3922. aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8
  3923. pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low
  3924. trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
  3925. eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
  3926. aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8
  3927. aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 9
  3928. aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8
  3929. pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid
  3930. pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid
  3931. aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8
  3932. aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8
  3933. pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high
  3934. pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low
  3935. aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8
  3936. trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
  3937. aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8
  3938. eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
  3939. aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 9
  3940. aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 9
  3941. eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
  3942. aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 9
  3943. aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 9
  3944. ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11
  3945. aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 9
  3946. aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 9
  3947. pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high
  3948. eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low
  3949. pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low
  3950. ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
  3951. pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid
  3952. pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid
  3953. aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 9
  3954. eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
  3955. eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low
  3956. eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high
  3957. aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 10
  3958. aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 10
  3959. aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 10
  3960. aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 10
  3961. aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 10
  3962. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15
  3963. aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 10
  3964. aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 10
  3965. aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 10
  3966. eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high
  3967. ldp $rk12q, $rk13q, [$cc, #192] @ load rk12, rk13
  3968. rev32 $h1.16b, $rtmp_ctr.16b @ CTR block 8k+16
  3969. ext $t11.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
  3970. ldp $ctr_t0q, $ctr_t1q, [$input_ptr], #32 @ AES block 8k+8, 8k+9 - load plaintext
  3971. aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 11
  3972. aese $ctr6b, $rk11 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 11
  3973. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+16
  3974. aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 11
  3975. aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 11
  3976. aese $ctr7b, $rk11 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 11
  3977. pmull $t12.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
  3978. aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 11
  3979. aese $ctr7b, $rk12 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 12
  3980. aese $ctr5b, $rk11 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 11
  3981. aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 12
  3982. aese $ctr6b, $rk12 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 12
  3983. rev32 $h2.16b, $rtmp_ctr.16b @ CTR block 8k+17
  3984. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+17
  3985. aese $ctr4b, $rk11 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 11
  3986. eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
  3987. aese $ctr5b, $rk12 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 12
  3988. ldr $rk14q, [$cc, #224] @ load rk14
  3989. aese $ctr7b, $rk13 @ AES block 8k+15 - round 13
  3990. ldp $ctr_t2q, $ctr_t3q, [$input_ptr], #32 @ AES block 8k+10, 8k+11 - load plaintext
  3991. aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 12
  3992. aese $ctr4b, $rk12 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 12
  3993. eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid
  3994. aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 12
  3995. ldp $ctr_t4q, $ctr_t5q, [$input_ptr], #32 @ AES block 4, 5 - load plaintext
  3996. ldp $ctr_t6q, $ctr_t7q, [$input_ptr], #32 @ AES block 6, 7 - load plaintext
  3997. aese $ctr2b, $rk13 @ AES block 8k+10 - round 13
  3998. aese $ctr4b, $rk13 @ AES block 8k+12 - round 13
  3999. rev32 $h3.16b, $rtmp_ctr.16b @ CTR block 8k+18
  4000. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+18
  4001. aese $ctr5b, $rk13 @ AES block 8k+13 - round 13
  4002. aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 12
  4003. aese $ctr3b, $rk13 @ AES block 8k+11 - round 13
  4004. cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
  4005. eor3 $res2b, $ctr_t2b, $ctr2b, $rk14 @ AES block 8k+10 - result
  4006. rev32 $h4.16b, $rtmp_ctr.16b @ CTR block 8k+19
  4007. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+19
  4008. aese $ctr0b, $rk13 @ AES block 8k+8 - round 13
  4009. aese $ctr6b, $rk13 @ AES block 8k+14 - round 13
  4010. eor3 $res5b, $ctr_t5b, $ctr5b, $rk14 @ AES block 5 - result
  4011. ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
  4012. pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
  4013. aese $ctr1b, $rk13 @ AES block 8k+9 - round 13
  4014. eor3 $res4b, $ctr_t4b, $ctr4b, $rk14 @ AES block 4 - result
  4015. rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 8k+20
  4016. eor3 $res3b, $ctr_t3b, $ctr3b, $rk14 @ AES block 8k+11 - result
  4017. mov $ctr3.16b, $h4.16b @ CTR block 8k+19
  4018. eor3 $res1b, $ctr_t1b, $ctr1b, $rk14 @ AES block 8k+9 - result
  4019. eor3 $res0b, $ctr_t0b, $ctr0b, $rk14 @ AES block 8k+8 - result
  4020. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+20
  4021. stp $res0q, $res1q, [$output_ptr], #32 @ AES block 8k+8, 8k+9 - store result
  4022. mov $ctr2.16b, $h3.16b @ CTR block 8k+18
  4023. eor3 $res7b, $ctr_t7b, $ctr7b, $rk14 @ AES block 7 - result
  4024. eor3 $acc_lb, $acc_lb, $t11.16b, $acc_hb @ MODULO - fold into low
  4025. stp $res2q, $res3q, [$output_ptr], #32 @ AES block 8k+10, 8k+11 - store result
  4026. eor3 $res6b, $ctr_t6b, $ctr6b, $rk14 @ AES block 6 - result
  4027. mov $ctr1.16b, $h2.16b @ CTR block 8k+17
  4028. stp $res4q, $res5q, [$output_ptr], #32 @ AES block 4, 5 - store result
  4029. stp $res6q, $res7q, [$output_ptr], #32 @ AES block 6, 7 - store result
  4030. mov $ctr0.16b, $h1.16b @ CTR block 8k+16
  4031. b.lt .L256_enc_main_loop
  4032. .L256_enc_prepretail: @ PREPRETAIL
  4033. rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13
  4034. ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
  4035. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13
  4036. rev64 $res2b, $res2b @ GHASH block 8k+2
  4037. rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14
  4038. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14
  4039. rev64 $res5b, $res5b @ GHASH block 8k+5
  4040. ldr $h56kq, [$current_tag, #144] @ load h6k | h5k
  4041. ldr $h78kq, [$current_tag, #192] @ load h8k | h7k
  4042. rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15
  4043. aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0
  4044. aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0
  4045. aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0
  4046. aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0
  4047. aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0
  4048. aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0
  4049. aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0
  4050. aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0
  4051. ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
  4052. rev64 $res0b, $res0b @ GHASH block 8k
  4053. aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1
  4054. rev64 $res1b, $res1b @ GHASH block 8k+1
  4055. ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
  4056. aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1
  4057. ldr $h7q, [$current_tag, #176] @ load h7l | h7h
  4058. ext $h7.16b, $h7.16b, $h7.16b, #8
  4059. ldr $h8q, [$current_tag, #208] @ load h8l | h8h
  4060. ext $h8.16b, $h8.16b, $h8.16b, #8
  4061. aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1
  4062. ldr $h5q, [$current_tag, #128] @ load h5l | h5h
  4063. ext $h5.16b, $h5.16b, $h5.16b, #8
  4064. ldr $h6q, [$current_tag, #160] @ load h6l | h6h
  4065. ext $h6.16b, $h6.16b, $h6.16b, #8
  4066. aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1
  4067. aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1
  4068. aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1
  4069. eor $res0b, $res0b, $acc_lb @ PRE 1
  4070. rev64 $res3b, $res3b @ GHASH block 8k+3
  4071. aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1
  4072. aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2
  4073. aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2
  4074. aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1
  4075. aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2
  4076. aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2
  4077. aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2
  4078. aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2
  4079. aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2
  4080. aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2
  4081. ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
  4082. trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
  4083. pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high
  4084. rev64 $res6b, $res6b @ GHASH block 8k+6
  4085. aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3
  4086. pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high
  4087. aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3
  4088. pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low
  4089. trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
  4090. pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high
  4091. aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3
  4092. aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3
  4093. aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3
  4094. eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high
  4095. pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low
  4096. pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high
  4097. aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3
  4098. aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3
  4099. eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid
  4100. aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3
  4101. pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low
  4102. aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4
  4103. aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4
  4104. aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4
  4105. aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4
  4106. aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4
  4107. aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5
  4108. pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid
  4109. eor3 $acc_hb, $acc_hb, $t1.16b, $t2.16b @ GHASH block 8k+2, 8k+3 - high
  4110. aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4
  4111. trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
  4112. trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
  4113. aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4
  4114. eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low
  4115. aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4
  4116. pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low
  4117. pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid
  4118. eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
  4119. rev64 $res4b, $res4b @ GHASH block 8k+4
  4120. aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5
  4121. aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5
  4122. aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5
  4123. aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5
  4124. ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
  4125. ldr $h3q, [$current_tag, #80] @ load h3l | h3h
  4126. ext $h3.16b, $h3.16b, $h3.16b, #8
  4127. ldr $h4q, [$current_tag, #112] @ load h4l | h4h
  4128. ext $h4.16b, $h4.16b, $h4.16b, #8
  4129. pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid
  4130. pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid
  4131. eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low
  4132. eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid
  4133. aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5
  4134. rev64 $res7b, $res7b @ GHASH block 8k+7
  4135. trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
  4136. aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5
  4137. aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5
  4138. eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
  4139. aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6
  4140. aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6
  4141. aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6
  4142. ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
  4143. ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
  4144. aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6
  4145. aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6
  4146. aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6
  4147. aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6
  4148. aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6
  4149. pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high
  4150. pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low
  4151. ldr $h1q, [$current_tag, #32] @ load h1l | h1h
  4152. ext $h1.16b, $h1.16b, $h1.16b, #8
  4153. ldr $h2q, [$current_tag, #64] @ load h2l | h2h
  4154. ext $h2.16b, $h2.16b, $h2.16b, #8
  4155. ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
  4156. aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7
  4157. aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7
  4158. pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high
  4159. trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
  4160. aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7
  4161. aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7
  4162. pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low
  4163. aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7
  4164. aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7
  4165. eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
  4166. pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high
  4167. pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low
  4168. aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7
  4169. trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
  4170. trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
  4171. aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7
  4172. aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8
  4173. eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low
  4174. aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8
  4175. aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8
  4176. aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8
  4177. aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8
  4178. aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8
  4179. eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
  4180. aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8
  4181. pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid
  4182. pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid
  4183. aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8
  4184. pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high
  4185. pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid
  4186. pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid
  4187. pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low
  4188. eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
  4189. eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high
  4190. ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11
  4191. aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 9
  4192. aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 9
  4193. eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high
  4194. eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
  4195. ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
  4196. eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low
  4197. aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 9
  4198. aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 9
  4199. aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 9
  4200. aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 9
  4201. aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 9
  4202. aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 10
  4203. aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 10
  4204. aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 9
  4205. aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 10
  4206. aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 10
  4207. aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 10
  4208. aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 10
  4209. aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 10
  4210. aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 10
  4211. pmull $t12.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
  4212. eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
  4213. aese $ctr7b, $rk11 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 11
  4214. ldp $rk12q, $rk13q, [$cc, #192] @ load rk12, rk13
  4215. ext $t11.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
  4216. aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 11
  4217. eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid
  4218. aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 11
  4219. aese $ctr6b, $rk11 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 11
  4220. aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 11
  4221. aese $ctr4b, $rk11 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 11
  4222. aese $ctr5b, $rk11 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 11
  4223. pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
  4224. aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 11
  4225. ldr $rk14q, [$cc, #224] @ load rk14
  4226. aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 12
  4227. aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 12
  4228. aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 12
  4229. aese $ctr6b, $rk12 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 12
  4230. aese $ctr5b, $rk12 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 12
  4231. ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
  4232. aese $ctr4b, $rk12 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 12
  4233. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15
  4234. aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 12
  4235. aese $ctr7b, $rk12 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 12
  4236. aese $ctr0b, $rk13 @ AES block 8k+8 - round 13
  4237. eor3 $acc_lb, $acc_lb, $t11.16b, $acc_hb @ MODULO - fold into low
  4238. aese $ctr5b, $rk13 @ AES block 8k+13 - round 13
  4239. aese $ctr1b, $rk13 @ AES block 8k+9 - round 13
  4240. aese $ctr3b, $rk13 @ AES block 8k+11 - round 13
  4241. aese $ctr4b, $rk13 @ AES block 8k+12 - round 13
  4242. aese $ctr7b, $rk13 @ AES block 8k+15 - round 13
  4243. aese $ctr2b, $rk13 @ AES block 8k+10 - round 13
  4244. aese $ctr6b, $rk13 @ AES block 8k+14 - round 13
  4245. .L256_enc_tail: @ TAIL
  4246. ldp $h78kq, $h8q, [$current_tag, #192] @ load h8l | h8h
  4247. ext $h8.16b, $h8.16b, $h8.16b, #8
  4248. sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
  4249. ldr $ctr_t0q, [$input_ptr], #16 @ AES block 8k+8 - load plaintext
  4250. ldp $h5q, $h56kq, [$current_tag, #128] @ load h5l | h5h
  4251. ext $h5.16b, $h5.16b, $h5.16b, #8
  4252. ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
  4253. ldp $h6q, $h7q, [$current_tag, #160] @ load h6l | h6h
  4254. ext $h6.16b, $h6.16b, $h6.16b, #8
  4255. ext $h7.16b, $h7.16b, $h7.16b, #8
  4256. mov $t1.16b, $rk14
  4257. cmp $main_end_input_ptr, #112
  4258. eor3 $res1b, $ctr_t0b, $ctr0b, $t1.16b @ AES block 8k+8 - result
  4259. b.gt .L256_enc_blocks_more_than_7
  4260. movi $acc_l.8b, #0
  4261. mov $ctr7b, $ctr6b
  4262. movi $acc_h.8b, #0
  4263. mov $ctr6b, $ctr5b
  4264. mov $ctr5b, $ctr4b
  4265. mov $ctr4b, $ctr3b
  4266. mov $ctr3b, $ctr2b
  4267. sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
  4268. mov $ctr2b, $ctr1b
  4269. movi $acc_m.8b, #0
  4270. cmp $main_end_input_ptr, #96
  4271. b.gt .L256_enc_blocks_more_than_6
  4272. mov $ctr7b, $ctr6b
  4273. mov $ctr6b, $ctr5b
  4274. cmp $main_end_input_ptr, #80
  4275. mov $ctr5b, $ctr4b
  4276. mov $ctr4b, $ctr3b
  4277. mov $ctr3b, $ctr1b
  4278. sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
  4279. b.gt .L256_enc_blocks_more_than_5
  4280. mov $ctr7b, $ctr6b
  4281. sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
  4282. mov $ctr6b, $ctr5b
  4283. mov $ctr5b, $ctr4b
  4284. cmp $main_end_input_ptr, #64
  4285. mov $ctr4b, $ctr1b
  4286. b.gt .L256_enc_blocks_more_than_4
  4287. cmp $main_end_input_ptr, #48
  4288. mov $ctr7b, $ctr6b
  4289. mov $ctr6b, $ctr5b
  4290. mov $ctr5b, $ctr1b
  4291. sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
  4292. b.gt .L256_enc_blocks_more_than_3
  4293. cmp $main_end_input_ptr, #32
  4294. mov $ctr7b, $ctr6b
  4295. ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
  4296. mov $ctr6b, $ctr1b
  4297. sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
  4298. b.gt .L256_enc_blocks_more_than_2
  4299. mov $ctr7b, $ctr1b
  4300. sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
  4301. cmp $main_end_input_ptr, #16
  4302. b.gt .L256_enc_blocks_more_than_1
  4303. sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
  4304. ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
  4305. b .L256_enc_blocks_less_than_1
  4306. .L256_enc_blocks_more_than_7: @ blocks left > 7
  4307. st1 { $res1b}, [$output_ptr], #16 @ AES final-7 block - store result
  4308. rev64 $res0b, $res1b @ GHASH final-7 block
  4309. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  4310. ldr $ctr_t1q, [$input_ptr], #16 @ AES final-6 block - load plaintext
  4311. pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH final-7 block - high
  4312. ins $rk4v.d[0], $res0.d[1] @ GHASH final-7 block - mid
  4313. ins $acc_m.d[0], $h78k.d[1] @ GHASH final-7 block - mid
  4314. movi $t0.8b, #0 @ suppress further partial tag feed in
  4315. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-7 block - mid
  4316. eor3 $res1b, $ctr_t1b, $ctr1b, $t1.16b @ AES final-6 block - result
  4317. pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-7 block - mid
  4318. pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH final-7 block - low
  4319. .L256_enc_blocks_more_than_6: @ blocks left > 6
  4320. st1 { $res1b}, [$output_ptr], #16 @ AES final-6 block - store result
  4321. rev64 $res0b, $res1b @ GHASH final-6 block
  4322. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  4323. pmull $rk3q1, $res0.1d, $h7.1d @ GHASH final-6 block - low
  4324. ins $rk4v.d[0], $res0.d[1] @ GHASH final-6 block - mid
  4325. pmull2 $rk2q1, $res0.2d, $h7.2d @ GHASH final-6 block - high
  4326. ldr $ctr_t1q, [$input_ptr], #16 @ AES final-5 block - load plaintext
  4327. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-6 block - low
  4328. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-6 block - mid
  4329. pmull $rk4v.1q, $rk4v.1d, $h78k.1d @ GHASH final-6 block - mid
  4330. eor3 $res1b, $ctr_t1b, $ctr2b, $t1.16b @ AES final-5 block - result
  4331. movi $t0.8b, #0 @ suppress further partial tag feed in
  4332. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-6 block - mid
  4333. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-6 block - high
  4334. .L256_enc_blocks_more_than_5: @ blocks left > 5
  4335. st1 { $res1b}, [$output_ptr], #16 @ AES final-5 block - store result
  4336. rev64 $res0b, $res1b @ GHASH final-5 block
  4337. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  4338. ins $rk4v.d[0], $res0.d[1] @ GHASH final-5 block - mid
  4339. pmull2 $rk2q1, $res0.2d, $h6.2d @ GHASH final-5 block - high
  4340. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-5 block - high
  4341. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-5 block - mid
  4342. ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-5 block - mid
  4343. ldr $ctr_t1q, [$input_ptr], #16 @ AES final-4 block - load plaintext
  4344. pmull $rk3q1, $res0.1d, $h6.1d @ GHASH final-5 block - low
  4345. pmull2 $rk4v.1q, $rk4v.2d, $h56k.2d @ GHASH final-5 block - mid
  4346. movi $t0.8b, #0 @ suppress further partial tag feed in
  4347. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-5 block - low
  4348. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-5 block - mid
  4349. eor3 $res1b, $ctr_t1b, $ctr3b, $t1.16b @ AES final-4 block - result
  4350. .L256_enc_blocks_more_than_4: @ blocks left > 4
  4351. st1 { $res1b}, [$output_ptr], #16 @ AES final-4 block - store result
  4352. rev64 $res0b, $res1b @ GHASH final-4 block
  4353. ldr $ctr_t1q, [$input_ptr], #16 @ AES final-3 block - load plaintext
  4354. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  4355. ins $rk4v.d[0], $res0.d[1] @ GHASH final-4 block - mid
  4356. pmull2 $rk2q1, $res0.2d, $h5.2d @ GHASH final-4 block - high
  4357. eor3 $res1b, $ctr_t1b, $ctr4b, $t1.16b @ AES final-3 block - result
  4358. pmull $rk3q1, $res0.1d, $h5.1d @ GHASH final-4 block - low
  4359. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-4 block - mid
  4360. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-4 block - low
  4361. pmull $rk4v.1q, $rk4v.1d, $h56k.1d @ GHASH final-4 block - mid
  4362. movi $t0.8b, #0 @ suppress further partial tag feed in
  4363. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-4 block - mid
  4364. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-4 block - high
  4365. .L256_enc_blocks_more_than_3: @ blocks left > 3
  4366. st1 { $res1b}, [$output_ptr], #16 @ AES final-3 block - store result
  4367. ldr $h4q, [$current_tag, #112] @ load h4l | h4h
  4368. ext $h4.16b, $h4.16b, $h4.16b, #8
  4369. rev64 $res0b, $res1b @ GHASH final-3 block
  4370. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  4371. ins $rk4v.d[0], $res0.d[1] @ GHASH final-3 block - mid
  4372. pmull2 $rk2q1, $res0.2d, $h4.2d @ GHASH final-3 block - high
  4373. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-3 block - high
  4374. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
  4375. ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
  4376. ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-3 block - mid
  4377. ldr $ctr_t1q, [$input_ptr], #16 @ AES final-2 block - load plaintext
  4378. pmull2 $rk4v.1q, $rk4v.2d, $h34k.2d @ GHASH final-3 block - mid
  4379. pmull $rk3q1, $res0.1d, $h4.1d @ GHASH final-3 block - low
  4380. eor3 $res1b, $ctr_t1b, $ctr5b, $t1.16b @ AES final-2 block - result
  4381. movi $t0.8b, #0 @ suppress further partial tag feed in
  4382. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-3 block - mid
  4383. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-3 block - low
  4384. .L256_enc_blocks_more_than_2: @ blocks left > 2
  4385. ldr $h3q, [$current_tag, #80] @ load h3l | h3h
  4386. ext $h3.16b, $h3.16b, $h3.16b, #8
  4387. st1 { $res1b}, [$output_ptr], #16 @ AES final-2 block - store result
  4388. rev64 $res0b, $res1b @ GHASH final-2 block
  4389. ldr $ctr_t1q, [$input_ptr], #16 @ AES final-1 block - load plaintext
  4390. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  4391. ins $rk4v.d[0], $res0.d[1] @ GHASH final-2 block - mid
  4392. movi $t0.8b, #0 @ suppress further partial tag feed in
  4393. pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
  4394. eor3 $res1b, $ctr_t1b, $ctr6b, $t1.16b @ AES final-1 block - result
  4395. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
  4396. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
  4397. pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
  4398. pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
  4399. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
  4400. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
  4401. .L256_enc_blocks_more_than_1: @ blocks left > 1
  4402. st1 { $res1b}, [$output_ptr], #16 @ AES final-1 block - store result
  4403. ldr $h2q, [$current_tag, #64] @ load h2l | h2h
  4404. ext $h2.16b, $h2.16b, $h2.16b, #8
  4405. rev64 $res0b, $res1b @ GHASH final-1 block
  4406. ldr $ctr_t1q, [$input_ptr], #16 @ AES final block - load plaintext
  4407. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  4408. movi $t0.8b, #0 @ suppress further partial tag feed in
  4409. ins $rk4v.d[0], $res0.d[1] @ GHASH final-1 block - mid
  4410. pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
  4411. eor3 $res1b, $ctr_t1b, $ctr7b, $t1.16b @ AES final block - result
  4412. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
  4413. pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
  4414. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
  4415. ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
  4416. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
  4417. ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
  4418. pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
  4419. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
  4420. .L256_enc_blocks_less_than_1: @ blocks left <= 1
  4421. and $bit_length, $bit_length, #127 @ bit_length %= 128
  4422. sub $bit_length, $bit_length, #128 @ bit_length -= 128
  4423. neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
  4424. mvn $temp0_x, xzr @ temp0_x = 0xffffffffffffffff
  4425. and $bit_length, $bit_length, #127 @ bit_length %= 128
  4426. lsr $temp0_x, $temp0_x, $bit_length @ temp0_x is mask for top 64b of last block
  4427. cmp $bit_length, #64
  4428. mvn $temp1_x, xzr @ temp1_x = 0xffffffffffffffff
  4429. csel $temp3_x, $temp0_x, xzr, lt
  4430. csel $temp2_x, $temp1_x, $temp0_x, lt
  4431. mov $ctr0.d[0], $temp2_x @ ctr0b is mask for last block
  4432. ldr $h1q, [$current_tag, #32] @ load h1l | h1h
  4433. ext $h1.16b, $h1.16b, $h1.16b, #8
  4434. ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored
  4435. mov $ctr0.d[1], $temp3_x
  4436. and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
  4437. rev64 $res0b, $res1b @ GHASH final block
  4438. rev32 $rtmp_ctr.16b, $rtmp_ctr.16b
  4439. bif $res1b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing
  4440. str $rtmp_ctrq, [$counter] @ store the updated counter
  4441. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  4442. st1 { $res1b}, [$output_ptr] @ store all 16B
  4443. ins $t0.d[0], $res0.d[1] @ GHASH final block - mid
  4444. pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
  4445. pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
  4446. eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
  4447. eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
  4448. eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
  4449. pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
  4450. eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
  4451. ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
  4452. ext $t11.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
  4453. eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
  4454. pmull $t12.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
  4455. eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid
  4456. pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
  4457. ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
  4458. eor3 $acc_lb, $acc_lb, $acc_hb, $t11.16b @ MODULO - fold into low
  4459. ext $acc_lb, $acc_lb, $acc_lb, #8
  4460. rev64 $acc_lb, $acc_lb
  4461. st1 { $acc_l.16b }, [$current_tag]
  4462. mov x0, $byte_length @ return sizes
  4463. ldp d10, d11, [sp, #16]
  4464. ldp d12, d13, [sp, #32]
  4465. ldp d14, d15, [sp, #48]
  4466. ldp d8, d9, [sp], #80
  4467. ret
  4468. .L256_enc_ret:
  4469. mov w0, #0x0
  4470. ret
  4471. .size unroll8_eor3_aes_gcm_enc_256_kernel,.-unroll8_eor3_aes_gcm_enc_256_kernel
  4472. ___
  4473. {
  4474. #########################################################################################
  4475. # size_t unroll8_eor3_aes_gcm_dec_256_kernel(const uint8_t * ciphertext,
  4476. # uint64_t plaintext_length,
  4477. # uint8_t * plaintext,
  4478. # uint64_t *Xi,
  4479. # unsigned char ivec[16],
  4480. # const void *key);
  4481. #
  4482. $code.=<<___;
  4483. .global unroll8_eor3_aes_gcm_dec_256_kernel
  4484. .type unroll8_eor3_aes_gcm_dec_256_kernel,%function
  4485. .align 4
  4486. unroll8_eor3_aes_gcm_dec_256_kernel:
  4487. AARCH64_VALID_CALL_TARGET
  4488. cbz x1, .L256_dec_ret
  4489. stp d8, d9, [sp, #-80]!
  4490. lsr $byte_length, $bit_length, #3
  4491. mov $counter, x4
  4492. mov $cc, x5
  4493. stp d10, d11, [sp, #16]
  4494. stp d12, d13, [sp, #32]
  4495. stp d14, d15, [sp, #48]
  4496. mov x5, #0xc200000000000000
  4497. stp x5, xzr, [sp, #64]
  4498. add $modulo_constant, sp, #64
  4499. ld1 { $ctr0b}, [$counter] @ CTR block 0
  4500. mov $constant_temp, #0x100000000 @ set up counter increment
  4501. movi $rctr_inc.16b, #0x0
  4502. mov $rctr_inc.d[1], $constant_temp
  4503. mov $main_end_input_ptr, $byte_length
  4504. sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
  4505. rev32 $rtmp_ctr.16b, $ctr0.16b @ set up reversed counter
  4506. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 0
  4507. rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 1
  4508. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 1
  4509. rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 2
  4510. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 2
  4511. ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
  4512. rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 3
  4513. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 3
  4514. rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 4
  4515. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 4
  4516. aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
  4517. rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 5
  4518. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 5
  4519. aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
  4520. aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
  4521. rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 6
  4522. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 6
  4523. rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 7
  4524. aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 0
  4525. aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 0
  4526. aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 0
  4527. aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
  4528. aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 0
  4529. ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
  4530. aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 1
  4531. aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 1
  4532. aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
  4533. aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 1
  4534. aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 1
  4535. aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
  4536. aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
  4537. aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
  4538. aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
  4539. aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
  4540. aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 2
  4541. aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
  4542. aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 2
  4543. aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 2
  4544. aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
  4545. aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 2
  4546. ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
  4547. aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
  4548. aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
  4549. aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
  4550. aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 3
  4551. aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 3
  4552. aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 3
  4553. aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
  4554. aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 3
  4555. aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 4
  4556. aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
  4557. aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 4
  4558. aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
  4559. aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
  4560. aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 4
  4561. aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
  4562. aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 4
  4563. aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
  4564. aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 5
  4565. ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
  4566. aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 5
  4567. aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 5
  4568. aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 5
  4569. aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
  4570. aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
  4571. aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
  4572. aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 6
  4573. aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
  4574. aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 6
  4575. aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 6
  4576. aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
  4577. aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 6
  4578. aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
  4579. aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
  4580. ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
  4581. aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 7
  4582. aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
  4583. aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
  4584. aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
  4585. aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 7
  4586. aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 7
  4587. aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
  4588. aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 7
  4589. and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffff80 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
  4590. aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 8
  4591. aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 8
  4592. aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
  4593. aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
  4594. aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
  4595. aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 8
  4596. aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
  4597. aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 8
  4598. aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9
  4599. ld1 { $acc_lb}, [$current_tag]
  4600. ext $acc_lb, $acc_lb, $acc_lb, #8
  4601. rev64 $acc_lb, $acc_lb
  4602. ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11
  4603. add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
  4604. add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
  4605. aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9
  4606. aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 9
  4607. aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 9
  4608. aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 9
  4609. aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 9
  4610. aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9
  4611. aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9
  4612. aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 10
  4613. aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 10
  4614. aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 10
  4615. aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 10
  4616. aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 10
  4617. aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 10
  4618. aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 10
  4619. aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 10
  4620. ldp $rk12q, $rk13q, [$cc, #192] @ load rk12, rk13
  4621. aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 11
  4622. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 7
  4623. aese $ctr7b, $rk11 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 11
  4624. aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 11
  4625. aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 11
  4626. aese $ctr5b, $rk11 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 11
  4627. aese $ctr4b, $rk11 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 11
  4628. aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 11
  4629. aese $ctr6b, $rk11 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 11
  4630. ldr $rk14q, [$cc, #224] @ load rk14
  4631. aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 12
  4632. aese $ctr4b, $rk12 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 12
  4633. aese $ctr5b, $rk12 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 12
  4634. cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
  4635. aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 12
  4636. aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 12
  4637. aese $ctr6b, $rk12 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 12
  4638. aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 12
  4639. aese $ctr7b, $rk12 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 12
  4640. aese $ctr5b, $rk13 @ AES block 5 - round 13
  4641. aese $ctr1b, $rk13 @ AES block 1 - round 13
  4642. aese $ctr2b, $rk13 @ AES block 2 - round 13
  4643. aese $ctr0b, $rk13 @ AES block 0 - round 13
  4644. aese $ctr4b, $rk13 @ AES block 4 - round 13
  4645. aese $ctr6b, $rk13 @ AES block 6 - round 13
  4646. aese $ctr3b, $rk13 @ AES block 3 - round 13
  4647. aese $ctr7b, $rk13 @ AES block 7 - round 13
  4648. b.ge .L256_dec_tail @ handle tail
  4649. ldp $res0q, $res1q, [$input_ptr], #32 @ AES block 0, 1 - load ciphertext
  4650. ldp $res2q, $res3q, [$input_ptr], #32 @ AES block 2, 3 - load ciphertext
  4651. ldp $res4q, $res5q, [$input_ptr], #32 @ AES block 4, 5 - load ciphertext
  4652. ldp $res6q, $res7q, [$input_ptr], #32 @ AES block 6, 7 - load ciphertext
  4653. cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
  4654. eor3 $ctr1b, $res1b, $ctr1b, $rk14 @ AES block 1 - result
  4655. eor3 $ctr0b, $res0b, $ctr0b, $rk14 @ AES block 0 - result
  4656. stp $ctr0q, $ctr1q, [$output_ptr], #32 @ AES block 0, 1 - store result
  4657. rev32 $ctr0.16b, $rtmp_ctr.16b @ CTR block 8
  4658. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8
  4659. eor3 $ctr3b, $res3b, $ctr3b, $rk14 @ AES block 3 - result
  4660. eor3 $ctr5b, $res5b, $ctr5b, $rk14 @ AES block 5 - result
  4661. eor3 $ctr4b, $res4b, $ctr4b, $rk14 @ AES block 4 - result
  4662. rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 9
  4663. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 9
  4664. eor3 $ctr2b, $res2b, $ctr2b, $rk14 @ AES block 2 - result
  4665. stp $ctr2q, $ctr3q, [$output_ptr], #32 @ AES block 2, 3 - store result
  4666. rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 10
  4667. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 10
  4668. eor3 $ctr6b, $res6b, $ctr6b, $rk14 @ AES block 6 - result
  4669. rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 11
  4670. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 11
  4671. stp $ctr4q, $ctr5q, [$output_ptr], #32 @ AES block 4, 5 - store result
  4672. eor3 $ctr7b, $res7b, $ctr7b, $rk14 @ AES block 7 - result
  4673. stp $ctr6q, $ctr7q, [$output_ptr], #32 @ AES block 6, 7 - store result
  4674. rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 12
  4675. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 12
  4676. b.ge .L256_dec_prepretail @ do prepretail
  4677. .L256_dec_main_loop: @ main loop start
  4678. rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13
  4679. ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
  4680. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13
  4681. rev64 $res1b, $res1b @ GHASH block 8k+1
  4682. ldr $h7q, [$current_tag, #176] @ load h7l | h7h
  4683. ext $h7.16b, $h7.16b, $h7.16b, #8
  4684. ldr $h8q, [$current_tag, #208] @ load h8l | h8h
  4685. ext $h8.16b, $h8.16b, $h8.16b, #8
  4686. rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14
  4687. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14
  4688. rev64 $res0b, $res0b @ GHASH block 8k
  4689. ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
  4690. rev64 $res4b, $res4b @ GHASH block 8k+4
  4691. rev64 $res3b, $res3b @ GHASH block 8k+3
  4692. rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15
  4693. rev64 $res7b, $res7b @ GHASH block 8k+7
  4694. aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0
  4695. aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0
  4696. aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0
  4697. aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0
  4698. aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0
  4699. aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0
  4700. aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0
  4701. aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0
  4702. ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
  4703. eor $res0b, $res0b, $acc_lb @ PRE 1
  4704. ldr $h5q, [$current_tag, #128] @ load h5l | h5h
  4705. ext $h5.16b, $h5.16b, $h5.16b, #8
  4706. ldr $h6q, [$current_tag, #160] @ load h6l | h6h
  4707. ext $h6.16b, $h6.16b, $h6.16b, #8
  4708. aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1
  4709. aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1
  4710. rev64 $res2b, $res2b @ GHASH block 8k+2
  4711. aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1
  4712. aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1
  4713. aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1
  4714. aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1
  4715. trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
  4716. aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1
  4717. aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1
  4718. aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2
  4719. aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2
  4720. aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2
  4721. aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2
  4722. aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2
  4723. pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low
  4724. aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2
  4725. aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2
  4726. aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2
  4727. ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
  4728. pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high
  4729. aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3
  4730. aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3
  4731. pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high
  4732. pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low
  4733. aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3
  4734. aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3
  4735. pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high
  4736. aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3
  4737. aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3
  4738. trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
  4739. pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high
  4740. aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3
  4741. eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high
  4742. aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4
  4743. aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3
  4744. aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4
  4745. aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4
  4746. aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4
  4747. aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4
  4748. aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4
  4749. aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4
  4750. aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4
  4751. ldr $h56kq, [$current_tag, #144] @ load h6k | h5k
  4752. ldr $h78kq, [$current_tag, #192] @ load h8k | h7k
  4753. eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid
  4754. pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low
  4755. ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
  4756. aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5
  4757. eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low
  4758. aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5
  4759. aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5
  4760. aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5
  4761. aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5
  4762. aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5
  4763. aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5
  4764. eor3 $acc_hb, $acc_hb, $t1.16b, $t2.16b @ GHASH block 8k+2, 8k+3 - high
  4765. trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
  4766. rev64 $res5b, $res5b @ GHASH block 8k+5
  4767. pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid
  4768. pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid
  4769. trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
  4770. aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6
  4771. aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6
  4772. aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5
  4773. trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
  4774. aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6
  4775. aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6
  4776. eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
  4777. pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low
  4778. aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6
  4779. aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6
  4780. aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6
  4781. aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6
  4782. pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid
  4783. pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid
  4784. eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low
  4785. ldr $h3q, [$current_tag, #80] @ load h3l | h3h
  4786. ext $h3.16b, $h3.16b, $h3.16b, #8
  4787. ldr $h4q, [$current_tag, #112] @ load h4l | h4h
  4788. ext $h4.16b, $h4.16b, $h4.16b, #8
  4789. rev64 $res6b, $res6b @ GHASH block 8k+6
  4790. eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid
  4791. aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7
  4792. aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7
  4793. ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
  4794. ldr $h1q, [$current_tag, #32] @ load h1l | h1h
  4795. ext $h1.16b, $h1.16b, $h1.16b, #8
  4796. ldr $h2q, [$current_tag, #64] @ load h2l | h2h
  4797. ext $h2.16b, $h2.16b, $h2.16b, #8
  4798. eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
  4799. aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7
  4800. aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7
  4801. aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7
  4802. aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7
  4803. ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
  4804. ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
  4805. aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7
  4806. aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7
  4807. pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high
  4808. pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low
  4809. trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
  4810. aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8
  4811. pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high
  4812. aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8
  4813. aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8
  4814. pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low
  4815. aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8
  4816. aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8
  4817. aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8
  4818. pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high
  4819. trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
  4820. aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8
  4821. aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8
  4822. ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11
  4823. pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low
  4824. trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
  4825. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15
  4826. eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high
  4827. aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 9
  4828. aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 9
  4829. eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
  4830. aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 9
  4831. ldp $res0q, $res1q, [$input_ptr], #32 @ AES block 8k+8, 8k+9 - load ciphertext
  4832. eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
  4833. aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 9
  4834. pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid
  4835. aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 9
  4836. aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 9
  4837. pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid
  4838. pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid
  4839. pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high
  4840. pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low
  4841. aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 10
  4842. aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 10
  4843. pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid
  4844. aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 9
  4845. eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low
  4846. aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 9
  4847. eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
  4848. eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high
  4849. aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 10
  4850. aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 10
  4851. aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 10
  4852. aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 10
  4853. aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 10
  4854. aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 10
  4855. eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low
  4856. rev32 $h1.16b, $rtmp_ctr.16b @ CTR block 8k+16
  4857. ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
  4858. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+16
  4859. aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 11
  4860. ldp $rk12q, $rk13q, [$cc, #192] @ load rk12, rk13
  4861. aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 11
  4862. aese $ctr6b, $rk11 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 11
  4863. eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
  4864. rev32 $h2.16b, $rtmp_ctr.16b @ CTR block 8k+17
  4865. aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 11
  4866. ldp $res2q, $res3q, [$input_ptr], #32 @ AES block 8k+10, 8k+11 - load ciphertext
  4867. aese $ctr7b, $rk11 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 11
  4868. ext $t11.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
  4869. aese $ctr5b, $rk11 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 11
  4870. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+17
  4871. aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 11
  4872. aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 12
  4873. aese $ctr7b, $rk12 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 12
  4874. aese $ctr6b, $rk12 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 12
  4875. rev32 $h3.16b, $rtmp_ctr.16b @ CTR block 8k+18
  4876. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+18
  4877. pmull $t12.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
  4878. eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
  4879. aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 12
  4880. aese $ctr4b, $rk11 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 11
  4881. ldr $rk14q, [$cc, #224] @ load rk14
  4882. aese $ctr5b, $rk12 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 12
  4883. aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 12
  4884. eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid
  4885. aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 12
  4886. aese $ctr4b, $rk12 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 12
  4887. ldp $res4q, $res5q, [$input_ptr], #32 @ AES block 8k+12, 8k+13 - load ciphertext
  4888. aese $ctr1b, $rk13 @ AES block 8k+9 - round 13
  4889. aese $ctr2b, $rk13 @ AES block 8k+10 - round 13
  4890. ldp $res6q, $res7q, [$input_ptr], #32 @ AES block 8k+14, 8k+15 - load ciphertext
  4891. aese $ctr0b, $rk13 @ AES block 8k+8 - round 13
  4892. aese $ctr5b, $rk13 @ AES block 8k+13 - round 13
  4893. rev32 $h4.16b, $rtmp_ctr.16b @ CTR block 8k+19
  4894. eor3 $ctr2b, $res2b, $ctr2b, $rk14 @ AES block 8k+10 - result
  4895. eor3 $ctr1b, $res1b, $ctr1b, $rk14 @ AES block 8k+9 - result
  4896. ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
  4897. aese $ctr7b, $rk13 @ AES block 8k+15 - round 13
  4898. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+19
  4899. pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
  4900. aese $ctr4b, $rk13 @ AES block 8k+12 - round 13
  4901. eor3 $ctr5b, $res5b, $ctr5b, $rk14 @ AES block 8k+13 - result
  4902. eor3 $ctr0b, $res0b, $ctr0b, $rk14 @ AES block 8k+8 - result
  4903. aese $ctr3b, $rk13 @ AES block 8k+11 - round 13
  4904. stp $ctr0q, $ctr1q, [$output_ptr], #32 @ AES block 8k+8, 8k+9 - store result
  4905. mov $ctr0.16b, $h1.16b @ CTR block 8k+16
  4906. eor3 $ctr4b, $res4b, $ctr4b, $rk14 @ AES block 8k+12 - result
  4907. eor3 $acc_lb, $acc_lb, $t11.16b, $acc_hb @ MODULO - fold into low
  4908. eor3 $ctr3b, $res3b, $ctr3b, $rk14 @ AES block 8k+11 - result
  4909. stp $ctr2q, $ctr3q, [$output_ptr], #32 @ AES block 8k+10, 8k+11 - store result
  4910. mov $ctr3.16b, $h4.16b @ CTR block 8k+19
  4911. mov $ctr2.16b, $h3.16b @ CTR block 8k+18
  4912. aese $ctr6b, $rk13 @ AES block 8k+14 - round 13
  4913. mov $ctr1.16b, $h2.16b @ CTR block 8k+17
  4914. stp $ctr4q, $ctr5q, [$output_ptr], #32 @ AES block 8k+12, 8k+13 - store result
  4915. eor3 $ctr7b, $res7b, $ctr7b, $rk14 @ AES block 8k+15 - result
  4916. eor3 $ctr6b, $res6b, $ctr6b, $rk14 @ AES block 8k+14 - result
  4917. rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 8k+20
  4918. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+20
  4919. cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
  4920. stp $ctr6q, $ctr7q, [$output_ptr], #32 @ AES block 8k+14, 8k+15 - store result
  4921. b.lt .L256_dec_main_loop
  4922. .L256_dec_prepretail: @ PREPRETAIL
  4923. ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
  4924. rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13
  4925. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13
  4926. rev64 $res4b, $res4b @ GHASH block 8k+4
  4927. ldr $h56kq, [$current_tag, #144] @ load h6k | h5k
  4928. ldr $h78kq, [$current_tag, #192] @ load h8k | h7k
  4929. rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14
  4930. rev64 $res0b, $res0b @ GHASH block 8k
  4931. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14
  4932. ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
  4933. ldr $h7q, [$current_tag, #176] @ load h7l | h7h
  4934. ext $h7.16b, $h7.16b, $h7.16b, #8
  4935. ldr $h8q, [$current_tag, #208] @ load h8l | h8h
  4936. ext $h8.16b, $h8.16b, $h8.16b, #8
  4937. rev64 $res1b, $res1b @ GHASH block 8k+1
  4938. rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15
  4939. rev64 $res2b, $res2b @ GHASH block 8k+2
  4940. ldr $h5q, [$current_tag, #128] @ load h5l | h5h
  4941. ext $h5.16b, $h5.16b, $h5.16b, #8
  4942. ldr $h6q, [$current_tag, #160] @ load h6l | h6h
  4943. ext $h6.16b, $h6.16b, $h6.16b, #8
  4944. aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0
  4945. aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0
  4946. aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0
  4947. aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0
  4948. aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0
  4949. aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0
  4950. aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1
  4951. aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0
  4952. aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0
  4953. ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
  4954. aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1
  4955. eor $res0b, $res0b, $acc_lb @ PRE 1
  4956. aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1
  4957. aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1
  4958. aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1
  4959. aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1
  4960. aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1
  4961. aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1
  4962. pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high
  4963. trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
  4964. pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low
  4965. rev64 $res3b, $res3b @ GHASH block 8k+3
  4966. pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low
  4967. aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2
  4968. aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2
  4969. aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2
  4970. aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2
  4971. aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2
  4972. pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high
  4973. aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2
  4974. aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3
  4975. aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3
  4976. rev64 $res6b, $res6b @ GHASH block 8k+6
  4977. aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3
  4978. aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2
  4979. aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3
  4980. pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high
  4981. trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
  4982. aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2
  4983. ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
  4984. aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3
  4985. pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high
  4986. aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3
  4987. eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high
  4988. eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid
  4989. aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3
  4990. pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low
  4991. aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3
  4992. eor3 $acc_hb, $acc_hb, $t1.16b, $t2.16b @ GHASH block 8k+2, 8k+3 - high
  4993. trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
  4994. trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
  4995. pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid
  4996. pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low
  4997. eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low
  4998. pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid
  4999. aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4
  5000. aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4
  5001. eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low
  5002. ldr $h1q, [$current_tag, #32] @ load h1l | h1h
  5003. ext $h1.16b, $h1.16b, $h1.16b, #8
  5004. ldr $h2q, [$current_tag, #64] @ load h2l | h2h
  5005. ext $h2.16b, $h2.16b, $h2.16b, #8
  5006. aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4
  5007. aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4
  5008. aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4
  5009. eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid
  5010. eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
  5011. aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5
  5012. aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4
  5013. aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5
  5014. aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4
  5015. aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4
  5016. aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5
  5017. pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid
  5018. aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5
  5019. aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5
  5020. aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5
  5021. pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid
  5022. aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5
  5023. aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5
  5024. ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
  5025. ldr $h3q, [$current_tag, #80] @ load h3l | h3h
  5026. ext $h3.16b, $h3.16b, $h3.16b, #8
  5027. ldr $h4q, [$current_tag, #112] @ load h4l | h4h
  5028. ext $h4.16b, $h4.16b, $h4.16b, #8
  5029. rev64 $res7b, $res7b @ GHASH block 8k+7
  5030. rev64 $res5b, $res5b @ GHASH block 8k+5
  5031. eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
  5032. trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
  5033. aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6
  5034. ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
  5035. ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
  5036. aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6
  5037. aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6
  5038. aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6
  5039. pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high
  5040. pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high
  5041. pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low
  5042. trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
  5043. pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low
  5044. trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
  5045. aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7
  5046. pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high
  5047. aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6
  5048. aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6
  5049. aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6
  5050. aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6
  5051. ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
  5052. pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low
  5053. aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7
  5054. aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7
  5055. aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7
  5056. aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7
  5057. aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7
  5058. eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high
  5059. aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7
  5060. trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
  5061. aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7
  5062. aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8
  5063. aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8
  5064. aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8
  5065. aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8
  5066. aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8
  5067. aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8
  5068. aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8
  5069. aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 9
  5070. eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
  5071. aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 9
  5072. aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 9
  5073. eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
  5074. aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 9
  5075. aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 9
  5076. pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid
  5077. aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8
  5078. pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid
  5079. pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high
  5080. pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid
  5081. pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid
  5082. pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low
  5083. ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11
  5084. eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low
  5085. eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
  5086. aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 9
  5087. aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 9
  5088. aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 9
  5089. eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high
  5090. eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low
  5091. ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
  5092. eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
  5093. aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 10
  5094. aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 10
  5095. aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 10
  5096. aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 10
  5097. aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 10
  5098. aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 10
  5099. eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
  5100. aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 10
  5101. aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 10
  5102. ldp $rk12q, $rk13q, [$cc, #192] @ load rk12, rk13
  5103. ext $t11.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
  5104. aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 11
  5105. aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 11
  5106. aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 11
  5107. pmull $t12.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
  5108. aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 11
  5109. aese $ctr7b, $rk11 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 11
  5110. aese $ctr6b, $rk11 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 11
  5111. aese $ctr4b, $rk11 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 11
  5112. aese $ctr5b, $rk11 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 11
  5113. aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 12
  5114. eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid
  5115. aese $ctr3b, $rk13 @ AES block 8k+11 - round 13
  5116. aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 12
  5117. aese $ctr6b, $rk12 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 12
  5118. pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
  5119. aese $ctr4b, $rk12 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 12
  5120. aese $ctr7b, $rk12 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 12
  5121. aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 12
  5122. ldr $rk14q, [$cc, #224] @ load rk14
  5123. aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 12
  5124. aese $ctr4b, $rk13 @ AES block 8k+12 - round 13
  5125. ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
  5126. aese $ctr5b, $rk12 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 12
  5127. aese $ctr6b, $rk13 @ AES block 8k+14 - round 13
  5128. aese $ctr2b, $rk13 @ AES block 8k+10 - round 13
  5129. aese $ctr1b, $rk13 @ AES block 8k+9 - round 13
  5130. aese $ctr5b, $rk13 @ AES block 8k+13 - round 13
  5131. eor3 $acc_lb, $acc_lb, $t11.16b, $acc_hb @ MODULO - fold into low
  5132. add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15
  5133. aese $ctr7b, $rk13 @ AES block 8k+15 - round 13
  5134. aese $ctr0b, $rk13 @ AES block 8k+8 - round 13
  5135. .L256_dec_tail: @ TAIL
  5136. ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
  5137. sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
  5138. cmp $main_end_input_ptr, #112
  5139. ldr $res1q, [$input_ptr], #16 @ AES block 8k+8 - load ciphertext
  5140. ldp $h78kq, $h8q, [$current_tag, #192] @ load h8k | h7k
  5141. ext $h8.16b, $h8.16b, $h8.16b, #8
  5142. mov $t1.16b, $rk14
  5143. ldp $h5q, $h56kq, [$current_tag, #128] @ load h5l | h5h
  5144. ext $h5.16b, $h5.16b, $h5.16b, #8
  5145. eor3 $res4b, $res1b, $ctr0b, $t1.16b @ AES block 8k+8 - result
  5146. ldp $h6q, $h7q, [$current_tag, #160] @ load h6l | h6h
  5147. ext $h6.16b, $h6.16b, $h6.16b, #8
  5148. ext $h7.16b, $h7.16b, $h7.16b, #8
  5149. b.gt .L256_dec_blocks_more_than_7
  5150. mov $ctr7b, $ctr6b
  5151. sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
  5152. mov $ctr6b, $ctr5b
  5153. mov $ctr5b, $ctr4b
  5154. mov $ctr4b, $ctr3b
  5155. movi $acc_l.8b, #0
  5156. movi $acc_h.8b, #0
  5157. movi $acc_m.8b, #0
  5158. mov $ctr3b, $ctr2b
  5159. cmp $main_end_input_ptr, #96
  5160. mov $ctr2b, $ctr1b
  5161. b.gt .L256_dec_blocks_more_than_6
  5162. mov $ctr7b, $ctr6b
  5163. mov $ctr6b, $ctr5b
  5164. mov $ctr5b, $ctr4b
  5165. cmp $main_end_input_ptr, #80
  5166. sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
  5167. mov $ctr4b, $ctr3b
  5168. mov $ctr3b, $ctr1b
  5169. b.gt .L256_dec_blocks_more_than_5
  5170. cmp $main_end_input_ptr, #64
  5171. mov $ctr7b, $ctr6b
  5172. sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
  5173. mov $ctr6b, $ctr5b
  5174. mov $ctr5b, $ctr4b
  5175. mov $ctr4b, $ctr1b
  5176. b.gt .L256_dec_blocks_more_than_4
  5177. sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
  5178. mov $ctr7b, $ctr6b
  5179. cmp $main_end_input_ptr, #48
  5180. mov $ctr6b, $ctr5b
  5181. mov $ctr5b, $ctr1b
  5182. b.gt .L256_dec_blocks_more_than_3
  5183. ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
  5184. sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
  5185. mov $ctr7b, $ctr6b
  5186. cmp $main_end_input_ptr, #32
  5187. mov $ctr6b, $ctr1b
  5188. b.gt .L256_dec_blocks_more_than_2
  5189. sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
  5190. mov $ctr7b, $ctr1b
  5191. cmp $main_end_input_ptr, #16
  5192. b.gt .L256_dec_blocks_more_than_1
  5193. sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
  5194. ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
  5195. b .L256_dec_blocks_less_than_1
  5196. .L256_dec_blocks_more_than_7: @ blocks left > 7
  5197. rev64 $res0b, $res1b @ GHASH final-7 block
  5198. ldr $res1q, [$input_ptr], #16 @ AES final-6 block - load ciphertext
  5199. st1 { $res4b}, [$output_ptr], #16 @ AES final-7 block - store result
  5200. ins $acc_m.d[0], $h78k.d[1] @ GHASH final-7 block - mid
  5201. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  5202. ins $rk4v.d[0], $res0.d[1] @ GHASH final-7 block - mid
  5203. eor3 $res4b, $res1b, $ctr1b, $t1.16b @ AES final-6 block - result
  5204. pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH final-7 block - high
  5205. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-7 block - mid
  5206. movi $t0.8b, #0 @ suppress further partial tag feed in
  5207. pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH final-7 block - low
  5208. pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-7 block - mid
  5209. .L256_dec_blocks_more_than_6: @ blocks left > 6
  5210. rev64 $res0b, $res1b @ GHASH final-6 block
  5211. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  5212. ldr $res1q, [$input_ptr], #16 @ AES final-5 block - load ciphertext
  5213. movi $t0.8b, #0 @ suppress further partial tag feed in
  5214. ins $rk4v.d[0], $res0.d[1] @ GHASH final-6 block - mid
  5215. st1 { $res4b}, [$output_ptr], #16 @ AES final-6 block - store result
  5216. pmull2 $rk2q1, $res0.2d, $h7.2d @ GHASH final-6 block - high
  5217. pmull $rk3q1, $res0.1d, $h7.1d @ GHASH final-6 block - low
  5218. eor3 $res4b, $res1b, $ctr2b, $t1.16b @ AES final-5 block - result
  5219. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-6 block - low
  5220. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-6 block - mid
  5221. pmull $rk4v.1q, $rk4v.1d, $h78k.1d @ GHASH final-6 block - mid
  5222. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-6 block - mid
  5223. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-6 block - high
  5224. .L256_dec_blocks_more_than_5: @ blocks left > 5
  5225. rev64 $res0b, $res1b @ GHASH final-5 block
  5226. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  5227. pmull2 $rk2q1, $res0.2d, $h6.2d @ GHASH final-5 block - high
  5228. ins $rk4v.d[0], $res0.d[1] @ GHASH final-5 block - mid
  5229. ldr $res1q, [$input_ptr], #16 @ AES final-4 block - load ciphertext
  5230. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-5 block - mid
  5231. st1 { $res4b}, [$output_ptr], #16 @ AES final-5 block - store result
  5232. pmull $rk3q1, $res0.1d, $h6.1d @ GHASH final-5 block - low
  5233. ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-5 block - mid
  5234. pmull2 $rk4v.1q, $rk4v.2d, $h56k.2d @ GHASH final-5 block - mid
  5235. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-5 block - high
  5236. eor3 $res4b, $res1b, $ctr3b, $t1.16b @ AES final-4 block - result
  5237. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-5 block - low
  5238. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-5 block - mid
  5239. movi $t0.8b, #0 @ suppress further partial tag feed in
  5240. .L256_dec_blocks_more_than_4: @ blocks left > 4
  5241. rev64 $res0b, $res1b @ GHASH final-4 block
  5242. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  5243. ins $rk4v.d[0], $res0.d[1] @ GHASH final-4 block - mid
  5244. ldr $res1q, [$input_ptr], #16 @ AES final-3 block - load ciphertext
  5245. movi $t0.8b, #0 @ suppress further partial tag feed in
  5246. pmull $rk3q1, $res0.1d, $h5.1d @ GHASH final-4 block - low
  5247. pmull2 $rk2q1, $res0.2d, $h5.2d @ GHASH final-4 block - high
  5248. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-4 block - mid
  5249. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-4 block - high
  5250. pmull $rk4v.1q, $rk4v.1d, $h56k.1d @ GHASH final-4 block - mid
  5251. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-4 block - low
  5252. st1 { $res4b}, [$output_ptr], #16 @ AES final-4 block - store result
  5253. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-4 block - mid
  5254. eor3 $res4b, $res1b, $ctr4b, $t1.16b @ AES final-3 block - result
  5255. .L256_dec_blocks_more_than_3: @ blocks left > 3
  5256. ldr $h4q, [$current_tag, #112] @ load h4l | h4h
  5257. ext $h4.16b, $h4.16b, $h4.16b, #8
  5258. rev64 $res0b, $res1b @ GHASH final-3 block
  5259. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  5260. ldr $res1q, [$input_ptr], #16 @ AES final-2 block - load ciphertext
  5261. ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
  5262. ins $rk4v.d[0], $res0.d[1] @ GHASH final-3 block - mid
  5263. st1 { $res4b}, [$output_ptr], #16 @ AES final-3 block - store result
  5264. eor3 $res4b, $res1b, $ctr5b, $t1.16b @ AES final-2 block - result
  5265. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
  5266. ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-3 block - mid
  5267. pmull $rk3q1, $res0.1d, $h4.1d @ GHASH final-3 block - low
  5268. pmull2 $rk2q1, $res0.2d, $h4.2d @ GHASH final-3 block - high
  5269. movi $t0.8b, #0 @ suppress further partial tag feed in
  5270. pmull2 $rk4v.1q, $rk4v.2d, $h34k.2d @ GHASH final-3 block - mid
  5271. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-3 block - low
  5272. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-3 block - high
  5273. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-3 block - mid
  5274. .L256_dec_blocks_more_than_2: @ blocks left > 2
  5275. rev64 $res0b, $res1b @ GHASH final-2 block
  5276. ldr $h3q, [$current_tag, #80] @ load h3l | h3h
  5277. ext $h3.16b, $h3.16b, $h3.16b, #8
  5278. ldr $res1q, [$input_ptr], #16 @ AES final-1 block - load ciphertext
  5279. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  5280. ins $rk4v.d[0], $res0.d[1] @ GHASH final-2 block - mid
  5281. pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
  5282. st1 { $res4b}, [$output_ptr], #16 @ AES final-2 block - store result
  5283. eor3 $res4b, $res1b, $ctr6b, $t1.16b @ AES final-1 block - result
  5284. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
  5285. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
  5286. movi $t0.8b, #0 @ suppress further partial tag feed in
  5287. pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
  5288. pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
  5289. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
  5290. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
  5291. .L256_dec_blocks_more_than_1: @ blocks left > 1
  5292. rev64 $res0b, $res1b @ GHASH final-1 block
  5293. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  5294. ins $rk4v.d[0], $res0.d[1] @ GHASH final-1 block - mid
  5295. ldr $h2q, [$current_tag, #64] @ load h2l | h2h
  5296. ext $h2.16b, $h2.16b, $h2.16b, #8
  5297. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
  5298. ldr $res1q, [$input_ptr], #16 @ AES final block - load ciphertext
  5299. st1 { $res4b}, [$output_ptr], #16 @ AES final-1 block - store result
  5300. ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
  5301. pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
  5302. ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
  5303. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
  5304. eor3 $res4b, $res1b, $ctr7b, $t1.16b @ AES final block - result
  5305. pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
  5306. pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
  5307. movi $t0.8b, #0 @ suppress further partial tag feed in
  5308. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
  5309. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
  5310. .L256_dec_blocks_less_than_1: @ blocks left <= 1
  5311. ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored
  5312. mvn $temp0_x, xzr @ temp0_x = 0xffffffffffffffff
  5313. and $bit_length, $bit_length, #127 @ bit_length %= 128
  5314. sub $bit_length, $bit_length, #128 @ bit_length -= 128
  5315. rev32 $rtmp_ctr.16b, $rtmp_ctr.16b
  5316. str $rtmp_ctrq, [$counter] @ store the updated counter
  5317. neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
  5318. and $bit_length, $bit_length, #127 @ bit_length %= 128
  5319. lsr $temp0_x, $temp0_x, $bit_length @ temp0_x is mask for top 64b of last block
  5320. cmp $bit_length, #64
  5321. mvn $temp1_x, xzr @ temp1_x = 0xffffffffffffffff
  5322. csel $temp3_x, $temp0_x, xzr, lt
  5323. csel $temp2_x, $temp1_x, $temp0_x, lt
  5324. mov $ctr0.d[0], $temp2_x @ ctr0b is mask for last block
  5325. mov $ctr0.d[1], $temp3_x
  5326. and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
  5327. ldr $h1q, [$current_tag, #32] @ load h1l | h1h
  5328. ext $h1.16b, $h1.16b, $h1.16b, #8
  5329. bif $res4b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing
  5330. rev64 $res0b, $res1b @ GHASH final block
  5331. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  5332. ins $t0.d[0], $res0.d[1] @ GHASH final block - mid
  5333. pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
  5334. eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
  5335. pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
  5336. eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
  5337. pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
  5338. eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
  5339. ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
  5340. eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
  5341. pmull $t11.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
  5342. eor $t10.16b, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
  5343. ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
  5344. st1 { $res4b}, [$output_ptr] @ store all 16B
  5345. eor $acc_mb, $acc_mb, $t10.16b @ MODULO - karatsuba tidy up
  5346. eor $t11.16b, $acc_hb, $t11.16b @ MODULO - fold into mid
  5347. eor $acc_mb, $acc_mb, $t11.16b @ MODULO - fold into mid
  5348. pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
  5349. ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
  5350. eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low
  5351. eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
  5352. ext $acc_lb, $acc_lb, $acc_lb, #8
  5353. rev64 $acc_lb, $acc_lb
  5354. st1 { $acc_l.16b }, [$current_tag]
  5355. mov x0, $byte_length
  5356. ldp d10, d11, [sp, #16]
  5357. ldp d12, d13, [sp, #32]
  5358. ldp d14, d15, [sp, #48]
  5359. ldp d8, d9, [sp], #80
  5360. ret
  5361. .L256_dec_ret:
  5362. mov w0, #0x0
  5363. ret
  5364. .size unroll8_eor3_aes_gcm_dec_256_kernel,.-unroll8_eor3_aes_gcm_dec_256_kernel
  5365. ___
  5366. }
  5367. }
  5368. $code.=<<___;
  5369. .asciz "AES GCM module for ARMv8, SPDX BSD-3-Clause by <xiaokang.qian\@arm.com>"
  5370. .align 2
  5371. #endif
  5372. ___
  5373. {
  5374. my %opcode = (
  5375. "rax1" => 0xce608c00, "eor3" => 0xce000000,
  5376. "bcax" => 0xce200000, "xar" => 0xce800000 );
  5377. sub unsha3 {
  5378. my ($mnemonic,$arg)=@_;
  5379. $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv#]([0-9\-]+))?)?/
  5380. &&
  5381. sprintf ".inst\t0x%08x\t//%s %s",
  5382. $opcode{$mnemonic}|$1|($2<<5)|($3<<16)|(eval($4)<<10),
  5383. $mnemonic,$arg;
  5384. }
  5385. sub unvmov {
  5386. my $arg=shift;
  5387. $arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o &&
  5388. sprintf "ins v%d.d[%d],v%d.d[%d]",$1<8?$1:$1+8,($2 eq "lo")?0:1,
  5389. $3<8?$3:$3+8,($4 eq "lo")?0:1;
  5390. }
  5391. foreach(split("\n",$code)) {
  5392. s/@\s/\/\//o; # old->new style commentary
  5393. s/\`([^\`]*)\`/eval($1)/ge;
  5394. m/\bld1r\b/ and s/\.16b/.2d/g or
  5395. s/\b(eor3|rax1|xar|bcax)\s+(v.*)/unsha3($1,$2)/ge;
  5396. print $_,"\n";
  5397. }
  5398. }
  5399. close STDOUT or die "error closing STDOUT: $!"; # enforce flush