aes-gcm-avx512.pl 199 KB


  1. # Copyright 2021-2023 The OpenSSL Project Authors. All Rights Reserved.
  2. # Copyright (c) 2021, Intel Corporation. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. #
  10. # This implementation is based on the AES-GCM code (AVX512VAES + VPCLMULQDQ)
  11. # from Intel(R) Multi-Buffer Crypto for IPsec Library v1.1
  12. # (https://github.com/intel/intel-ipsec-mb).
  13. # Original author is Tomasz Kantecki <tomasz.kantecki@intel.com>.
  14. #
  15. # References:
  16. # [1] Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation on
  17. # Intel Architecture Processors. August, 2010.
  18. # [2] Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode on
  19. # Intel Architecture Processors. October, 2012.
  20. # [3] Shay Gueron et. al. Intel Carry-Less Multiplication Instruction and its
  21. # Usage for Computing the GCM Mode. May, 2010.
  22. #
  23. #
  24. # December 2021
  25. #
  26. # Initial release.
  27. #
  28. # GCM128_CONTEXT structure has storage for 16 hkeys only, but this
  29. # implementation can use up to 48. To avoid extending the context size,
  30. # precompute and store in the context first 16 hkeys only, and compute the rest
  31. # on demand keeping them in the local frame.
  32. #
  33. #======================================================================
  34. # $output is the last argument if it looks like a file (it has an extension)
  35. # $flavour is the first argument if it doesn't look like a file
  36. $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  37. $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  38. $win64 = 0;
  39. $win64 = 1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  40. $avx512vaes = 0;
  41. $0 =~ m/(.*[\/\\])[^\/\\]+$/;
  42. $dir = $1;
  43. ($xlate = "${dir}x86_64-xlate.pl" and -f $xlate)
  44. or ($xlate = "${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate)
  45. or die "can't locate x86_64-xlate.pl";
  46. if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
  47. $avx512vaes = ($1 >= 2.30);
  48. }
  49. if (!$avx512vaes
  50. && $win64
  51. && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/)
  52. && `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/)
  53. {
  54. $avx512vaes = ($1 == 2.13 && $2 >= 3) + ($1 >= 2.14);
  55. }
  56. if (!$avx512vaes && `$ENV{CC} -v 2>&1`
  57. =~ /(Apple)?\s*((?:clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)\.([0-9]+)?/) {
  58. my $ver = $3 + $4/100.0 + $5/10000.0; # 3.1.0->3.01, 3.10.1->3.1001
  59. if ($1) {
  60. # Apple conditions, they use a different version series, see
  61. # https://en.wikipedia.org/wiki/Xcode#Xcode_7.0_-_10.x_(since_Free_On-Device_Development)_2
  62. # clang 7.0.0 is Apple clang 10.0.1
  63. $avx512vaes = ($ver>=10.0001)
  64. } else {
  65. $avx512vaes = ($ver>=7.0);
  66. }
  67. }
  68. open OUT, "| \"$^X\" \"$xlate\" $flavour \"$output\""
  69. or die "can't call $xlate: $!";
  70. *STDOUT = *OUT;
  71. #======================================================================
  72. if ($avx512vaes>0) { #<<<
  73. $code .= <<___;
  74. .extern OPENSSL_ia32cap_P
  75. .globl ossl_vaes_vpclmulqdq_capable
  76. .type ossl_vaes_vpclmulqdq_capable,\@abi-omnipotent
  77. .align 32
  78. ossl_vaes_vpclmulqdq_capable:
  79. mov OPENSSL_ia32cap_P+8(%rip), %rcx
  80. # avx512vpclmulqdq + avx512vaes + avx512vl + avx512bw + avx512dq + avx512f
  81. mov \$`1<<42|1<<41|1<<31|1<<30|1<<17|1<<16`,%rdx
  82. xor %eax,%eax
  83. and %rdx,%rcx
  84. cmp %rdx,%rcx
  85. cmove %rcx,%rax
  86. ret
  87. .size ossl_vaes_vpclmulqdq_capable, .-ossl_vaes_vpclmulqdq_capable
  88. ___
  89. # ; Mapping key length -> AES rounds count
  90. my %aes_rounds = (
  91. 128 => 9,
  92. 192 => 11,
  93. 256 => 13);
  94. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  95. # ;;; Code generation control switches
  96. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  97. # ; ABI-aware zeroing of volatile registers in EPILOG().
  98. # ; Disabled due to performance reasons.
  99. my $CLEAR_SCRATCH_REGISTERS = 0;
  100. # ; Zero HKeys storage from the stack if they are stored there
  101. my $CLEAR_HKEYS_STORAGE_ON_EXIT = 1;
  102. # ; Enable / disable check of function arguments for null pointer
  103. # ; Currently disabled, as this check is handled outside.
  104. my $CHECK_FUNCTION_ARGUMENTS = 0;
  105. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  106. # ;;; Global constants
  107. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  108. # AES block size in bytes
  109. my $AES_BLOCK_SIZE = 16;
  110. # Storage capacity in elements
  111. my $HKEYS_STORAGE_CAPACITY = 48;
  112. my $LOCAL_STORAGE_CAPACITY = 48;
  113. my $HKEYS_CONTEXT_CAPACITY = 16;
  114. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  115. # ;;; Stack frame definition
  116. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  117. # (1) -> +64(Win)/+48(Lin)-byte space for pushed GPRs
  118. # (2) -> +8-byte space for 16-byte alignment of XMM storage
  119. # (3) -> Frame pointer (%RBP)
  120. # (4) -> +160-byte XMM storage (Windows only, zero on Linux)
  121. # (5) -> +48-byte space for 64-byte alignment of %RSP from p.8
  122. # (6) -> +768-byte LOCAL storage (optional, can be omitted in some functions)
  123. # (7) -> +768-byte HKEYS storage
  124. # (8) -> Stack pointer (%RSP) aligned on 64-byte boundary
  125. my $GP_STORAGE = $win64 ? 8 * 8 : 8 * 6; # ; space for saved non-volatile GP registers (pushed on stack)
  126. my $XMM_STORAGE = $win64 ? (10 * 16) : 0; # ; space for saved XMM registers
  127. my $HKEYS_STORAGE = ($HKEYS_STORAGE_CAPACITY * $AES_BLOCK_SIZE); # ; space for HKeys^i, i=1..48
  128. my $LOCAL_STORAGE = ($LOCAL_STORAGE_CAPACITY * $AES_BLOCK_SIZE); # ; space for up to 48 AES blocks
  129. my $STACK_HKEYS_OFFSET = 0;
  130. my $STACK_LOCAL_OFFSET = ($STACK_HKEYS_OFFSET + $HKEYS_STORAGE);
  131. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  132. # ;;; Function arguments abstraction
  133. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  134. my ($arg1, $arg2, $arg3, $arg4, $arg5, $arg6, $arg7, $arg8, $arg9, $arg10, $arg11);
  135. # ; Counter used for assembly label generation
  136. my $label_count = 0;
  137. # ; This implementation follows the convention: for non-leaf functions (they
  138. # ; must call PROLOG) %rbp is used as a frame pointer, and has fixed offset from
  139. # ; the function entry: $GP_STORAGE + [8 bytes alignment (Windows only)]. This
  140. # ; helps to facilitate SEH handlers writing.
  141. #
  142. # ; Leaf functions here do not use more than 4 input arguments.
  143. if ($win64) {
  144. $arg1 = "%rcx";
  145. $arg2 = "%rdx";
  146. $arg3 = "%r8";
  147. $arg4 = "%r9";
  148. $arg5 = "`$GP_STORAGE + 8 + 8*5`(%rbp)"; # +8 - alignment bytes
  149. $arg6 = "`$GP_STORAGE + 8 + 8*6`(%rbp)";
  150. $arg7 = "`$GP_STORAGE + 8 + 8*7`(%rbp)";
  151. $arg8 = "`$GP_STORAGE + 8 + 8*8`(%rbp)";
  152. $arg9 = "`$GP_STORAGE + 8 + 8*9`(%rbp)";
  153. $arg10 = "`$GP_STORAGE + 8 + 8*10`(%rbp)";
  154. $arg11 = "`$GP_STORAGE + 8 + 8*11`(%rbp)";
  155. } else {
  156. $arg1 = "%rdi";
  157. $arg2 = "%rsi";
  158. $arg3 = "%rdx";
  159. $arg4 = "%rcx";
  160. $arg5 = "%r8";
  161. $arg6 = "%r9";
  162. $arg7 = "`$GP_STORAGE + 8*1`(%rbp)";
  163. $arg8 = "`$GP_STORAGE + 8*2`(%rbp)";
  164. $arg9 = "`$GP_STORAGE + 8*3`(%rbp)";
  165. $arg10 = "`$GP_STORAGE + 8*4`(%rbp)";
  166. $arg11 = "`$GP_STORAGE + 8*5`(%rbp)";
  167. }
  168. # ; Offsets in gcm128_context structure (see include/crypto/modes.h)
  169. my $CTX_OFFSET_CurCount = (16 * 0); # ; (Yi) Current counter for generation of encryption key
  170. my $CTX_OFFSET_PEncBlock = (16 * 1); # ; (repurposed EKi field) Partial block buffer
  171. my $CTX_OFFSET_EK0 = (16 * 2); # ; (EK0) Encrypted Y0 counter (see gcm spec notation)
  172. my $CTX_OFFSET_AadLen = (16 * 3); # ; (len.u[0]) Length of Hash which has been input
  173. my $CTX_OFFSET_InLen = ((16 * 3) + 8); # ; (len.u[1]) Length of input data which will be encrypted or decrypted
  174. my $CTX_OFFSET_AadHash = (16 * 4); # ; (Xi) Current hash
  175. my $CTX_OFFSET_HTable = (16 * 6); # ; (Htable) Precomputed table (allows 16 values)
  176. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  177. # ;;; Helper functions
  178. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  179. sub BYTE {
  180. my ($reg) = @_;
  181. if ($reg =~ /%r[abcd]x/i) {
  182. $reg =~ s/%r([abcd])x/%${1}l/i;
  183. } elsif ($reg =~ /%r[sdb][ip]/i) {
  184. $reg =~ s/%r([sdb][ip])/%${1}l/i;
  185. } elsif ($reg =~ /%r[0-9]{1,2}/i) {
  186. $reg =~ s/%(r[0-9]{1,2})/%${1}b/i;
  187. } else {
  188. die "BYTE: unknown register: $reg\n";
  189. }
  190. return $reg;
  191. }
  192. sub WORD {
  193. my ($reg) = @_;
  194. if ($reg =~ /%r[abcdsdb][xip]/i) {
  195. $reg =~ s/%r([abcdsdb])([xip])/%${1}${2}/i;
  196. } elsif ($reg =~ /%r[0-9]{1,2}/) {
  197. $reg =~ s/%(r[0-9]{1,2})/%${1}w/i;
  198. } else {
  199. die "WORD: unknown register: $reg\n";
  200. }
  201. return $reg;
  202. }
  203. sub DWORD {
  204. my ($reg) = @_;
  205. if ($reg =~ /%r[abcdsdb][xip]/i) {
  206. $reg =~ s/%r([abcdsdb])([xip])/%e${1}${2}/i;
  207. } elsif ($reg =~ /%r[0-9]{1,2}/i) {
  208. $reg =~ s/%(r[0-9]{1,2})/%${1}d/i;
  209. } else {
  210. die "DWORD: unknown register: $reg\n";
  211. }
  212. return $reg;
  213. }
  214. sub XWORD {
  215. my ($reg) = @_;
  216. if ($reg =~ /%[xyz]mm/i) {
  217. $reg =~ s/%[xyz]mm/%xmm/i;
  218. } else {
  219. die "XWORD: unknown register: $reg\n";
  220. }
  221. return $reg;
  222. }
  223. sub YWORD {
  224. my ($reg) = @_;
  225. if ($reg =~ /%[xyz]mm/i) {
  226. $reg =~ s/%[xyz]mm/%ymm/i;
  227. } else {
  228. die "YWORD: unknown register: $reg\n";
  229. }
  230. return $reg;
  231. }
  232. sub ZWORD {
  233. my ($reg) = @_;
  234. if ($reg =~ /%[xyz]mm/i) {
  235. $reg =~ s/%[xyz]mm/%zmm/i;
  236. } else {
  237. die "ZWORD: unknown register: $reg\n";
  238. }
  239. return $reg;
  240. }
  241. # ; Helper function to construct effective address based on two kinds of
  242. # ; offsets: numerical or located in the register
  243. sub EffectiveAddress {
  244. my ($base, $offset, $displacement) = @_;
  245. $displacement = 0 if (!$displacement);
  246. if ($offset =~ /^\d+\z/) { # numerical offset
  247. return "`$offset + $displacement`($base)";
  248. } else { # offset resides in register
  249. return "$displacement($base,$offset,1)";
  250. }
  251. }
  252. # ; Provides memory location of corresponding HashKey power
  253. sub HashKeyByIdx {
  254. my ($idx, $base) = @_;
  255. my $base_str = ($base eq "%rsp") ? "frame" : "context";
  256. my $offset = &HashKeyOffsetByIdx($idx, $base_str);
  257. return "$offset($base)";
  258. }
  259. # ; Provides offset (in bytes) of corresponding HashKey power from the highest key in the storage
  260. sub HashKeyOffsetByIdx {
  261. my ($idx, $base) = @_;
  262. die "HashKeyOffsetByIdx: base should be either 'frame' or 'context'; base = $base"
  263. if (($base ne "frame") && ($base ne "context"));
  264. my $offset_base;
  265. my $offset_idx;
  266. if ($base eq "frame") { # frame storage
  267. die "HashKeyOffsetByIdx: idx out of bounds (1..48)! idx = $idx\n" if ($idx > $HKEYS_STORAGE_CAPACITY || $idx < 1);
  268. $offset_base = $STACK_HKEYS_OFFSET;
  269. $offset_idx = ($AES_BLOCK_SIZE * ($HKEYS_STORAGE_CAPACITY - $idx));
  270. } else { # context storage
  271. die "HashKeyOffsetByIdx: idx out of bounds (1..16)! idx = $idx\n" if ($idx > $HKEYS_CONTEXT_CAPACITY || $idx < 1);
  272. $offset_base = $CTX_OFFSET_HTable;
  273. $offset_idx = ($AES_BLOCK_SIZE * ($HKEYS_CONTEXT_CAPACITY - $idx));
  274. }
  275. return $offset_base + $offset_idx;
  276. }
  277. # ; Creates local frame and does back up of non-volatile registers.
  278. # ; Holds stack unwinding directives.
  279. sub PROLOG {
  280. my ($need_hkeys_stack_storage, $need_aes_stack_storage, $func_name) = @_;
  281. my $DYNAMIC_STACK_ALLOC_SIZE = 0;
  282. my $DYNAMIC_STACK_ALLOC_ALIGNMENT_SPACE = $win64 ? 48 : 52;
  283. if ($need_hkeys_stack_storage) {
  284. $DYNAMIC_STACK_ALLOC_SIZE += $HKEYS_STORAGE;
  285. }
  286. if ($need_aes_stack_storage) {
  287. if (!$need_hkeys_stack_storage) {
  288. die "PROLOG: unsupported case - aes storage without hkeys one";
  289. }
  290. $DYNAMIC_STACK_ALLOC_SIZE += $LOCAL_STORAGE;
  291. }
  292. $code .= <<___;
  293. push %rbx
  294. .cfi_push %rbx
  295. .L${func_name}_seh_push_rbx:
  296. push %rbp
  297. .cfi_push %rbp
  298. .L${func_name}_seh_push_rbp:
  299. push %r12
  300. .cfi_push %r12
  301. .L${func_name}_seh_push_r12:
  302. push %r13
  303. .cfi_push %r13
  304. .L${func_name}_seh_push_r13:
  305. push %r14
  306. .cfi_push %r14
  307. .L${func_name}_seh_push_r14:
  308. push %r15
  309. .cfi_push %r15
  310. .L${func_name}_seh_push_r15:
  311. ___
  312. if ($win64) {
  313. $code .= <<___;
  314. push %rdi
  315. .L${func_name}_seh_push_rdi:
  316. push %rsi
  317. .L${func_name}_seh_push_rsi:
  318. sub \$`$XMM_STORAGE+8`,%rsp # +8 alignment
  319. .L${func_name}_seh_allocstack_xmm:
  320. ___
  321. }
  322. $code .= <<___;
  323. # ; %rbp contains stack pointer right after GP regs pushed at stack + [8
  324. # ; bytes of alignment (Windows only)]. It serves as a frame pointer in SEH
  325. # ; handlers. The requirement for a frame pointer is that its offset from
  326. # ; RSP shall be multiple of 16, and not exceed 240 bytes. The frame pointer
  327. # ; itself seems to be reasonable to use here, because later we do 64-byte stack
  328. # ; alignment which gives us non-determinate offsets and complicates writing
  329. # ; SEH handlers.
  330. #
  331. # ; It also serves as an anchor for retrieving stack arguments on both Linux
  332. # ; and Windows.
  333. lea `$XMM_STORAGE`(%rsp),%rbp
  334. .cfi_def_cfa_register %rbp
  335. .L${func_name}_seh_setfp:
  336. ___
  337. if ($win64) {
  338. # ; xmm6:xmm15 need to be preserved on Windows
  339. foreach my $reg_idx (6 .. 15) {
  340. my $xmm_reg_offset = ($reg_idx - 6) * 16;
  341. $code .= <<___;
  342. vmovdqu %xmm${reg_idx},$xmm_reg_offset(%rsp)
  343. .L${func_name}_seh_save_xmm${reg_idx}:
  344. ___
  345. }
  346. }
  347. $code .= <<___;
  348. # Prolog ends here. Next stack allocation is treated as "dynamic".
  349. .L${func_name}_seh_prolog_end:
  350. ___
  351. if ($DYNAMIC_STACK_ALLOC_SIZE) {
  352. $code .= <<___;
  353. sub \$`$DYNAMIC_STACK_ALLOC_SIZE + $DYNAMIC_STACK_ALLOC_ALIGNMENT_SPACE`,%rsp
  354. and \$(-64),%rsp
  355. ___
  356. }
  357. }
  358. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  359. # ;;; Restore register content for the caller.
  360. # ;;; And cleanup stack.
  361. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  362. sub EPILOG {
  363. my ($hkeys_storage_on_stack, $payload_len) = @_;
  364. my $label_suffix = $label_count++;
  365. if ($hkeys_storage_on_stack && $CLEAR_HKEYS_STORAGE_ON_EXIT) {
  366. # ; There is no need in hkeys cleanup if payload len was small, i.e. no hkeys
  367. # ; were stored in the local frame storage
  368. $code .= <<___;
  369. cmpq \$`16*16`,$payload_len
  370. jbe .Lskip_hkeys_cleanup_${label_suffix}
  371. vpxor %xmm0,%xmm0,%xmm0
  372. ___
  373. for (my $i = 0; $i < int($HKEYS_STORAGE / 64); $i++) {
  374. $code .= "vmovdqa64 %zmm0,`$STACK_HKEYS_OFFSET + 64*$i`(%rsp)\n";
  375. }
  376. $code .= ".Lskip_hkeys_cleanup_${label_suffix}:\n";
  377. }
  378. if ($CLEAR_SCRATCH_REGISTERS) {
  379. &clear_scratch_gps_asm();
  380. &clear_scratch_zmms_asm();
  381. } else {
  382. $code .= "vzeroupper\n";
  383. }
  384. if ($win64) {
  385. # ; restore xmm15:xmm6
  386. for (my $reg_idx = 15; $reg_idx >= 6; $reg_idx--) {
  387. my $xmm_reg_offset = -$XMM_STORAGE + ($reg_idx - 6) * 16;
  388. $code .= <<___;
  389. vmovdqu $xmm_reg_offset(%rbp),%xmm${reg_idx},
  390. ___
  391. }
  392. }
  393. if ($win64) {
  394. # Forming valid epilog for SEH with use of frame pointer.
  395. # https://docs.microsoft.com/en-us/cpp/build/prolog-and-epilog?view=msvc-160#epilog-code
  396. $code .= "lea 8(%rbp),%rsp\n";
  397. } else {
  398. $code .= "lea (%rbp),%rsp\n";
  399. $code .= ".cfi_def_cfa_register %rsp\n";
  400. }
  401. if ($win64) {
  402. $code .= <<___;
  403. pop %rsi
  404. .cfi_pop %rsi
  405. pop %rdi
  406. .cfi_pop %rdi
  407. ___
  408. }
  409. $code .= <<___;
  410. pop %r15
  411. .cfi_pop %r15
  412. pop %r14
  413. .cfi_pop %r14
  414. pop %r13
  415. .cfi_pop %r13
  416. pop %r12
  417. .cfi_pop %r12
  418. pop %rbp
  419. .cfi_pop %rbp
  420. pop %rbx
  421. .cfi_pop %rbx
  422. ___
  423. }
  424. # ; Clears all scratch ZMM registers
  425. # ;
  426. # ; It should be called before restoring the XMM registers
  427. # ; for Windows (XMM6-XMM15).
  428. # ;
  429. sub clear_scratch_zmms_asm {
  430. # ; On Linux, all ZMM registers are scratch registers
  431. if (!$win64) {
  432. $code .= "vzeroall\n";
  433. } else {
  434. foreach my $i (0 .. 5) {
  435. $code .= "vpxorq %xmm${i},%xmm${i},%xmm${i}\n";
  436. }
  437. }
  438. foreach my $i (16 .. 31) {
  439. $code .= "vpxorq %xmm${i},%xmm${i},%xmm${i}\n";
  440. }
  441. }
  442. # Clears all scratch GP registers
  443. sub clear_scratch_gps_asm {
  444. foreach my $reg ("%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11") {
  445. $code .= "xor $reg,$reg\n";
  446. }
  447. if (!$win64) {
  448. foreach my $reg ("%rsi", "%rdi") {
  449. $code .= "xor $reg,$reg\n";
  450. }
  451. }
  452. }
  453. sub precompute_hkeys_on_stack {
  454. my $GCM128_CTX = $_[0];
  455. my $HKEYS_READY = $_[1];
  456. my $ZTMP0 = $_[2];
  457. my $ZTMP1 = $_[3];
  458. my $ZTMP2 = $_[4];
  459. my $ZTMP3 = $_[5];
  460. my $ZTMP4 = $_[6];
  461. my $ZTMP5 = $_[7];
  462. my $ZTMP6 = $_[8];
  463. my $HKEYS_RANGE = $_[9]; # ; "first16", "mid16", "all", "first32", "last32"
  464. die "precompute_hkeys_on_stack: Unexpected value of HKEYS_RANGE: $HKEYS_RANGE"
  465. if ($HKEYS_RANGE ne "first16"
  466. && $HKEYS_RANGE ne "mid16"
  467. && $HKEYS_RANGE ne "all"
  468. && $HKEYS_RANGE ne "first32"
  469. && $HKEYS_RANGE ne "last32");
  470. my $label_suffix = $label_count++;
  471. $code .= <<___;
  472. test $HKEYS_READY,$HKEYS_READY
  473. jnz .L_skip_hkeys_precomputation_${label_suffix}
  474. ___
  475. if ($HKEYS_RANGE eq "first16" || $HKEYS_RANGE eq "first32" || $HKEYS_RANGE eq "all") {
  476. # ; Fill the stack with the first 16 hkeys from the context
  477. $code .= <<___;
  478. # ; Move 16 hkeys from the context to stack
  479. vmovdqu64 @{[HashKeyByIdx(4,$GCM128_CTX)]},$ZTMP0
  480. vmovdqu64 $ZTMP0,@{[HashKeyByIdx(4,"%rsp")]}
  481. vmovdqu64 @{[HashKeyByIdx(8,$GCM128_CTX)]},$ZTMP1
  482. vmovdqu64 $ZTMP1,@{[HashKeyByIdx(8,"%rsp")]}
  483. # ; broadcast HashKey^8
  484. vshufi64x2 \$0x00,$ZTMP1,$ZTMP1,$ZTMP1
  485. vmovdqu64 @{[HashKeyByIdx(12,$GCM128_CTX)]},$ZTMP2
  486. vmovdqu64 $ZTMP2,@{[HashKeyByIdx(12,"%rsp")]}
  487. vmovdqu64 @{[HashKeyByIdx(16,$GCM128_CTX)]},$ZTMP3
  488. vmovdqu64 $ZTMP3,@{[HashKeyByIdx(16,"%rsp")]}
  489. ___
  490. }
  491. if ($HKEYS_RANGE eq "mid16" || $HKEYS_RANGE eq "last32") {
  492. $code .= <<___;
  493. vmovdqu64 @{[HashKeyByIdx(8,"%rsp")]},$ZTMP1
  494. # ; broadcast HashKey^8
  495. vshufi64x2 \$0x00,$ZTMP1,$ZTMP1,$ZTMP1
  496. vmovdqu64 @{[HashKeyByIdx(12,"%rsp")]},$ZTMP2
  497. vmovdqu64 @{[HashKeyByIdx(16,"%rsp")]},$ZTMP3
  498. ___
  499. }
  500. if ($HKEYS_RANGE eq "mid16" || $HKEYS_RANGE eq "first32" || $HKEYS_RANGE eq "last32" || $HKEYS_RANGE eq "all") {
  501. # ; Precompute hkeys^i, i=17..32
  502. my $i = 20;
  503. foreach (1 .. int((32 - 16) / 8)) {
  504. # ;; compute HashKey^(4 + n), HashKey^(3 + n), ... HashKey^(1 + n)
  505. &GHASH_MUL($ZTMP2, $ZTMP1, $ZTMP4, $ZTMP5, $ZTMP6);
  506. $code .= "vmovdqu64 $ZTMP2,@{[HashKeyByIdx($i,\"%rsp\")]}\n";
  507. $i += 4;
  508. # ;; compute HashKey^(8 + n), HashKey^(7 + n), ... HashKey^(5 + n)
  509. &GHASH_MUL($ZTMP3, $ZTMP1, $ZTMP4, $ZTMP5, $ZTMP6);
  510. $code .= "vmovdqu64 $ZTMP3,@{[HashKeyByIdx($i,\"%rsp\")]}\n";
  511. $i += 4;
  512. }
  513. }
  514. if ($HKEYS_RANGE eq "last32" || $HKEYS_RANGE eq "all") {
  515. # ; Precompute hkeys^i, i=33..48 (HKEYS_STORAGE_CAPACITY = 48)
  516. my $i = 36;
  517. foreach (1 .. int((48 - 32) / 8)) {
  518. # ;; compute HashKey^(4 + n), HashKey^(3 + n), ... HashKey^(1 + n)
  519. &GHASH_MUL($ZTMP2, $ZTMP1, $ZTMP4, $ZTMP5, $ZTMP6);
  520. $code .= "vmovdqu64 $ZTMP2,@{[HashKeyByIdx($i,\"%rsp\")]}\n";
  521. $i += 4;
  522. # ;; compute HashKey^(8 + n), HashKey^(7 + n), ... HashKey^(5 + n)
  523. &GHASH_MUL($ZTMP3, $ZTMP1, $ZTMP4, $ZTMP5, $ZTMP6);
  524. $code .= "vmovdqu64 $ZTMP3,@{[HashKeyByIdx($i,\"%rsp\")]}\n";
  525. $i += 4;
  526. }
  527. }
  528. $code .= ".L_skip_hkeys_precomputation_${label_suffix}:\n";
  529. }
  530. # ;; =============================================================================
  531. # ;; Generic macro to produce code that executes $OPCODE instruction
  532. # ;; on selected number of AES blocks (16 bytes long ) between 0 and 16.
  533. # ;; All three operands of the instruction come from registers.
  534. # ;; Note: if 3 blocks are left at the end instruction is produced to operate all
  535. # ;; 4 blocks (full width of ZMM)
  536. sub ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 {
  537. my $NUM_BLOCKS = $_[0]; # [in] numerical value, number of AES blocks (0 to 16)
  538. my $OPCODE = $_[1]; # [in] instruction name
  539. my @DST;
  540. $DST[0] = $_[2]; # [out] destination ZMM register
  541. $DST[1] = $_[3]; # [out] destination ZMM register
  542. $DST[2] = $_[4]; # [out] destination ZMM register
  543. $DST[3] = $_[5]; # [out] destination ZMM register
  544. my @SRC1;
  545. $SRC1[0] = $_[6]; # [in] source 1 ZMM register
  546. $SRC1[1] = $_[7]; # [in] source 1 ZMM register
  547. $SRC1[2] = $_[8]; # [in] source 1 ZMM register
  548. $SRC1[3] = $_[9]; # [in] source 1 ZMM register
  549. my @SRC2;
  550. $SRC2[0] = $_[10]; # [in] source 2 ZMM register
  551. $SRC2[1] = $_[11]; # [in] source 2 ZMM register
  552. $SRC2[2] = $_[12]; # [in] source 2 ZMM register
  553. $SRC2[3] = $_[13]; # [in] source 2 ZMM register
  554. die "ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16: num_blocks is out of bounds = $NUM_BLOCKS\n"
  555. if ($NUM_BLOCKS > 16 || $NUM_BLOCKS < 0);
  556. my $reg_idx = 0;
  557. my $blocks_left = $NUM_BLOCKS;
  558. foreach (1 .. ($NUM_BLOCKS / 4)) {
  559. $code .= "$OPCODE $SRC2[$reg_idx],$SRC1[$reg_idx],$DST[$reg_idx]\n";
  560. $reg_idx++;
  561. $blocks_left -= 4;
  562. }
  563. my $DSTREG = $DST[$reg_idx];
  564. my $SRC1REG = $SRC1[$reg_idx];
  565. my $SRC2REG = $SRC2[$reg_idx];
  566. if ($blocks_left == 1) {
  567. $code .= "$OPCODE @{[XWORD($SRC2REG)]},@{[XWORD($SRC1REG)]},@{[XWORD($DSTREG)]}\n";
  568. } elsif ($blocks_left == 2) {
  569. $code .= "$OPCODE @{[YWORD($SRC2REG)]},@{[YWORD($SRC1REG)]},@{[YWORD($DSTREG)]}\n";
  570. } elsif ($blocks_left == 3) {
  571. $code .= "$OPCODE $SRC2REG,$SRC1REG,$DSTREG\n";
  572. }
  573. }
  574. # ;; =============================================================================
  575. # ;; Loads specified number of AES blocks into ZMM registers using mask register
  576. # ;; for the last loaded register (xmm, ymm or zmm).
  577. # ;; Loads take place at 1 byte granularity.
  578. sub ZMM_LOAD_MASKED_BLOCKS_0_16 {
  579. my $NUM_BLOCKS = $_[0]; # [in] numerical value, number of AES blocks (0 to 16)
  580. my $INP = $_[1]; # [in] input data pointer to read from
  581. my $DATA_OFFSET = $_[2]; # [in] offset to the output pointer (GP or numerical)
  582. my @DST;
  583. $DST[0] = $_[3]; # [out] ZMM register with loaded data
  584. $DST[1] = $_[4]; # [out] ZMM register with loaded data
  585. $DST[2] = $_[5]; # [out] ZMM register with loaded data
  586. $DST[3] = $_[6]; # [out] ZMM register with loaded data
  587. my $MASK = $_[7]; # [in] mask register
  588. die "ZMM_LOAD_MASKED_BLOCKS_0_16: num_blocks is out of bounds = $NUM_BLOCKS\n"
  589. if ($NUM_BLOCKS > 16 || $NUM_BLOCKS < 0);
  590. my $src_offset = 0;
  591. my $dst_idx = 0;
  592. my $blocks_left = $NUM_BLOCKS;
  593. if ($NUM_BLOCKS > 0) {
  594. foreach (1 .. (int(($NUM_BLOCKS + 3) / 4) - 1)) {
  595. $code .= "vmovdqu8 @{[EffectiveAddress($INP,$DATA_OFFSET,$src_offset)]},$DST[$dst_idx]\n";
  596. $src_offset += 64;
  597. $dst_idx++;
  598. $blocks_left -= 4;
  599. }
  600. }
  601. my $DSTREG = $DST[$dst_idx];
  602. if ($blocks_left == 1) {
  603. $code .= "vmovdqu8 @{[EffectiveAddress($INP,$DATA_OFFSET,$src_offset)]},@{[XWORD($DSTREG)]}\{$MASK\}{z}\n";
  604. } elsif ($blocks_left == 2) {
  605. $code .= "vmovdqu8 @{[EffectiveAddress($INP,$DATA_OFFSET,$src_offset)]},@{[YWORD($DSTREG)]}\{$MASK\}{z}\n";
  606. } elsif (($blocks_left == 3 || $blocks_left == 4)) {
  607. $code .= "vmovdqu8 @{[EffectiveAddress($INP,$DATA_OFFSET,$src_offset)]},$DSTREG\{$MASK\}{z}\n";
  608. }
  609. }
  610. # ;; =============================================================================
  611. # ;; Stores specified number of AES blocks from ZMM registers with mask register
  612. # ;; for the last loaded register (xmm, ymm or zmm).
  613. # ;; Stores take place at 1 byte granularity.
  614. sub ZMM_STORE_MASKED_BLOCKS_0_16 {
  615. my $NUM_BLOCKS = $_[0]; # [in] numerical value, number of AES blocks (0 to 16)
  616. my $OUTP = $_[1]; # [in] output data pointer to write to
  617. my $DATA_OFFSET = $_[2]; # [in] offset to the output pointer (GP or numerical)
  618. my @SRC;
  619. $SRC[0] = $_[3]; # [in] ZMM register with data to store
  620. $SRC[1] = $_[4]; # [in] ZMM register with data to store
  621. $SRC[2] = $_[5]; # [in] ZMM register with data to store
  622. $SRC[3] = $_[6]; # [in] ZMM register with data to store
  623. my $MASK = $_[7]; # [in] mask register
  624. die "ZMM_STORE_MASKED_BLOCKS_0_16: num_blocks is out of bounds = $NUM_BLOCKS\n"
  625. if ($NUM_BLOCKS > 16 || $NUM_BLOCKS < 0);
  626. my $dst_offset = 0;
  627. my $src_idx = 0;
  628. my $blocks_left = $NUM_BLOCKS;
  629. if ($NUM_BLOCKS > 0) {
  630. foreach (1 .. (int(($NUM_BLOCKS + 3) / 4) - 1)) {
  631. $code .= "vmovdqu8 $SRC[$src_idx],`$dst_offset`($OUTP,$DATA_OFFSET,1)\n";
  632. $dst_offset += 64;
  633. $src_idx++;
  634. $blocks_left -= 4;
  635. }
  636. }
  637. my $SRCREG = $SRC[$src_idx];
  638. if ($blocks_left == 1) {
  639. $code .= "vmovdqu8 @{[XWORD($SRCREG)]},`$dst_offset`($OUTP,$DATA_OFFSET,1){$MASK}\n";
  640. } elsif ($blocks_left == 2) {
  641. $code .= "vmovdqu8 @{[YWORD($SRCREG)]},`$dst_offset`($OUTP,$DATA_OFFSET,1){$MASK}\n";
  642. } elsif ($blocks_left == 3 || $blocks_left == 4) {
  643. $code .= "vmovdqu8 $SRCREG,`$dst_offset`($OUTP,$DATA_OFFSET,1){$MASK}\n";
  644. }
  645. }
  646. # ;;; ===========================================================================
  647. # ;;; Handles AES encryption rounds
  648. # ;;; It handles special cases: the last and first rounds
  649. # ;;; Optionally, it performs XOR with data after the last AES round.
  650. # ;;; Uses NROUNDS parameter to check what needs to be done for the current round.
  651. # ;;; If 3 blocks are trailing then operation on whole ZMM is performed (4 blocks).
  652. sub ZMM_AESENC_ROUND_BLOCKS_0_16 {
  653. my $L0B0_3 = $_[0]; # [in/out] zmm; blocks 0 to 3
  654. my $L0B4_7 = $_[1]; # [in/out] zmm; blocks 4 to 7
  655. my $L0B8_11 = $_[2]; # [in/out] zmm; blocks 8 to 11
  656. my $L0B12_15 = $_[3]; # [in/out] zmm; blocks 12 to 15
  657. my $KEY = $_[4]; # [in] zmm containing round key
  658. my $ROUND = $_[5]; # [in] round number
  659. my $D0_3 = $_[6]; # [in] zmm or no_data; plain/cipher text blocks 0-3
  660. my $D4_7 = $_[7]; # [in] zmm or no_data; plain/cipher text blocks 4-7
  661. my $D8_11 = $_[8]; # [in] zmm or no_data; plain/cipher text blocks 8-11
  662. my $D12_15 = $_[9]; # [in] zmm or no_data; plain/cipher text blocks 12-15
  663. my $NUMBL = $_[10]; # [in] number of blocks; numerical value
  664. my $NROUNDS = $_[11]; # [in] number of rounds; numerical value
  665. # ;;; === first AES round
  666. if ($ROUND < 1) {
  667. # ;; round 0
  668. &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
  669. $NUMBL, "vpxorq", $L0B0_3, $L0B4_7, $L0B8_11, $L0B12_15, $L0B0_3,
  670. $L0B4_7, $L0B8_11, $L0B12_15, $KEY, $KEY, $KEY, $KEY);
  671. }
  672. # ;;; === middle AES rounds
  673. if ($ROUND >= 1 && $ROUND <= $NROUNDS) {
  674. # ;; rounds 1 to 9/11/13
  675. &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
  676. $NUMBL, "vaesenc", $L0B0_3, $L0B4_7, $L0B8_11, $L0B12_15, $L0B0_3,
  677. $L0B4_7, $L0B8_11, $L0B12_15, $KEY, $KEY, $KEY, $KEY);
  678. }
  679. # ;;; === last AES round
  680. if ($ROUND > $NROUNDS) {
  681. # ;; the last round - mix enclast with text xor's
  682. &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
  683. $NUMBL, "vaesenclast", $L0B0_3, $L0B4_7, $L0B8_11, $L0B12_15, $L0B0_3,
  684. $L0B4_7, $L0B8_11, $L0B12_15, $KEY, $KEY, $KEY, $KEY);
  685. # ;;; === XOR with data
  686. if ( ($D0_3 ne "no_data")
  687. && ($D4_7 ne "no_data")
  688. && ($D8_11 ne "no_data")
  689. && ($D12_15 ne "no_data"))
  690. {
  691. &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
  692. $NUMBL, "vpxorq", $L0B0_3, $L0B4_7, $L0B8_11, $L0B12_15, $L0B0_3,
  693. $L0B4_7, $L0B8_11, $L0B12_15, $D0_3, $D4_7, $D8_11, $D12_15);
  694. }
  695. }
  696. }
  697. # ;;; Horizontal XOR - 4 x 128bits xored together
  698. sub VHPXORI4x128 {
  699. my $REG = $_[0]; # [in/out] ZMM with 4x128bits to xor; 128bit output
  700. my $TMP = $_[1]; # [clobbered] ZMM temporary register
  701. $code .= <<___;
  702. vextracti64x4 \$1,$REG,@{[YWORD($TMP)]}
  703. vpxorq @{[YWORD($TMP)]},@{[YWORD($REG)]},@{[YWORD($REG)]}
  704. vextracti32x4 \$1,@{[YWORD($REG)]},@{[XWORD($TMP)]}
  705. vpxorq @{[XWORD($TMP)]},@{[XWORD($REG)]},@{[XWORD($REG)]}
  706. ___
  707. }
  708. # ;;; AVX512 reduction macro
  709. sub VCLMUL_REDUCE {
  710. my $OUT = $_[0]; # [out] zmm/ymm/xmm: result (must not be $TMP1 or $HI128)
  711. my $POLY = $_[1]; # [in] zmm/ymm/xmm: polynomial
  712. my $HI128 = $_[2]; # [in] zmm/ymm/xmm: high 128b of hash to reduce
  713. my $LO128 = $_[3]; # [in] zmm/ymm/xmm: low 128b of hash to reduce
  714. my $TMP0 = $_[4]; # [in] zmm/ymm/xmm: temporary register
  715. my $TMP1 = $_[5]; # [in] zmm/ymm/xmm: temporary register
  716. $code .= <<___;
  717. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  718. # ;; first phase of the reduction
  719. vpclmulqdq \$0x01,$LO128,$POLY,$TMP0
  720. vpslldq \$8,$TMP0,$TMP0 # ; shift-L 2 DWs
  721. vpxorq $TMP0,$LO128,$TMP0 # ; first phase of the reduction complete
  722. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  723. # ;; second phase of the reduction
  724. vpclmulqdq \$0x00,$TMP0,$POLY,$TMP1
  725. vpsrldq \$4,$TMP1,$TMP1 # ; shift-R only 1-DW to obtain 2-DWs shift-R
  726. vpclmulqdq \$0x10,$TMP0,$POLY,$OUT
  727. vpslldq \$4,$OUT,$OUT # ; shift-L 1-DW to obtain result with no shifts
  728. vpternlogq \$0x96,$HI128,$TMP1,$OUT # ; OUT/GHASH = OUT xor TMP1 xor HI128
  729. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  730. ___
  731. }
  732. # ;; ===========================================================================
  733. # ;; schoolbook multiply of 16 blocks (16 x 16 bytes)
  734. # ;; - it is assumed that data read from $INPTR is already shuffled and
  735. # ;; $INPTR address is 64 byte aligned
  736. # ;; - there is an option to pass ready blocks through ZMM registers too.
  737. # ;; 4 extra parameters need to be passed in such case and 21st ($ZTMP9) argument can be empty
  738. sub GHASH_16 {
  739. my $TYPE = $_[0]; # [in] ghash type: start (xor hash), mid, end (same as mid; no reduction),
  740. # end_reduce (end with reduction), start_reduce
  741. my $GH = $_[1]; # [in/out] ZMM ghash sum: high 128-bits
  742. my $GM = $_[2]; # [in/out] ZMM ghash sum: middle 128-bits
  743. my $GL = $_[3]; # [in/out] ZMM ghash sum: low 128-bits
  744. my $INPTR = $_[4]; # [in] data input pointer
  745. my $INOFF = $_[5]; # [in] data input offset
  746. my $INDIS = $_[6]; # [in] data input displacement
  747. my $HKPTR = $_[7]; # [in] hash key pointer
  748. my $HKOFF = $_[8]; # [in] hash key offset (can be either numerical offset, or register containing offset)
  749. my $HKDIS = $_[9]; # [in] hash key displacement
  750. my $HASH = $_[10]; # [in/out] ZMM hash value in/out
  751. my $ZTMP0 = $_[11]; # [clobbered] temporary ZMM
  752. my $ZTMP1 = $_[12]; # [clobbered] temporary ZMM
  753. my $ZTMP2 = $_[13]; # [clobbered] temporary ZMM
  754. my $ZTMP3 = $_[14]; # [clobbered] temporary ZMM
  755. my $ZTMP4 = $_[15]; # [clobbered] temporary ZMM
  756. my $ZTMP5 = $_[16]; # [clobbered] temporary ZMM
  757. my $ZTMP6 = $_[17]; # [clobbered] temporary ZMM
  758. my $ZTMP7 = $_[18]; # [clobbered] temporary ZMM
  759. my $ZTMP8 = $_[19]; # [clobbered] temporary ZMM
  760. my $ZTMP9 = $_[20]; # [clobbered] temporary ZMM, can be empty if 4 extra parameters below are provided
  761. my $DAT0 = $_[21]; # [in] ZMM with 4 blocks of input data (INPTR, INOFF, INDIS unused)
  762. my $DAT1 = $_[22]; # [in] ZMM with 4 blocks of input data (INPTR, INOFF, INDIS unused)
  763. my $DAT2 = $_[23]; # [in] ZMM with 4 blocks of input data (INPTR, INOFF, INDIS unused)
  764. my $DAT3 = $_[24]; # [in] ZMM with 4 blocks of input data (INPTR, INOFF, INDIS unused)
  765. my $start_ghash = 0;
  766. my $do_reduction = 0;
  767. if ($TYPE eq "start") {
  768. $start_ghash = 1;
  769. }
  770. if ($TYPE eq "start_reduce") {
  771. $start_ghash = 1;
  772. $do_reduction = 1;
  773. }
  774. if ($TYPE eq "end_reduce") {
  775. $do_reduction = 1;
  776. }
  777. # ;; ghash blocks 0-3
  778. if (scalar(@_) == 21) {
  779. $code .= "vmovdqa64 @{[EffectiveAddress($INPTR,$INOFF,($INDIS+0*64))]},$ZTMP9\n";
  780. } else {
  781. $ZTMP9 = $DAT0;
  782. }
  783. if ($start_ghash != 0) {
  784. $code .= "vpxorq $HASH,$ZTMP9,$ZTMP9\n";
  785. }
  786. $code .= <<___;
  787. vmovdqu64 @{[EffectiveAddress($HKPTR,$HKOFF,($HKDIS+0*64))]},$ZTMP8
  788. vpclmulqdq \$0x11,$ZTMP8,$ZTMP9,$ZTMP0 # ; T0H = a1*b1
  789. vpclmulqdq \$0x00,$ZTMP8,$ZTMP9,$ZTMP1 # ; T0L = a0*b0
  790. vpclmulqdq \$0x01,$ZTMP8,$ZTMP9,$ZTMP2 # ; T0M1 = a1*b0
  791. vpclmulqdq \$0x10,$ZTMP8,$ZTMP9,$ZTMP3 # ; T0M2 = a0*b1
  792. ___
  793. # ;; ghash blocks 4-7
  794. if (scalar(@_) == 21) {
  795. $code .= "vmovdqa64 @{[EffectiveAddress($INPTR,$INOFF,($INDIS+1*64))]},$ZTMP9\n";
  796. } else {
  797. $ZTMP9 = $DAT1;
  798. }
  799. $code .= <<___;
  800. vmovdqu64 @{[EffectiveAddress($HKPTR,$HKOFF,($HKDIS+1*64))]},$ZTMP8
  801. vpclmulqdq \$0x11,$ZTMP8,$ZTMP9,$ZTMP4 # ; T1H = a1*b1
  802. vpclmulqdq \$0x00,$ZTMP8,$ZTMP9,$ZTMP5 # ; T1L = a0*b0
  803. vpclmulqdq \$0x01,$ZTMP8,$ZTMP9,$ZTMP6 # ; T1M1 = a1*b0
  804. vpclmulqdq \$0x10,$ZTMP8,$ZTMP9,$ZTMP7 # ; T1M2 = a0*b1
  805. ___
  806. # ;; update sums
  807. if ($start_ghash != 0) {
  808. $code .= <<___;
  809. vpxorq $ZTMP6,$ZTMP2,$GM # ; GM = T0M1 + T1M1
  810. vpxorq $ZTMP4,$ZTMP0,$GH # ; GH = T0H + T1H
  811. vpxorq $ZTMP5,$ZTMP1,$GL # ; GL = T0L + T1L
  812. vpternlogq \$0x96,$ZTMP7,$ZTMP3,$GM # ; GM = T0M2 + T1M1
  813. ___
  814. } else { # ;; mid, end, end_reduce
  815. $code .= <<___;
  816. vpternlogq \$0x96,$ZTMP6,$ZTMP2,$GM # ; GM += T0M1 + T1M1
  817. vpternlogq \$0x96,$ZTMP4,$ZTMP0,$GH # ; GH += T0H + T1H
  818. vpternlogq \$0x96,$ZTMP5,$ZTMP1,$GL # ; GL += T0L + T1L
  819. vpternlogq \$0x96,$ZTMP7,$ZTMP3,$GM # ; GM += T0M2 + T1M1
  820. ___
  821. }
  822. # ;; ghash blocks 8-11
  823. if (scalar(@_) == 21) {
  824. $code .= "vmovdqa64 @{[EffectiveAddress($INPTR,$INOFF,($INDIS+2*64))]},$ZTMP9\n";
  825. } else {
  826. $ZTMP9 = $DAT2;
  827. }
  828. $code .= <<___;
  829. vmovdqu64 @{[EffectiveAddress($HKPTR,$HKOFF,($HKDIS+2*64))]},$ZTMP8
  830. vpclmulqdq \$0x11,$ZTMP8,$ZTMP9,$ZTMP0 # ; T0H = a1*b1
  831. vpclmulqdq \$0x00,$ZTMP8,$ZTMP9,$ZTMP1 # ; T0L = a0*b0
  832. vpclmulqdq \$0x01,$ZTMP8,$ZTMP9,$ZTMP2 # ; T0M1 = a1*b0
  833. vpclmulqdq \$0x10,$ZTMP8,$ZTMP9,$ZTMP3 # ; T0M2 = a0*b1
  834. ___
  835. # ;; ghash blocks 12-15
  836. if (scalar(@_) == 21) {
  837. $code .= "vmovdqa64 @{[EffectiveAddress($INPTR,$INOFF,($INDIS+3*64))]},$ZTMP9\n";
  838. } else {
  839. $ZTMP9 = $DAT3;
  840. }
  841. $code .= <<___;
  842. vmovdqu64 @{[EffectiveAddress($HKPTR,$HKOFF,($HKDIS+3*64))]},$ZTMP8
  843. vpclmulqdq \$0x11,$ZTMP8,$ZTMP9,$ZTMP4 # ; T1H = a1*b1
  844. vpclmulqdq \$0x00,$ZTMP8,$ZTMP9,$ZTMP5 # ; T1L = a0*b0
  845. vpclmulqdq \$0x01,$ZTMP8,$ZTMP9,$ZTMP6 # ; T1M1 = a1*b0
  846. vpclmulqdq \$0x10,$ZTMP8,$ZTMP9,$ZTMP7 # ; T1M2 = a0*b1
  847. # ;; update sums
  848. vpternlogq \$0x96,$ZTMP6,$ZTMP2,$GM # ; GM += T0M1 + T1M1
  849. vpternlogq \$0x96,$ZTMP4,$ZTMP0,$GH # ; GH += T0H + T1H
  850. vpternlogq \$0x96,$ZTMP5,$ZTMP1,$GL # ; GL += T0L + T1L
  851. vpternlogq \$0x96,$ZTMP7,$ZTMP3,$GM # ; GM += T0M2 + T1M1
  852. ___
  853. if ($do_reduction != 0) {
  854. $code .= <<___;
  855. # ;; integrate GM into GH and GL
  856. vpsrldq \$8,$GM,$ZTMP0
  857. vpslldq \$8,$GM,$ZTMP1
  858. vpxorq $ZTMP0,$GH,$GH
  859. vpxorq $ZTMP1,$GL,$GL
  860. ___
  861. # ;; add GH and GL 128-bit words horizontally
  862. &VHPXORI4x128($GH, $ZTMP0);
  863. &VHPXORI4x128($GL, $ZTMP1);
  864. # ;; reduction
  865. $code .= "vmovdqa64 POLY2(%rip),@{[XWORD($ZTMP2)]}\n";
  866. &VCLMUL_REDUCE(&XWORD($HASH), &XWORD($ZTMP2), &XWORD($GH), &XWORD($GL), &XWORD($ZTMP0), &XWORD($ZTMP1));
  867. }
  868. }
  869. # ;; ===========================================================================
  870. # ;; GHASH 1 to 16 blocks of cipher text
  871. # ;; - performs reduction at the end
  872. # ;; - it doesn't load the data and it assumed it is already loaded and shuffled
  873. sub GHASH_1_TO_16 {
  874. my $GCM128_CTX = $_[0]; # [in] pointer to expanded keys
  875. my $GHASH = $_[1]; # [out] ghash output
  876. my $T0H = $_[2]; # [clobbered] temporary ZMM
  877. my $T0L = $_[3]; # [clobbered] temporary ZMM
  878. my $T0M1 = $_[4]; # [clobbered] temporary ZMM
  879. my $T0M2 = $_[5]; # [clobbered] temporary ZMM
  880. my $T1H = $_[6]; # [clobbered] temporary ZMM
  881. my $T1L = $_[7]; # [clobbered] temporary ZMM
  882. my $T1M1 = $_[8]; # [clobbered] temporary ZMM
  883. my $T1M2 = $_[9]; # [clobbered] temporary ZMM
  884. my $HK = $_[10]; # [clobbered] temporary ZMM
  885. my $AAD_HASH_IN = $_[11]; # [in] input hash value
  886. my @CIPHER_IN;
  887. $CIPHER_IN[0] = $_[12]; # [in] ZMM with cipher text blocks 0-3
  888. $CIPHER_IN[1] = $_[13]; # [in] ZMM with cipher text blocks 4-7
  889. $CIPHER_IN[2] = $_[14]; # [in] ZMM with cipher text blocks 8-11
  890. $CIPHER_IN[3] = $_[15]; # [in] ZMM with cipher text blocks 12-15
  891. my $NUM_BLOCKS = $_[16]; # [in] numerical value, number of blocks
  892. my $GH = $_[17]; # [in] ZMM with hi product part
  893. my $GM = $_[18]; # [in] ZMM with mid product part
  894. my $GL = $_[19]; # [in] ZMM with lo product part
  895. die "GHASH_1_TO_16: num_blocks is out of bounds = $NUM_BLOCKS\n" if ($NUM_BLOCKS > 16 || $NUM_BLOCKS < 0);
  896. if (scalar(@_) == 17) {
  897. $code .= "vpxorq $AAD_HASH_IN,$CIPHER_IN[0],$CIPHER_IN[0]\n";
  898. }
  899. if ($NUM_BLOCKS == 16) {
  900. $code .= <<___;
  901. vmovdqu64 @{[HashKeyByIdx($NUM_BLOCKS, $GCM128_CTX)]},$HK
  902. vpclmulqdq \$0x11,$HK,$CIPHER_IN[0],$T0H # ; H = a1*b1
  903. vpclmulqdq \$0x00,$HK,$CIPHER_IN[0],$T0L # ; L = a0*b0
  904. vpclmulqdq \$0x01,$HK,$CIPHER_IN[0],$T0M1 # ; M1 = a1*b0
  905. vpclmulqdq \$0x10,$HK,$CIPHER_IN[0],$T0M2 # ; M2 = a0*b1
  906. vmovdqu64 @{[HashKeyByIdx($NUM_BLOCKS-1*4, $GCM128_CTX)]},$HK
  907. vpclmulqdq \$0x11,$HK,$CIPHER_IN[1],$T1H # ; H = a1*b1
  908. vpclmulqdq \$0x00,$HK,$CIPHER_IN[1],$T1L # ; L = a0*b0
  909. vpclmulqdq \$0x01,$HK,$CIPHER_IN[1],$T1M1 # ; M1 = a1*b0
  910. vpclmulqdq \$0x10,$HK,$CIPHER_IN[1],$T1M2 # ; M2 = a0*b1
  911. vmovdqu64 @{[HashKeyByIdx($NUM_BLOCKS-2*4, $GCM128_CTX)]},$HK
  912. vpclmulqdq \$0x11,$HK,$CIPHER_IN[2],$CIPHER_IN[0] # ; H = a1*b1
  913. vpclmulqdq \$0x00,$HK,$CIPHER_IN[2],$CIPHER_IN[1] # ; L = a0*b0
  914. vpternlogq \$0x96,$T1H,$CIPHER_IN[0],$T0H
  915. vpternlogq \$0x96,$T1L,$CIPHER_IN[1],$T0L
  916. vpclmulqdq \$0x01,$HK,$CIPHER_IN[2],$CIPHER_IN[0] # ; M1 = a1*b0
  917. vpclmulqdq \$0x10,$HK,$CIPHER_IN[2],$CIPHER_IN[1] # ; M2 = a0*b1
  918. vpternlogq \$0x96,$T1M1,$CIPHER_IN[0],$T0M1
  919. vpternlogq \$0x96,$T1M2,$CIPHER_IN[1],$T0M2
  920. vmovdqu64 @{[HashKeyByIdx($NUM_BLOCKS-3*4, $GCM128_CTX)]},$HK
  921. vpclmulqdq \$0x11,$HK,$CIPHER_IN[3],$T1H # ; H = a1*b1
  922. vpclmulqdq \$0x00,$HK,$CIPHER_IN[3],$T1L # ; L = a0*b0
  923. vpclmulqdq \$0x01,$HK,$CIPHER_IN[3],$T1M1 # ; M1 = a1*b0
  924. vpclmulqdq \$0x10,$HK,$CIPHER_IN[3],$T1M2 # ; M2 = a0*b1
  925. vpxorq $T1H,$T0H,$T1H
  926. vpxorq $T1L,$T0L,$T1L
  927. vpxorq $T1M1,$T0M1,$T1M1
  928. vpxorq $T1M2,$T0M2,$T1M2
  929. ___
  930. } elsif ($NUM_BLOCKS >= 12) {
  931. $code .= <<___;
  932. vmovdqu64 @{[HashKeyByIdx($NUM_BLOCKS, $GCM128_CTX)]},$HK
  933. vpclmulqdq \$0x11,$HK,$CIPHER_IN[0],$T0H # ; H = a1*b1
  934. vpclmulqdq \$0x00,$HK,$CIPHER_IN[0],$T0L # ; L = a0*b0
  935. vpclmulqdq \$0x01,$HK,$CIPHER_IN[0],$T0M1 # ; M1 = a1*b0
  936. vpclmulqdq \$0x10,$HK,$CIPHER_IN[0],$T0M2 # ; M2 = a0*b1
  937. vmovdqu64 @{[HashKeyByIdx($NUM_BLOCKS-1*4, $GCM128_CTX)]},$HK
  938. vpclmulqdq \$0x11,$HK,$CIPHER_IN[1],$T1H # ; H = a1*b1
  939. vpclmulqdq \$0x00,$HK,$CIPHER_IN[1],$T1L # ; L = a0*b0
  940. vpclmulqdq \$0x01,$HK,$CIPHER_IN[1],$T1M1 # ; M1 = a1*b0
  941. vpclmulqdq \$0x10,$HK,$CIPHER_IN[1],$T1M2 # ; M2 = a0*b1
  942. vmovdqu64 @{[HashKeyByIdx($NUM_BLOCKS-2*4, $GCM128_CTX)]},$HK
  943. vpclmulqdq \$0x11,$HK,$CIPHER_IN[2],$CIPHER_IN[0] # ; H = a1*b1
  944. vpclmulqdq \$0x00,$HK,$CIPHER_IN[2],$CIPHER_IN[1] # ; L = a0*b0
  945. vpternlogq \$0x96,$T0H,$CIPHER_IN[0],$T1H
  946. vpternlogq \$0x96,$T0L,$CIPHER_IN[1],$T1L
  947. vpclmulqdq \$0x01,$HK,$CIPHER_IN[2],$CIPHER_IN[0] # ; M1 = a1*b0
  948. vpclmulqdq \$0x10,$HK,$CIPHER_IN[2],$CIPHER_IN[1] # ; M2 = a0*b1
  949. vpternlogq \$0x96,$T0M1,$CIPHER_IN[0],$T1M1
  950. vpternlogq \$0x96,$T0M2,$CIPHER_IN[1],$T1M2
  951. ___
  952. } elsif ($NUM_BLOCKS >= 8) {
  953. $code .= <<___;
  954. vmovdqu64 @{[HashKeyByIdx($NUM_BLOCKS, $GCM128_CTX)]},$HK
  955. vpclmulqdq \$0x11,$HK,$CIPHER_IN[0],$T0H # ; H = a1*b1
  956. vpclmulqdq \$0x00,$HK,$CIPHER_IN[0],$T0L # ; L = a0*b0
  957. vpclmulqdq \$0x01,$HK,$CIPHER_IN[0],$T0M1 # ; M1 = a1*b0
  958. vpclmulqdq \$0x10,$HK,$CIPHER_IN[0],$T0M2 # ; M2 = a0*b1
  959. vmovdqu64 @{[HashKeyByIdx($NUM_BLOCKS-1*4, $GCM128_CTX)]},$HK
  960. vpclmulqdq \$0x11,$HK,$CIPHER_IN[1],$T1H # ; H = a1*b1
  961. vpclmulqdq \$0x00,$HK,$CIPHER_IN[1],$T1L # ; L = a0*b0
  962. vpclmulqdq \$0x01,$HK,$CIPHER_IN[1],$T1M1 # ; M1 = a1*b0
  963. vpclmulqdq \$0x10,$HK,$CIPHER_IN[1],$T1M2 # ; M2 = a0*b1
  964. vpxorq $T1H,$T0H,$T1H
  965. vpxorq $T1L,$T0L,$T1L
  966. vpxorq $T1M1,$T0M1,$T1M1
  967. vpxorq $T1M2,$T0M2,$T1M2
  968. ___
  969. } elsif ($NUM_BLOCKS >= 4) {
  970. $code .= <<___;
  971. vmovdqu64 @{[HashKeyByIdx($NUM_BLOCKS, $GCM128_CTX)]},$HK
  972. vpclmulqdq \$0x11,$HK,$CIPHER_IN[0],$T1H # ; H = a1*b1
  973. vpclmulqdq \$0x00,$HK,$CIPHER_IN[0],$T1L # ; L = a0*b0
  974. vpclmulqdq \$0x01,$HK,$CIPHER_IN[0],$T1M1 # ; M1 = a1*b0
  975. vpclmulqdq \$0x10,$HK,$CIPHER_IN[0],$T1M2 # ; M2 = a0*b1
  976. ___
  977. }
  978. # ;; T1H/L/M1/M2 - hold current product sums (provided $NUM_BLOCKS >= 4)
  979. my $blocks_left = ($NUM_BLOCKS % 4);
  980. if ($blocks_left > 0) {
  981. # ;; =====================================================
  982. # ;; There are 1, 2 or 3 blocks left to process.
  983. # ;; It may also be that they are the only blocks to process.
  984. # ;; Set hash key and register index position for the remaining 1 to 3 blocks
  985. my $reg_idx = ($NUM_BLOCKS / 4);
  986. my $REG_IN = $CIPHER_IN[$reg_idx];
  987. if ($blocks_left == 1) {
  988. $code .= <<___;
  989. vmovdqu64 @{[HashKeyByIdx($blocks_left, $GCM128_CTX)]},@{[XWORD($HK)]}
  990. vpclmulqdq \$0x01,@{[XWORD($HK)]},@{[XWORD($REG_IN)]},@{[XWORD($T0M1)]} # ; M1 = a1*b0
  991. vpclmulqdq \$0x10,@{[XWORD($HK)]},@{[XWORD($REG_IN)]},@{[XWORD($T0M2)]} # ; M2 = a0*b1
  992. vpclmulqdq \$0x11,@{[XWORD($HK)]},@{[XWORD($REG_IN)]},@{[XWORD($T0H)]} # ; H = a1*b1
  993. vpclmulqdq \$0x00,@{[XWORD($HK)]},@{[XWORD($REG_IN)]},@{[XWORD($T0L)]} # ; L = a0*b0
  994. ___
  995. } elsif ($blocks_left == 2) {
  996. $code .= <<___;
  997. vmovdqu64 @{[HashKeyByIdx($blocks_left, $GCM128_CTX)]},@{[YWORD($HK)]}
  998. vpclmulqdq \$0x01,@{[YWORD($HK)]},@{[YWORD($REG_IN)]},@{[YWORD($T0M1)]} # ; M1 = a1*b0
  999. vpclmulqdq \$0x10,@{[YWORD($HK)]},@{[YWORD($REG_IN)]},@{[YWORD($T0M2)]} # ; M2 = a0*b1
  1000. vpclmulqdq \$0x11,@{[YWORD($HK)]},@{[YWORD($REG_IN)]},@{[YWORD($T0H)]} # ; H = a1*b1
  1001. vpclmulqdq \$0x00,@{[YWORD($HK)]},@{[YWORD($REG_IN)]},@{[YWORD($T0L)]} # ; L = a0*b0
  1002. ___
  1003. } else { # ; blocks_left == 3
  1004. $code .= <<___;
  1005. vmovdqu64 @{[HashKeyByIdx($blocks_left, $GCM128_CTX)]},@{[YWORD($HK)]}
  1006. vinserti64x2 \$2,@{[HashKeyByIdx($blocks_left-2, $GCM128_CTX)]},$HK,$HK
  1007. vpclmulqdq \$0x01,$HK,$REG_IN,$T0M1 # ; M1 = a1*b0
  1008. vpclmulqdq \$0x10,$HK,$REG_IN,$T0M2 # ; M2 = a0*b1
  1009. vpclmulqdq \$0x11,$HK,$REG_IN,$T0H # ; H = a1*b1
  1010. vpclmulqdq \$0x00,$HK,$REG_IN,$T0L # ; L = a0*b0
  1011. ___
  1012. }
  1013. if (scalar(@_) == 20) {
  1014. # ;; *** GH/GM/GL passed as arguments
  1015. if ($NUM_BLOCKS >= 4) {
  1016. $code .= <<___;
  1017. # ;; add ghash product sums from the first 4, 8 or 12 blocks
  1018. vpxorq $T1M1,$T0M1,$T0M1
  1019. vpternlogq \$0x96,$T1M2,$GM,$T0M2
  1020. vpternlogq \$0x96,$T1H,$GH,$T0H
  1021. vpternlogq \$0x96,$T1L,$GL,$T0L
  1022. ___
  1023. } else {
  1024. $code .= <<___;
  1025. vpxorq $GM,$T0M1,$T0M1
  1026. vpxorq $GH,$T0H,$T0H
  1027. vpxorq $GL,$T0L,$T0L
  1028. ___
  1029. }
  1030. } else {
  1031. # ;; *** GH/GM/GL NOT passed as arguments
  1032. if ($NUM_BLOCKS >= 4) {
  1033. $code .= <<___;
  1034. # ;; add ghash product sums from the first 4, 8 or 12 blocks
  1035. vpxorq $T1M1,$T0M1,$T0M1
  1036. vpxorq $T1M2,$T0M2,$T0M2
  1037. vpxorq $T1H,$T0H,$T0H
  1038. vpxorq $T1L,$T0L,$T0L
  1039. ___
  1040. }
  1041. }
  1042. $code .= <<___;
  1043. # ;; integrate TM into TH and TL
  1044. vpxorq $T0M2,$T0M1,$T0M1
  1045. vpsrldq \$8,$T0M1,$T1M1
  1046. vpslldq \$8,$T0M1,$T1M2
  1047. vpxorq $T1M1,$T0H,$T0H
  1048. vpxorq $T1M2,$T0L,$T0L
  1049. ___
  1050. } else {
  1051. # ;; =====================================================
  1052. # ;; number of blocks is 4, 8, 12 or 16
  1053. # ;; T1H/L/M1/M2 include product sums not T0H/L/M1/M2
  1054. if (scalar(@_) == 20) {
  1055. $code .= <<___;
  1056. # ;; *** GH/GM/GL passed as arguments
  1057. vpxorq $GM,$T1M1,$T1M1
  1058. vpxorq $GH,$T1H,$T1H
  1059. vpxorq $GL,$T1L,$T1L
  1060. ___
  1061. }
  1062. $code .= <<___;
  1063. # ;; integrate TM into TH and TL
  1064. vpxorq $T1M2,$T1M1,$T1M1
  1065. vpsrldq \$8,$T1M1,$T0M1
  1066. vpslldq \$8,$T1M1,$T0M2
  1067. vpxorq $T0M1,$T1H,$T0H
  1068. vpxorq $T0M2,$T1L,$T0L
  1069. ___
  1070. }
  1071. # ;; add TH and TL 128-bit words horizontally
  1072. &VHPXORI4x128($T0H, $T1M1);
  1073. &VHPXORI4x128($T0L, $T1M2);
  1074. # ;; reduction
  1075. $code .= "vmovdqa64 POLY2(%rip),@{[XWORD($HK)]}\n";
  1076. &VCLMUL_REDUCE(
  1077. @{[XWORD($GHASH)]},
  1078. @{[XWORD($HK)]},
  1079. @{[XWORD($T0H)]},
  1080. @{[XWORD($T0L)]},
  1081. @{[XWORD($T0M1)]},
  1082. @{[XWORD($T0M2)]});
  1083. }
  1084. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  1085. # ;; GHASH_MUL MACRO to implement: Data*HashKey mod (x^128 + x^127 + x^126 +x^121 + 1)
  1086. # ;; Input: A and B (128-bits each, bit-reflected)
  1087. # ;; Output: C = A*B*x mod poly, (i.e. >>1 )
  1088. # ;; To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
  1089. # ;; GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
  1090. # ;;
  1091. # ;; Refer to [3] for more details.
  1092. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  1093. sub GHASH_MUL {
  1094. my $GH = $_[0]; #; [in/out] xmm/ymm/zmm with multiply operand(s) (128-bits)
  1095. my $HK = $_[1]; #; [in] xmm/ymm/zmm with hash key value(s) (128-bits)
  1096. my $T1 = $_[2]; #; [clobbered] xmm/ymm/zmm
  1097. my $T2 = $_[3]; #; [clobbered] xmm/ymm/zmm
  1098. my $T3 = $_[4]; #; [clobbered] xmm/ymm/zmm
  1099. $code .= <<___;
  1100. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  1101. vpclmulqdq \$0x11,$HK,$GH,$T1 # ; $T1 = a1*b1
  1102. vpclmulqdq \$0x00,$HK,$GH,$T2 # ; $T2 = a0*b0
  1103. vpclmulqdq \$0x01,$HK,$GH,$T3 # ; $T3 = a1*b0
  1104. vpclmulqdq \$0x10,$HK,$GH,$GH # ; $GH = a0*b1
  1105. vpxorq $T3,$GH,$GH
  1106. vpsrldq \$8,$GH,$T3 # ; shift-R $GH 2 DWs
  1107. vpslldq \$8,$GH,$GH # ; shift-L $GH 2 DWs
  1108. vpxorq $T3,$T1,$T1
  1109. vpxorq $T2,$GH,$GH
  1110. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  1111. # ;first phase of the reduction
  1112. vmovdqu64 POLY2(%rip),$T3
  1113. vpclmulqdq \$0x01,$GH,$T3,$T2
  1114. vpslldq \$8,$T2,$T2 # ; shift-L $T2 2 DWs
  1115. vpxorq $T2,$GH,$GH # ; first phase of the reduction complete
  1116. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  1117. # ;second phase of the reduction
  1118. vpclmulqdq \$0x00,$GH,$T3,$T2
  1119. vpsrldq \$4,$T2,$T2 # ; shift-R only 1-DW to obtain 2-DWs shift-R
  1120. vpclmulqdq \$0x10,$GH,$T3,$GH
  1121. vpslldq \$4,$GH,$GH # ; Shift-L 1-DW to obtain result with no shifts
  1122. # ; second phase of the reduction complete, the result is in $GH
  1123. vpternlogq \$0x96,$T2,$T1,$GH # ; GH = GH xor T1 xor T2
  1124. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  1125. ___
  1126. }
  1127. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  1128. # ;;; PRECOMPUTE computes HashKey_i
  1129. sub PRECOMPUTE {
  1130. my $GCM128_CTX = $_[0]; #; [in/out] context pointer, hkeys content updated
  1131. my $HK = $_[1]; #; [in] xmm, hash key
  1132. my $T1 = $_[2]; #; [clobbered] xmm
  1133. my $T2 = $_[3]; #; [clobbered] xmm
  1134. my $T3 = $_[4]; #; [clobbered] xmm
  1135. my $T4 = $_[5]; #; [clobbered] xmm
  1136. my $T5 = $_[6]; #; [clobbered] xmm
  1137. my $T6 = $_[7]; #; [clobbered] xmm
  1138. my $ZT1 = &ZWORD($T1);
  1139. my $ZT2 = &ZWORD($T2);
  1140. my $ZT3 = &ZWORD($T3);
  1141. my $ZT4 = &ZWORD($T4);
  1142. my $ZT5 = &ZWORD($T5);
  1143. my $ZT6 = &ZWORD($T6);
  1144. my $YT1 = &YWORD($T1);
  1145. my $YT2 = &YWORD($T2);
  1146. my $YT3 = &YWORD($T3);
  1147. my $YT4 = &YWORD($T4);
  1148. my $YT5 = &YWORD($T5);
  1149. my $YT6 = &YWORD($T6);
  1150. $code .= <<___;
  1151. vshufi32x4 \$0x00,@{[YWORD($HK)]},@{[YWORD($HK)]},$YT5
  1152. vmovdqa $YT5,$YT4
  1153. ___
  1154. # ;; calculate HashKey^2<<1 mod poly
  1155. &GHASH_MUL($YT4, $YT5, $YT1, $YT2, $YT3);
  1156. $code .= <<___;
  1157. vmovdqu64 $T4,@{[HashKeyByIdx(2,$GCM128_CTX)]}
  1158. vinserti64x2 \$1,$HK,$YT4,$YT5
  1159. vmovdqa64 $YT5,$YT6 # ;; YT6 = HashKey | HashKey^2
  1160. ___
  1161. # ;; use 2x128-bit computation
  1162. # ;; calculate HashKey^4<<1 mod poly, HashKey^3<<1 mod poly
  1163. &GHASH_MUL($YT5, $YT4, $YT1, $YT2, $YT3); # ;; YT5 = HashKey^3 | HashKey^4
  1164. $code .= <<___;
  1165. vmovdqu64 $YT5,@{[HashKeyByIdx(4,$GCM128_CTX)]}
  1166. vinserti64x4 \$1,$YT6,$ZT5,$ZT5 # ;; ZT5 = YT6 | YT5
  1167. # ;; switch to 4x128-bit computations now
  1168. vshufi64x2 \$0x00,$ZT5,$ZT5,$ZT4 # ;; broadcast HashKey^4 across all ZT4
  1169. vmovdqa64 $ZT5,$ZT6 # ;; save HashKey^4 to HashKey^1 in ZT6
  1170. ___
  1171. # ;; calculate HashKey^5<<1 mod poly, HashKey^6<<1 mod poly, ... HashKey^8<<1 mod poly
  1172. &GHASH_MUL($ZT5, $ZT4, $ZT1, $ZT2, $ZT3);
  1173. $code .= <<___;
  1174. vmovdqu64 $ZT5,@{[HashKeyByIdx(8,$GCM128_CTX)]} # ;; HashKey^8 to HashKey^5 in ZT5 now
  1175. vshufi64x2 \$0x00,$ZT5,$ZT5,$ZT4 # ;; broadcast HashKey^8 across all ZT4
  1176. ___
  1177. # ;; calculate HashKey^9<<1 mod poly, HashKey^10<<1 mod poly, ... HashKey^16<<1 mod poly
  1178. # ;; use HashKey^8 as multiplier against ZT6 and ZT5 - this allows deeper ooo execution
  1179. # ;; compute HashKey^(12), HashKey^(11), ... HashKey^(9)
  1180. &GHASH_MUL($ZT6, $ZT4, $ZT1, $ZT2, $ZT3);
  1181. $code .= "vmovdqu64 $ZT6,@{[HashKeyByIdx(12,$GCM128_CTX)]}\n";
  1182. # ;; compute HashKey^(16), HashKey^(15), ... HashKey^(13)
  1183. &GHASH_MUL($ZT5, $ZT4, $ZT1, $ZT2, $ZT3);
  1184. $code .= "vmovdqu64 $ZT5,@{[HashKeyByIdx(16,$GCM128_CTX)]}\n";
  1185. # ; Hkeys 17..48 will be precomputed somewhere else as context can hold only 16 hkeys
  1186. }
  1187. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  1188. # ;; READ_SMALL_DATA_INPUT
  1189. # ;; Packs xmm register with data when data input is less or equal to 16 bytes
  1190. # ;; Returns 0 if data has length 0
  1191. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  1192. sub READ_SMALL_DATA_INPUT {
  1193. my $OUTPUT = $_[0]; # [out] xmm register
  1194. my $INPUT = $_[1]; # [in] buffer pointer to read from
  1195. my $LENGTH = $_[2]; # [in] number of bytes to read
  1196. my $TMP1 = $_[3]; # [clobbered]
  1197. my $TMP2 = $_[4]; # [clobbered]
  1198. my $MASK = $_[5]; # [out] k1 to k7 register to store the partial block mask
  1199. $code .= <<___;
  1200. mov \$16,@{[DWORD($TMP2)]}
  1201. lea byte_len_to_mask_table(%rip),$TMP1
  1202. cmp $TMP2,$LENGTH
  1203. cmovc $LENGTH,$TMP2
  1204. ___
  1205. if ($win64) {
  1206. $code .= <<___;
  1207. add $TMP2,$TMP1
  1208. add $TMP2,$TMP1
  1209. kmovw ($TMP1),$MASK
  1210. ___
  1211. } else {
  1212. $code .= "kmovw ($TMP1,$TMP2,2),$MASK\n";
  1213. }
  1214. $code .= "vmovdqu8 ($INPUT),${OUTPUT}{$MASK}{z}\n";
  1215. }
  1216. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  1217. # CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
  1218. # Input: The input data (A_IN), that data's length (A_LEN), and the hash key (HASH_KEY).
  1219. # Output: The hash of the data (AAD_HASH).
  1220. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  1221. sub CALC_AAD_HASH {
  1222. my $A_IN = $_[0]; # [in] AAD text pointer
  1223. my $A_LEN = $_[1]; # [in] AAD length
  1224. my $AAD_HASH = $_[2]; # [in/out] xmm ghash value
  1225. my $GCM128_CTX = $_[3]; # [in] pointer to context
  1226. my $ZT0 = $_[4]; # [clobbered] ZMM register
  1227. my $ZT1 = $_[5]; # [clobbered] ZMM register
  1228. my $ZT2 = $_[6]; # [clobbered] ZMM register
  1229. my $ZT3 = $_[7]; # [clobbered] ZMM register
  1230. my $ZT4 = $_[8]; # [clobbered] ZMM register
  1231. my $ZT5 = $_[9]; # [clobbered] ZMM register
  1232. my $ZT6 = $_[10]; # [clobbered] ZMM register
  1233. my $ZT7 = $_[11]; # [clobbered] ZMM register
  1234. my $ZT8 = $_[12]; # [clobbered] ZMM register
  1235. my $ZT9 = $_[13]; # [clobbered] ZMM register
  1236. my $ZT10 = $_[14]; # [clobbered] ZMM register
  1237. my $ZT11 = $_[15]; # [clobbered] ZMM register
  1238. my $ZT12 = $_[16]; # [clobbered] ZMM register
  1239. my $ZT13 = $_[17]; # [clobbered] ZMM register
  1240. my $ZT14 = $_[18]; # [clobbered] ZMM register
  1241. my $ZT15 = $_[19]; # [clobbered] ZMM register
  1242. my $ZT16 = $_[20]; # [clobbered] ZMM register
  1243. my $T1 = $_[21]; # [clobbered] GP register
  1244. my $T2 = $_[22]; # [clobbered] GP register
  1245. my $T3 = $_[23]; # [clobbered] GP register
  1246. my $MASKREG = $_[24]; # [clobbered] mask register
  1247. my $HKEYS_READY = "%rbx";
  1248. my $SHFMSK = $ZT13;
  1249. my $label_suffix = $label_count++;
  1250. $code .= <<___;
  1251. mov $A_IN,$T1 # ; T1 = AAD
  1252. mov $A_LEN,$T2 # ; T2 = aadLen
  1253. or $T2,$T2
  1254. jz .L_CALC_AAD_done_${label_suffix}
  1255. xor $HKEYS_READY,$HKEYS_READY
  1256. vmovdqa64 SHUF_MASK(%rip),$SHFMSK
  1257. .L_get_AAD_loop48x16_${label_suffix}:
  1258. cmp \$`(48*16)`,$T2
  1259. jl .L_exit_AAD_loop48x16_${label_suffix}
  1260. ___
  1261. $code .= <<___;
  1262. vmovdqu64 `64*0`($T1),$ZT1 # ; Blocks 0-3
  1263. vmovdqu64 `64*1`($T1),$ZT2 # ; Blocks 4-7
  1264. vmovdqu64 `64*2`($T1),$ZT3 # ; Blocks 8-11
  1265. vmovdqu64 `64*3`($T1),$ZT4 # ; Blocks 12-15
  1266. vpshufb $SHFMSK,$ZT1,$ZT1
  1267. vpshufb $SHFMSK,$ZT2,$ZT2
  1268. vpshufb $SHFMSK,$ZT3,$ZT3
  1269. vpshufb $SHFMSK,$ZT4,$ZT4
  1270. ___
  1271. &precompute_hkeys_on_stack($GCM128_CTX, $HKEYS_READY, $ZT0, $ZT8, $ZT9, $ZT10, $ZT11, $ZT12, $ZT14, "all");
  1272. $code .= "mov \$1,$HKEYS_READY\n";
  1273. &GHASH_16(
  1274. "start", $ZT5, $ZT6, $ZT7,
  1275. "NO_INPUT_PTR", "NO_INPUT_PTR", "NO_INPUT_PTR", "%rsp",
  1276. &HashKeyOffsetByIdx(48, "frame"), 0, "@{[ZWORD($AAD_HASH)]}", $ZT0,
  1277. $ZT8, $ZT9, $ZT10, $ZT11,
  1278. $ZT12, $ZT14, $ZT15, $ZT16,
  1279. "NO_ZMM", $ZT1, $ZT2, $ZT3,
  1280. $ZT4);
  1281. $code .= <<___;
  1282. vmovdqu64 `16*16 + 64*0`($T1),$ZT1 # ; Blocks 16-19
  1283. vmovdqu64 `16*16 + 64*1`($T1),$ZT2 # ; Blocks 20-23
  1284. vmovdqu64 `16*16 + 64*2`($T1),$ZT3 # ; Blocks 24-27
  1285. vmovdqu64 `16*16 + 64*3`($T1),$ZT4 # ; Blocks 28-31
  1286. vpshufb $SHFMSK,$ZT1,$ZT1
  1287. vpshufb $SHFMSK,$ZT2,$ZT2
  1288. vpshufb $SHFMSK,$ZT3,$ZT3
  1289. vpshufb $SHFMSK,$ZT4,$ZT4
  1290. ___
  1291. &GHASH_16(
  1292. "mid", $ZT5, $ZT6, $ZT7,
  1293. "NO_INPUT_PTR", "NO_INPUT_PTR", "NO_INPUT_PTR", "%rsp",
  1294. &HashKeyOffsetByIdx(32, "frame"), 0, "NO_HASH_IN_OUT", $ZT0,
  1295. $ZT8, $ZT9, $ZT10, $ZT11,
  1296. $ZT12, $ZT14, $ZT15, $ZT16,
  1297. "NO_ZMM", $ZT1, $ZT2, $ZT3,
  1298. $ZT4);
  1299. $code .= <<___;
  1300. vmovdqu64 `32*16 + 64*0`($T1),$ZT1 # ; Blocks 32-35
  1301. vmovdqu64 `32*16 + 64*1`($T1),$ZT2 # ; Blocks 36-39
  1302. vmovdqu64 `32*16 + 64*2`($T1),$ZT3 # ; Blocks 40-43
  1303. vmovdqu64 `32*16 + 64*3`($T1),$ZT4 # ; Blocks 44-47
  1304. vpshufb $SHFMSK,$ZT1,$ZT1
  1305. vpshufb $SHFMSK,$ZT2,$ZT2
  1306. vpshufb $SHFMSK,$ZT3,$ZT3
  1307. vpshufb $SHFMSK,$ZT4,$ZT4
  1308. ___
  1309. &GHASH_16(
  1310. "end_reduce", $ZT5, $ZT6, $ZT7,
  1311. "NO_INPUT_PTR", "NO_INPUT_PTR", "NO_INPUT_PTR", "%rsp",
  1312. &HashKeyOffsetByIdx(16, "frame"), 0, &ZWORD($AAD_HASH), $ZT0,
  1313. $ZT8, $ZT9, $ZT10, $ZT11,
  1314. $ZT12, $ZT14, $ZT15, $ZT16,
  1315. "NO_ZMM", $ZT1, $ZT2, $ZT3,
  1316. $ZT4);
  1317. $code .= <<___;
  1318. sub \$`(48*16)`,$T2
  1319. je .L_CALC_AAD_done_${label_suffix}
  1320. add \$`(48*16)`,$T1
  1321. jmp .L_get_AAD_loop48x16_${label_suffix}
  1322. .L_exit_AAD_loop48x16_${label_suffix}:
  1323. # ; Less than 48x16 bytes remaining
  1324. cmp \$`(32*16)`,$T2
  1325. jl .L_less_than_32x16_${label_suffix}
  1326. ___
  1327. $code .= <<___;
  1328. # ; Get next 16 blocks
  1329. vmovdqu64 `64*0`($T1),$ZT1
  1330. vmovdqu64 `64*1`($T1),$ZT2
  1331. vmovdqu64 `64*2`($T1),$ZT3
  1332. vmovdqu64 `64*3`($T1),$ZT4
  1333. vpshufb $SHFMSK,$ZT1,$ZT1
  1334. vpshufb $SHFMSK,$ZT2,$ZT2
  1335. vpshufb $SHFMSK,$ZT3,$ZT3
  1336. vpshufb $SHFMSK,$ZT4,$ZT4
  1337. ___
  1338. &precompute_hkeys_on_stack($GCM128_CTX, $HKEYS_READY, $ZT0, $ZT8, $ZT9, $ZT10, $ZT11, $ZT12, $ZT14, "first32");
  1339. $code .= "mov \$1,$HKEYS_READY\n";
  1340. &GHASH_16(
  1341. "start", $ZT5, $ZT6, $ZT7,
  1342. "NO_INPUT_PTR", "NO_INPUT_PTR", "NO_INPUT_PTR", "%rsp",
  1343. &HashKeyOffsetByIdx(32, "frame"), 0, &ZWORD($AAD_HASH), $ZT0,
  1344. $ZT8, $ZT9, $ZT10, $ZT11,
  1345. $ZT12, $ZT14, $ZT15, $ZT16,
  1346. "NO_ZMM", $ZT1, $ZT2, $ZT3,
  1347. $ZT4);
  1348. $code .= <<___;
  1349. vmovdqu64 `16*16 + 64*0`($T1),$ZT1
  1350. vmovdqu64 `16*16 + 64*1`($T1),$ZT2
  1351. vmovdqu64 `16*16 + 64*2`($T1),$ZT3
  1352. vmovdqu64 `16*16 + 64*3`($T1),$ZT4
  1353. vpshufb $SHFMSK,$ZT1,$ZT1
  1354. vpshufb $SHFMSK,$ZT2,$ZT2
  1355. vpshufb $SHFMSK,$ZT3,$ZT3
  1356. vpshufb $SHFMSK,$ZT4,$ZT4
  1357. ___
  1358. &GHASH_16(
  1359. "end_reduce", $ZT5, $ZT6, $ZT7,
  1360. "NO_INPUT_PTR", "NO_INPUT_PTR", "NO_INPUT_PTR", "%rsp",
  1361. &HashKeyOffsetByIdx(16, "frame"), 0, &ZWORD($AAD_HASH), $ZT0,
  1362. $ZT8, $ZT9, $ZT10, $ZT11,
  1363. $ZT12, $ZT14, $ZT15, $ZT16,
  1364. "NO_ZMM", $ZT1, $ZT2, $ZT3,
  1365. $ZT4);
  1366. $code .= <<___;
  1367. sub \$`(32*16)`,$T2
  1368. je .L_CALC_AAD_done_${label_suffix}
  1369. add \$`(32*16)`,$T1
  1370. jmp .L_less_than_16x16_${label_suffix}
  1371. .L_less_than_32x16_${label_suffix}:
  1372. cmp \$`(16*16)`,$T2
  1373. jl .L_less_than_16x16_${label_suffix}
  1374. # ; Get next 16 blocks
  1375. vmovdqu64 `64*0`($T1),$ZT1
  1376. vmovdqu64 `64*1`($T1),$ZT2
  1377. vmovdqu64 `64*2`($T1),$ZT3
  1378. vmovdqu64 `64*3`($T1),$ZT4
  1379. vpshufb $SHFMSK,$ZT1,$ZT1
  1380. vpshufb $SHFMSK,$ZT2,$ZT2
  1381. vpshufb $SHFMSK,$ZT3,$ZT3
  1382. vpshufb $SHFMSK,$ZT4,$ZT4
  1383. ___
  1384. # ; This code path does not use more than 16 hkeys, so they can be taken from the context
  1385. # ; (not from the stack storage)
  1386. &GHASH_16(
  1387. "start_reduce", $ZT5, $ZT6, $ZT7,
  1388. "NO_INPUT_PTR", "NO_INPUT_PTR", "NO_INPUT_PTR", $GCM128_CTX,
  1389. &HashKeyOffsetByIdx(16, "context"), 0, &ZWORD($AAD_HASH), $ZT0,
  1390. $ZT8, $ZT9, $ZT10, $ZT11,
  1391. $ZT12, $ZT14, $ZT15, $ZT16,
  1392. "NO_ZMM", $ZT1, $ZT2, $ZT3,
  1393. $ZT4);
  1394. $code .= <<___;
  1395. sub \$`(16*16)`,$T2
  1396. je .L_CALC_AAD_done_${label_suffix}
  1397. add \$`(16*16)`,$T1
  1398. # ; Less than 16x16 bytes remaining
  1399. .L_less_than_16x16_${label_suffix}:
  1400. # ;; prep mask source address
  1401. lea byte64_len_to_mask_table(%rip),$T3
  1402. lea ($T3,$T2,8),$T3
  1403. # ;; calculate number of blocks to ghash (including partial bytes)
  1404. add \$15,@{[DWORD($T2)]}
  1405. shr \$4,@{[DWORD($T2)]}
  1406. cmp \$2,@{[DWORD($T2)]}
  1407. jb .L_AAD_blocks_1_${label_suffix}
  1408. je .L_AAD_blocks_2_${label_suffix}
  1409. cmp \$4,@{[DWORD($T2)]}
  1410. jb .L_AAD_blocks_3_${label_suffix}
  1411. je .L_AAD_blocks_4_${label_suffix}
  1412. cmp \$6,@{[DWORD($T2)]}
  1413. jb .L_AAD_blocks_5_${label_suffix}
  1414. je .L_AAD_blocks_6_${label_suffix}
  1415. cmp \$8,@{[DWORD($T2)]}
  1416. jb .L_AAD_blocks_7_${label_suffix}
  1417. je .L_AAD_blocks_8_${label_suffix}
  1418. cmp \$10,@{[DWORD($T2)]}
  1419. jb .L_AAD_blocks_9_${label_suffix}
  1420. je .L_AAD_blocks_10_${label_suffix}
  1421. cmp \$12,@{[DWORD($T2)]}
  1422. jb .L_AAD_blocks_11_${label_suffix}
  1423. je .L_AAD_blocks_12_${label_suffix}
  1424. cmp \$14,@{[DWORD($T2)]}
  1425. jb .L_AAD_blocks_13_${label_suffix}
  1426. je .L_AAD_blocks_14_${label_suffix}
  1427. cmp \$15,@{[DWORD($T2)]}
  1428. je .L_AAD_blocks_15_${label_suffix}
  1429. ___
  1430. # ;; fall through for 16 blocks
  1431. # ;; The flow of each of these cases is identical:
  1432. # ;; - load blocks plain text
  1433. # ;; - shuffle loaded blocks
  1434. # ;; - xor in current hash value into block 0
  1435. # ;; - perform up multiplications with ghash keys
  1436. # ;; - jump to reduction code
  1437. for (my $aad_blocks = 16; $aad_blocks > 0; $aad_blocks--) {
  1438. $code .= ".L_AAD_blocks_${aad_blocks}_${label_suffix}:\n";
  1439. if ($aad_blocks > 12) {
  1440. $code .= "sub \$`12*16*8`, $T3\n";
  1441. } elsif ($aad_blocks > 8) {
  1442. $code .= "sub \$`8*16*8`, $T3\n";
  1443. } elsif ($aad_blocks > 4) {
  1444. $code .= "sub \$`4*16*8`, $T3\n";
  1445. }
  1446. $code .= "kmovq ($T3),$MASKREG\n";
  1447. &ZMM_LOAD_MASKED_BLOCKS_0_16($aad_blocks, $T1, 0, $ZT1, $ZT2, $ZT3, $ZT4, $MASKREG);
  1448. &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16($aad_blocks, "vpshufb", $ZT1, $ZT2, $ZT3, $ZT4,
  1449. $ZT1, $ZT2, $ZT3, $ZT4, $SHFMSK, $SHFMSK, $SHFMSK, $SHFMSK);
  1450. &GHASH_1_TO_16($GCM128_CTX, &ZWORD($AAD_HASH),
  1451. $ZT0, $ZT5, $ZT6, $ZT7, $ZT8, $ZT9, $ZT10, $ZT11, $ZT12, &ZWORD($AAD_HASH), $ZT1, $ZT2, $ZT3, $ZT4, $aad_blocks);
  1452. if ($aad_blocks > 1) {
  1453. # ;; fall through to CALC_AAD_done in 1 block case
  1454. $code .= "jmp .L_CALC_AAD_done_${label_suffix}\n";
  1455. }
  1456. }
  1457. $code .= ".L_CALC_AAD_done_${label_suffix}:\n";
  1458. # ;; result in AAD_HASH
  1459. }
  1460. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  1461. # ;; PARTIAL_BLOCK
  1462. # ;; Handles encryption/decryption and the tag partial blocks between
  1463. # ;; update calls.
  1464. # ;; Requires the input data be at least 1 byte long.
  1465. # ;; Output:
  1466. # ;; A cipher/plain of the first partial block (CIPH_PLAIN_OUT),
  1467. # ;; AAD_HASH and updated GCM128_CTX
  1468. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  1469. sub PARTIAL_BLOCK {
  1470. my $GCM128_CTX = $_[0]; # [in] key pointer
  1471. my $PBLOCK_LEN = $_[1]; # [in] partial block length
  1472. my $CIPH_PLAIN_OUT = $_[2]; # [in] output buffer
  1473. my $PLAIN_CIPH_IN = $_[3]; # [in] input buffer
  1474. my $PLAIN_CIPH_LEN = $_[4]; # [in] buffer length
  1475. my $DATA_OFFSET = $_[5]; # [out] data offset (gets set)
  1476. my $AAD_HASH = $_[6]; # [out] updated GHASH value
  1477. my $ENC_DEC = $_[7]; # [in] cipher direction
  1478. my $GPTMP0 = $_[8]; # [clobbered] GP temporary register
  1479. my $GPTMP1 = $_[9]; # [clobbered] GP temporary register
  1480. my $GPTMP2 = $_[10]; # [clobbered] GP temporary register
  1481. my $ZTMP0 = $_[11]; # [clobbered] ZMM temporary register
  1482. my $ZTMP1 = $_[12]; # [clobbered] ZMM temporary register
  1483. my $ZTMP2 = $_[13]; # [clobbered] ZMM temporary register
  1484. my $ZTMP3 = $_[14]; # [clobbered] ZMM temporary register
  1485. my $ZTMP4 = $_[15]; # [clobbered] ZMM temporary register
  1486. my $ZTMP5 = $_[16]; # [clobbered] ZMM temporary register
  1487. my $ZTMP6 = $_[17]; # [clobbered] ZMM temporary register
  1488. my $ZTMP7 = $_[18]; # [clobbered] ZMM temporary register
  1489. my $MASKREG = $_[19]; # [clobbered] mask temporary register
  1490. my $XTMP0 = &XWORD($ZTMP0);
  1491. my $XTMP1 = &XWORD($ZTMP1);
  1492. my $XTMP2 = &XWORD($ZTMP2);
  1493. my $XTMP3 = &XWORD($ZTMP3);
  1494. my $XTMP4 = &XWORD($ZTMP4);
  1495. my $XTMP5 = &XWORD($ZTMP5);
  1496. my $XTMP6 = &XWORD($ZTMP6);
  1497. my $XTMP7 = &XWORD($ZTMP7);
  1498. my $LENGTH = $DATA_OFFSET;
  1499. my $IA0 = $GPTMP1;
  1500. my $IA1 = $GPTMP2;
  1501. my $IA2 = $GPTMP0;
  1502. my $label_suffix = $label_count++;
  1503. $code .= <<___;
  1504. # ;; if no partial block present then LENGTH/DATA_OFFSET will be set to zero
  1505. mov ($PBLOCK_LEN),$LENGTH
  1506. or $LENGTH,$LENGTH
  1507. je .L_partial_block_done_${label_suffix} # ;Leave Macro if no partial blocks
  1508. ___
  1509. &READ_SMALL_DATA_INPUT($XTMP0, $PLAIN_CIPH_IN, $PLAIN_CIPH_LEN, $IA0, $IA2, $MASKREG);
  1510. $code .= <<___;
  1511. # ;; XTMP1 = my_ctx_data.partial_block_enc_key
  1512. vmovdqu64 $CTX_OFFSET_PEncBlock($GCM128_CTX),$XTMP1
  1513. vmovdqu64 @{[HashKeyByIdx(1,$GCM128_CTX)]},$XTMP2
  1514. # ;; adjust the shuffle mask pointer to be able to shift right $LENGTH bytes
  1515. # ;; (16 - $LENGTH) is the number of bytes in plaintext mod 16)
  1516. lea SHIFT_MASK(%rip),$IA0
  1517. add $LENGTH,$IA0
  1518. vmovdqu64 ($IA0),$XTMP3 # ; shift right shuffle mask
  1519. vpshufb $XTMP3,$XTMP1,$XTMP1
  1520. ___
  1521. if ($ENC_DEC eq "DEC") {
  1522. $code .= <<___;
  1523. # ;; keep copy of cipher text in $XTMP4
  1524. vmovdqa64 $XTMP0,$XTMP4
  1525. ___
  1526. }
  1527. $code .= <<___;
  1528. vpxorq $XTMP0,$XTMP1,$XTMP1 # ; Ciphertext XOR E(K, Yn)
  1529. # ;; Set $IA1 to be the amount of data left in CIPH_PLAIN_IN after filling the block
  1530. # ;; Determine if partial block is not being filled and shift mask accordingly
  1531. ___
  1532. if ($win64) {
  1533. $code .= <<___;
  1534. mov $PLAIN_CIPH_LEN,$IA1
  1535. add $LENGTH,$IA1
  1536. ___
  1537. } else {
  1538. $code .= "lea ($PLAIN_CIPH_LEN, $LENGTH, 1),$IA1\n";
  1539. }
  1540. $code .= <<___;
  1541. sub \$16,$IA1
  1542. jge .L_no_extra_mask_${label_suffix}
  1543. sub $IA1,$IA0
  1544. .L_no_extra_mask_${label_suffix}:
  1545. # ;; get the appropriate mask to mask out bottom $LENGTH bytes of $XTMP1
  1546. # ;; - mask out bottom $LENGTH bytes of $XTMP1
  1547. # ;; sizeof(SHIFT_MASK) == 16 bytes
  1548. vmovdqu64 16($IA0),$XTMP0
  1549. vpand $XTMP0,$XTMP1,$XTMP1
  1550. ___
  1551. if ($ENC_DEC eq "DEC") {
  1552. $code .= <<___;
  1553. vpand $XTMP0,$XTMP4,$XTMP4
  1554. vpshufb SHUF_MASK(%rip),$XTMP4,$XTMP4
  1555. vpshufb $XTMP3,$XTMP4,$XTMP4
  1556. vpxorq $XTMP4,$AAD_HASH,$AAD_HASH
  1557. ___
  1558. } else {
  1559. $code .= <<___;
  1560. vpshufb SHUF_MASK(%rip),$XTMP1,$XTMP1
  1561. vpshufb $XTMP3,$XTMP1,$XTMP1
  1562. vpxorq $XTMP1,$AAD_HASH,$AAD_HASH
  1563. ___
  1564. }
  1565. $code .= <<___;
  1566. cmp \$0,$IA1
  1567. jl .L_partial_incomplete_${label_suffix}
  1568. ___
  1569. # ;; GHASH computation for the last <16 Byte block
  1570. &GHASH_MUL($AAD_HASH, $XTMP2, $XTMP5, $XTMP6, $XTMP7);
  1571. $code .= <<___;
  1572. movq \$0, ($PBLOCK_LEN)
  1573. # ;; Set $LENGTH to be the number of bytes to write out
  1574. mov $LENGTH,$IA0
  1575. mov \$16,$LENGTH
  1576. sub $IA0,$LENGTH
  1577. jmp .L_enc_dec_done_${label_suffix}
  1578. .L_partial_incomplete_${label_suffix}:
  1579. ___
  1580. if ($win64) {
  1581. $code .= <<___;
  1582. mov $PLAIN_CIPH_LEN,$IA0
  1583. add $IA0,($PBLOCK_LEN)
  1584. ___
  1585. } else {
  1586. $code .= "add $PLAIN_CIPH_LEN,($PBLOCK_LEN)\n";
  1587. }
  1588. $code .= <<___;
  1589. mov $PLAIN_CIPH_LEN,$LENGTH
  1590. .L_enc_dec_done_${label_suffix}:
  1591. # ;; output encrypted Bytes
  1592. lea byte_len_to_mask_table(%rip),$IA0
  1593. kmovw ($IA0,$LENGTH,2),$MASKREG
  1594. vmovdqu64 $AAD_HASH,$CTX_OFFSET_AadHash($GCM128_CTX)
  1595. ___
  1596. if ($ENC_DEC eq "ENC") {
  1597. $code .= <<___;
  1598. # ;; shuffle XTMP1 back to output as ciphertext
  1599. vpshufb SHUF_MASK(%rip),$XTMP1,$XTMP1
  1600. vpshufb $XTMP3,$XTMP1,$XTMP1
  1601. ___
  1602. }
  1603. $code .= <<___;
  1604. mov $CIPH_PLAIN_OUT,$IA0
  1605. vmovdqu8 $XTMP1,($IA0){$MASKREG}
  1606. .L_partial_block_done_${label_suffix}:
  1607. ___
  1608. }
  1609. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  1610. # ;; Ciphers 1 to 16 blocks and prepares them for later GHASH compute operation
  1611. sub INITIAL_BLOCKS_PARTIAL_CIPHER {
  1612. my $AES_KEYS = $_[0]; # [in] key pointer
  1613. my $GCM128_CTX = $_[1]; # [in] context pointer
  1614. my $CIPH_PLAIN_OUT = $_[2]; # [in] text output pointer
  1615. my $PLAIN_CIPH_IN = $_[3]; # [in] text input pointer
  1616. my $LENGTH = $_[4]; # [in/clobbered] length in bytes
  1617. my $DATA_OFFSET = $_[5]; # [in/out] current data offset (updated)
  1618. my $NUM_BLOCKS = $_[6]; # [in] can only be 1, 2, 3, 4, 5, ..., 15 or 16 (not 0)
  1619. my $CTR = $_[7]; # [in/out] current counter value
  1620. my $ENC_DEC = $_[8]; # [in] cipher direction (ENC/DEC)
  1621. my $DAT0 = $_[9]; # [out] ZMM with cipher text shuffled for GHASH
  1622. my $DAT1 = $_[10]; # [out] ZMM with cipher text shuffled for GHASH
  1623. my $DAT2 = $_[11]; # [out] ZMM with cipher text shuffled for GHASH
  1624. my $DAT3 = $_[12]; # [out] ZMM with cipher text shuffled for GHASH
  1625. my $LAST_CIPHER_BLK = $_[13]; # [out] XMM to put ciphered counter block partially xor'ed with text
  1626. my $LAST_GHASH_BLK = $_[14]; # [out] XMM to put last cipher text block shuffled for GHASH
  1627. my $CTR0 = $_[15]; # [clobbered] ZMM temporary
  1628. my $CTR1 = $_[16]; # [clobbered] ZMM temporary
  1629. my $CTR2 = $_[17]; # [clobbered] ZMM temporary
  1630. my $CTR3 = $_[18]; # [clobbered] ZMM temporary
  1631. my $ZT1 = $_[19]; # [clobbered] ZMM temporary
  1632. my $IA0 = $_[20]; # [clobbered] GP temporary
  1633. my $IA1 = $_[21]; # [clobbered] GP temporary
  1634. my $MASKREG = $_[22]; # [clobbered] mask register
  1635. my $SHUFMASK = $_[23]; # [out] ZMM loaded with BE/LE shuffle mask
  1636. if ($NUM_BLOCKS == 1) {
  1637. $code .= "vmovdqa64 SHUF_MASK(%rip),@{[XWORD($SHUFMASK)]}\n";
  1638. } elsif ($NUM_BLOCKS == 2) {
  1639. $code .= "vmovdqa64 SHUF_MASK(%rip),@{[YWORD($SHUFMASK)]}\n";
  1640. } else {
  1641. $code .= "vmovdqa64 SHUF_MASK(%rip),$SHUFMASK\n";
  1642. }
  1643. # ;; prepare AES counter blocks
  1644. if ($NUM_BLOCKS == 1) {
  1645. $code .= "vpaddd ONE(%rip),$CTR,@{[XWORD($CTR0)]}\n";
  1646. } elsif ($NUM_BLOCKS == 2) {
  1647. $code .= <<___;
  1648. vshufi64x2 \$0,@{[YWORD($CTR)]},@{[YWORD($CTR)]},@{[YWORD($CTR0)]}
  1649. vpaddd ddq_add_1234(%rip),@{[YWORD($CTR0)]},@{[YWORD($CTR0)]}
  1650. ___
  1651. } else {
  1652. $code .= <<___;
  1653. vshufi64x2 \$0,@{[ZWORD($CTR)]},@{[ZWORD($CTR)]},@{[ZWORD($CTR)]}
  1654. vpaddd ddq_add_1234(%rip),@{[ZWORD($CTR)]},$CTR0
  1655. ___
  1656. if ($NUM_BLOCKS > 4) {
  1657. $code .= "vpaddd ddq_add_5678(%rip),@{[ZWORD($CTR)]},$CTR1\n";
  1658. }
  1659. if ($NUM_BLOCKS > 8) {
  1660. $code .= "vpaddd ddq_add_8888(%rip),$CTR0,$CTR2\n";
  1661. }
  1662. if ($NUM_BLOCKS > 12) {
  1663. $code .= "vpaddd ddq_add_8888(%rip),$CTR1,$CTR3\n";
  1664. }
  1665. }
  1666. # ;; get load/store mask
  1667. $code .= <<___;
  1668. lea byte64_len_to_mask_table(%rip),$IA0
  1669. mov $LENGTH,$IA1
  1670. ___
  1671. if ($NUM_BLOCKS > 12) {
  1672. $code .= "sub \$`3*64`,$IA1\n";
  1673. } elsif ($NUM_BLOCKS > 8) {
  1674. $code .= "sub \$`2*64`,$IA1\n";
  1675. } elsif ($NUM_BLOCKS > 4) {
  1676. $code .= "sub \$`1*64`,$IA1\n";
  1677. }
  1678. $code .= "kmovq ($IA0,$IA1,8),$MASKREG\n";
  1679. # ;; extract new counter value
  1680. # ;; shuffle the counters for AES rounds
  1681. if ($NUM_BLOCKS <= 4) {
  1682. $code .= "vextracti32x4 \$`($NUM_BLOCKS - 1)`,$CTR0,$CTR\n";
  1683. } elsif ($NUM_BLOCKS <= 8) {
  1684. $code .= "vextracti32x4 \$`($NUM_BLOCKS - 5)`,$CTR1,$CTR\n";
  1685. } elsif ($NUM_BLOCKS <= 12) {
  1686. $code .= "vextracti32x4 \$`($NUM_BLOCKS - 9)`,$CTR2,$CTR\n";
  1687. } else {
  1688. $code .= "vextracti32x4 \$`($NUM_BLOCKS - 13)`,$CTR3,$CTR\n";
  1689. }
  1690. &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
  1691. $NUM_BLOCKS, "vpshufb", $CTR0, $CTR1, $CTR2, $CTR3, $CTR0,
  1692. $CTR1, $CTR2, $CTR3, $SHUFMASK, $SHUFMASK, $SHUFMASK, $SHUFMASK);
  1693. # ;; load plain/cipher text
  1694. &ZMM_LOAD_MASKED_BLOCKS_0_16($NUM_BLOCKS, $PLAIN_CIPH_IN, $DATA_OFFSET, $DAT0, $DAT1, $DAT2, $DAT3, $MASKREG);
  1695. # ;; AES rounds and XOR with plain/cipher text
  1696. foreach my $j (0 .. ($NROUNDS + 1)) {
  1697. $code .= "vbroadcastf64x2 `($j * 16)`($AES_KEYS),$ZT1\n";
  1698. &ZMM_AESENC_ROUND_BLOCKS_0_16($CTR0, $CTR1, $CTR2, $CTR3, $ZT1, $j,
  1699. $DAT0, $DAT1, $DAT2, $DAT3, $NUM_BLOCKS, $NROUNDS);
  1700. }
  1701. # ;; retrieve the last cipher counter block (partially XOR'ed with text)
  1702. # ;; - this is needed for partial block cases
  1703. if ($NUM_BLOCKS <= 4) {
  1704. $code .= "vextracti32x4 \$`($NUM_BLOCKS - 1)`,$CTR0,$LAST_CIPHER_BLK\n";
  1705. } elsif ($NUM_BLOCKS <= 8) {
  1706. $code .= "vextracti32x4 \$`($NUM_BLOCKS - 5)`,$CTR1,$LAST_CIPHER_BLK\n";
  1707. } elsif ($NUM_BLOCKS <= 12) {
  1708. $code .= "vextracti32x4 \$`($NUM_BLOCKS - 9)`,$CTR2,$LAST_CIPHER_BLK\n";
  1709. } else {
  1710. $code .= "vextracti32x4 \$`($NUM_BLOCKS - 13)`,$CTR3,$LAST_CIPHER_BLK\n";
  1711. }
  1712. # ;; write cipher/plain text back to output and
  1713. $code .= "mov $CIPH_PLAIN_OUT,$IA0\n";
  1714. &ZMM_STORE_MASKED_BLOCKS_0_16($NUM_BLOCKS, $IA0, $DATA_OFFSET, $CTR0, $CTR1, $CTR2, $CTR3, $MASKREG);
  1715. # ;; zero bytes outside the mask before hashing
  1716. if ($NUM_BLOCKS <= 4) {
  1717. $code .= "vmovdqu8 $CTR0,${CTR0}{$MASKREG}{z}\n";
  1718. } elsif ($NUM_BLOCKS <= 8) {
  1719. $code .= "vmovdqu8 $CTR1,${CTR1}{$MASKREG}{z}\n";
  1720. } elsif ($NUM_BLOCKS <= 12) {
  1721. $code .= "vmovdqu8 $CTR2,${CTR2}{$MASKREG}{z}\n";
  1722. } else {
  1723. $code .= "vmovdqu8 $CTR3,${CTR3}{$MASKREG}{z}\n";
  1724. }
  1725. # ;; Shuffle the cipher text blocks for hashing part
  1726. # ;; ZT5 and ZT6 are expected outputs with blocks for hashing
  1727. if ($ENC_DEC eq "DEC") {
  1728. # ;; Decrypt case
  1729. # ;; - cipher blocks are in ZT5 & ZT6
  1730. &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
  1731. $NUM_BLOCKS, "vpshufb", $DAT0, $DAT1, $DAT2, $DAT3, $DAT0,
  1732. $DAT1, $DAT2, $DAT3, $SHUFMASK, $SHUFMASK, $SHUFMASK, $SHUFMASK);
  1733. } else {
  1734. # ;; Encrypt case
  1735. # ;; - cipher blocks are in CTR0-CTR3
  1736. &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
  1737. $NUM_BLOCKS, "vpshufb", $DAT0, $DAT1, $DAT2, $DAT3, $CTR0,
  1738. $CTR1, $CTR2, $CTR3, $SHUFMASK, $SHUFMASK, $SHUFMASK, $SHUFMASK);
  1739. }
  1740. # ;; Extract the last block for partials and multi_call cases
  1741. if ($NUM_BLOCKS <= 4) {
  1742. $code .= "vextracti32x4 \$`($NUM_BLOCKS-1)`,$DAT0,$LAST_GHASH_BLK\n";
  1743. } elsif ($NUM_BLOCKS <= 8) {
  1744. $code .= "vextracti32x4 \$`($NUM_BLOCKS-5)`,$DAT1,$LAST_GHASH_BLK\n";
  1745. } elsif ($NUM_BLOCKS <= 12) {
  1746. $code .= "vextracti32x4 \$`($NUM_BLOCKS-9)`,$DAT2,$LAST_GHASH_BLK\n";
  1747. } else {
  1748. $code .= "vextracti32x4 \$`($NUM_BLOCKS-13)`,$DAT3,$LAST_GHASH_BLK\n";
  1749. }
  1750. }
  1751. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  1752. # ;; Computes GHASH on 1 to 16 blocks
  1753. sub INITIAL_BLOCKS_PARTIAL_GHASH {
  1754. my $AES_KEYS = $_[0]; # [in] key pointer
  1755. my $GCM128_CTX = $_[1]; # [in] context pointer
  1756. my $LENGTH = $_[2]; # [in/clobbered] length in bytes
  1757. my $NUM_BLOCKS = $_[3]; # [in] can only be 1, 2, 3, 4, 5, ..., 15 or 16 (not 0)
  1758. my $HASH_IN_OUT = $_[4]; # [in/out] XMM ghash in/out value
  1759. my $ENC_DEC = $_[5]; # [in] cipher direction (ENC/DEC)
  1760. my $DAT0 = $_[6]; # [in] ZMM with cipher text shuffled for GHASH
  1761. my $DAT1 = $_[7]; # [in] ZMM with cipher text shuffled for GHASH
  1762. my $DAT2 = $_[8]; # [in] ZMM with cipher text shuffled for GHASH
  1763. my $DAT3 = $_[9]; # [in] ZMM with cipher text shuffled for GHASH
  1764. my $LAST_CIPHER_BLK = $_[10]; # [in] XMM with ciphered counter block partially xor'ed with text
  1765. my $LAST_GHASH_BLK = $_[11]; # [in] XMM with last cipher text block shuffled for GHASH
  1766. my $ZT0 = $_[12]; # [clobbered] ZMM temporary
  1767. my $ZT1 = $_[13]; # [clobbered] ZMM temporary
  1768. my $ZT2 = $_[14]; # [clobbered] ZMM temporary
  1769. my $ZT3 = $_[15]; # [clobbered] ZMM temporary
  1770. my $ZT4 = $_[16]; # [clobbered] ZMM temporary
  1771. my $ZT5 = $_[17]; # [clobbered] ZMM temporary
  1772. my $ZT6 = $_[18]; # [clobbered] ZMM temporary
  1773. my $ZT7 = $_[19]; # [clobbered] ZMM temporary
  1774. my $ZT8 = $_[20]; # [clobbered] ZMM temporary
  1775. my $PBLOCK_LEN = $_[21]; # [in] partial block length
  1776. my $GH = $_[22]; # [in] ZMM with hi product part
  1777. my $GM = $_[23]; # [in] ZMM with mid product part
  1778. my $GL = $_[24]; # [in] ZMM with lo product part
  1779. my $label_suffix = $label_count++;
  1780. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  1781. # ;;; - Hash all but the last partial block of data
  1782. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  1783. # ;; update data offset
  1784. if ($NUM_BLOCKS > 1) {
  1785. # ;; The final block of data may be <16B
  1786. $code .= "sub \$16 * ($NUM_BLOCKS - 1),$LENGTH\n";
  1787. }
  1788. if ($NUM_BLOCKS < 16) {
  1789. $code .= <<___;
  1790. # ;; NOTE: the 'jl' is always taken for num_initial_blocks = 16.
  1791. # ;; This is run in the context of GCM_ENC_DEC_SMALL for length < 256.
  1792. cmp \$16,$LENGTH
  1793. jl .L_small_initial_partial_block_${label_suffix}
  1794. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  1795. # ;;; Handle a full length final block - encrypt and hash all blocks
  1796. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  1797. sub \$16,$LENGTH
  1798. movq \$0,($PBLOCK_LEN)
  1799. ___
  1800. # ;; Hash all of the data
  1801. if (scalar(@_) == 22) {
  1802. # ;; start GHASH compute
  1803. &GHASH_1_TO_16($GCM128_CTX, $HASH_IN_OUT, $ZT0, $ZT1, $ZT2, $ZT3, $ZT4,
  1804. $ZT5, $ZT6, $ZT7, $ZT8, &ZWORD($HASH_IN_OUT), $DAT0, $DAT1, $DAT2, $DAT3, $NUM_BLOCKS);
  1805. } elsif (scalar(@_) == 25) {
  1806. # ;; continue GHASH compute
  1807. &GHASH_1_TO_16($GCM128_CTX, $HASH_IN_OUT, $ZT0, $ZT1, $ZT2, $ZT3, $ZT4,
  1808. $ZT5, $ZT6, $ZT7, $ZT8, &ZWORD($HASH_IN_OUT), $DAT0, $DAT1, $DAT2, $DAT3, $NUM_BLOCKS, $GH, $GM, $GL);
  1809. }
  1810. $code .= "jmp .L_small_initial_compute_done_${label_suffix}\n";
  1811. }
  1812. $code .= <<___;
  1813. .L_small_initial_partial_block_${label_suffix}:
  1814. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  1815. # ;;; Handle ghash for a <16B final block
  1816. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  1817. # ;; As it's an init / update / finalize series we need to leave the
  1818. # ;; last block if it's less than a full block of data.
  1819. mov $LENGTH,($PBLOCK_LEN)
  1820. vmovdqu64 $LAST_CIPHER_BLK,$CTX_OFFSET_PEncBlock($GCM128_CTX)
  1821. ___
  1822. my $k = ($NUM_BLOCKS - 1);
  1823. my $last_block_to_hash = 1;
  1824. if (($NUM_BLOCKS > $last_block_to_hash)) {
  1825. # ;; ZT12-ZT20 - temporary registers
  1826. if (scalar(@_) == 22) {
  1827. # ;; start GHASH compute
  1828. &GHASH_1_TO_16($GCM128_CTX, $HASH_IN_OUT, $ZT0, $ZT1, $ZT2, $ZT3, $ZT4,
  1829. $ZT5, $ZT6, $ZT7, $ZT8, &ZWORD($HASH_IN_OUT), $DAT0, $DAT1, $DAT2, $DAT3, $k);
  1830. } elsif (scalar(@_) == 25) {
  1831. # ;; continue GHASH compute
  1832. &GHASH_1_TO_16($GCM128_CTX, $HASH_IN_OUT, $ZT0, $ZT1, $ZT2, $ZT3, $ZT4,
  1833. $ZT5, $ZT6, $ZT7, $ZT8, &ZWORD($HASH_IN_OUT), $DAT0, $DAT1, $DAT2, $DAT3, $k, $GH, $GM, $GL);
  1834. }
  1835. # ;; just fall through no jmp needed
  1836. } else {
  1837. if (scalar(@_) == 25) {
  1838. $code .= <<___;
  1839. # ;; Reduction is required in this case.
  1840. # ;; Integrate GM into GH and GL.
  1841. vpsrldq \$8,$GM,$ZT0
  1842. vpslldq \$8,$GM,$ZT1
  1843. vpxorq $ZT0,$GH,$GH
  1844. vpxorq $ZT1,$GL,$GL
  1845. ___
  1846. # ;; Add GH and GL 128-bit words horizontally
  1847. &VHPXORI4x128($GH, $ZT0);
  1848. &VHPXORI4x128($GL, $ZT1);
  1849. # ;; 256-bit to 128-bit reduction
  1850. $code .= "vmovdqa64 POLY2(%rip),@{[XWORD($ZT0)]}\n";
  1851. &VCLMUL_REDUCE(&XWORD($HASH_IN_OUT), &XWORD($ZT0), &XWORD($GH), &XWORD($GL), &XWORD($ZT1), &XWORD($ZT2));
  1852. }
  1853. $code .= <<___;
  1854. # ;; Record that a reduction is not needed -
  1855. # ;; In this case no hashes are computed because there
  1856. # ;; is only one initial block and it is < 16B in length.
  1857. # ;; We only need to check if a reduction is needed if
  1858. # ;; initial_blocks == 1 and init/update/final is being used.
  1859. # ;; In this case we may just have a partial block, and that
  1860. # ;; gets hashed in finalize.
  1861. # ;; The hash should end up in HASH_IN_OUT.
  1862. # ;; The only way we should get here is if there is
  1863. # ;; a partial block of data, so xor that into the hash.
  1864. vpxorq $LAST_GHASH_BLK,$HASH_IN_OUT,$HASH_IN_OUT
  1865. # ;; The result is in $HASH_IN_OUT
  1866. jmp .L_after_reduction_${label_suffix}
  1867. ___
  1868. }
  1869. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  1870. # ;;; After GHASH reduction
  1871. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  1872. $code .= ".L_small_initial_compute_done_${label_suffix}:\n";
  1873. # ;; If using init/update/finalize, we need to xor any partial block data
  1874. # ;; into the hash.
  1875. if ($NUM_BLOCKS > 1) {
  1876. # ;; NOTE: for $NUM_BLOCKS = 0 the xor never takes place
  1877. if ($NUM_BLOCKS != 16) {
  1878. $code .= <<___;
  1879. # ;; NOTE: for $NUM_BLOCKS = 16, $LENGTH, stored in [PBlockLen] is never zero
  1880. or $LENGTH,$LENGTH
  1881. je .L_after_reduction_${label_suffix}
  1882. ___
  1883. }
  1884. $code .= "vpxorq $LAST_GHASH_BLK,$HASH_IN_OUT,$HASH_IN_OUT\n";
  1885. }
  1886. $code .= ".L_after_reduction_${label_suffix}:\n";
  1887. # ;; Final hash is now in HASH_IN_OUT
  1888. }
  1889. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  1890. # ;; INITIAL_BLOCKS_PARTIAL macro with support for a partial final block.
  1891. # ;; It may look similar to INITIAL_BLOCKS but its usage is different:
  1892. # ;; - first encrypts/decrypts required number of blocks and then
  1893. # ;; ghashes these blocks
  1894. # ;; - Small packets or left over data chunks (<256 bytes)
  1895. # ;; - Remaining data chunks below 256 bytes (multi buffer code)
  1896. # ;;
  1897. # ;; num_initial_blocks is expected to include the partial final block
  1898. # ;; in the count.
  1899. sub INITIAL_BLOCKS_PARTIAL {
  1900. my $AES_KEYS = $_[0]; # [in] key pointer
  1901. my $GCM128_CTX = $_[1]; # [in] context pointer
  1902. my $CIPH_PLAIN_OUT = $_[2]; # [in] text output pointer
  1903. my $PLAIN_CIPH_IN = $_[3]; # [in] text input pointer
  1904. my $LENGTH = $_[4]; # [in/clobbered] length in bytes
  1905. my $DATA_OFFSET = $_[5]; # [in/out] current data offset (updated)
  1906. my $NUM_BLOCKS = $_[6]; # [in] can only be 1, 2, 3, 4, 5, ..., 15 or 16 (not 0)
  1907. my $CTR = $_[7]; # [in/out] current counter value
  1908. my $HASH_IN_OUT = $_[8]; # [in/out] XMM ghash in/out value
  1909. my $ENC_DEC = $_[9]; # [in] cipher direction (ENC/DEC)
  1910. my $CTR0 = $_[10]; # [clobbered] ZMM temporary
  1911. my $CTR1 = $_[11]; # [clobbered] ZMM temporary
  1912. my $CTR2 = $_[12]; # [clobbered] ZMM temporary
  1913. my $CTR3 = $_[13]; # [clobbered] ZMM temporary
  1914. my $DAT0 = $_[14]; # [clobbered] ZMM temporary
  1915. my $DAT1 = $_[15]; # [clobbered] ZMM temporary
  1916. my $DAT2 = $_[16]; # [clobbered] ZMM temporary
  1917. my $DAT3 = $_[17]; # [clobbered] ZMM temporary
  1918. my $LAST_CIPHER_BLK = $_[18]; # [clobbered] ZMM temporary
  1919. my $LAST_GHASH_BLK = $_[19]; # [clobbered] ZMM temporary
  1920. my $ZT0 = $_[20]; # [clobbered] ZMM temporary
  1921. my $ZT1 = $_[21]; # [clobbered] ZMM temporary
  1922. my $ZT2 = $_[22]; # [clobbered] ZMM temporary
  1923. my $ZT3 = $_[23]; # [clobbered] ZMM temporary
  1924. my $ZT4 = $_[24]; # [clobbered] ZMM temporary
  1925. my $IA0 = $_[25]; # [clobbered] GP temporary
  1926. my $IA1 = $_[26]; # [clobbered] GP temporary
  1927. my $MASKREG = $_[27]; # [clobbered] mask register
  1928. my $SHUFMASK = $_[28]; # [clobbered] ZMM for BE/LE shuffle mask
  1929. my $PBLOCK_LEN = $_[29]; # [in] partial block length
  1930. &INITIAL_BLOCKS_PARTIAL_CIPHER(
  1931. $AES_KEYS, $GCM128_CTX, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN,
  1932. $LENGTH, $DATA_OFFSET, $NUM_BLOCKS, $CTR,
  1933. $ENC_DEC, $DAT0, $DAT1, $DAT2,
  1934. $DAT3, &XWORD($LAST_CIPHER_BLK), &XWORD($LAST_GHASH_BLK), $CTR0,
  1935. $CTR1, $CTR2, $CTR3, $ZT0,
  1936. $IA0, $IA1, $MASKREG, $SHUFMASK);
  1937. &INITIAL_BLOCKS_PARTIAL_GHASH($AES_KEYS, $GCM128_CTX, $LENGTH, $NUM_BLOCKS, $HASH_IN_OUT, $ENC_DEC, $DAT0,
  1938. $DAT1, $DAT2, $DAT3, &XWORD($LAST_CIPHER_BLK),
  1939. &XWORD($LAST_GHASH_BLK), $CTR0, $CTR1, $CTR2, $CTR3, $ZT0, $ZT1, $ZT2, $ZT3, $ZT4, $PBLOCK_LEN);
  1940. }
  1941. # ;; ===========================================================================
  1942. # ;; Stitched GHASH of 16 blocks (with reduction) with encryption of N blocks
  1943. # ;; followed with GHASH of the N blocks.
  1944. sub GHASH_16_ENCRYPT_N_GHASH_N {
  1945. my $AES_KEYS = $_[0]; # [in] key pointer
  1946. my $GCM128_CTX = $_[1]; # [in] context pointer
  1947. my $CIPH_PLAIN_OUT = $_[2]; # [in] pointer to output buffer
  1948. my $PLAIN_CIPH_IN = $_[3]; # [in] pointer to input buffer
  1949. my $DATA_OFFSET = $_[4]; # [in] data offset
  1950. my $LENGTH = $_[5]; # [in] data length
  1951. my $CTR_BE = $_[6]; # [in/out] ZMM counter blocks (last 4) in big-endian
  1952. my $CTR_CHECK = $_[7]; # [in/out] GP with 8-bit counter for overflow check
  1953. my $HASHKEY_OFFSET = $_[8]; # [in] numerical offset for the highest hash key
  1954. # (can be in form of register or numerical value)
  1955. my $GHASHIN_BLK_OFFSET = $_[9]; # [in] numerical offset for GHASH blocks in
  1956. my $SHFMSK = $_[10]; # [in] ZMM with byte swap mask for pshufb
  1957. my $B00_03 = $_[11]; # [clobbered] temporary ZMM
  1958. my $B04_07 = $_[12]; # [clobbered] temporary ZMM
  1959. my $B08_11 = $_[13]; # [clobbered] temporary ZMM
  1960. my $B12_15 = $_[14]; # [clobbered] temporary ZMM
  1961. my $GH1H_UNUSED = $_[15]; # [clobbered] temporary ZMM
  1962. my $GH1L = $_[16]; # [clobbered] temporary ZMM
  1963. my $GH1M = $_[17]; # [clobbered] temporary ZMM
  1964. my $GH1T = $_[18]; # [clobbered] temporary ZMM
  1965. my $GH2H = $_[19]; # [clobbered] temporary ZMM
  1966. my $GH2L = $_[20]; # [clobbered] temporary ZMM
  1967. my $GH2M = $_[21]; # [clobbered] temporary ZMM
  1968. my $GH2T = $_[22]; # [clobbered] temporary ZMM
  1969. my $GH3H = $_[23]; # [clobbered] temporary ZMM
  1970. my $GH3L = $_[24]; # [clobbered] temporary ZMM
  1971. my $GH3M = $_[25]; # [clobbered] temporary ZMM
  1972. my $GH3T = $_[26]; # [clobbered] temporary ZMM
  1973. my $AESKEY1 = $_[27]; # [clobbered] temporary ZMM
  1974. my $AESKEY2 = $_[28]; # [clobbered] temporary ZMM
  1975. my $GHKEY1 = $_[29]; # [clobbered] temporary ZMM
  1976. my $GHKEY2 = $_[30]; # [clobbered] temporary ZMM
  1977. my $GHDAT1 = $_[31]; # [clobbered] temporary ZMM
  1978. my $GHDAT2 = $_[32]; # [clobbered] temporary ZMM
  1979. my $ZT01 = $_[33]; # [clobbered] temporary ZMM
  1980. my $ADDBE_4x4 = $_[34]; # [in] ZMM with 4x128bits 4 in big-endian
  1981. my $ADDBE_1234 = $_[35]; # [in] ZMM with 4x128bits 1, 2, 3 and 4 in big-endian
  1982. my $GHASH_TYPE = $_[36]; # [in] "start", "start_reduce", "mid", "end_reduce"
  1983. my $TO_REDUCE_L = $_[37]; # [in] ZMM for low 4x128-bit GHASH sum
  1984. my $TO_REDUCE_H = $_[38]; # [in] ZMM for hi 4x128-bit GHASH sum
  1985. my $TO_REDUCE_M = $_[39]; # [in] ZMM for medium 4x128-bit GHASH sum
  1986. my $ENC_DEC = $_[40]; # [in] cipher direction
  1987. my $HASH_IN_OUT = $_[41]; # [in/out] XMM ghash in/out value
  1988. my $IA0 = $_[42]; # [clobbered] GP temporary
  1989. my $IA1 = $_[43]; # [clobbered] GP temporary
  1990. my $MASKREG = $_[44]; # [clobbered] mask register
  1991. my $NUM_BLOCKS = $_[45]; # [in] numerical value with number of blocks to be encrypted/ghashed (1 to 16)
  1992. my $PBLOCK_LEN = $_[46]; # [in] partial block length
  1993. die "GHASH_16_ENCRYPT_N_GHASH_N: num_blocks is out of bounds = $NUM_BLOCKS\n"
  1994. if ($NUM_BLOCKS > 16 || $NUM_BLOCKS < 0);
  1995. my $label_suffix = $label_count++;
  1996. my $GH1H = $HASH_IN_OUT;
  1997. # ; this is to avoid additional move in do_reduction case
  1998. my $LAST_GHASH_BLK = $GH1L;
  1999. my $LAST_CIPHER_BLK = $GH1T;
  2000. my $RED_POLY = $GH2T;
  2001. my $RED_P1 = $GH2L;
  2002. my $RED_T1 = $GH2H;
  2003. my $RED_T2 = $GH2M;
  2004. my $DATA1 = $GH3H;
  2005. my $DATA2 = $GH3L;
  2006. my $DATA3 = $GH3M;
  2007. my $DATA4 = $GH3T;
  2008. # ;; do reduction after the 16 blocks ?
  2009. my $do_reduction = 0;
  2010. # ;; is 16 block chunk a start?
  2011. my $is_start = 0;
  2012. if ($GHASH_TYPE eq "start_reduce") {
  2013. $is_start = 1;
  2014. $do_reduction = 1;
  2015. }
  2016. if ($GHASH_TYPE eq "start") {
  2017. $is_start = 1;
  2018. }
  2019. if ($GHASH_TYPE eq "end_reduce") {
  2020. $do_reduction = 1;
  2021. }
  2022. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2023. # ;; - get load/store mask
  2024. # ;; - load plain/cipher text
  2025. # ;; get load/store mask
  2026. $code .= <<___;
  2027. lea byte64_len_to_mask_table(%rip),$IA0
  2028. mov $LENGTH,$IA1
  2029. ___
  2030. if ($NUM_BLOCKS > 12) {
  2031. $code .= "sub \$`3*64`,$IA1\n";
  2032. } elsif ($NUM_BLOCKS > 8) {
  2033. $code .= "sub \$`2*64`,$IA1\n";
  2034. } elsif ($NUM_BLOCKS > 4) {
  2035. $code .= "sub \$`1*64`,$IA1\n";
  2036. }
  2037. $code .= "kmovq ($IA0,$IA1,8),$MASKREG\n";
  2038. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2039. # ;; prepare counter blocks
  2040. $code .= <<___;
  2041. cmp \$`(256 - $NUM_BLOCKS)`,@{[DWORD($CTR_CHECK)]}
  2042. jae .L_16_blocks_overflow_${label_suffix}
  2043. ___
  2044. &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
  2045. $NUM_BLOCKS, "vpaddd", $B00_03, $B04_07, $B08_11, $B12_15, $CTR_BE,
  2046. $B00_03, $B04_07, $B08_11, $ADDBE_1234, $ADDBE_4x4, $ADDBE_4x4, $ADDBE_4x4);
  2047. $code .= <<___;
  2048. jmp .L_16_blocks_ok_${label_suffix}
  2049. .L_16_blocks_overflow_${label_suffix}:
  2050. vpshufb $SHFMSK,$CTR_BE,$CTR_BE
  2051. vpaddd ddq_add_1234(%rip),$CTR_BE,$B00_03
  2052. ___
  2053. if ($NUM_BLOCKS > 4) {
  2054. $code .= <<___;
  2055. vmovdqa64 ddq_add_4444(%rip),$B12_15
  2056. vpaddd $B12_15,$B00_03,$B04_07
  2057. ___
  2058. }
  2059. if ($NUM_BLOCKS > 8) {
  2060. $code .= "vpaddd $B12_15,$B04_07,$B08_11\n";
  2061. }
  2062. if ($NUM_BLOCKS > 12) {
  2063. $code .= "vpaddd $B12_15,$B08_11,$B12_15\n";
  2064. }
  2065. &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
  2066. $NUM_BLOCKS, "vpshufb", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
  2067. $B04_07, $B08_11, $B12_15, $SHFMSK, $SHFMSK, $SHFMSK, $SHFMSK);
  2068. $code .= <<___;
  2069. .L_16_blocks_ok_${label_suffix}:
  2070. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2071. # ;; - pre-load constants
  2072. # ;; - add current hash into the 1st block
  2073. vbroadcastf64x2 `(16 * 0)`($AES_KEYS),$AESKEY1
  2074. ___
  2075. if ($is_start != 0) {
  2076. $code .= "vpxorq `$GHASHIN_BLK_OFFSET + (0*64)`(%rsp),$HASH_IN_OUT,$GHDAT1\n";
  2077. } else {
  2078. $code .= "vmovdqa64 `$GHASHIN_BLK_OFFSET + (0*64)`(%rsp),$GHDAT1\n";
  2079. }
  2080. $code .= "vmovdqu64 @{[EffectiveAddress(\"%rsp\",$HASHKEY_OFFSET,0*64)]},$GHKEY1\n";
  2081. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2082. # ;; save counter for the next round
  2083. # ;; increment counter overflow check register
  2084. if ($NUM_BLOCKS <= 4) {
  2085. $code .= "vextracti32x4 \$`($NUM_BLOCKS - 1)`,$B00_03,@{[XWORD($CTR_BE)]}\n";
  2086. } elsif ($NUM_BLOCKS <= 8) {
  2087. $code .= "vextracti32x4 \$`($NUM_BLOCKS - 5)`,$B04_07,@{[XWORD($CTR_BE)]}\n";
  2088. } elsif ($NUM_BLOCKS <= 12) {
  2089. $code .= "vextracti32x4 \$`($NUM_BLOCKS - 9)`,$B08_11,@{[XWORD($CTR_BE)]}\n";
  2090. } else {
  2091. $code .= "vextracti32x4 \$`($NUM_BLOCKS - 13)`,$B12_15,@{[XWORD($CTR_BE)]}\n";
  2092. }
  2093. $code .= "vshufi64x2 \$0b00000000,$CTR_BE,$CTR_BE,$CTR_BE\n";
  2094. $code .= <<___;
  2095. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2096. # ;; pre-load constants
  2097. vbroadcastf64x2 `(16 * 1)`($AES_KEYS),$AESKEY2
  2098. vmovdqu64 @{[EffectiveAddress("%rsp",$HASHKEY_OFFSET,1*64)]},$GHKEY2
  2099. vmovdqa64 `$GHASHIN_BLK_OFFSET + (1*64)`(%rsp),$GHDAT2
  2100. ___
  2101. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2102. # ;; stitch AES rounds with GHASH
  2103. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2104. # ;; AES round 0 - ARK
  2105. &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
  2106. $NUM_BLOCKS, "vpxorq", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
  2107. $B04_07, $B08_11, $B12_15, $AESKEY1, $AESKEY1, $AESKEY1, $AESKEY1);
  2108. $code .= "vbroadcastf64x2 `(16 * 2)`($AES_KEYS),$AESKEY1\n";
  2109. $code .= <<___;
  2110. # ;;==================================================
  2111. # ;; GHASH 4 blocks (15 to 12)
  2112. vpclmulqdq \$0x11,$GHKEY1,$GHDAT1,$GH1H # ; a1*b1
  2113. vpclmulqdq \$0x00,$GHKEY1,$GHDAT1,$GH1L # ; a0*b0
  2114. vpclmulqdq \$0x01,$GHKEY1,$GHDAT1,$GH1M # ; a1*b0
  2115. vpclmulqdq \$0x10,$GHKEY1,$GHDAT1,$GH1T # ; a0*b1
  2116. vmovdqu64 @{[EffectiveAddress("%rsp",$HASHKEY_OFFSET,2*64)]},$GHKEY1
  2117. vmovdqa64 `$GHASHIN_BLK_OFFSET + (2*64)`(%rsp),$GHDAT1
  2118. ___
  2119. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2120. # ;; AES round 1
  2121. &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
  2122. $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
  2123. $B04_07, $B08_11, $B12_15, $AESKEY2, $AESKEY2, $AESKEY2, $AESKEY2);
  2124. $code .= "vbroadcastf64x2 `(16 * 3)`($AES_KEYS),$AESKEY2\n";
  2125. $code .= <<___;
  2126. # ;; =================================================
  2127. # ;; GHASH 4 blocks (11 to 8)
  2128. vpclmulqdq \$0x10,$GHKEY2,$GHDAT2,$GH2M # ; a0*b1
  2129. vpclmulqdq \$0x01,$GHKEY2,$GHDAT2,$GH2T # ; a1*b0
  2130. vpclmulqdq \$0x11,$GHKEY2,$GHDAT2,$GH2H # ; a1*b1
  2131. vpclmulqdq \$0x00,$GHKEY2,$GHDAT2,$GH2L # ; a0*b0
  2132. vmovdqu64 @{[EffectiveAddress("%rsp",$HASHKEY_OFFSET,3*64)]},$GHKEY2
  2133. vmovdqa64 `$GHASHIN_BLK_OFFSET + (3*64)`(%rsp),$GHDAT2
  2134. ___
  2135. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2136. # ;; AES round 2
  2137. &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
  2138. $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
  2139. $B04_07, $B08_11, $B12_15, $AESKEY1, $AESKEY1, $AESKEY1, $AESKEY1);
  2140. $code .= "vbroadcastf64x2 `(16 * 4)`($AES_KEYS),$AESKEY1\n";
  2141. $code .= <<___;
  2142. # ;; =================================================
  2143. # ;; GHASH 4 blocks (7 to 4)
  2144. vpclmulqdq \$0x10,$GHKEY1,$GHDAT1,$GH3M # ; a0*b1
  2145. vpclmulqdq \$0x01,$GHKEY1,$GHDAT1,$GH3T # ; a1*b0
  2146. vpclmulqdq \$0x11,$GHKEY1,$GHDAT1,$GH3H # ; a1*b1
  2147. vpclmulqdq \$0x00,$GHKEY1,$GHDAT1,$GH3L # ; a0*b0
  2148. ___
  2149. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2150. # ;; AES rounds 3
  2151. &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
  2152. $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
  2153. $B04_07, $B08_11, $B12_15, $AESKEY2, $AESKEY2, $AESKEY2, $AESKEY2);
  2154. $code .= "vbroadcastf64x2 `(16 * 5)`($AES_KEYS),$AESKEY2\n";
  2155. $code .= <<___;
  2156. # ;; =================================================
  2157. # ;; Gather (XOR) GHASH for 12 blocks
  2158. vpternlogq \$0x96,$GH3H,$GH2H,$GH1H
  2159. vpternlogq \$0x96,$GH3L,$GH2L,$GH1L
  2160. vpternlogq \$0x96,$GH3T,$GH2T,$GH1T
  2161. vpternlogq \$0x96,$GH3M,$GH2M,$GH1M
  2162. ___
  2163. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2164. # ;; AES rounds 4
  2165. &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
  2166. $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
  2167. $B04_07, $B08_11, $B12_15, $AESKEY1, $AESKEY1, $AESKEY1, $AESKEY1);
  2168. $code .= "vbroadcastf64x2 `(16 * 6)`($AES_KEYS),$AESKEY1\n";
  2169. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2170. # ;; load plain/cipher text
  2171. &ZMM_LOAD_MASKED_BLOCKS_0_16($NUM_BLOCKS, $PLAIN_CIPH_IN, $DATA_OFFSET, $DATA1, $DATA2, $DATA3, $DATA4, $MASKREG);
  2172. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2173. # ;; AES rounds 5
  2174. &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
  2175. $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
  2176. $B04_07, $B08_11, $B12_15, $AESKEY2, $AESKEY2, $AESKEY2, $AESKEY2);
  2177. $code .= "vbroadcastf64x2 `(16 * 7)`($AES_KEYS),$AESKEY2\n";
  2178. $code .= <<___;
  2179. # ;; =================================================
  2180. # ;; GHASH 4 blocks (3 to 0)
  2181. vpclmulqdq \$0x10,$GHKEY2,$GHDAT2,$GH2M # ; a0*b1
  2182. vpclmulqdq \$0x01,$GHKEY2,$GHDAT2,$GH2T # ; a1*b0
  2183. vpclmulqdq \$0x11,$GHKEY2,$GHDAT2,$GH2H # ; a1*b1
  2184. vpclmulqdq \$0x00,$GHKEY2,$GHDAT2,$GH2L # ; a0*b0
  2185. ___
  2186. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2187. # ;; AES round 6
  2188. &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
  2189. $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
  2190. $B04_07, $B08_11, $B12_15, $AESKEY1, $AESKEY1, $AESKEY1, $AESKEY1);
  2191. $code .= "vbroadcastf64x2 `(16 * 8)`($AES_KEYS),$AESKEY1\n";
  2192. # ;; =================================================
  2193. # ;; gather GHASH in GH1L (low), GH1H (high), GH1M (mid)
  2194. # ;; - add GH2[MTLH] to GH1[MTLH]
  2195. $code .= "vpternlogq \$0x96,$GH2T,$GH1T,$GH1M\n";
  2196. if ($do_reduction != 0) {
  2197. if ($is_start != 0) {
  2198. $code .= "vpxorq $GH2M,$GH1M,$GH1M\n";
  2199. } else {
  2200. $code .= <<___;
  2201. vpternlogq \$0x96,$GH2H,$TO_REDUCE_H,$GH1H
  2202. vpternlogq \$0x96,$GH2L,$TO_REDUCE_L,$GH1L
  2203. vpternlogq \$0x96,$GH2M,$TO_REDUCE_M,$GH1M
  2204. ___
  2205. }
  2206. } else {
  2207. # ;; Update H/M/L hash sums if not carrying reduction
  2208. if ($is_start != 0) {
  2209. $code .= <<___;
  2210. vpxorq $GH2H,$GH1H,$TO_REDUCE_H
  2211. vpxorq $GH2L,$GH1L,$TO_REDUCE_L
  2212. vpxorq $GH2M,$GH1M,$TO_REDUCE_M
  2213. ___
  2214. } else {
  2215. $code .= <<___;
  2216. vpternlogq \$0x96,$GH2H,$GH1H,$TO_REDUCE_H
  2217. vpternlogq \$0x96,$GH2L,$GH1L,$TO_REDUCE_L
  2218. vpternlogq \$0x96,$GH2M,$GH1M,$TO_REDUCE_M
  2219. ___
  2220. }
  2221. }
  2222. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2223. # ;; AES round 7
  2224. &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
  2225. $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
  2226. $B04_07, $B08_11, $B12_15, $AESKEY2, $AESKEY2, $AESKEY2, $AESKEY2);
  2227. $code .= "vbroadcastf64x2 `(16 * 9)`($AES_KEYS),$AESKEY2\n";
  2228. # ;; =================================================
  2229. # ;; prepare mid sum for adding to high & low
  2230. # ;; load polynomial constant for reduction
  2231. if ($do_reduction != 0) {
  2232. $code .= <<___;
  2233. vpsrldq \$8,$GH1M,$GH2M
  2234. vpslldq \$8,$GH1M,$GH1M
  2235. vmovdqa64 POLY2(%rip),@{[XWORD($RED_POLY)]}
  2236. ___
  2237. }
  2238. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2239. # ;; AES round 8
  2240. &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
  2241. $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
  2242. $B04_07, $B08_11, $B12_15, $AESKEY1, $AESKEY1, $AESKEY1, $AESKEY1);
  2243. $code .= "vbroadcastf64x2 `(16 * 10)`($AES_KEYS),$AESKEY1\n";
  2244. # ;; =================================================
  2245. # ;; Add mid product to high and low
  2246. if ($do_reduction != 0) {
  2247. if ($is_start != 0) {
  2248. $code .= <<___;
  2249. vpternlogq \$0x96,$GH2M,$GH2H,$GH1H # ; TH = TH1 + TH2 + TM>>64
  2250. vpternlogq \$0x96,$GH1M,$GH2L,$GH1L # ; TL = TL1 + TL2 + TM<<64
  2251. ___
  2252. } else {
  2253. $code .= <<___;
  2254. vpxorq $GH2M,$GH1H,$GH1H # ; TH = TH1 + TM>>64
  2255. vpxorq $GH1M,$GH1L,$GH1L # ; TL = TL1 + TM<<64
  2256. ___
  2257. }
  2258. }
  2259. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2260. # ;; AES round 9
  2261. &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
  2262. $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
  2263. $B04_07, $B08_11, $B12_15, $AESKEY2, $AESKEY2, $AESKEY2, $AESKEY2);
  2264. # ;; =================================================
  2265. # ;; horizontal xor of low and high 4x128
  2266. if ($do_reduction != 0) {
  2267. &VHPXORI4x128($GH1H, $GH2H);
  2268. &VHPXORI4x128($GH1L, $GH2L);
  2269. }
  2270. if (($NROUNDS >= 11)) {
  2271. $code .= "vbroadcastf64x2 `(16 * 11)`($AES_KEYS),$AESKEY2\n";
  2272. }
  2273. # ;; =================================================
  2274. # ;; first phase of reduction
  2275. if ($do_reduction != 0) {
  2276. $code .= <<___;
  2277. vpclmulqdq \$0x01,@{[XWORD($GH1L)]},@{[XWORD($RED_POLY)]},@{[XWORD($RED_P1)]}
  2278. vpslldq \$8,@{[XWORD($RED_P1)]},@{[XWORD($RED_P1)]} # ; shift-L 2 DWs
  2279. vpxorq @{[XWORD($RED_P1)]},@{[XWORD($GH1L)]},@{[XWORD($RED_P1)]} # ; first phase of the reduct
  2280. ___
  2281. }
  2282. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2283. # ;; AES rounds up to 11 (AES192) or 13 (AES256)
  2284. # ;; AES128 is done
  2285. if (($NROUNDS >= 11)) {
  2286. &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
  2287. $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
  2288. $B04_07, $B08_11, $B12_15, $AESKEY1, $AESKEY1, $AESKEY1, $AESKEY1);
  2289. $code .= "vbroadcastf64x2 `(16 * 12)`($AES_KEYS),$AESKEY1\n";
  2290. &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
  2291. $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
  2292. $B04_07, $B08_11, $B12_15, $AESKEY2, $AESKEY2, $AESKEY2, $AESKEY2);
  2293. if (($NROUNDS == 13)) {
  2294. $code .= "vbroadcastf64x2 `(16 * 13)`($AES_KEYS),$AESKEY2\n";
  2295. &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
  2296. $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
  2297. $B04_07, $B08_11, $B12_15, $AESKEY1, $AESKEY1, $AESKEY1, $AESKEY1);
  2298. $code .= "vbroadcastf64x2 `(16 * 14)`($AES_KEYS),$AESKEY1\n";
  2299. &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
  2300. $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
  2301. $B04_07, $B08_11, $B12_15, $AESKEY2, $AESKEY2, $AESKEY2, $AESKEY2);
  2302. }
  2303. }
  2304. # ;; =================================================
  2305. # ;; second phase of the reduction
  2306. if ($do_reduction != 0) {
  2307. $code .= <<___;
  2308. vpclmulqdq \$0x00,@{[XWORD($RED_P1)]},@{[XWORD($RED_POLY)]},@{[XWORD($RED_T1)]}
  2309. vpsrldq \$4,@{[XWORD($RED_T1)]},@{[XWORD($RED_T1)]} # ; shift-R 1-DW to obtain 2-DWs shift-R
  2310. vpclmulqdq \$0x10,@{[XWORD($RED_P1)]},@{[XWORD($RED_POLY)]},@{[XWORD($RED_T2)]}
  2311. vpslldq \$4,@{[XWORD($RED_T2)]},@{[XWORD($RED_T2)]} # ; shift-L 1-DW for result without shifts
  2312. # ;; GH1H = GH1H + RED_T1 + RED_T2
  2313. vpternlogq \$0x96,@{[XWORD($RED_T1)]},@{[XWORD($RED_T2)]},@{[XWORD($GH1H)]}
  2314. ___
  2315. }
  2316. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2317. # ;; the last AES round
  2318. &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
  2319. $NUM_BLOCKS, "vaesenclast", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
  2320. $B04_07, $B08_11, $B12_15, $AESKEY1, $AESKEY1, $AESKEY1, $AESKEY1);
  2321. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2322. # ;; XOR against plain/cipher text
  2323. &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
  2324. $NUM_BLOCKS, "vpxorq", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
  2325. $B04_07, $B08_11, $B12_15, $DATA1, $DATA2, $DATA3, $DATA4);
  2326. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2327. # ;; retrieve the last cipher counter block (partially XOR'ed with text)
  2328. # ;; - this is needed for partial block cases
  2329. if ($NUM_BLOCKS <= 4) {
  2330. $code .= "vextracti32x4 \$`($NUM_BLOCKS - 1)`,$B00_03,@{[XWORD($LAST_CIPHER_BLK)]}\n";
  2331. } elsif ($NUM_BLOCKS <= 8) {
  2332. $code .= "vextracti32x4 \$`($NUM_BLOCKS - 5)`,$B04_07,@{[XWORD($LAST_CIPHER_BLK)]}\n";
  2333. } elsif ($NUM_BLOCKS <= 12) {
  2334. $code .= "vextracti32x4 \$`($NUM_BLOCKS - 9)`,$B08_11,@{[XWORD($LAST_CIPHER_BLK)]}\n";
  2335. } else {
  2336. $code .= "vextracti32x4 \$`($NUM_BLOCKS - 13)`,$B12_15,@{[XWORD($LAST_CIPHER_BLK)]}\n";
  2337. }
  2338. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2339. # ;; store cipher/plain text
  2340. $code .= "mov $CIPH_PLAIN_OUT,$IA0\n";
  2341. &ZMM_STORE_MASKED_BLOCKS_0_16($NUM_BLOCKS, $IA0, $DATA_OFFSET, $B00_03, $B04_07, $B08_11, $B12_15, $MASKREG);
  2342. # ;; =================================================
  2343. # ;; shuffle cipher text blocks for GHASH computation
  2344. if ($ENC_DEC eq "ENC") {
  2345. # ;; zero bytes outside the mask before hashing
  2346. if ($NUM_BLOCKS <= 4) {
  2347. $code .= "vmovdqu8 $B00_03,${B00_03}{$MASKREG}{z}\n";
  2348. } elsif ($NUM_BLOCKS <= 8) {
  2349. $code .= "vmovdqu8 $B04_07,${B04_07}{$MASKREG}{z}\n";
  2350. } elsif ($NUM_BLOCKS <= 12) {
  2351. $code .= "vmovdqu8 $B08_11,${B08_11}{$MASKREG}{z}\n";
  2352. } else {
  2353. $code .= "vmovdqu8 $B12_15,${B12_15}{$MASKREG}{z}\n";
  2354. }
  2355. &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
  2356. $NUM_BLOCKS, "vpshufb", $DATA1, $DATA2, $DATA3, $DATA4, $B00_03,
  2357. $B04_07, $B08_11, $B12_15, $SHFMSK, $SHFMSK, $SHFMSK, $SHFMSK);
  2358. } else {
  2359. # ;; zero bytes outside the mask before hashing
  2360. if ($NUM_BLOCKS <= 4) {
  2361. $code .= "vmovdqu8 $DATA1,${DATA1}{$MASKREG}{z}\n";
  2362. } elsif ($NUM_BLOCKS <= 8) {
  2363. $code .= "vmovdqu8 $DATA2,${DATA2}{$MASKREG}{z}\n";
  2364. } elsif ($NUM_BLOCKS <= 12) {
  2365. $code .= "vmovdqu8 $DATA3,${DATA3}{$MASKREG}{z}\n";
  2366. } else {
  2367. $code .= "vmovdqu8 $DATA4,${DATA4}{$MASKREG}{z}\n";
  2368. }
  2369. &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
  2370. $NUM_BLOCKS, "vpshufb", $DATA1, $DATA2, $DATA3, $DATA4, $DATA1,
  2371. $DATA2, $DATA3, $DATA4, $SHFMSK, $SHFMSK, $SHFMSK, $SHFMSK);
  2372. }
  2373. # ;; =================================================
  2374. # ;; Extract the last block for partial / multi_call cases
  2375. if ($NUM_BLOCKS <= 4) {
  2376. $code .= "vextracti32x4 \$`($NUM_BLOCKS-1)`,$DATA1,@{[XWORD($LAST_GHASH_BLK)]}\n";
  2377. } elsif ($NUM_BLOCKS <= 8) {
  2378. $code .= "vextracti32x4 \$`($NUM_BLOCKS-5)`,$DATA2,@{[XWORD($LAST_GHASH_BLK)]}\n";
  2379. } elsif ($NUM_BLOCKS <= 12) {
  2380. $code .= "vextracti32x4 \$`($NUM_BLOCKS-9)`,$DATA3,@{[XWORD($LAST_GHASH_BLK)]}\n";
  2381. } else {
  2382. $code .= "vextracti32x4 \$`($NUM_BLOCKS-13)`,$DATA4,@{[XWORD($LAST_GHASH_BLK)]}\n";
  2383. }
  2384. if ($do_reduction != 0) {
  2385. # ;; GH1H holds reduced hash value
  2386. # ;; - normally do "vmovdqa64 &XWORD($GH1H), &XWORD($HASH_IN_OUT)"
  2387. # ;; - register rename trick obsoletes the above move
  2388. }
  2389. # ;; =================================================
  2390. # ;; GHASH last N blocks
  2391. # ;; - current hash value in HASH_IN_OUT or
  2392. # ;; product parts in TO_REDUCE_H/M/L
  2393. # ;; - DATA1-DATA4 include blocks for GHASH
  2394. if ($do_reduction == 0) {
  2395. &INITIAL_BLOCKS_PARTIAL_GHASH(
  2396. $AES_KEYS, $GCM128_CTX, $LENGTH, $NUM_BLOCKS,
  2397. &XWORD($HASH_IN_OUT), $ENC_DEC, $DATA1, $DATA2,
  2398. $DATA3, $DATA4, &XWORD($LAST_CIPHER_BLK), &XWORD($LAST_GHASH_BLK),
  2399. $B00_03, $B04_07, $B08_11, $B12_15,
  2400. $GHDAT1, $GHDAT2, $AESKEY1, $AESKEY2,
  2401. $GHKEY1, $PBLOCK_LEN, $TO_REDUCE_H, $TO_REDUCE_M,
  2402. $TO_REDUCE_L);
  2403. } else {
  2404. &INITIAL_BLOCKS_PARTIAL_GHASH(
  2405. $AES_KEYS, $GCM128_CTX, $LENGTH, $NUM_BLOCKS,
  2406. &XWORD($HASH_IN_OUT), $ENC_DEC, $DATA1, $DATA2,
  2407. $DATA3, $DATA4, &XWORD($LAST_CIPHER_BLK), &XWORD($LAST_GHASH_BLK),
  2408. $B00_03, $B04_07, $B08_11, $B12_15,
  2409. $GHDAT1, $GHDAT2, $AESKEY1, $AESKEY2,
  2410. $GHKEY1, $PBLOCK_LEN);
  2411. }
  2412. }
  2413. # ;; ===========================================================================
  2414. # ;; ===========================================================================
  2415. # ;; Stitched GHASH of 16 blocks (with reduction) with encryption of N blocks
  2416. # ;; followed with GHASH of the N blocks.
  2417. sub GCM_ENC_DEC_LAST {
  2418. my $AES_KEYS = $_[0]; # [in] key pointer
  2419. my $GCM128_CTX = $_[1]; # [in] context pointer
  2420. my $CIPH_PLAIN_OUT = $_[2]; # [in] pointer to output buffer
  2421. my $PLAIN_CIPH_IN = $_[3]; # [in] pointer to input buffer
  2422. my $DATA_OFFSET = $_[4]; # [in] data offset
  2423. my $LENGTH = $_[5]; # [in/clobbered] data length
  2424. my $CTR_BE = $_[6]; # [in/out] ZMM counter blocks (last 4) in big-endian
  2425. my $CTR_CHECK = $_[7]; # [in/out] GP with 8-bit counter for overflow check
  2426. my $HASHKEY_OFFSET = $_[8]; # [in] numerical offset for the highest hash key
  2427. # (can be register or numerical offset)
  2428. my $GHASHIN_BLK_OFFSET = $_[9]; # [in] numerical offset for GHASH blocks in
  2429. my $SHFMSK = $_[10]; # [in] ZMM with byte swap mask for pshufb
  2430. my $ZT00 = $_[11]; # [clobbered] temporary ZMM
  2431. my $ZT01 = $_[12]; # [clobbered] temporary ZMM
  2432. my $ZT02 = $_[13]; # [clobbered] temporary ZMM
  2433. my $ZT03 = $_[14]; # [clobbered] temporary ZMM
  2434. my $ZT04 = $_[15]; # [clobbered] temporary ZMM
  2435. my $ZT05 = $_[16]; # [clobbered] temporary ZMM
  2436. my $ZT06 = $_[17]; # [clobbered] temporary ZMM
  2437. my $ZT07 = $_[18]; # [clobbered] temporary ZMM
  2438. my $ZT08 = $_[19]; # [clobbered] temporary ZMM
  2439. my $ZT09 = $_[20]; # [clobbered] temporary ZMM
  2440. my $ZT10 = $_[21]; # [clobbered] temporary ZMM
  2441. my $ZT11 = $_[22]; # [clobbered] temporary ZMM
  2442. my $ZT12 = $_[23]; # [clobbered] temporary ZMM
  2443. my $ZT13 = $_[24]; # [clobbered] temporary ZMM
  2444. my $ZT14 = $_[25]; # [clobbered] temporary ZMM
  2445. my $ZT15 = $_[26]; # [clobbered] temporary ZMM
  2446. my $ZT16 = $_[27]; # [clobbered] temporary ZMM
  2447. my $ZT17 = $_[28]; # [clobbered] temporary ZMM
  2448. my $ZT18 = $_[29]; # [clobbered] temporary ZMM
  2449. my $ZT19 = $_[30]; # [clobbered] temporary ZMM
  2450. my $ZT20 = $_[31]; # [clobbered] temporary ZMM
  2451. my $ZT21 = $_[32]; # [clobbered] temporary ZMM
  2452. my $ZT22 = $_[33]; # [clobbered] temporary ZMM
  2453. my $ADDBE_4x4 = $_[34]; # [in] ZMM with 4x128bits 4 in big-endian
  2454. my $ADDBE_1234 = $_[35]; # [in] ZMM with 4x128bits 1, 2, 3 and 4 in big-endian
  2455. my $GHASH_TYPE = $_[36]; # [in] "start", "start_reduce", "mid", "end_reduce"
  2456. my $TO_REDUCE_L = $_[37]; # [in] ZMM for low 4x128-bit GHASH sum
  2457. my $TO_REDUCE_H = $_[38]; # [in] ZMM for hi 4x128-bit GHASH sum
  2458. my $TO_REDUCE_M = $_[39]; # [in] ZMM for medium 4x128-bit GHASH sum
  2459. my $ENC_DEC = $_[40]; # [in] cipher direction
  2460. my $HASH_IN_OUT = $_[41]; # [in/out] XMM ghash in/out value
  2461. my $IA0 = $_[42]; # [clobbered] GP temporary
  2462. my $IA1 = $_[43]; # [clobbered] GP temporary
  2463. my $MASKREG = $_[44]; # [clobbered] mask register
  2464. my $PBLOCK_LEN = $_[45]; # [in] partial block length
  2465. my $label_suffix = $label_count++;
  2466. $code .= <<___;
  2467. mov @{[DWORD($LENGTH)]},@{[DWORD($IA0)]}
  2468. add \$15,@{[DWORD($IA0)]}
  2469. shr \$4,@{[DWORD($IA0)]}
  2470. je .L_last_num_blocks_is_0_${label_suffix}
  2471. cmp \$8,@{[DWORD($IA0)]}
  2472. je .L_last_num_blocks_is_8_${label_suffix}
  2473. jb .L_last_num_blocks_is_7_1_${label_suffix}
  2474. cmp \$12,@{[DWORD($IA0)]}
  2475. je .L_last_num_blocks_is_12_${label_suffix}
  2476. jb .L_last_num_blocks_is_11_9_${label_suffix}
  2477. # ;; 16, 15, 14 or 13
  2478. cmp \$15,@{[DWORD($IA0)]}
  2479. je .L_last_num_blocks_is_15_${label_suffix}
  2480. ja .L_last_num_blocks_is_16_${label_suffix}
  2481. cmp \$14,@{[DWORD($IA0)]}
  2482. je .L_last_num_blocks_is_14_${label_suffix}
  2483. jmp .L_last_num_blocks_is_13_${label_suffix}
  2484. .L_last_num_blocks_is_11_9_${label_suffix}:
  2485. # ;; 11, 10 or 9
  2486. cmp \$10,@{[DWORD($IA0)]}
  2487. je .L_last_num_blocks_is_10_${label_suffix}
  2488. ja .L_last_num_blocks_is_11_${label_suffix}
  2489. jmp .L_last_num_blocks_is_9_${label_suffix}
  2490. .L_last_num_blocks_is_7_1_${label_suffix}:
  2491. cmp \$4,@{[DWORD($IA0)]}
  2492. je .L_last_num_blocks_is_4_${label_suffix}
  2493. jb .L_last_num_blocks_is_3_1_${label_suffix}
  2494. # ;; 7, 6 or 5
  2495. cmp \$6,@{[DWORD($IA0)]}
  2496. ja .L_last_num_blocks_is_7_${label_suffix}
  2497. je .L_last_num_blocks_is_6_${label_suffix}
  2498. jmp .L_last_num_blocks_is_5_${label_suffix}
  2499. .L_last_num_blocks_is_3_1_${label_suffix}:
  2500. # ;; 3, 2 or 1
  2501. cmp \$2,@{[DWORD($IA0)]}
  2502. ja .L_last_num_blocks_is_3_${label_suffix}
  2503. je .L_last_num_blocks_is_2_${label_suffix}
  2504. ___
  2505. # ;; fall through for `jmp .L_last_num_blocks_is_1`
  2506. # ;; Use rep to generate different block size variants
  2507. # ;; - one block size has to be the first one
  2508. for my $num_blocks (1 .. 16) {
  2509. $code .= ".L_last_num_blocks_is_${num_blocks}_${label_suffix}:\n";
  2510. &GHASH_16_ENCRYPT_N_GHASH_N(
  2511. $AES_KEYS, $GCM128_CTX, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $DATA_OFFSET,
  2512. $LENGTH, $CTR_BE, $CTR_CHECK, $HASHKEY_OFFSET, $GHASHIN_BLK_OFFSET,
  2513. $SHFMSK, $ZT00, $ZT01, $ZT02, $ZT03,
  2514. $ZT04, $ZT05, $ZT06, $ZT07, $ZT08,
  2515. $ZT09, $ZT10, $ZT11, $ZT12, $ZT13,
  2516. $ZT14, $ZT15, $ZT16, $ZT17, $ZT18,
  2517. $ZT19, $ZT20, $ZT21, $ZT22, $ADDBE_4x4,
  2518. $ADDBE_1234, $GHASH_TYPE, $TO_REDUCE_L, $TO_REDUCE_H, $TO_REDUCE_M,
  2519. $ENC_DEC, $HASH_IN_OUT, $IA0, $IA1, $MASKREG,
  2520. $num_blocks, $PBLOCK_LEN);
  2521. $code .= "jmp .L_last_blocks_done_${label_suffix}\n";
  2522. }
  2523. $code .= ".L_last_num_blocks_is_0_${label_suffix}:\n";
  2524. # ;; if there is 0 blocks to cipher then there are only 16 blocks for ghash and reduction
  2525. # ;; - convert mid into end_reduce
  2526. # ;; - convert start into start_reduce
  2527. if ($GHASH_TYPE eq "mid") {
  2528. $GHASH_TYPE = "end_reduce";
  2529. }
  2530. if ($GHASH_TYPE eq "start") {
  2531. $GHASH_TYPE = "start_reduce";
  2532. }
  2533. &GHASH_16($GHASH_TYPE, $TO_REDUCE_H, $TO_REDUCE_M, $TO_REDUCE_L, "%rsp",
  2534. $GHASHIN_BLK_OFFSET, 0, "%rsp", $HASHKEY_OFFSET, 0, $HASH_IN_OUT, $ZT00, $ZT01,
  2535. $ZT02, $ZT03, $ZT04, $ZT05, $ZT06, $ZT07, $ZT08, $ZT09);
  2536. $code .= ".L_last_blocks_done_${label_suffix}:\n";
  2537. }
  2538. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2539. # ;; Main GCM macro stitching cipher with GHASH
  2540. # ;; - operates on single stream
  2541. # ;; - encrypts 16 blocks at a time
  2542. # ;; - ghash the 16 previously encrypted ciphertext blocks
  2543. # ;; - no partial block or multi_call handling here
  2544. sub GHASH_16_ENCRYPT_16_PARALLEL {
  2545. my $AES_KEYS = $_[0]; # [in] key pointer
  2546. my $CIPH_PLAIN_OUT = $_[1]; # [in] pointer to output buffer
  2547. my $PLAIN_CIPH_IN = $_[2]; # [in] pointer to input buffer
  2548. my $DATA_OFFSET = $_[3]; # [in] data offset
  2549. my $CTR_BE = $_[4]; # [in/out] ZMM counter blocks (last 4) in big-endian
  2550. my $CTR_CHECK = $_[5]; # [in/out] GP with 8-bit counter for overflow check
  2551. my $HASHKEY_OFFSET = $_[6]; # [in] numerical offset for the highest hash key (hash key index value)
  2552. my $AESOUT_BLK_OFFSET = $_[7]; # [in] numerical offset for AES-CTR out
  2553. my $GHASHIN_BLK_OFFSET = $_[8]; # [in] numerical offset for GHASH blocks in
  2554. my $SHFMSK = $_[9]; # [in] ZMM with byte swap mask for pshufb
  2555. my $ZT1 = $_[10]; # [clobbered] temporary ZMM (cipher)
  2556. my $ZT2 = $_[11]; # [clobbered] temporary ZMM (cipher)
  2557. my $ZT3 = $_[12]; # [clobbered] temporary ZMM (cipher)
  2558. my $ZT4 = $_[13]; # [clobbered] temporary ZMM (cipher)
  2559. my $ZT5 = $_[14]; # [clobbered/out] temporary ZMM or GHASH OUT (final_reduction)
  2560. my $ZT6 = $_[15]; # [clobbered] temporary ZMM (cipher)
  2561. my $ZT7 = $_[16]; # [clobbered] temporary ZMM (cipher)
  2562. my $ZT8 = $_[17]; # [clobbered] temporary ZMM (cipher)
  2563. my $ZT9 = $_[18]; # [clobbered] temporary ZMM (cipher)
  2564. my $ZT10 = $_[19]; # [clobbered] temporary ZMM (ghash)
  2565. my $ZT11 = $_[20]; # [clobbered] temporary ZMM (ghash)
  2566. my $ZT12 = $_[21]; # [clobbered] temporary ZMM (ghash)
  2567. my $ZT13 = $_[22]; # [clobbered] temporary ZMM (ghash)
  2568. my $ZT14 = $_[23]; # [clobbered] temporary ZMM (ghash)
  2569. my $ZT15 = $_[24]; # [clobbered] temporary ZMM (ghash)
  2570. my $ZT16 = $_[25]; # [clobbered] temporary ZMM (ghash)
  2571. my $ZT17 = $_[26]; # [clobbered] temporary ZMM (ghash)
  2572. my $ZT18 = $_[27]; # [clobbered] temporary ZMM (ghash)
  2573. my $ZT19 = $_[28]; # [clobbered] temporary ZMM
  2574. my $ZT20 = $_[29]; # [clobbered] temporary ZMM
  2575. my $ZT21 = $_[30]; # [clobbered] temporary ZMM
  2576. my $ZT22 = $_[31]; # [clobbered] temporary ZMM
  2577. my $ZT23 = $_[32]; # [clobbered] temporary ZMM
  2578. my $ADDBE_4x4 = $_[33]; # [in] ZMM with 4x128bits 4 in big-endian
  2579. my $ADDBE_1234 = $_[34]; # [in] ZMM with 4x128bits 1, 2, 3 and 4 in big-endian
  2580. my $TO_REDUCE_L = $_[35]; # [in/out] ZMM for low 4x128-bit GHASH sum
  2581. my $TO_REDUCE_H = $_[36]; # [in/out] ZMM for hi 4x128-bit GHASH sum
  2582. my $TO_REDUCE_M = $_[37]; # [in/out] ZMM for medium 4x128-bit GHASH sum
  2583. my $DO_REDUCTION = $_[38]; # [in] "no_reduction", "final_reduction", "first_time"
  2584. my $ENC_DEC = $_[39]; # [in] cipher direction
  2585. my $DATA_DISPL = $_[40]; # [in] fixed numerical data displacement/offset
  2586. my $GHASH_IN = $_[41]; # [in] current GHASH value or "no_ghash_in"
  2587. my $IA0 = $_[42]; # [clobbered] temporary GPR
  2588. my $B00_03 = $ZT1;
  2589. my $B04_07 = $ZT2;
  2590. my $B08_11 = $ZT3;
  2591. my $B12_15 = $ZT4;
  2592. my $GH1H = $ZT5;
  2593. # ; @note: do not change this mapping
  2594. my $GH1L = $ZT6;
  2595. my $GH1M = $ZT7;
  2596. my $GH1T = $ZT8;
  2597. my $GH2H = $ZT9;
  2598. my $GH2L = $ZT10;
  2599. my $GH2M = $ZT11;
  2600. my $GH2T = $ZT12;
  2601. my $RED_POLY = $GH2T;
  2602. my $RED_P1 = $GH2L;
  2603. my $RED_T1 = $GH2H;
  2604. my $RED_T2 = $GH2M;
  2605. my $GH3H = $ZT13;
  2606. my $GH3L = $ZT14;
  2607. my $GH3M = $ZT15;
  2608. my $GH3T = $ZT16;
  2609. my $DATA1 = $ZT13;
  2610. my $DATA2 = $ZT14;
  2611. my $DATA3 = $ZT15;
  2612. my $DATA4 = $ZT16;
  2613. my $AESKEY1 = $ZT17;
  2614. my $AESKEY2 = $ZT18;
  2615. my $GHKEY1 = $ZT19;
  2616. my $GHKEY2 = $ZT20;
  2617. my $GHDAT1 = $ZT21;
  2618. my $GHDAT2 = $ZT22;
  2619. my $label_suffix = $label_count++;
  2620. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2621. # ;; prepare counter blocks
  2622. $code .= <<___;
  2623. cmpb \$`(256 - 16)`,@{[BYTE($CTR_CHECK)]}
  2624. jae .L_16_blocks_overflow_${label_suffix}
  2625. vpaddd $ADDBE_1234,$CTR_BE,$B00_03
  2626. vpaddd $ADDBE_4x4,$B00_03,$B04_07
  2627. vpaddd $ADDBE_4x4,$B04_07,$B08_11
  2628. vpaddd $ADDBE_4x4,$B08_11,$B12_15
  2629. jmp .L_16_blocks_ok_${label_suffix}
  2630. .L_16_blocks_overflow_${label_suffix}:
  2631. vpshufb $SHFMSK,$CTR_BE,$CTR_BE
  2632. vmovdqa64 ddq_add_4444(%rip),$B12_15
  2633. vpaddd ddq_add_1234(%rip),$CTR_BE,$B00_03
  2634. vpaddd $B12_15,$B00_03,$B04_07
  2635. vpaddd $B12_15,$B04_07,$B08_11
  2636. vpaddd $B12_15,$B08_11,$B12_15
  2637. vpshufb $SHFMSK,$B00_03,$B00_03
  2638. vpshufb $SHFMSK,$B04_07,$B04_07
  2639. vpshufb $SHFMSK,$B08_11,$B08_11
  2640. vpshufb $SHFMSK,$B12_15,$B12_15
  2641. .L_16_blocks_ok_${label_suffix}:
  2642. ___
  2643. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2644. # ;; pre-load constants
  2645. $code .= "vbroadcastf64x2 `(16 * 0)`($AES_KEYS),$AESKEY1\n";
  2646. if ($GHASH_IN ne "no_ghash_in") {
  2647. $code .= "vpxorq `$GHASHIN_BLK_OFFSET + (0*64)`(%rsp),$GHASH_IN,$GHDAT1\n";
  2648. } else {
  2649. $code .= "vmovdqa64 `$GHASHIN_BLK_OFFSET + (0*64)`(%rsp),$GHDAT1\n";
  2650. }
  2651. $code .= <<___;
  2652. vmovdqu64 @{[HashKeyByIdx(($HASHKEY_OFFSET - (0*4)),"%rsp")]},$GHKEY1
  2653. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2654. # ;; save counter for the next round
  2655. # ;; increment counter overflow check register
  2656. vshufi64x2 \$0b11111111,$B12_15,$B12_15,$CTR_BE
  2657. addb \$16,@{[BYTE($CTR_CHECK)]}
  2658. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2659. # ;; pre-load constants
  2660. vbroadcastf64x2 `(16 * 1)`($AES_KEYS),$AESKEY2
  2661. vmovdqu64 @{[HashKeyByIdx(($HASHKEY_OFFSET - (1*4)),"%rsp")]},$GHKEY2
  2662. vmovdqa64 `$GHASHIN_BLK_OFFSET + (1*64)`(%rsp),$GHDAT2
  2663. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2664. # ;; stitch AES rounds with GHASH
  2665. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2666. # ;; AES round 0 - ARK
  2667. vpxorq $AESKEY1,$B00_03,$B00_03
  2668. vpxorq $AESKEY1,$B04_07,$B04_07
  2669. vpxorq $AESKEY1,$B08_11,$B08_11
  2670. vpxorq $AESKEY1,$B12_15,$B12_15
  2671. vbroadcastf64x2 `(16 * 2)`($AES_KEYS),$AESKEY1
  2672. # ;;==================================================
  2673. # ;; GHASH 4 blocks (15 to 12)
  2674. vpclmulqdq \$0x11,$GHKEY1,$GHDAT1,$GH1H # ; a1*b1
  2675. vpclmulqdq \$0x00,$GHKEY1,$GHDAT1,$GH1L # ; a0*b0
  2676. vpclmulqdq \$0x01,$GHKEY1,$GHDAT1,$GH1M # ; a1*b0
  2677. vpclmulqdq \$0x10,$GHKEY1,$GHDAT1,$GH1T # ; a0*b1
  2678. vmovdqu64 @{[HashKeyByIdx(($HASHKEY_OFFSET - (2*4)),"%rsp")]},$GHKEY1
  2679. vmovdqa64 `$GHASHIN_BLK_OFFSET + (2*64)`(%rsp),$GHDAT1
  2680. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2681. # ;; AES round 1
  2682. vaesenc $AESKEY2,$B00_03,$B00_03
  2683. vaesenc $AESKEY2,$B04_07,$B04_07
  2684. vaesenc $AESKEY2,$B08_11,$B08_11
  2685. vaesenc $AESKEY2,$B12_15,$B12_15
  2686. vbroadcastf64x2 `(16 * 3)`($AES_KEYS),$AESKEY2
  2687. # ;; =================================================
  2688. # ;; GHASH 4 blocks (11 to 8)
  2689. vpclmulqdq \$0x10,$GHKEY2,$GHDAT2,$GH2M # ; a0*b1
  2690. vpclmulqdq \$0x01,$GHKEY2,$GHDAT2,$GH2T # ; a1*b0
  2691. vpclmulqdq \$0x11,$GHKEY2,$GHDAT2,$GH2H # ; a1*b1
  2692. vpclmulqdq \$0x00,$GHKEY2,$GHDAT2,$GH2L # ; a0*b0
  2693. vmovdqu64 @{[HashKeyByIdx(($HASHKEY_OFFSET - (3*4)),"%rsp")]},$GHKEY2
  2694. vmovdqa64 `$GHASHIN_BLK_OFFSET + (3*64)`(%rsp),$GHDAT2
  2695. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2696. # ;; AES round 2
  2697. vaesenc $AESKEY1,$B00_03,$B00_03
  2698. vaesenc $AESKEY1,$B04_07,$B04_07
  2699. vaesenc $AESKEY1,$B08_11,$B08_11
  2700. vaesenc $AESKEY1,$B12_15,$B12_15
  2701. vbroadcastf64x2 `(16 * 4)`($AES_KEYS),$AESKEY1
  2702. # ;; =================================================
  2703. # ;; GHASH 4 blocks (7 to 4)
  2704. vpclmulqdq \$0x10,$GHKEY1,$GHDAT1,$GH3M # ; a0*b1
  2705. vpclmulqdq \$0x01,$GHKEY1,$GHDAT1,$GH3T # ; a1*b0
  2706. vpclmulqdq \$0x11,$GHKEY1,$GHDAT1,$GH3H # ; a1*b1
  2707. vpclmulqdq \$0x00,$GHKEY1,$GHDAT1,$GH3L # ; a0*b0
  2708. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2709. # ;; AES rounds 3
  2710. vaesenc $AESKEY2,$B00_03,$B00_03
  2711. vaesenc $AESKEY2,$B04_07,$B04_07
  2712. vaesenc $AESKEY2,$B08_11,$B08_11
  2713. vaesenc $AESKEY2,$B12_15,$B12_15
  2714. vbroadcastf64x2 `(16 * 5)`($AES_KEYS),$AESKEY2
  2715. # ;; =================================================
  2716. # ;; Gather (XOR) GHASH for 12 blocks
  2717. vpternlogq \$0x96,$GH3H,$GH2H,$GH1H
  2718. vpternlogq \$0x96,$GH3L,$GH2L,$GH1L
  2719. vpternlogq \$0x96,$GH3T,$GH2T,$GH1T
  2720. vpternlogq \$0x96,$GH3M,$GH2M,$GH1M
  2721. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2722. # ;; AES rounds 4
  2723. vaesenc $AESKEY1,$B00_03,$B00_03
  2724. vaesenc $AESKEY1,$B04_07,$B04_07
  2725. vaesenc $AESKEY1,$B08_11,$B08_11
  2726. vaesenc $AESKEY1,$B12_15,$B12_15
  2727. vbroadcastf64x2 `(16 * 6)`($AES_KEYS),$AESKEY1
  2728. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2729. # ;; load plain/cipher text (recycle GH3xx registers)
  2730. vmovdqu8 `$DATA_DISPL + (0 * 64)`($PLAIN_CIPH_IN,$DATA_OFFSET),$DATA1
  2731. vmovdqu8 `$DATA_DISPL + (1 * 64)`($PLAIN_CIPH_IN,$DATA_OFFSET),$DATA2
  2732. vmovdqu8 `$DATA_DISPL + (2 * 64)`($PLAIN_CIPH_IN,$DATA_OFFSET),$DATA3
  2733. vmovdqu8 `$DATA_DISPL + (3 * 64)`($PLAIN_CIPH_IN,$DATA_OFFSET),$DATA4
  2734. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2735. # ;; AES rounds 5
  2736. vaesenc $AESKEY2,$B00_03,$B00_03
  2737. vaesenc $AESKEY2,$B04_07,$B04_07
  2738. vaesenc $AESKEY2,$B08_11,$B08_11
  2739. vaesenc $AESKEY2,$B12_15,$B12_15
  2740. vbroadcastf64x2 `(16 * 7)`($AES_KEYS),$AESKEY2
  2741. # ;; =================================================
  2742. # ;; GHASH 4 blocks (3 to 0)
  2743. vpclmulqdq \$0x10,$GHKEY2,$GHDAT2,$GH2M # ; a0*b1
  2744. vpclmulqdq \$0x01,$GHKEY2,$GHDAT2,$GH2T # ; a1*b0
  2745. vpclmulqdq \$0x11,$GHKEY2,$GHDAT2,$GH2H # ; a1*b1
  2746. vpclmulqdq \$0x00,$GHKEY2,$GHDAT2,$GH2L # ; a0*b0
  2747. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2748. # ;; AES round 6
  2749. vaesenc $AESKEY1,$B00_03,$B00_03
  2750. vaesenc $AESKEY1,$B04_07,$B04_07
  2751. vaesenc $AESKEY1,$B08_11,$B08_11
  2752. vaesenc $AESKEY1,$B12_15,$B12_15
  2753. vbroadcastf64x2 `(16 * 8)`($AES_KEYS),$AESKEY1
  2754. ___
  2755. # ;; =================================================
  2756. # ;; gather GHASH in GH1L (low) and GH1H (high)
  2757. if ($DO_REDUCTION eq "first_time") {
  2758. $code .= <<___;
  2759. vpternlogq \$0x96,$GH2T,$GH1T,$GH1M # ; TM
  2760. vpxorq $GH2M,$GH1M,$TO_REDUCE_M # ; TM
  2761. vpxorq $GH2H,$GH1H,$TO_REDUCE_H # ; TH
  2762. vpxorq $GH2L,$GH1L,$TO_REDUCE_L # ; TL
  2763. ___
  2764. }
  2765. if ($DO_REDUCTION eq "no_reduction") {
  2766. $code .= <<___;
  2767. vpternlogq \$0x96,$GH2T,$GH1T,$GH1M # ; TM
  2768. vpternlogq \$0x96,$GH2M,$GH1M,$TO_REDUCE_M # ; TM
  2769. vpternlogq \$0x96,$GH2H,$GH1H,$TO_REDUCE_H # ; TH
  2770. vpternlogq \$0x96,$GH2L,$GH1L,$TO_REDUCE_L # ; TL
  2771. ___
  2772. }
  2773. if ($DO_REDUCTION eq "final_reduction") {
  2774. $code .= <<___;
  2775. # ;; phase 1: add mid products together
  2776. # ;; also load polynomial constant for reduction
  2777. vpternlogq \$0x96,$GH2T,$GH1T,$GH1M # ; TM
  2778. vpternlogq \$0x96,$GH2M,$TO_REDUCE_M,$GH1M
  2779. vpsrldq \$8,$GH1M,$GH2M
  2780. vpslldq \$8,$GH1M,$GH1M
  2781. vmovdqa64 POLY2(%rip),@{[XWORD($RED_POLY)]}
  2782. ___
  2783. }
  2784. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2785. # ;; AES round 7
  2786. $code .= <<___;
  2787. vaesenc $AESKEY2,$B00_03,$B00_03
  2788. vaesenc $AESKEY2,$B04_07,$B04_07
  2789. vaesenc $AESKEY2,$B08_11,$B08_11
  2790. vaesenc $AESKEY2,$B12_15,$B12_15
  2791. vbroadcastf64x2 `(16 * 9)`($AES_KEYS),$AESKEY2
  2792. ___
  2793. # ;; =================================================
  2794. # ;; Add mid product to high and low
  2795. if ($DO_REDUCTION eq "final_reduction") {
  2796. $code .= <<___;
  2797. vpternlogq \$0x96,$GH2M,$GH2H,$GH1H # ; TH = TH1 + TH2 + TM>>64
  2798. vpxorq $TO_REDUCE_H,$GH1H,$GH1H
  2799. vpternlogq \$0x96,$GH1M,$GH2L,$GH1L # ; TL = TL1 + TL2 + TM<<64
  2800. vpxorq $TO_REDUCE_L,$GH1L,$GH1L
  2801. ___
  2802. }
  2803. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2804. # ;; AES round 8
  2805. $code .= <<___;
  2806. vaesenc $AESKEY1,$B00_03,$B00_03
  2807. vaesenc $AESKEY1,$B04_07,$B04_07
  2808. vaesenc $AESKEY1,$B08_11,$B08_11
  2809. vaesenc $AESKEY1,$B12_15,$B12_15
  2810. vbroadcastf64x2 `(16 * 10)`($AES_KEYS),$AESKEY1
  2811. ___
  2812. # ;; =================================================
  2813. # ;; horizontal xor of low and high 4x128
  2814. if ($DO_REDUCTION eq "final_reduction") {
  2815. &VHPXORI4x128($GH1H, $GH2H);
  2816. &VHPXORI4x128($GH1L, $GH2L);
  2817. }
  2818. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2819. # ;; AES round 9
  2820. $code .= <<___;
  2821. vaesenc $AESKEY2,$B00_03,$B00_03
  2822. vaesenc $AESKEY2,$B04_07,$B04_07
  2823. vaesenc $AESKEY2,$B08_11,$B08_11
  2824. vaesenc $AESKEY2,$B12_15,$B12_15
  2825. ___
  2826. if (($NROUNDS >= 11)) {
  2827. $code .= "vbroadcastf64x2 `(16 * 11)`($AES_KEYS),$AESKEY2\n";
  2828. }
  2829. # ;; =================================================
  2830. # ;; first phase of reduction
  2831. if ($DO_REDUCTION eq "final_reduction") {
  2832. $code .= <<___;
  2833. vpclmulqdq \$0x01,@{[XWORD($GH1L)]},@{[XWORD($RED_POLY)]},@{[XWORD($RED_P1)]}
  2834. vpslldq \$8,@{[XWORD($RED_P1)]},@{[XWORD($RED_P1)]} # ; shift-L 2 DWs
  2835. vpxorq @{[XWORD($RED_P1)]},@{[XWORD($GH1L)]},@{[XWORD($RED_P1)]} # ; first phase of the reduct
  2836. ___
  2837. }
  2838. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2839. # ;; AES rounds up to 11 (AES192) or 13 (AES256)
  2840. # ;; AES128 is done
  2841. if (($NROUNDS >= 11)) {
  2842. $code .= <<___;
  2843. vaesenc $AESKEY1,$B00_03,$B00_03
  2844. vaesenc $AESKEY1,$B04_07,$B04_07
  2845. vaesenc $AESKEY1,$B08_11,$B08_11
  2846. vaesenc $AESKEY1,$B12_15,$B12_15
  2847. vbroadcastf64x2 `(16 * 12)`($AES_KEYS),$AESKEY1
  2848. vaesenc $AESKEY2,$B00_03,$B00_03
  2849. vaesenc $AESKEY2,$B04_07,$B04_07
  2850. vaesenc $AESKEY2,$B08_11,$B08_11
  2851. vaesenc $AESKEY2,$B12_15,$B12_15
  2852. ___
  2853. if (($NROUNDS == 13)) {
  2854. $code .= <<___;
  2855. vbroadcastf64x2 `(16 * 13)`($AES_KEYS),$AESKEY2
  2856. vaesenc $AESKEY1,$B00_03,$B00_03
  2857. vaesenc $AESKEY1,$B04_07,$B04_07
  2858. vaesenc $AESKEY1,$B08_11,$B08_11
  2859. vaesenc $AESKEY1,$B12_15,$B12_15
  2860. vbroadcastf64x2 `(16 * 14)`($AES_KEYS),$AESKEY1
  2861. vaesenc $AESKEY2,$B00_03,$B00_03
  2862. vaesenc $AESKEY2,$B04_07,$B04_07
  2863. vaesenc $AESKEY2,$B08_11,$B08_11
  2864. vaesenc $AESKEY2,$B12_15,$B12_15
  2865. ___
  2866. }
  2867. }
  2868. # ;; =================================================
  2869. # ;; second phase of the reduction
  2870. if ($DO_REDUCTION eq "final_reduction") {
  2871. $code .= <<___;
  2872. vpclmulqdq \$0x00,@{[XWORD($RED_P1)]},@{[XWORD($RED_POLY)]},@{[XWORD($RED_T1)]}
  2873. vpsrldq \$4,@{[XWORD($RED_T1)]},@{[XWORD($RED_T1)]} # ; shift-R 1-DW to obtain 2-DWs shift-R
  2874. vpclmulqdq \$0x10,@{[XWORD($RED_P1)]},@{[XWORD($RED_POLY)]},@{[XWORD($RED_T2)]}
  2875. vpslldq \$4,@{[XWORD($RED_T2)]},@{[XWORD($RED_T2)]} # ; shift-L 1-DW for result without shifts
  2876. # ;; GH1H = GH1H x RED_T1 x RED_T2
  2877. vpternlogq \$0x96,@{[XWORD($RED_T1)]},@{[XWORD($RED_T2)]},@{[XWORD($GH1H)]}
  2878. ___
  2879. }
  2880. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2881. # ;; the last AES round
  2882. $code .= <<___;
  2883. vaesenclast $AESKEY1,$B00_03,$B00_03
  2884. vaesenclast $AESKEY1,$B04_07,$B04_07
  2885. vaesenclast $AESKEY1,$B08_11,$B08_11
  2886. vaesenclast $AESKEY1,$B12_15,$B12_15
  2887. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2888. # ;; XOR against plain/cipher text
  2889. vpxorq $DATA1,$B00_03,$B00_03
  2890. vpxorq $DATA2,$B04_07,$B04_07
  2891. vpxorq $DATA3,$B08_11,$B08_11
  2892. vpxorq $DATA4,$B12_15,$B12_15
  2893. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2894. # ;; store cipher/plain text
  2895. mov $CIPH_PLAIN_OUT,$IA0
  2896. vmovdqu8 $B00_03,`$DATA_DISPL + (0 * 64)`($IA0,$DATA_OFFSET,1)
  2897. vmovdqu8 $B04_07,`$DATA_DISPL + (1 * 64)`($IA0,$DATA_OFFSET,1)
  2898. vmovdqu8 $B08_11,`$DATA_DISPL + (2 * 64)`($IA0,$DATA_OFFSET,1)
  2899. vmovdqu8 $B12_15,`$DATA_DISPL + (3 * 64)`($IA0,$DATA_OFFSET,1)
  2900. ___
  2901. # ;; =================================================
  2902. # ;; shuffle cipher text blocks for GHASH computation
  2903. if ($ENC_DEC eq "ENC") {
  2904. $code .= <<___;
  2905. vpshufb $SHFMSK,$B00_03,$B00_03
  2906. vpshufb $SHFMSK,$B04_07,$B04_07
  2907. vpshufb $SHFMSK,$B08_11,$B08_11
  2908. vpshufb $SHFMSK,$B12_15,$B12_15
  2909. ___
  2910. } else {
  2911. $code .= <<___;
  2912. vpshufb $SHFMSK,$DATA1,$B00_03
  2913. vpshufb $SHFMSK,$DATA2,$B04_07
  2914. vpshufb $SHFMSK,$DATA3,$B08_11
  2915. vpshufb $SHFMSK,$DATA4,$B12_15
  2916. ___
  2917. }
  2918. # ;; =================================================
  2919. # ;; store shuffled cipher text for ghashing
  2920. $code .= <<___;
  2921. vmovdqa64 $B00_03,`$AESOUT_BLK_OFFSET + (0*64)`(%rsp)
  2922. vmovdqa64 $B04_07,`$AESOUT_BLK_OFFSET + (1*64)`(%rsp)
  2923. vmovdqa64 $B08_11,`$AESOUT_BLK_OFFSET + (2*64)`(%rsp)
  2924. vmovdqa64 $B12_15,`$AESOUT_BLK_OFFSET + (3*64)`(%rsp)
  2925. ___
  2926. }
  2927. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2928. # ;;; Encryption of a single block
  2929. sub ENCRYPT_SINGLE_BLOCK {
  2930. my $AES_KEY = $_[0]; # ; [in]
  2931. my $XMM0 = $_[1]; # ; [in/out]
  2932. my $GPR1 = $_[2]; # ; [clobbered]
  2933. my $label_suffix = $label_count++;
  2934. $code .= <<___;
  2935. # ; load number of rounds from AES_KEY structure (offset in bytes is
  2936. # ; size of the |rd_key| buffer)
  2937. mov `4*15*4`($AES_KEY),@{[DWORD($GPR1)]}
  2938. cmp \$9,@{[DWORD($GPR1)]}
  2939. je .Laes_128_${label_suffix}
  2940. cmp \$11,@{[DWORD($GPR1)]}
  2941. je .Laes_192_${label_suffix}
  2942. cmp \$13,@{[DWORD($GPR1)]}
  2943. je .Laes_256_${label_suffix}
  2944. jmp .Lexit_aes_${label_suffix}
  2945. ___
  2946. for my $keylen (sort keys %aes_rounds) {
  2947. my $nr = $aes_rounds{$keylen};
  2948. $code .= <<___;
  2949. .align 32
  2950. .Laes_${keylen}_${label_suffix}:
  2951. ___
  2952. $code .= "vpxorq `16*0`($AES_KEY),$XMM0, $XMM0\n\n";
  2953. for (my $i = 1; $i <= $nr; $i++) {
  2954. $code .= "vaesenc `16*$i`($AES_KEY),$XMM0,$XMM0\n\n";
  2955. }
  2956. $code .= <<___;
  2957. vaesenclast `16*($nr+1)`($AES_KEY),$XMM0,$XMM0
  2958. jmp .Lexit_aes_${label_suffix}
  2959. ___
  2960. }
  2961. $code .= ".Lexit_aes_${label_suffix}:\n\n";
  2962. }
  2963. sub CALC_J0 {
  2964. my $GCM128_CTX = $_[0]; #; [in] Pointer to GCM context
  2965. my $IV = $_[1]; #; [in] Pointer to IV
  2966. my $IV_LEN = $_[2]; #; [in] IV length
  2967. my $J0 = $_[3]; #; [out] XMM reg to contain J0
  2968. my $ZT0 = $_[4]; #; [clobbered] ZMM register
  2969. my $ZT1 = $_[5]; #; [clobbered] ZMM register
  2970. my $ZT2 = $_[6]; #; [clobbered] ZMM register
  2971. my $ZT3 = $_[7]; #; [clobbered] ZMM register
  2972. my $ZT4 = $_[8]; #; [clobbered] ZMM register
  2973. my $ZT5 = $_[9]; #; [clobbered] ZMM register
  2974. my $ZT6 = $_[10]; #; [clobbered] ZMM register
  2975. my $ZT7 = $_[11]; #; [clobbered] ZMM register
  2976. my $ZT8 = $_[12]; #; [clobbered] ZMM register
  2977. my $ZT9 = $_[13]; #; [clobbered] ZMM register
  2978. my $ZT10 = $_[14]; #; [clobbered] ZMM register
  2979. my $ZT11 = $_[15]; #; [clobbered] ZMM register
  2980. my $ZT12 = $_[16]; #; [clobbered] ZMM register
  2981. my $ZT13 = $_[17]; #; [clobbered] ZMM register
  2982. my $ZT14 = $_[18]; #; [clobbered] ZMM register
  2983. my $ZT15 = $_[19]; #; [clobbered] ZMM register
  2984. my $ZT16 = $_[20]; #; [clobbered] ZMM register
  2985. my $T1 = $_[21]; #; [clobbered] GP register
  2986. my $T2 = $_[22]; #; [clobbered] GP register
  2987. my $T3 = $_[23]; #; [clobbered] GP register
  2988. my $MASKREG = $_[24]; #; [clobbered] mask register
  2989. # ;; J0 = GHASH(IV || 0s+64 || len(IV)64)
  2990. # ;; s = 16 * RoundUp(len(IV)/16) - len(IV) */
  2991. # ;; Calculate GHASH of (IV || 0s)
  2992. $code .= "vpxor $J0,$J0,$J0\n";
  2993. &CALC_AAD_HASH($IV, $IV_LEN, $J0, $GCM128_CTX, $ZT0, $ZT1, $ZT2, $ZT3, $ZT4,
  2994. $ZT5, $ZT6, $ZT7, $ZT8, $ZT9, $ZT10, $ZT11, $ZT12, $ZT13, $ZT14, $ZT15, $ZT16, $T1, $T2, $T3, $MASKREG);
  2995. # ;; Calculate GHASH of last 16-byte block (0 || len(IV)64)
  2996. $code .= <<___;
  2997. mov $IV_LEN,$T1
  2998. shl \$3,$T1 # ; IV length in bits
  2999. vmovq $T1,@{[XWORD($ZT2)]}
  3000. # ;; Might need shuffle of ZT2
  3001. vpxorq $J0,@{[XWORD($ZT2)]},$J0
  3002. vmovdqu64 @{[HashKeyByIdx(1,$GCM128_CTX)]},@{[XWORD($ZT0)]}
  3003. ___
  3004. &GHASH_MUL($J0, @{[XWORD($ZT0)]}, @{[XWORD($ZT1)]}, @{[XWORD($ZT2)]}, @{[XWORD($ZT3)]});
  3005. $code .= "vpshufb SHUF_MASK(%rip),$J0,$J0 # ; perform a 16Byte swap\n";
  3006. }
  3007. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  3008. # ;;; GCM_INIT_IV performs an initialization of gcm128_ctx struct to prepare for
  3009. # ;;; encoding/decoding.
  3010. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  3011. sub GCM_INIT_IV {
  3012. my $AES_KEYS = $_[0]; # [in] AES key schedule
  3013. my $GCM128_CTX = $_[1]; # [in/out] GCM context
  3014. my $IV = $_[2]; # [in] IV pointer
  3015. my $IV_LEN = $_[3]; # [in] IV length
  3016. my $GPR1 = $_[4]; # [clobbered] GP register
  3017. my $GPR2 = $_[5]; # [clobbered] GP register
  3018. my $GPR3 = $_[6]; # [clobbered] GP register
  3019. my $MASKREG = $_[7]; # [clobbered] mask register
  3020. my $CUR_COUNT = $_[8]; # [out] XMM with current counter
  3021. my $ZT0 = $_[9]; # [clobbered] ZMM register
  3022. my $ZT1 = $_[10]; # [clobbered] ZMM register
  3023. my $ZT2 = $_[11]; # [clobbered] ZMM register
  3024. my $ZT3 = $_[12]; # [clobbered] ZMM register
  3025. my $ZT4 = $_[13]; # [clobbered] ZMM register
  3026. my $ZT5 = $_[14]; # [clobbered] ZMM register
  3027. my $ZT6 = $_[15]; # [clobbered] ZMM register
  3028. my $ZT7 = $_[16]; # [clobbered] ZMM register
  3029. my $ZT8 = $_[17]; # [clobbered] ZMM register
  3030. my $ZT9 = $_[18]; # [clobbered] ZMM register
  3031. my $ZT10 = $_[19]; # [clobbered] ZMM register
  3032. my $ZT11 = $_[20]; # [clobbered] ZMM register
  3033. my $ZT12 = $_[21]; # [clobbered] ZMM register
  3034. my $ZT13 = $_[22]; # [clobbered] ZMM register
  3035. my $ZT14 = $_[23]; # [clobbered] ZMM register
  3036. my $ZT15 = $_[24]; # [clobbered] ZMM register
  3037. my $ZT16 = $_[25]; # [clobbered] ZMM register
  3038. my $ZT0x = $ZT0;
  3039. $ZT0x =~ s/zmm/xmm/;
  3040. $code .= <<___;
  3041. cmp \$12,$IV_LEN
  3042. je iv_len_12_init_IV
  3043. ___
  3044. # ;; IV is different than 12 bytes
  3045. &CALC_J0($GCM128_CTX, $IV, $IV_LEN, $CUR_COUNT, $ZT0, $ZT1, $ZT2, $ZT3, $ZT4, $ZT5, $ZT6, $ZT7,
  3046. $ZT8, $ZT9, $ZT10, $ZT11, $ZT12, $ZT13, $ZT14, $ZT15, $ZT16, $GPR1, $GPR2, $GPR3, $MASKREG);
  3047. $code .= <<___;
  3048. jmp skip_iv_len_12_init_IV
  3049. iv_len_12_init_IV: # ;; IV is 12 bytes
  3050. # ;; read 12 IV bytes and pad with 0x00000001
  3051. vmovdqu8 ONEf(%rip),$CUR_COUNT
  3052. mov $IV,$GPR2
  3053. mov \$0x0000000000000fff,@{[DWORD($GPR1)]}
  3054. kmovq $GPR1,$MASKREG
  3055. vmovdqu8 ($GPR2),${CUR_COUNT}{$MASKREG} # ; ctr = IV | 0x1
  3056. skip_iv_len_12_init_IV:
  3057. vmovdqu $CUR_COUNT,$ZT0x
  3058. ___
  3059. &ENCRYPT_SINGLE_BLOCK($AES_KEYS, "$ZT0x", "$GPR1"); # ; E(K, Y0)
  3060. $code .= <<___;
  3061. vmovdqu $ZT0x,`$CTX_OFFSET_EK0`($GCM128_CTX) # ; save EK0 for finalization stage
  3062. # ;; store IV as counter in LE format
  3063. vpshufb SHUF_MASK(%rip),$CUR_COUNT,$CUR_COUNT
  3064. vmovdqu $CUR_COUNT,`$CTX_OFFSET_CurCount`($GCM128_CTX) # ; save current counter Yi
  3065. ___
  3066. }
  3067. sub GCM_UPDATE_AAD {
  3068. my $GCM128_CTX = $_[0]; # [in] GCM context pointer
  3069. my $A_IN = $_[1]; # [in] AAD pointer
  3070. my $A_LEN = $_[2]; # [in] AAD length in bytes
  3071. my $GPR1 = $_[3]; # [clobbered] GP register
  3072. my $GPR2 = $_[4]; # [clobbered] GP register
  3073. my $GPR3 = $_[5]; # [clobbered] GP register
  3074. my $MASKREG = $_[6]; # [clobbered] mask register
  3075. my $AAD_HASH = $_[7]; # [out] XMM for AAD_HASH value
  3076. my $ZT0 = $_[8]; # [clobbered] ZMM register
  3077. my $ZT1 = $_[9]; # [clobbered] ZMM register
  3078. my $ZT2 = $_[10]; # [clobbered] ZMM register
  3079. my $ZT3 = $_[11]; # [clobbered] ZMM register
  3080. my $ZT4 = $_[12]; # [clobbered] ZMM register
  3081. my $ZT5 = $_[13]; # [clobbered] ZMM register
  3082. my $ZT6 = $_[14]; # [clobbered] ZMM register
  3083. my $ZT7 = $_[15]; # [clobbered] ZMM register
  3084. my $ZT8 = $_[16]; # [clobbered] ZMM register
  3085. my $ZT9 = $_[17]; # [clobbered] ZMM register
  3086. my $ZT10 = $_[18]; # [clobbered] ZMM register
  3087. my $ZT11 = $_[19]; # [clobbered] ZMM register
  3088. my $ZT12 = $_[20]; # [clobbered] ZMM register
  3089. my $ZT13 = $_[21]; # [clobbered] ZMM register
  3090. my $ZT14 = $_[22]; # [clobbered] ZMM register
  3091. my $ZT15 = $_[23]; # [clobbered] ZMM register
  3092. my $ZT16 = $_[24]; # [clobbered] ZMM register
  3093. # ; load current hash
  3094. $code .= "vmovdqu64 $CTX_OFFSET_AadHash($GCM128_CTX),$AAD_HASH\n";
  3095. &CALC_AAD_HASH($A_IN, $A_LEN, $AAD_HASH, $GCM128_CTX, $ZT0, $ZT1, $ZT2,
  3096. $ZT3, $ZT4, $ZT5, $ZT6, $ZT7, $ZT8, $ZT9, $ZT10, $ZT11, $ZT12, $ZT13,
  3097. $ZT14, $ZT15, $ZT16, $GPR1, $GPR2, $GPR3, $MASKREG);
  3098. # ; load current hash
  3099. $code .= "vmovdqu64 $AAD_HASH,$CTX_OFFSET_AadHash($GCM128_CTX)\n";
  3100. }
  3101. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  3102. # ;;; Cipher and ghash of payloads shorter than 256 bytes
  3103. # ;;; - number of blocks in the message comes as argument
  3104. # ;;; - depending on the number of blocks an optimized variant of
  3105. # ;;; INITIAL_BLOCKS_PARTIAL is invoked
  3106. sub GCM_ENC_DEC_SMALL {
  3107. my $AES_KEYS = $_[0]; # [in] key pointer
  3108. my $GCM128_CTX = $_[1]; # [in] context pointer
  3109. my $CIPH_PLAIN_OUT = $_[2]; # [in] output buffer
  3110. my $PLAIN_CIPH_IN = $_[3]; # [in] input buffer
  3111. my $PLAIN_CIPH_LEN = $_[4]; # [in] buffer length
  3112. my $ENC_DEC = $_[5]; # [in] cipher direction
  3113. my $DATA_OFFSET = $_[6]; # [in] data offset
  3114. my $LENGTH = $_[7]; # [in] data length
  3115. my $NUM_BLOCKS = $_[8]; # [in] number of blocks to process 1 to 16
  3116. my $CTR = $_[9]; # [in/out] XMM counter block
  3117. my $HASH_IN_OUT = $_[10]; # [in/out] XMM GHASH value
  3118. my $ZTMP0 = $_[11]; # [clobbered] ZMM register
  3119. my $ZTMP1 = $_[12]; # [clobbered] ZMM register
  3120. my $ZTMP2 = $_[13]; # [clobbered] ZMM register
  3121. my $ZTMP3 = $_[14]; # [clobbered] ZMM register
  3122. my $ZTMP4 = $_[15]; # [clobbered] ZMM register
  3123. my $ZTMP5 = $_[16]; # [clobbered] ZMM register
  3124. my $ZTMP6 = $_[17]; # [clobbered] ZMM register
  3125. my $ZTMP7 = $_[18]; # [clobbered] ZMM register
  3126. my $ZTMP8 = $_[19]; # [clobbered] ZMM register
  3127. my $ZTMP9 = $_[20]; # [clobbered] ZMM register
  3128. my $ZTMP10 = $_[21]; # [clobbered] ZMM register
  3129. my $ZTMP11 = $_[22]; # [clobbered] ZMM register
  3130. my $ZTMP12 = $_[23]; # [clobbered] ZMM register
  3131. my $ZTMP13 = $_[24]; # [clobbered] ZMM register
  3132. my $ZTMP14 = $_[25]; # [clobbered] ZMM register
  3133. my $IA0 = $_[26]; # [clobbered] GP register
  3134. my $IA1 = $_[27]; # [clobbered] GP register
  3135. my $MASKREG = $_[28]; # [clobbered] mask register
  3136. my $SHUFMASK = $_[29]; # [in] ZMM with BE/LE shuffle mask
  3137. my $PBLOCK_LEN = $_[30]; # [in] partial block length
  3138. my $label_suffix = $label_count++;
  3139. $code .= <<___;
  3140. cmp \$8,$NUM_BLOCKS
  3141. je .L_small_initial_num_blocks_is_8_${label_suffix}
  3142. jl .L_small_initial_num_blocks_is_7_1_${label_suffix}
  3143. cmp \$12,$NUM_BLOCKS
  3144. je .L_small_initial_num_blocks_is_12_${label_suffix}
  3145. jl .L_small_initial_num_blocks_is_11_9_${label_suffix}
  3146. # ;; 16, 15, 14 or 13
  3147. cmp \$16,$NUM_BLOCKS
  3148. je .L_small_initial_num_blocks_is_16_${label_suffix}
  3149. cmp \$15,$NUM_BLOCKS
  3150. je .L_small_initial_num_blocks_is_15_${label_suffix}
  3151. cmp \$14,$NUM_BLOCKS
  3152. je .L_small_initial_num_blocks_is_14_${label_suffix}
  3153. jmp .L_small_initial_num_blocks_is_13_${label_suffix}
  3154. .L_small_initial_num_blocks_is_11_9_${label_suffix}:
  3155. # ;; 11, 10 or 9
  3156. cmp \$11,$NUM_BLOCKS
  3157. je .L_small_initial_num_blocks_is_11_${label_suffix}
  3158. cmp \$10,$NUM_BLOCKS
  3159. je .L_small_initial_num_blocks_is_10_${label_suffix}
  3160. jmp .L_small_initial_num_blocks_is_9_${label_suffix}
  3161. .L_small_initial_num_blocks_is_7_1_${label_suffix}:
  3162. cmp \$4,$NUM_BLOCKS
  3163. je .L_small_initial_num_blocks_is_4_${label_suffix}
  3164. jl .L_small_initial_num_blocks_is_3_1_${label_suffix}
  3165. # ;; 7, 6 or 5
  3166. cmp \$7,$NUM_BLOCKS
  3167. je .L_small_initial_num_blocks_is_7_${label_suffix}
  3168. cmp \$6,$NUM_BLOCKS
  3169. je .L_small_initial_num_blocks_is_6_${label_suffix}
  3170. jmp .L_small_initial_num_blocks_is_5_${label_suffix}
  3171. .L_small_initial_num_blocks_is_3_1_${label_suffix}:
  3172. # ;; 3, 2 or 1
  3173. cmp \$3,$NUM_BLOCKS
  3174. je .L_small_initial_num_blocks_is_3_${label_suffix}
  3175. cmp \$2,$NUM_BLOCKS
  3176. je .L_small_initial_num_blocks_is_2_${label_suffix}
  3177. # ;; for $NUM_BLOCKS == 1, just fall through and no 'jmp' needed
  3178. # ;; Generation of different block size variants
  3179. # ;; - one block size has to be the first one
  3180. ___
  3181. for (my $num_blocks = 1; $num_blocks <= 16; $num_blocks++) {
  3182. $code .= ".L_small_initial_num_blocks_is_${num_blocks}_${label_suffix}:\n";
  3183. &INITIAL_BLOCKS_PARTIAL(
  3184. $AES_KEYS, $GCM128_CTX, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $LENGTH, $DATA_OFFSET,
  3185. $num_blocks, $CTR, $HASH_IN_OUT, $ENC_DEC, $ZTMP0, $ZTMP1,
  3186. $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7,
  3187. $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12, $ZTMP13,
  3188. $ZTMP14, $IA0, $IA1, $MASKREG, $SHUFMASK, $PBLOCK_LEN);
  3189. if ($num_blocks != 16) {
  3190. $code .= "jmp .L_small_initial_blocks_encrypted_${label_suffix}\n";
  3191. }
  3192. }
  3193. $code .= ".L_small_initial_blocks_encrypted_${label_suffix}:\n";
  3194. }
  3195. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  3196. # ; GCM_ENC_DEC Encrypts/Decrypts given data. Assumes that the passed gcm128_context
  3197. # ; struct has been initialized by GCM_INIT_IV
  3198. # ; Requires the input data be at least 1 byte long because of READ_SMALL_INPUT_DATA.
  3199. # ; Clobbers rax, r10-r15, and zmm0-zmm31, k1
  3200. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  3201. sub GCM_ENC_DEC {
  3202. my $AES_KEYS = $_[0]; # [in] AES Key schedule
  3203. my $GCM128_CTX = $_[1]; # [in] context pointer
  3204. my $PBLOCK_LEN = $_[2]; # [in] length of partial block at the moment of previous update
  3205. my $PLAIN_CIPH_IN = $_[3]; # [in] input buffer pointer
  3206. my $PLAIN_CIPH_LEN = $_[4]; # [in] buffer length
  3207. my $CIPH_PLAIN_OUT = $_[5]; # [in] output buffer pointer
  3208. my $ENC_DEC = $_[6]; # [in] cipher direction
  3209. my $IA0 = "%r10";
  3210. my $IA1 = "%r12";
  3211. my $IA2 = "%r13";
  3212. my $IA3 = "%r15";
  3213. my $IA4 = "%r11";
  3214. my $IA5 = "%rax";
  3215. my $IA6 = "%rbx";
  3216. my $IA7 = "%r14";
  3217. my $LENGTH = $win64 ? $IA2 : $PLAIN_CIPH_LEN;
  3218. my $CTR_CHECK = $IA3;
  3219. my $DATA_OFFSET = $IA4;
  3220. my $HASHK_PTR = $IA6;
  3221. my $HKEYS_READY = $IA7;
  3222. my $CTR_BLOCKz = "%zmm2";
  3223. my $CTR_BLOCKx = "%xmm2";
  3224. # ; hardcoded in GCM_INIT
  3225. my $AAD_HASHz = "%zmm14";
  3226. my $AAD_HASHx = "%xmm14";
  3227. # ; hardcoded in GCM_COMPLETE
  3228. my $ZTMP0 = "%zmm0";
  3229. my $ZTMP1 = "%zmm3";
  3230. my $ZTMP2 = "%zmm4";
  3231. my $ZTMP3 = "%zmm5";
  3232. my $ZTMP4 = "%zmm6";
  3233. my $ZTMP5 = "%zmm7";
  3234. my $ZTMP6 = "%zmm10";
  3235. my $ZTMP7 = "%zmm11";
  3236. my $ZTMP8 = "%zmm12";
  3237. my $ZTMP9 = "%zmm13";
  3238. my $ZTMP10 = "%zmm15";
  3239. my $ZTMP11 = "%zmm16";
  3240. my $ZTMP12 = "%zmm17";
  3241. my $ZTMP13 = "%zmm19";
  3242. my $ZTMP14 = "%zmm20";
  3243. my $ZTMP15 = "%zmm21";
  3244. my $ZTMP16 = "%zmm30";
  3245. my $ZTMP17 = "%zmm31";
  3246. my $ZTMP18 = "%zmm1";
  3247. my $ZTMP19 = "%zmm18";
  3248. my $ZTMP20 = "%zmm8";
  3249. my $ZTMP21 = "%zmm22";
  3250. my $ZTMP22 = "%zmm23";
  3251. my $GH = "%zmm24";
  3252. my $GL = "%zmm25";
  3253. my $GM = "%zmm26";
  3254. my $SHUF_MASK = "%zmm29";
  3255. # ; Unused in the small packet path
  3256. my $ADDBE_4x4 = "%zmm27";
  3257. my $ADDBE_1234 = "%zmm28";
  3258. my $MASKREG = "%k1";
  3259. my $label_suffix = $label_count++;
  3260. # ;; reduction every 48 blocks, depth 32 blocks
  3261. # ;; @note 48 blocks is the maximum capacity of the stack frame
  3262. my $big_loop_nblocks = 48;
  3263. my $big_loop_depth = 32;
  3264. # ;;; Macro flow depending on packet size
  3265. # ;;; - LENGTH <= 16 blocks
  3266. # ;;; - cipher followed by hashing (reduction)
  3267. # ;;; - 16 blocks < LENGTH < 32 blocks
  3268. # ;;; - cipher 16 blocks
  3269. # ;;; - cipher N blocks & hash 16 blocks, hash N blocks (reduction)
  3270. # ;;; - 32 blocks < LENGTH < 48 blocks
  3271. # ;;; - cipher 2 x 16 blocks
  3272. # ;;; - hash 16 blocks
  3273. # ;;; - cipher N blocks & hash 16 blocks, hash N blocks (reduction)
  3274. # ;;; - LENGTH >= 48 blocks
  3275. # ;;; - cipher 2 x 16 blocks
  3276. # ;;; - while (data_to_cipher >= 48 blocks):
  3277. # ;;; - cipher 16 blocks & hash 16 blocks
  3278. # ;;; - cipher 16 blocks & hash 16 blocks
  3279. # ;;; - cipher 16 blocks & hash 16 blocks (reduction)
  3280. # ;;; - if (data_to_cipher >= 32 blocks):
  3281. # ;;; - cipher 16 blocks & hash 16 blocks
  3282. # ;;; - cipher 16 blocks & hash 16 blocks
  3283. # ;;; - hash 16 blocks (reduction)
  3284. # ;;; - cipher N blocks & hash 16 blocks, hash N blocks (reduction)
  3285. # ;;; - elif (data_to_cipher >= 16 blocks):
  3286. # ;;; - cipher 16 blocks & hash 16 blocks
  3287. # ;;; - hash 16 blocks
  3288. # ;;; - cipher N blocks & hash 16 blocks, hash N blocks (reduction)
  3289. # ;;; - else:
  3290. # ;;; - hash 16 blocks
  3291. # ;;; - cipher N blocks & hash 16 blocks, hash N blocks (reduction)
  3292. if ($win64) {
  3293. $code .= "cmpq \$0,$PLAIN_CIPH_LEN\n";
  3294. } else {
  3295. $code .= "or $PLAIN_CIPH_LEN,$PLAIN_CIPH_LEN\n";
  3296. }
  3297. $code .= "je .L_enc_dec_done_${label_suffix}\n";
  3298. # Length value from context $CTX_OFFSET_InLen`($GCM128_CTX) is updated in
  3299. # 'providers/implementations/ciphers/cipher_aes_gcm_hw_vaes_avx512.inc'
  3300. $code .= "xor $HKEYS_READY, $HKEYS_READY\n";
  3301. $code .= "vmovdqu64 `$CTX_OFFSET_AadHash`($GCM128_CTX),$AAD_HASHx\n";
  3302. # ;; Used for the update flow - if there was a previous partial
  3303. # ;; block fill the remaining bytes here.
  3304. &PARTIAL_BLOCK(
  3305. $GCM128_CTX, $PBLOCK_LEN, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $PLAIN_CIPH_LEN,
  3306. $DATA_OFFSET, $AAD_HASHx, $ENC_DEC, $IA0, $IA1,
  3307. $IA2, $ZTMP0, $ZTMP1, $ZTMP2, $ZTMP3,
  3308. $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7, $MASKREG);
  3309. $code .= "vmovdqu64 `$CTX_OFFSET_CurCount`($GCM128_CTX),$CTR_BLOCKx\n";
  3310. # ;; Save the amount of data left to process in $LENGTH
  3311. # ;; NOTE: PLAIN_CIPH_LEN is a register on linux;
  3312. if ($win64) {
  3313. $code .= "mov $PLAIN_CIPH_LEN,$LENGTH\n";
  3314. }
  3315. # ;; There may be no more data if it was consumed in the partial block.
  3316. $code .= <<___;
  3317. sub $DATA_OFFSET,$LENGTH
  3318. je .L_enc_dec_done_${label_suffix}
  3319. ___
  3320. $code .= <<___;
  3321. cmp \$`(16 * 16)`,$LENGTH
  3322. jbe .L_message_below_equal_16_blocks_${label_suffix}
  3323. vmovdqa64 SHUF_MASK(%rip),$SHUF_MASK
  3324. vmovdqa64 ddq_addbe_4444(%rip),$ADDBE_4x4
  3325. vmovdqa64 ddq_addbe_1234(%rip),$ADDBE_1234
  3326. # ;; start the pipeline
  3327. # ;; - 32 blocks aes-ctr
  3328. # ;; - 16 blocks ghash + aes-ctr
  3329. # ;; set up CTR_CHECK
  3330. vmovd $CTR_BLOCKx,@{[DWORD($CTR_CHECK)]}
  3331. and \$255,@{[DWORD($CTR_CHECK)]}
  3332. # ;; in LE format after init, convert to BE
  3333. vshufi64x2 \$0,$CTR_BLOCKz,$CTR_BLOCKz,$CTR_BLOCKz
  3334. vpshufb $SHUF_MASK,$CTR_BLOCKz,$CTR_BLOCKz
  3335. ___
  3336. # ;; ==== AES-CTR - first 16 blocks
  3337. my $aesout_offset = ($STACK_LOCAL_OFFSET + (0 * 16));
  3338. my $data_in_out_offset = 0;
  3339. &INITIAL_BLOCKS_16(
  3340. $PLAIN_CIPH_IN, $CIPH_PLAIN_OUT, $AES_KEYS, $DATA_OFFSET, "no_ghash", $CTR_BLOCKz,
  3341. $CTR_CHECK, $ADDBE_4x4, $ADDBE_1234, $ZTMP0, $ZTMP1, $ZTMP2,
  3342. $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7, $ZTMP8,
  3343. $SHUF_MASK, $ENC_DEC, $aesout_offset, $data_in_out_offset, $IA0);
  3344. &precompute_hkeys_on_stack($GCM128_CTX, $HKEYS_READY, $ZTMP0, $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6,
  3345. "first16");
  3346. $code .= <<___;
  3347. cmp \$`(32 * 16)`,$LENGTH
  3348. jb .L_message_below_32_blocks_${label_suffix}
  3349. ___
  3350. # ;; ==== AES-CTR - next 16 blocks
  3351. $aesout_offset = ($STACK_LOCAL_OFFSET + (16 * 16));
  3352. $data_in_out_offset = (16 * 16);
  3353. &INITIAL_BLOCKS_16(
  3354. $PLAIN_CIPH_IN, $CIPH_PLAIN_OUT, $AES_KEYS, $DATA_OFFSET, "no_ghash", $CTR_BLOCKz,
  3355. $CTR_CHECK, $ADDBE_4x4, $ADDBE_1234, $ZTMP0, $ZTMP1, $ZTMP2,
  3356. $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7, $ZTMP8,
  3357. $SHUF_MASK, $ENC_DEC, $aesout_offset, $data_in_out_offset, $IA0);
  3358. &precompute_hkeys_on_stack($GCM128_CTX, $HKEYS_READY, $ZTMP0, $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6,
  3359. "last32");
  3360. $code .= "mov \$1,$HKEYS_READY\n";
  3361. $code .= <<___;
  3362. add \$`(32 * 16)`,$DATA_OFFSET
  3363. sub \$`(32 * 16)`,$LENGTH
  3364. cmp \$`($big_loop_nblocks * 16)`,$LENGTH
  3365. jb .L_no_more_big_nblocks_${label_suffix}
  3366. ___
  3367. # ;; ====
  3368. # ;; ==== AES-CTR + GHASH - 48 blocks loop
  3369. # ;; ====
  3370. $code .= ".L_encrypt_big_nblocks_${label_suffix}:\n";
  3371. # ;; ==== AES-CTR + GHASH - 16 blocks, start
  3372. $aesout_offset = ($STACK_LOCAL_OFFSET + (32 * 16));
  3373. $data_in_out_offset = (0 * 16);
  3374. my $ghashin_offset = ($STACK_LOCAL_OFFSET + (0 * 16));
  3375. &GHASH_16_ENCRYPT_16_PARALLEL(
  3376. $AES_KEYS, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $DATA_OFFSET, $CTR_BLOCKz, $CTR_CHECK,
  3377. 48, $aesout_offset, $ghashin_offset, $SHUF_MASK, $ZTMP0, $ZTMP1,
  3378. $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7,
  3379. $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12, $ZTMP13,
  3380. $ZTMP14, $ZTMP15, $ZTMP16, $ZTMP17, $ZTMP18, $ZTMP19,
  3381. $ZTMP20, $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234, $GL,
  3382. $GH, $GM, "first_time", $ENC_DEC, $data_in_out_offset, $AAD_HASHz,
  3383. $IA0);
  3384. # ;; ==== AES-CTR + GHASH - 16 blocks, no reduction
  3385. $aesout_offset = ($STACK_LOCAL_OFFSET + (0 * 16));
  3386. $data_in_out_offset = (16 * 16);
  3387. $ghashin_offset = ($STACK_LOCAL_OFFSET + (16 * 16));
  3388. &GHASH_16_ENCRYPT_16_PARALLEL(
  3389. $AES_KEYS, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $DATA_OFFSET, $CTR_BLOCKz, $CTR_CHECK,
  3390. 32, $aesout_offset, $ghashin_offset, $SHUF_MASK, $ZTMP0, $ZTMP1,
  3391. $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7,
  3392. $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12, $ZTMP13,
  3393. $ZTMP14, $ZTMP15, $ZTMP16, $ZTMP17, $ZTMP18, $ZTMP19,
  3394. $ZTMP20, $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234, $GL,
  3395. $GH, $GM, "no_reduction", $ENC_DEC, $data_in_out_offset, "no_ghash_in",
  3396. $IA0);
  3397. # ;; ==== AES-CTR + GHASH - 16 blocks, reduction
  3398. $aesout_offset = ($STACK_LOCAL_OFFSET + (16 * 16));
  3399. $data_in_out_offset = (32 * 16);
  3400. $ghashin_offset = ($STACK_LOCAL_OFFSET + (32 * 16));
  3401. &GHASH_16_ENCRYPT_16_PARALLEL(
  3402. $AES_KEYS, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $DATA_OFFSET, $CTR_BLOCKz, $CTR_CHECK,
  3403. 16, $aesout_offset, $ghashin_offset, $SHUF_MASK, $ZTMP0, $ZTMP1,
  3404. $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7,
  3405. $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12, $ZTMP13,
  3406. $ZTMP14, $ZTMP15, $ZTMP16, $ZTMP17, $ZTMP18, $ZTMP19,
  3407. $ZTMP20, $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234, $GL,
  3408. $GH, $GM, "final_reduction", $ENC_DEC, $data_in_out_offset, "no_ghash_in",
  3409. $IA0);
  3410. # ;; === xor cipher block 0 with GHASH (ZT4)
  3411. $code .= <<___;
  3412. vmovdqa64 $ZTMP4,$AAD_HASHz
  3413. add \$`($big_loop_nblocks * 16)`,$DATA_OFFSET
  3414. sub \$`($big_loop_nblocks * 16)`,$LENGTH
  3415. cmp \$`($big_loop_nblocks * 16)`,$LENGTH
  3416. jae .L_encrypt_big_nblocks_${label_suffix}
  3417. .L_no_more_big_nblocks_${label_suffix}:
  3418. cmp \$`(32 * 16)`,$LENGTH
  3419. jae .L_encrypt_32_blocks_${label_suffix}
  3420. cmp \$`(16 * 16)`,$LENGTH
  3421. jae .L_encrypt_16_blocks_${label_suffix}
  3422. ___
  3423. # ;; =====================================================
  3424. # ;; =====================================================
  3425. # ;; ==== GHASH 1 x 16 blocks
  3426. # ;; ==== GHASH 1 x 16 blocks (reduction) & encrypt N blocks
  3427. # ;; ==== then GHASH N blocks
  3428. $code .= ".L_encrypt_0_blocks_ghash_32_${label_suffix}:\n";
  3429. # ;; calculate offset to the right hash key
  3430. $code .= <<___;
  3431. mov @{[DWORD($LENGTH)]},@{[DWORD($IA0)]}
  3432. and \$~15,@{[DWORD($IA0)]}
  3433. mov \$`@{[HashKeyOffsetByIdx(32,"frame")]}`,@{[DWORD($HASHK_PTR)]}
  3434. sub @{[DWORD($IA0)]},@{[DWORD($HASHK_PTR)]}
  3435. ___
  3436. # ;; ==== GHASH 32 blocks and follow with reduction
  3437. &GHASH_16("start", $GH, $GM, $GL, "%rsp", $STACK_LOCAL_OFFSET, (0 * 16),
  3438. "%rsp", $HASHK_PTR, 0, $AAD_HASHz, $ZTMP0, $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7, $ZTMP8, $ZTMP9);
  3439. # ;; ==== GHASH 1 x 16 blocks with reduction + cipher and ghash on the reminder
  3440. $ghashin_offset = ($STACK_LOCAL_OFFSET + (16 * 16));
  3441. $code .= "add \$`(16 * 16)`,@{[DWORD($HASHK_PTR)]}\n";
  3442. &GCM_ENC_DEC_LAST(
  3443. $AES_KEYS, $GCM128_CTX, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $DATA_OFFSET, $LENGTH,
  3444. $CTR_BLOCKz, $CTR_CHECK, $HASHK_PTR, $ghashin_offset, $SHUF_MASK, $ZTMP0,
  3445. $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6,
  3446. $ZTMP7, $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12,
  3447. $ZTMP13, $ZTMP14, $ZTMP15, $ZTMP16, $ZTMP17, $ZTMP18,
  3448. $ZTMP19, $ZTMP20, $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234,
  3449. "mid", $GL, $GH, $GM, $ENC_DEC, $AAD_HASHz,
  3450. $IA0, $IA5, $MASKREG, $PBLOCK_LEN);
  3451. $code .= "vpshufb @{[XWORD($SHUF_MASK)]},$CTR_BLOCKx,$CTR_BLOCKx\n";
  3452. $code .= "jmp .L_ghash_done_${label_suffix}\n";
  3453. # ;; =====================================================
  3454. # ;; =====================================================
  3455. # ;; ==== GHASH & encrypt 1 x 16 blocks
  3456. # ;; ==== GHASH & encrypt 1 x 16 blocks
  3457. # ;; ==== GHASH 1 x 16 blocks (reduction)
  3458. # ;; ==== GHASH 1 x 16 blocks (reduction) & encrypt N blocks
  3459. # ;; ==== then GHASH N blocks
  3460. $code .= ".L_encrypt_32_blocks_${label_suffix}:\n";
  3461. # ;; ==== AES-CTR + GHASH - 16 blocks, start
  3462. $aesout_offset = ($STACK_LOCAL_OFFSET + (32 * 16));
  3463. $ghashin_offset = ($STACK_LOCAL_OFFSET + (0 * 16));
  3464. $data_in_out_offset = (0 * 16);
  3465. &GHASH_16_ENCRYPT_16_PARALLEL(
  3466. $AES_KEYS, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $DATA_OFFSET, $CTR_BLOCKz, $CTR_CHECK,
  3467. 48, $aesout_offset, $ghashin_offset, $SHUF_MASK, $ZTMP0, $ZTMP1,
  3468. $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7,
  3469. $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12, $ZTMP13,
  3470. $ZTMP14, $ZTMP15, $ZTMP16, $ZTMP17, $ZTMP18, $ZTMP19,
  3471. $ZTMP20, $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234, $GL,
  3472. $GH, $GM, "first_time", $ENC_DEC, $data_in_out_offset, $AAD_HASHz,
  3473. $IA0);
  3474. # ;; ==== AES-CTR + GHASH - 16 blocks, no reduction
  3475. $aesout_offset = ($STACK_LOCAL_OFFSET + (0 * 16));
  3476. $ghashin_offset = ($STACK_LOCAL_OFFSET + (16 * 16));
  3477. $data_in_out_offset = (16 * 16);
  3478. &GHASH_16_ENCRYPT_16_PARALLEL(
  3479. $AES_KEYS, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $DATA_OFFSET, $CTR_BLOCKz, $CTR_CHECK,
  3480. 32, $aesout_offset, $ghashin_offset, $SHUF_MASK, $ZTMP0, $ZTMP1,
  3481. $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7,
  3482. $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12, $ZTMP13,
  3483. $ZTMP14, $ZTMP15, $ZTMP16, $ZTMP17, $ZTMP18, $ZTMP19,
  3484. $ZTMP20, $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234, $GL,
  3485. $GH, $GM, "no_reduction", $ENC_DEC, $data_in_out_offset, "no_ghash_in",
  3486. $IA0);
  3487. # ;; ==== GHASH 16 blocks with reduction
  3488. &GHASH_16(
  3489. "end_reduce", $GH, $GM, $GL, "%rsp", $STACK_LOCAL_OFFSET, (32 * 16),
  3490. "%rsp", &HashKeyOffsetByIdx(16, "frame"),
  3491. 0, $AAD_HASHz, $ZTMP0, $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7, $ZTMP8, $ZTMP9);
  3492. # ;; ==== GHASH 1 x 16 blocks with reduction + cipher and ghash on the reminder
  3493. $ghashin_offset = ($STACK_LOCAL_OFFSET + (0 * 16));
  3494. $code .= <<___;
  3495. sub \$`(32 * 16)`,$LENGTH
  3496. add \$`(32 * 16)`,$DATA_OFFSET
  3497. ___
  3498. # ;; calculate offset to the right hash key
  3499. $code .= "mov @{[DWORD($LENGTH)]},@{[DWORD($IA0)]}\n";
  3500. $code .= <<___;
  3501. and \$~15,@{[DWORD($IA0)]}
  3502. mov \$`@{[HashKeyOffsetByIdx(16,"frame")]}`,@{[DWORD($HASHK_PTR)]}
  3503. sub @{[DWORD($IA0)]},@{[DWORD($HASHK_PTR)]}
  3504. ___
  3505. &GCM_ENC_DEC_LAST(
  3506. $AES_KEYS, $GCM128_CTX, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $DATA_OFFSET, $LENGTH,
  3507. $CTR_BLOCKz, $CTR_CHECK, $HASHK_PTR, $ghashin_offset, $SHUF_MASK, $ZTMP0,
  3508. $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6,
  3509. $ZTMP7, $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12,
  3510. $ZTMP13, $ZTMP14, $ZTMP15, $ZTMP16, $ZTMP17, $ZTMP18,
  3511. $ZTMP19, $ZTMP20, $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234,
  3512. "start", $GL, $GH, $GM, $ENC_DEC, $AAD_HASHz,
  3513. $IA0, $IA5, $MASKREG, $PBLOCK_LEN);
  3514. $code .= "vpshufb @{[XWORD($SHUF_MASK)]},$CTR_BLOCKx,$CTR_BLOCKx\n";
  3515. $code .= "jmp .L_ghash_done_${label_suffix}\n";
  3516. # ;; =====================================================
  3517. # ;; =====================================================
  3518. # ;; ==== GHASH & encrypt 16 blocks (done before)
  3519. # ;; ==== GHASH 1 x 16 blocks
  3520. # ;; ==== GHASH 1 x 16 blocks (reduction) & encrypt N blocks
  3521. # ;; ==== then GHASH N blocks
  3522. $code .= ".L_encrypt_16_blocks_${label_suffix}:\n";
  3523. # ;; ==== AES-CTR + GHASH - 16 blocks, start
  3524. $aesout_offset = ($STACK_LOCAL_OFFSET + (32 * 16));
  3525. $ghashin_offset = ($STACK_LOCAL_OFFSET + (0 * 16));
  3526. $data_in_out_offset = (0 * 16);
  3527. &GHASH_16_ENCRYPT_16_PARALLEL(
  3528. $AES_KEYS, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $DATA_OFFSET, $CTR_BLOCKz, $CTR_CHECK,
  3529. 48, $aesout_offset, $ghashin_offset, $SHUF_MASK, $ZTMP0, $ZTMP1,
  3530. $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7,
  3531. $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12, $ZTMP13,
  3532. $ZTMP14, $ZTMP15, $ZTMP16, $ZTMP17, $ZTMP18, $ZTMP19,
  3533. $ZTMP20, $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234, $GL,
  3534. $GH, $GM, "first_time", $ENC_DEC, $data_in_out_offset, $AAD_HASHz,
  3535. $IA0);
  3536. # ;; ==== GHASH 1 x 16 blocks
  3537. &GHASH_16(
  3538. "mid", $GH, $GM, $GL, "%rsp", $STACK_LOCAL_OFFSET, (16 * 16),
  3539. "%rsp", &HashKeyOffsetByIdx(32, "frame"),
  3540. 0, "no_hash_input", $ZTMP0, $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7, $ZTMP8, $ZTMP9);
  3541. # ;; ==== GHASH 1 x 16 blocks with reduction + cipher and ghash on the reminder
  3542. $ghashin_offset = ($STACK_LOCAL_OFFSET + (32 * 16));
  3543. $code .= <<___;
  3544. sub \$`(16 * 16)`,$LENGTH
  3545. add \$`(16 * 16)`,$DATA_OFFSET
  3546. ___
  3547. &GCM_ENC_DEC_LAST(
  3548. $AES_KEYS, $GCM128_CTX, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN,
  3549. $DATA_OFFSET, $LENGTH, $CTR_BLOCKz, $CTR_CHECK,
  3550. &HashKeyOffsetByIdx(16, "frame"), $ghashin_offset, $SHUF_MASK, $ZTMP0,
  3551. $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4,
  3552. $ZTMP5, $ZTMP6, $ZTMP7, $ZTMP8,
  3553. $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12,
  3554. $ZTMP13, $ZTMP14, $ZTMP15, $ZTMP16,
  3555. $ZTMP17, $ZTMP18, $ZTMP19, $ZTMP20,
  3556. $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234,
  3557. "end_reduce", $GL, $GH, $GM,
  3558. $ENC_DEC, $AAD_HASHz, $IA0, $IA5,
  3559. $MASKREG, $PBLOCK_LEN);
  3560. $code .= "vpshufb @{[XWORD($SHUF_MASK)]},$CTR_BLOCKx,$CTR_BLOCKx\n";
  3561. $code .= <<___;
  3562. jmp .L_ghash_done_${label_suffix}
  3563. .L_message_below_32_blocks_${label_suffix}:
  3564. # ;; 32 > number of blocks > 16
  3565. sub \$`(16 * 16)`,$LENGTH
  3566. add \$`(16 * 16)`,$DATA_OFFSET
  3567. ___
  3568. $ghashin_offset = ($STACK_LOCAL_OFFSET + (0 * 16));
  3569. # ;; calculate offset to the right hash key
  3570. $code .= "mov @{[DWORD($LENGTH)]},@{[DWORD($IA0)]}\n";
  3571. &precompute_hkeys_on_stack($GCM128_CTX, $HKEYS_READY, $ZTMP0, $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6,
  3572. "mid16");
  3573. $code .= "mov \$1,$HKEYS_READY\n";
  3574. $code .= <<___;
  3575. and \$~15,@{[DWORD($IA0)]}
  3576. mov \$`@{[HashKeyOffsetByIdx(16,"frame")]}`,@{[DWORD($HASHK_PTR)]}
  3577. sub @{[DWORD($IA0)]},@{[DWORD($HASHK_PTR)]}
  3578. ___
  3579. &GCM_ENC_DEC_LAST(
  3580. $AES_KEYS, $GCM128_CTX, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $DATA_OFFSET, $LENGTH,
  3581. $CTR_BLOCKz, $CTR_CHECK, $HASHK_PTR, $ghashin_offset, $SHUF_MASK, $ZTMP0,
  3582. $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6,
  3583. $ZTMP7, $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12,
  3584. $ZTMP13, $ZTMP14, $ZTMP15, $ZTMP16, $ZTMP17, $ZTMP18,
  3585. $ZTMP19, $ZTMP20, $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234,
  3586. "start", $GL, $GH, $GM, $ENC_DEC, $AAD_HASHz,
  3587. $IA0, $IA5, $MASKREG, $PBLOCK_LEN);
  3588. $code .= "vpshufb @{[XWORD($SHUF_MASK)]},$CTR_BLOCKx,$CTR_BLOCKx\n";
  3589. $code .= <<___;
  3590. jmp .L_ghash_done_${label_suffix}
  3591. .L_message_below_equal_16_blocks_${label_suffix}:
  3592. # ;; Determine how many blocks to process
  3593. # ;; - process one additional block if there is a partial block
  3594. mov @{[DWORD($LENGTH)]},@{[DWORD($IA1)]}
  3595. add \$15,@{[DWORD($IA1)]}
  3596. shr \$4, @{[DWORD($IA1)]} # ; $IA1 can be in the range from 0 to 16
  3597. ___
  3598. &GCM_ENC_DEC_SMALL(
  3599. $AES_KEYS, $GCM128_CTX, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $PLAIN_CIPH_LEN, $ENC_DEC,
  3600. $DATA_OFFSET, $LENGTH, $IA1, $CTR_BLOCKx, $AAD_HASHx, $ZTMP0,
  3601. $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6,
  3602. $ZTMP7, $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12,
  3603. $ZTMP13, $ZTMP14, $IA0, $IA3, $MASKREG, $SHUF_MASK,
  3604. $PBLOCK_LEN);
  3605. # ;; fall through to exit
  3606. $code .= ".L_ghash_done_${label_suffix}:\n";
  3607. # ;; save the last counter block
  3608. $code .= "vmovdqu64 $CTR_BLOCKx,`$CTX_OFFSET_CurCount`($GCM128_CTX)\n";
  3609. $code .= <<___;
  3610. vmovdqu64 $AAD_HASHx,`$CTX_OFFSET_AadHash`($GCM128_CTX)
  3611. .L_enc_dec_done_${label_suffix}:
  3612. ___
  3613. }
  3614. # ;;; ===========================================================================
  3615. # ;;; Encrypt/decrypt the initial 16 blocks
  3616. sub INITIAL_BLOCKS_16 {
  3617. my $IN = $_[0]; # [in] input buffer
  3618. my $OUT = $_[1]; # [in] output buffer
  3619. my $AES_KEYS = $_[2]; # [in] pointer to expanded keys
  3620. my $DATA_OFFSET = $_[3]; # [in] data offset
  3621. my $GHASH = $_[4]; # [in] ZMM with AAD (low 128 bits)
  3622. my $CTR = $_[5]; # [in] ZMM with CTR BE blocks 4x128 bits
  3623. my $CTR_CHECK = $_[6]; # [in/out] GPR with counter overflow check
  3624. my $ADDBE_4x4 = $_[7]; # [in] ZMM 4x128bits with value 4 (big endian)
  3625. my $ADDBE_1234 = $_[8]; # [in] ZMM 4x128bits with values 1, 2, 3 & 4 (big endian)
  3626. my $T0 = $_[9]; # [clobered] temporary ZMM register
  3627. my $T1 = $_[10]; # [clobered] temporary ZMM register
  3628. my $T2 = $_[11]; # [clobered] temporary ZMM register
  3629. my $T3 = $_[12]; # [clobered] temporary ZMM register
  3630. my $T4 = $_[13]; # [clobered] temporary ZMM register
  3631. my $T5 = $_[14]; # [clobered] temporary ZMM register
  3632. my $T6 = $_[15]; # [clobered] temporary ZMM register
  3633. my $T7 = $_[16]; # [clobered] temporary ZMM register
  3634. my $T8 = $_[17]; # [clobered] temporary ZMM register
  3635. my $SHUF_MASK = $_[18]; # [in] ZMM with BE/LE shuffle mask
  3636. my $ENC_DEC = $_[19]; # [in] ENC (encrypt) or DEC (decrypt) selector
  3637. my $BLK_OFFSET = $_[20]; # [in] stack frame offset to ciphered blocks
  3638. my $DATA_DISPL = $_[21]; # [in] fixed numerical data displacement/offset
  3639. my $IA0 = $_[22]; # [clobered] temporary GP register
  3640. my $B00_03 = $T5;
  3641. my $B04_07 = $T6;
  3642. my $B08_11 = $T7;
  3643. my $B12_15 = $T8;
  3644. my $label_suffix = $label_count++;
  3645. my $stack_offset = $BLK_OFFSET;
  3646. $code .= <<___;
  3647. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  3648. # ;; prepare counter blocks
  3649. cmpb \$`(256 - 16)`,@{[BYTE($CTR_CHECK)]}
  3650. jae .L_next_16_overflow_${label_suffix}
  3651. vpaddd $ADDBE_1234,$CTR,$B00_03
  3652. vpaddd $ADDBE_4x4,$B00_03,$B04_07
  3653. vpaddd $ADDBE_4x4,$B04_07,$B08_11
  3654. vpaddd $ADDBE_4x4,$B08_11,$B12_15
  3655. jmp .L_next_16_ok_${label_suffix}
  3656. .L_next_16_overflow_${label_suffix}:
  3657. vpshufb $SHUF_MASK,$CTR,$CTR
  3658. vmovdqa64 ddq_add_4444(%rip),$B12_15
  3659. vpaddd ddq_add_1234(%rip),$CTR,$B00_03
  3660. vpaddd $B12_15,$B00_03,$B04_07
  3661. vpaddd $B12_15,$B04_07,$B08_11
  3662. vpaddd $B12_15,$B08_11,$B12_15
  3663. vpshufb $SHUF_MASK,$B00_03,$B00_03
  3664. vpshufb $SHUF_MASK,$B04_07,$B04_07
  3665. vpshufb $SHUF_MASK,$B08_11,$B08_11
  3666. vpshufb $SHUF_MASK,$B12_15,$B12_15
  3667. .L_next_16_ok_${label_suffix}:
  3668. vshufi64x2 \$0b11111111,$B12_15,$B12_15,$CTR
  3669. addb \$16,@{[BYTE($CTR_CHECK)]}
  3670. # ;; === load 16 blocks of data
  3671. vmovdqu8 `$DATA_DISPL + (64*0)`($IN,$DATA_OFFSET,1),$T0
  3672. vmovdqu8 `$DATA_DISPL + (64*1)`($IN,$DATA_OFFSET,1),$T1
  3673. vmovdqu8 `$DATA_DISPL + (64*2)`($IN,$DATA_OFFSET,1),$T2
  3674. vmovdqu8 `$DATA_DISPL + (64*3)`($IN,$DATA_OFFSET,1),$T3
  3675. # ;; move to AES encryption rounds
  3676. vbroadcastf64x2 `(16*0)`($AES_KEYS),$T4
  3677. vpxorq $T4,$B00_03,$B00_03
  3678. vpxorq $T4,$B04_07,$B04_07
  3679. vpxorq $T4,$B08_11,$B08_11
  3680. vpxorq $T4,$B12_15,$B12_15
  3681. ___
  3682. foreach (1 .. ($NROUNDS)) {
  3683. $code .= <<___;
  3684. vbroadcastf64x2 `(16*$_)`($AES_KEYS),$T4
  3685. vaesenc $T4,$B00_03,$B00_03
  3686. vaesenc $T4,$B04_07,$B04_07
  3687. vaesenc $T4,$B08_11,$B08_11
  3688. vaesenc $T4,$B12_15,$B12_15
  3689. ___
  3690. }
  3691. $code .= <<___;
  3692. vbroadcastf64x2 `(16*($NROUNDS+1))`($AES_KEYS),$T4
  3693. vaesenclast $T4,$B00_03,$B00_03
  3694. vaesenclast $T4,$B04_07,$B04_07
  3695. vaesenclast $T4,$B08_11,$B08_11
  3696. vaesenclast $T4,$B12_15,$B12_15
  3697. # ;; xor against text
  3698. vpxorq $T0,$B00_03,$B00_03
  3699. vpxorq $T1,$B04_07,$B04_07
  3700. vpxorq $T2,$B08_11,$B08_11
  3701. vpxorq $T3,$B12_15,$B12_15
  3702. # ;; store
  3703. mov $OUT, $IA0
  3704. vmovdqu8 $B00_03,`$DATA_DISPL + (64*0)`($IA0,$DATA_OFFSET,1)
  3705. vmovdqu8 $B04_07,`$DATA_DISPL + (64*1)`($IA0,$DATA_OFFSET,1)
  3706. vmovdqu8 $B08_11,`$DATA_DISPL + (64*2)`($IA0,$DATA_OFFSET,1)
  3707. vmovdqu8 $B12_15,`$DATA_DISPL + (64*3)`($IA0,$DATA_OFFSET,1)
  3708. ___
  3709. if ($ENC_DEC eq "DEC") {
  3710. $code .= <<___;
  3711. # ;; decryption - cipher text needs to go to GHASH phase
  3712. vpshufb $SHUF_MASK,$T0,$B00_03
  3713. vpshufb $SHUF_MASK,$T1,$B04_07
  3714. vpshufb $SHUF_MASK,$T2,$B08_11
  3715. vpshufb $SHUF_MASK,$T3,$B12_15
  3716. ___
  3717. } else {
  3718. $code .= <<___;
  3719. # ;; encryption
  3720. vpshufb $SHUF_MASK,$B00_03,$B00_03
  3721. vpshufb $SHUF_MASK,$B04_07,$B04_07
  3722. vpshufb $SHUF_MASK,$B08_11,$B08_11
  3723. vpshufb $SHUF_MASK,$B12_15,$B12_15
  3724. ___
  3725. }
  3726. if ($GHASH ne "no_ghash") {
  3727. $code .= <<___;
  3728. # ;; === xor cipher block 0 with GHASH for the next GHASH round
  3729. vpxorq $GHASH,$B00_03,$B00_03
  3730. ___
  3731. }
  3732. $code .= <<___;
  3733. vmovdqa64 $B00_03,`$stack_offset + (0 * 64)`(%rsp)
  3734. vmovdqa64 $B04_07,`$stack_offset + (1 * 64)`(%rsp)
  3735. vmovdqa64 $B08_11,`$stack_offset + (2 * 64)`(%rsp)
  3736. vmovdqa64 $B12_15,`$stack_offset + (3 * 64)`(%rsp)
  3737. ___
  3738. }
  3739. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  3740. # ; GCM_COMPLETE Finishes ghash calculation
  3741. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  3742. sub GCM_COMPLETE {
  3743. my $GCM128_CTX = $_[0];
  3744. my $PBLOCK_LEN = $_[1];
  3745. my $label_suffix = $label_count++;
  3746. $code .= <<___;
  3747. vmovdqu @{[HashKeyByIdx(1,$GCM128_CTX)]},%xmm2
  3748. vmovdqu $CTX_OFFSET_EK0($GCM128_CTX),%xmm3 # ; xmm3 = E(K,Y0)
  3749. ___
  3750. $code .= <<___;
  3751. vmovdqu `$CTX_OFFSET_AadHash`($GCM128_CTX),%xmm4
  3752. # ;; Process the final partial block.
  3753. cmp \$0,$PBLOCK_LEN
  3754. je .L_partial_done_${label_suffix}
  3755. ___
  3756. # ;GHASH computation for the last <16 Byte block
  3757. &GHASH_MUL("%xmm4", "%xmm2", "%xmm0", "%xmm16", "%xmm17");
  3758. $code .= <<___;
  3759. .L_partial_done_${label_suffix}:
  3760. vmovq `$CTX_OFFSET_InLen`($GCM128_CTX), %xmm5
  3761. vpinsrq \$1, `$CTX_OFFSET_AadLen`($GCM128_CTX), %xmm5, %xmm5 # ; xmm5 = len(A)||len(C)
  3762. vpsllq \$3, %xmm5, %xmm5 # ; convert bytes into bits
  3763. vpxor %xmm5,%xmm4,%xmm4
  3764. ___
  3765. &GHASH_MUL("%xmm4", "%xmm2", "%xmm0", "%xmm16", "%xmm17");
  3766. $code .= <<___;
  3767. vpshufb SHUF_MASK(%rip),%xmm4,%xmm4 # ; perform a 16Byte swap
  3768. vpxor %xmm4,%xmm3,%xmm3
  3769. .L_return_T_${label_suffix}:
  3770. vmovdqu %xmm3,`$CTX_OFFSET_AadHash`($GCM128_CTX)
  3771. ___
  3772. }
  3773. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  3774. # ;;; Functions definitions
  3775. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  3776. $code .= ".text\n";
  3777. {
  3778. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  3779. # ;void ossl_aes_gcm_init_avx512 /
  3780. # ; (const void *aes_keys,
  3781. # ; void *gcm128ctx)
  3782. # ;
  3783. # ; Precomputes hashkey table for GHASH optimization.
  3784. # ; Leaf function (does not allocate stack space, does not use non-volatile registers).
  3785. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  3786. $code .= <<___;
  3787. .globl ossl_aes_gcm_init_avx512
  3788. .type ossl_aes_gcm_init_avx512,\@abi-omnipotent
  3789. .align 32
  3790. ossl_aes_gcm_init_avx512:
  3791. .cfi_startproc
  3792. endbranch
  3793. ___
  3794. if ($CHECK_FUNCTION_ARGUMENTS) {
  3795. $code .= <<___;
  3796. # ;; Check aes_keys != NULL
  3797. test $arg1,$arg1
  3798. jz .Labort_init
  3799. # ;; Check gcm128ctx != NULL
  3800. test $arg2,$arg2
  3801. jz .Labort_init
  3802. ___
  3803. }
  3804. $code .= "vpxorq %xmm16,%xmm16,%xmm16\n";
  3805. &ENCRYPT_SINGLE_BLOCK("$arg1", "%xmm16", "%rax"); # ; xmm16 = HashKey
  3806. $code .= <<___;
  3807. vpshufb SHUF_MASK(%rip),%xmm16,%xmm16
  3808. # ;;; PRECOMPUTATION of HashKey<<1 mod poly from the HashKey ;;;
  3809. vmovdqa64 %xmm16,%xmm2
  3810. vpsllq \$1,%xmm16,%xmm16
  3811. vpsrlq \$63,%xmm2,%xmm2
  3812. vmovdqa %xmm2,%xmm1
  3813. vpslldq \$8,%xmm2,%xmm2
  3814. vpsrldq \$8,%xmm1,%xmm1
  3815. vporq %xmm2,%xmm16,%xmm16
  3816. # ;reduction
  3817. vpshufd \$0b00100100,%xmm1,%xmm2
  3818. vpcmpeqd TWOONE(%rip),%xmm2,%xmm2
  3819. vpand POLY(%rip),%xmm2,%xmm2
  3820. vpxorq %xmm2,%xmm16,%xmm16 # ; xmm16 holds the HashKey<<1 mod poly
  3821. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  3822. vmovdqu64 %xmm16,@{[HashKeyByIdx(1,$arg2)]} # ; store HashKey<<1 mod poly
  3823. ___
  3824. &PRECOMPUTE("$arg2", "%xmm16", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5");
  3825. if ($CLEAR_SCRATCH_REGISTERS) {
  3826. &clear_scratch_gps_asm();
  3827. &clear_scratch_zmms_asm();
  3828. } else {
  3829. $code .= "vzeroupper\n";
  3830. }
  3831. $code .= <<___;
  3832. .Labort_init:
  3833. ret
  3834. .cfi_endproc
  3835. .size ossl_aes_gcm_init_avx512, .-ossl_aes_gcm_init_avx512
  3836. ___
  3837. }
  3838. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  3839. # ;void ossl_aes_gcm_setiv_avx512
  3840. # ; (const void *aes_keys,
  3841. # ; void *gcm128ctx,
  3842. # ; const unsigned char *iv,
  3843. # ; size_t ivlen)
  3844. # ;
  3845. # ; Computes E(K,Y0) for finalization, updates current counter Yi in gcm128_context structure.
  3846. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  3847. $code .= <<___;
  3848. .globl ossl_aes_gcm_setiv_avx512
  3849. .type ossl_aes_gcm_setiv_avx512,\@abi-omnipotent
  3850. .align 32
  3851. ossl_aes_gcm_setiv_avx512:
  3852. .cfi_startproc
  3853. .Lsetiv_seh_begin:
  3854. endbranch
  3855. ___
  3856. if ($CHECK_FUNCTION_ARGUMENTS) {
  3857. $code .= <<___;
  3858. # ;; Check aes_keys != NULL
  3859. test $arg1,$arg1
  3860. jz .Labort_setiv
  3861. # ;; Check gcm128ctx != NULL
  3862. test $arg2,$arg2
  3863. jz .Labort_setiv
  3864. # ;; Check iv != NULL
  3865. test $arg3,$arg3
  3866. jz .Labort_setiv
  3867. # ;; Check ivlen != 0
  3868. test $arg4,$arg4
  3869. jz .Labort_setiv
  3870. ___
  3871. }
  3872. # ; NOTE: code before PROLOG() must not modify any registers
  3873. &PROLOG(
  3874. 1, # allocate stack space for hkeys
  3875. 0, # do not allocate stack space for AES blocks
  3876. "setiv");
  3877. &GCM_INIT_IV(
  3878. "$arg1", "$arg2", "$arg3", "$arg4", "%r10", "%r11", "%r12", "%k1", "%xmm2", "%zmm1",
  3879. "%zmm11", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm12",
  3880. "%zmm13", "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19");
  3881. &EPILOG(
  3882. 1, # hkeys were allocated
  3883. $arg4);
  3884. $code .= <<___;
  3885. .Labort_setiv:
  3886. ret
  3887. .Lsetiv_seh_end:
  3888. .cfi_endproc
  3889. .size ossl_aes_gcm_setiv_avx512, .-ossl_aes_gcm_setiv_avx512
  3890. ___
  3891. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  3892. # ;void ossl_aes_gcm_update_aad_avx512
  3893. # ; (unsigned char *gcm128ctx,
  3894. # ; const unsigned char *aad,
  3895. # ; size_t aadlen)
  3896. # ;
  3897. # ; Updates AAD hash in gcm128_context structure.
  3898. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  3899. $code .= <<___;
  3900. .globl ossl_aes_gcm_update_aad_avx512
  3901. .type ossl_aes_gcm_update_aad_avx512,\@abi-omnipotent
  3902. .align 32
  3903. ossl_aes_gcm_update_aad_avx512:
  3904. .cfi_startproc
  3905. .Lghash_seh_begin:
  3906. endbranch
  3907. ___
  3908. if ($CHECK_FUNCTION_ARGUMENTS) {
  3909. $code .= <<___;
  3910. # ;; Check gcm128ctx != NULL
  3911. test $arg1,$arg1
  3912. jz .Lexit_update_aad
  3913. # ;; Check aad != NULL
  3914. test $arg2,$arg2
  3915. jz .Lexit_update_aad
  3916. # ;; Check aadlen != 0
  3917. test $arg3,$arg3
  3918. jz .Lexit_update_aad
  3919. ___
  3920. }
  3921. # ; NOTE: code before PROLOG() must not modify any registers
  3922. &PROLOG(
  3923. 1, # allocate stack space for hkeys,
  3924. 0, # do not allocate stack space for AES blocks
  3925. "ghash");
  3926. &GCM_UPDATE_AAD(
  3927. "$arg1", "$arg2", "$arg3", "%r10", "%r11", "%r12", "%k1", "%xmm14", "%zmm1", "%zmm11",
  3928. "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm12", "%zmm13",
  3929. "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19");
  3930. &EPILOG(
  3931. 1, # hkeys were allocated
  3932. $arg3);
  3933. $code .= <<___;
  3934. .Lexit_update_aad:
  3935. ret
  3936. .Lghash_seh_end:
  3937. .cfi_endproc
  3938. .size ossl_aes_gcm_update_aad_avx512, .-ossl_aes_gcm_update_aad_avx512
  3939. ___
  3940. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  3941. # ;void ossl_aes_gcm_encrypt_avx512
  3942. # ; (const void* aes_keys,
  3943. # ; void *gcm128ctx,
  3944. # ; unsigned int *pblocklen,
  3945. # ; const unsigned char *in,
  3946. # ; size_t len,
  3947. # ; unsigned char *out);
  3948. # ;
  3949. # ; Performs encryption of data |in| of len |len|, and stores the output in |out|.
  3950. # ; Stores encrypted partial block (if any) in gcm128ctx and its length in |pblocklen|.
  3951. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  3952. $code .= <<___;
  3953. .globl ossl_aes_gcm_encrypt_avx512
  3954. .type ossl_aes_gcm_encrypt_avx512,\@abi-omnipotent
  3955. .align 32
  3956. ossl_aes_gcm_encrypt_avx512:
  3957. .cfi_startproc
  3958. .Lencrypt_seh_begin:
  3959. endbranch
  3960. ___
  3961. # ; NOTE: code before PROLOG() must not modify any registers
  3962. &PROLOG(
  3963. 1, # allocate stack space for hkeys
  3964. 1, # allocate stack space for AES blocks
  3965. "encrypt");
  3966. if ($CHECK_FUNCTION_ARGUMENTS) {
  3967. $code .= <<___;
  3968. # ;; Check aes_keys != NULL
  3969. test $arg1,$arg1
  3970. jz .Lexit_gcm_encrypt
  3971. # ;; Check gcm128ctx != NULL
  3972. test $arg2,$arg2
  3973. jz .Lexit_gcm_encrypt
  3974. # ;; Check pblocklen != NULL
  3975. test $arg3,$arg3
  3976. jz .Lexit_gcm_encrypt
  3977. # ;; Check in != NULL
  3978. test $arg4,$arg4
  3979. jz .Lexit_gcm_encrypt
  3980. # ;; Check if len != 0
  3981. cmp \$0,$arg5
  3982. jz .Lexit_gcm_encrypt
  3983. # ;; Check out != NULL
  3984. cmp \$0,$arg6
  3985. jz .Lexit_gcm_encrypt
  3986. ___
  3987. }
  3988. $code .= <<___;
  3989. # ; load number of rounds from AES_KEY structure (offset in bytes is
  3990. # ; size of the |rd_key| buffer)
  3991. mov `4*15*4`($arg1),%eax
  3992. cmp \$9,%eax
  3993. je .Laes_gcm_encrypt_128_avx512
  3994. cmp \$11,%eax
  3995. je .Laes_gcm_encrypt_192_avx512
  3996. cmp \$13,%eax
  3997. je .Laes_gcm_encrypt_256_avx512
  3998. xor %eax,%eax
  3999. jmp .Lexit_gcm_encrypt
  4000. ___
  4001. for my $keylen (sort keys %aes_rounds) {
  4002. $NROUNDS = $aes_rounds{$keylen};
  4003. $code .= <<___;
  4004. .align 32
  4005. .Laes_gcm_encrypt_${keylen}_avx512:
  4006. ___
  4007. &GCM_ENC_DEC("$arg1", "$arg2", "$arg3", "$arg4", "$arg5", "$arg6", "ENC");
  4008. $code .= "jmp .Lexit_gcm_encrypt\n";
  4009. }
  4010. $code .= ".Lexit_gcm_encrypt:\n";
  4011. &EPILOG(1, $arg5);
  4012. $code .= <<___;
  4013. ret
  4014. .Lencrypt_seh_end:
  4015. .cfi_endproc
  4016. .size ossl_aes_gcm_encrypt_avx512, .-ossl_aes_gcm_encrypt_avx512
  4017. ___
  4018. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  4019. # ;void ossl_aes_gcm_decrypt_avx512
  4020. # ; (const void* keys,
  4021. # ; void *gcm128ctx,
  4022. # ; unsigned int *pblocklen,
  4023. # ; const unsigned char *in,
  4024. # ; size_t len,
  4025. # ; unsigned char *out);
  4026. # ;
  4027. # ; Performs decryption of data |in| of len |len|, and stores the output in |out|.
  4028. # ; Stores decrypted partial block (if any) in gcm128ctx and its length in |pblocklen|.
  4029. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  4030. $code .= <<___;
  4031. .globl ossl_aes_gcm_decrypt_avx512
  4032. .type ossl_aes_gcm_decrypt_avx512,\@abi-omnipotent
  4033. .align 32
  4034. ossl_aes_gcm_decrypt_avx512:
  4035. .cfi_startproc
  4036. .Ldecrypt_seh_begin:
  4037. endbranch
  4038. ___
  4039. # ; NOTE: code before PROLOG() must not modify any registers
  4040. &PROLOG(
  4041. 1, # allocate stack space for hkeys
  4042. 1, # allocate stack space for AES blocks
  4043. "decrypt");
  4044. if ($CHECK_FUNCTION_ARGUMENTS) {
  4045. $code .= <<___;
  4046. # ;; Check keys != NULL
  4047. test $arg1,$arg1
  4048. jz .Lexit_gcm_decrypt
  4049. # ;; Check gcm128ctx != NULL
  4050. test $arg2,$arg2
  4051. jz .Lexit_gcm_decrypt
  4052. # ;; Check pblocklen != NULL
  4053. test $arg3,$arg3
  4054. jz .Lexit_gcm_decrypt
  4055. # ;; Check in != NULL
  4056. test $arg4,$arg4
  4057. jz .Lexit_gcm_decrypt
  4058. # ;; Check if len != 0
  4059. cmp \$0,$arg5
  4060. jz .Lexit_gcm_decrypt
  4061. # ;; Check out != NULL
  4062. cmp \$0,$arg6
  4063. jz .Lexit_gcm_decrypt
  4064. ___
  4065. }
  4066. $code .= <<___;
  4067. # ; load number of rounds from AES_KEY structure (offset in bytes is
  4068. # ; size of the |rd_key| buffer)
  4069. mov `4*15*4`($arg1),%eax
  4070. cmp \$9,%eax
  4071. je .Laes_gcm_decrypt_128_avx512
  4072. cmp \$11,%eax
  4073. je .Laes_gcm_decrypt_192_avx512
  4074. cmp \$13,%eax
  4075. je .Laes_gcm_decrypt_256_avx512
  4076. xor %eax,%eax
  4077. jmp .Lexit_gcm_decrypt
  4078. ___
  4079. for my $keylen (sort keys %aes_rounds) {
  4080. $NROUNDS = $aes_rounds{$keylen};
  4081. $code .= <<___;
  4082. .align 32
  4083. .Laes_gcm_decrypt_${keylen}_avx512:
  4084. ___
  4085. &GCM_ENC_DEC("$arg1", "$arg2", "$arg3", "$arg4", "$arg5", "$arg6", "DEC");
  4086. $code .= "jmp .Lexit_gcm_decrypt\n";
  4087. }
  4088. $code .= ".Lexit_gcm_decrypt:\n";
  4089. &EPILOG(1, $arg5);
  4090. $code .= <<___;
  4091. ret
  4092. .Ldecrypt_seh_end:
  4093. .cfi_endproc
  4094. .size ossl_aes_gcm_decrypt_avx512, .-ossl_aes_gcm_decrypt_avx512
  4095. ___
  4096. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  4097. # ;void ossl_aes_gcm_finalize_vaes_avx512
  4098. # ; (void *gcm128ctx,
  4099. # ; unsigned int pblocklen);
  4100. # ;
  4101. # ; Finalizes encryption / decryption
  4102. # ; Leaf function (does not allocate stack space, does not use non-volatile registers).
  4103. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  4104. $code .= <<___;
  4105. .globl ossl_aes_gcm_finalize_avx512
  4106. .type ossl_aes_gcm_finalize_avx512,\@abi-omnipotent
  4107. .align 32
  4108. ossl_aes_gcm_finalize_avx512:
  4109. .cfi_startproc
  4110. endbranch
  4111. ___
  4112. if ($CHECK_FUNCTION_ARGUMENTS) {
  4113. $code .= <<___;
  4114. # ;; Check gcm128ctx != NULL
  4115. test $arg1,$arg1
  4116. jz .Labort_finalize
  4117. ___
  4118. }
  4119. &GCM_COMPLETE("$arg1", "$arg2");
  4120. $code .= <<___;
  4121. .Labort_finalize:
  4122. ret
  4123. .cfi_endproc
  4124. .size ossl_aes_gcm_finalize_avx512, .-ossl_aes_gcm_finalize_avx512
  4125. ___
  4126. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  4127. # ;void ossl_gcm_gmult_avx512(u64 Xi[2],
  4128. # ; const void* gcm128ctx)
  4129. # ;
  4130. # ; Leaf function (does not allocate stack space, does not use non-volatile registers).
  4131. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  4132. $code .= <<___;
  4133. .globl ossl_gcm_gmult_avx512
  4134. .hidden ossl_gcm_gmult_avx512
  4135. .type ossl_gcm_gmult_avx512,\@abi-omnipotent
  4136. .align 32
  4137. ossl_gcm_gmult_avx512:
  4138. .cfi_startproc
  4139. endbranch
  4140. ___
  4141. if ($CHECK_FUNCTION_ARGUMENTS) {
  4142. $code .= <<___;
  4143. # ;; Check Xi != NULL
  4144. test $arg1,$arg1
  4145. jz .Labort_gmult
  4146. # ;; Check gcm128ctx != NULL
  4147. test $arg2,$arg2
  4148. jz .Labort_gmult
  4149. ___
  4150. }
  4151. $code .= "vmovdqu64 ($arg1),%xmm1\n";
  4152. $code .= "vmovdqu64 @{[HashKeyByIdx(1,$arg2)]},%xmm2\n";
  4153. &GHASH_MUL("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5");
  4154. $code .= "vmovdqu64 %xmm1,($arg1)\n";
  4155. if ($CLEAR_SCRATCH_REGISTERS) {
  4156. &clear_scratch_gps_asm();
  4157. &clear_scratch_zmms_asm();
  4158. } else {
  4159. $code .= "vzeroupper\n";
  4160. }
  4161. $code .= <<___;
  4162. .Labort_gmult:
  4163. ret
  4164. .cfi_endproc
  4165. .size ossl_gcm_gmult_avx512, .-ossl_gcm_gmult_avx512
  4166. ___
  4167. if ($win64) {
  4168. # Add unwind metadata for SEH.
  4169. # See https://docs.microsoft.com/en-us/cpp/build/exception-handling-x64?view=msvc-160
  4170. my $UWOP_PUSH_NONVOL = 0;
  4171. my $UWOP_ALLOC_LARGE = 1;
  4172. my $UWOP_SET_FPREG = 3;
  4173. my $UWOP_SAVE_XMM128 = 8;
  4174. my %UWOP_REG_NUMBER = (
  4175. rax => 0,
  4176. rcx => 1,
  4177. rdx => 2,
  4178. rbx => 3,
  4179. rsp => 4,
  4180. rbp => 5,
  4181. rsi => 6,
  4182. rdi => 7,
  4183. map(("r$_" => $_), (8 .. 15)));
  4184. $code .= <<___;
  4185. .section .pdata
  4186. .align 4
  4187. .rva .Lsetiv_seh_begin
  4188. .rva .Lsetiv_seh_end
  4189. .rva .Lsetiv_seh_info
  4190. .rva .Lghash_seh_begin
  4191. .rva .Lghash_seh_end
  4192. .rva .Lghash_seh_info
  4193. .rva .Lencrypt_seh_begin
  4194. .rva .Lencrypt_seh_end
  4195. .rva .Lencrypt_seh_info
  4196. .rva .Ldecrypt_seh_begin
  4197. .rva .Ldecrypt_seh_end
  4198. .rva .Ldecrypt_seh_info
  4199. .section .xdata
  4200. ___
  4201. foreach my $func_name ("setiv", "ghash", "encrypt", "decrypt") {
  4202. $code .= <<___;
  4203. .align 8
  4204. .L${func_name}_seh_info:
  4205. .byte 1 # version 1, no flags
  4206. .byte .L${func_name}_seh_prolog_end-.L${func_name}_seh_begin
  4207. .byte 31 # num_slots = 1*8 + 2 + 1 + 2*10
  4208. # FR = rbp; Offset from RSP = $XMM_STORAGE scaled on 16
  4209. .byte @{[$UWOP_REG_NUMBER{rbp} | (($XMM_STORAGE / 16 ) << 4)]}
  4210. ___
  4211. # Metadata for %xmm15-%xmm6
  4212. # Occupy 2 slots each
  4213. for (my $reg_idx = 15; $reg_idx >= 6; $reg_idx--) {
  4214. # Scaled-by-16 stack offset
  4215. my $xmm_reg_offset = ($reg_idx - 6);
  4216. $code .= <<___;
  4217. .byte .L${func_name}_seh_save_xmm${reg_idx}-.L${func_name}_seh_begin
  4218. .byte @{[$UWOP_SAVE_XMM128 | (${reg_idx} << 4)]}
  4219. .value $xmm_reg_offset
  4220. ___
  4221. }
  4222. $code .= <<___;
  4223. # Frame pointer (occupy 1 slot)
  4224. .byte .L${func_name}_seh_setfp-.L${func_name}_seh_begin
  4225. .byte $UWOP_SET_FPREG
  4226. # Occupy 2 slots, as stack allocation < 512K, but > 128 bytes
  4227. .byte .L${func_name}_seh_allocstack_xmm-.L${func_name}_seh_begin
  4228. .byte $UWOP_ALLOC_LARGE
  4229. .value `($XMM_STORAGE + 8) / 8`
  4230. ___
  4231. # Metadata for GPR regs
  4232. # Occupy 1 slot each
  4233. foreach my $reg ("rsi", "rdi", "r15", "r14", "r13", "r12", "rbp", "rbx") {
  4234. $code .= <<___;
  4235. .byte .L${func_name}_seh_push_${reg}-.L${func_name}_seh_begin
  4236. .byte @{[$UWOP_PUSH_NONVOL | ($UWOP_REG_NUMBER{$reg} << 4)]}
  4237. ___
  4238. }
  4239. }
  4240. }
  4241. $code .= <<___;
  4242. .data
  4243. .align 16
  4244. POLY: .quad 0x0000000000000001, 0xC200000000000000
  4245. .align 64
  4246. POLY2:
  4247. .quad 0x00000001C2000000, 0xC200000000000000
  4248. .quad 0x00000001C2000000, 0xC200000000000000
  4249. .quad 0x00000001C2000000, 0xC200000000000000
  4250. .quad 0x00000001C2000000, 0xC200000000000000
  4251. .align 16
  4252. TWOONE: .quad 0x0000000000000001, 0x0000000100000000
  4253. # ;;; Order of these constants should not change.
  4254. # ;;; More specifically, ALL_F should follow SHIFT_MASK, and ZERO should follow ALL_F
  4255. .align 64
  4256. SHUF_MASK:
  4257. .quad 0x08090A0B0C0D0E0F, 0x0001020304050607
  4258. .quad 0x08090A0B0C0D0E0F, 0x0001020304050607
  4259. .quad 0x08090A0B0C0D0E0F, 0x0001020304050607
  4260. .quad 0x08090A0B0C0D0E0F, 0x0001020304050607
  4261. .align 16
  4262. SHIFT_MASK:
  4263. .quad 0x0706050403020100, 0x0f0e0d0c0b0a0908
  4264. ALL_F:
  4265. .quad 0xffffffffffffffff, 0xffffffffffffffff
  4266. ZERO:
  4267. .quad 0x0000000000000000, 0x0000000000000000
  4268. .align 16
  4269. ONE:
  4270. .quad 0x0000000000000001, 0x0000000000000000
  4271. .align 16
  4272. ONEf:
  4273. .quad 0x0000000000000000, 0x0100000000000000
  4274. .align 64
  4275. ddq_add_1234:
  4276. .quad 0x0000000000000001, 0x0000000000000000
  4277. .quad 0x0000000000000002, 0x0000000000000000
  4278. .quad 0x0000000000000003, 0x0000000000000000
  4279. .quad 0x0000000000000004, 0x0000000000000000
  4280. .align 64
  4281. ddq_add_5678:
  4282. .quad 0x0000000000000005, 0x0000000000000000
  4283. .quad 0x0000000000000006, 0x0000000000000000
  4284. .quad 0x0000000000000007, 0x0000000000000000
  4285. .quad 0x0000000000000008, 0x0000000000000000
  4286. .align 64
  4287. ddq_add_4444:
  4288. .quad 0x0000000000000004, 0x0000000000000000
  4289. .quad 0x0000000000000004, 0x0000000000000000
  4290. .quad 0x0000000000000004, 0x0000000000000000
  4291. .quad 0x0000000000000004, 0x0000000000000000
  4292. .align 64
  4293. ddq_add_8888:
  4294. .quad 0x0000000000000008, 0x0000000000000000
  4295. .quad 0x0000000000000008, 0x0000000000000000
  4296. .quad 0x0000000000000008, 0x0000000000000000
  4297. .quad 0x0000000000000008, 0x0000000000000000
  4298. .align 64
  4299. ddq_addbe_1234:
  4300. .quad 0x0000000000000000, 0x0100000000000000
  4301. .quad 0x0000000000000000, 0x0200000000000000
  4302. .quad 0x0000000000000000, 0x0300000000000000
  4303. .quad 0x0000000000000000, 0x0400000000000000
  4304. .align 64
  4305. ddq_addbe_4444:
  4306. .quad 0x0000000000000000, 0x0400000000000000
  4307. .quad 0x0000000000000000, 0x0400000000000000
  4308. .quad 0x0000000000000000, 0x0400000000000000
  4309. .quad 0x0000000000000000, 0x0400000000000000
  4310. .align 64
  4311. byte_len_to_mask_table:
  4312. .value 0x0000, 0x0001, 0x0003, 0x0007
  4313. .value 0x000f, 0x001f, 0x003f, 0x007f
  4314. .value 0x00ff, 0x01ff, 0x03ff, 0x07ff
  4315. .value 0x0fff, 0x1fff, 0x3fff, 0x7fff
  4316. .value 0xffff
  4317. .align 64
  4318. byte64_len_to_mask_table:
  4319. .quad 0x0000000000000000, 0x0000000000000001
  4320. .quad 0x0000000000000003, 0x0000000000000007
  4321. .quad 0x000000000000000f, 0x000000000000001f
  4322. .quad 0x000000000000003f, 0x000000000000007f
  4323. .quad 0x00000000000000ff, 0x00000000000001ff
  4324. .quad 0x00000000000003ff, 0x00000000000007ff
  4325. .quad 0x0000000000000fff, 0x0000000000001fff
  4326. .quad 0x0000000000003fff, 0x0000000000007fff
  4327. .quad 0x000000000000ffff, 0x000000000001ffff
  4328. .quad 0x000000000003ffff, 0x000000000007ffff
  4329. .quad 0x00000000000fffff, 0x00000000001fffff
  4330. .quad 0x00000000003fffff, 0x00000000007fffff
  4331. .quad 0x0000000000ffffff, 0x0000000001ffffff
  4332. .quad 0x0000000003ffffff, 0x0000000007ffffff
  4333. .quad 0x000000000fffffff, 0x000000001fffffff
  4334. .quad 0x000000003fffffff, 0x000000007fffffff
  4335. .quad 0x00000000ffffffff, 0x00000001ffffffff
  4336. .quad 0x00000003ffffffff, 0x00000007ffffffff
  4337. .quad 0x0000000fffffffff, 0x0000001fffffffff
  4338. .quad 0x0000003fffffffff, 0x0000007fffffffff
  4339. .quad 0x000000ffffffffff, 0x000001ffffffffff
  4340. .quad 0x000003ffffffffff, 0x000007ffffffffff
  4341. .quad 0x00000fffffffffff, 0x00001fffffffffff
  4342. .quad 0x00003fffffffffff, 0x00007fffffffffff
  4343. .quad 0x0000ffffffffffff, 0x0001ffffffffffff
  4344. .quad 0x0003ffffffffffff, 0x0007ffffffffffff
  4345. .quad 0x000fffffffffffff, 0x001fffffffffffff
  4346. .quad 0x003fffffffffffff, 0x007fffffffffffff
  4347. .quad 0x00ffffffffffffff, 0x01ffffffffffffff
  4348. .quad 0x03ffffffffffffff, 0x07ffffffffffffff
  4349. .quad 0x0fffffffffffffff, 0x1fffffffffffffff
  4350. .quad 0x3fffffffffffffff, 0x7fffffffffffffff
  4351. .quad 0xffffffffffffffff
  4352. ___
  4353. } else {
  4354. # Fallback for old assembler
  4355. $code .= <<___;
  4356. .text
  4357. .globl ossl_vaes_vpclmulqdq_capable
  4358. .type ossl_vaes_vpclmulqdq_capable,\@abi-omnipotent
  4359. ossl_vaes_vpclmulqdq_capable:
  4360. xor %eax,%eax
  4361. ret
  4362. .size ossl_vaes_vpclmulqdq_capable, .-ossl_vaes_vpclmulqdq_capable
  4363. .globl ossl_aes_gcm_init_avx512
  4364. .globl ossl_aes_gcm_setiv_avx512
  4365. .globl ossl_aes_gcm_update_aad_avx512
  4366. .globl ossl_aes_gcm_encrypt_avx512
  4367. .globl ossl_aes_gcm_decrypt_avx512
  4368. .globl ossl_aes_gcm_finalize_avx512
  4369. .globl ossl_gcm_gmult_avx512
  4370. .type ossl_aes_gcm_init_avx512,\@abi-omnipotent
  4371. ossl_aes_gcm_init_avx512:
  4372. ossl_aes_gcm_setiv_avx512:
  4373. ossl_aes_gcm_update_aad_avx512:
  4374. ossl_aes_gcm_encrypt_avx512:
  4375. ossl_aes_gcm_decrypt_avx512:
  4376. ossl_aes_gcm_finalize_avx512:
  4377. ossl_gcm_gmult_avx512:
  4378. .byte 0x0f,0x0b # ud2
  4379. ret
  4380. .size ossl_aes_gcm_init_avx512, .-ossl_aes_gcm_init_avx512
  4381. ___
  4382. }
  4383. $code =~ s/\`([^\`]*)\`/eval $1/gem;
  4384. print $code;
  4385. close STDOUT or die "error closing STDOUT: $!";