12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109 |
- /* poly1305_asm
- *
- * Copyright (C) 2006-2020 wolfSSL Inc.
- *
- * This file is part of wolfSSL.
- *
- * wolfSSL is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * wolfSSL is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
- */
- #ifndef HAVE_INTEL_AVX1
- #define HAVE_INTEL_AVX1
- #endif /* HAVE_INTEL_AVX1 */
- #ifndef NO_AVX2_SUPPORT
- #define HAVE_INTEL_AVX2
- #endif /* NO_AVX2_SUPPORT */
- #ifdef HAVE_INTEL_AVX1
- #ifndef __APPLE__
- .text
- .globl poly1305_setkey_avx
- .type poly1305_setkey_avx,@function
- .align 16
- poly1305_setkey_avx:
- #else
- .section __TEXT,__text
- .globl _poly1305_setkey_avx
- .p2align 4
- _poly1305_setkey_avx:
- #endif /* __APPLE__ */
- movabsq $0xffffffc0fffffff, %r10
- movabsq $0xffffffc0ffffffc, %r11
- movq (%rsi), %rdx
- movq 8(%rsi), %rax
- movq 16(%rsi), %rcx
- movq 24(%rsi), %r8
- andq %r10, %rdx
- andq %r11, %rax
- movq %rdx, %r10
- movq %rax, %r11
- xorq %r9, %r9
- movq %rdx, (%rdi)
- movq %rax, 8(%rdi)
- movq %r9, 24(%rdi)
- movq %r9, 32(%rdi)
- movq %r9, 40(%rdi)
- movq %rcx, 48(%rdi)
- movq %r8, 56(%rdi)
- movq %r9, 352(%rdi)
- movq %r9, 408(%rdi)
- movq %rdx, 360(%rdi)
- movq %rax, 416(%rdi)
- addq %rdx, %r10
- addq %rax, %r11
- movq %r10, 368(%rdi)
- movq %r11, 424(%rdi)
- addq %rdx, %r10
- addq %rax, %r11
- movq %r10, 376(%rdi)
- movq %r11, 432(%rdi)
- addq %rdx, %r10
- addq %rax, %r11
- movq %r10, 384(%rdi)
- movq %r11, 440(%rdi)
- addq %rdx, %r10
- addq %rax, %r11
- movq %r10, 392(%rdi)
- movq %r11, 448(%rdi)
- addq %rdx, %r10
- addq %rax, %r11
- movq %r10, 400(%rdi)
- movq %r11, 456(%rdi)
- movq %r9, 608(%rdi)
- movb $0x01, 616(%rdi)
- repz retq
- #ifndef __APPLE__
- .size poly1305_setkey_avx,.-poly1305_setkey_avx
- #endif /* __APPLE__ */
- #ifndef __APPLE__
- .text
- .globl poly1305_block_avx
- .type poly1305_block_avx,@function
- .align 16
- poly1305_block_avx:
- #else
- .section __TEXT,__text
- .globl _poly1305_block_avx
- .p2align 4
- _poly1305_block_avx:
- #endif /* __APPLE__ */
- pushq %r15
- pushq %rbx
- pushq %r12
- pushq %r13
- pushq %r14
- movq (%rdi), %r15
- movq 8(%rdi), %rbx
- movq 24(%rdi), %r8
- movq 32(%rdi), %r9
- movq 40(%rdi), %r10
- xorq %r14, %r14
- movb 616(%rdi), %r14b
- # h += m
- movq (%rsi), %r11
- movq 8(%rsi), %r12
- addq %r11, %r8
- adcq %r12, %r9
- movq %rbx, %rax
- adcq %r14, %r10
- # r[1] * h[0] => rdx, rax ==> t2, t1
- mulq %r8
- movq %rax, %r12
- movq %rdx, %r13
- # r[0] * h[1] => rdx, rax ++> t2, t1
- movq %r15, %rax
- mulq %r9
- addq %rax, %r12
- movq %r15, %rax
- adcq %rdx, %r13
- # r[0] * h[0] => rdx, rax ==> t4, t0
- mulq %r8
- movq %rax, %r11
- movq %rdx, %r8
- # r[1] * h[1] => rdx, rax =+> t3, t2
- movq %rbx, %rax
- mulq %r9
- # r[0] * h[2] +> t2
- addq 352(%rdi,%r10,8), %r13
- movq %rdx, %r14
- addq %r8, %r12
- adcq %rax, %r13
- # r[1] * h[2] +> t3
- adcq 408(%rdi,%r10,8), %r14
- # r * h in r14, r13, r12, r11
- # h = (r * h) mod 2^130 - 5
- movq %r13, %r10
- andq $-4, %r13
- andq $3, %r10
- addq %r13, %r11
- movq %r13, %r8
- adcq %r14, %r12
- adcq $0x00, %r10
- shrdq $2, %r14, %r8
- shrq $2, %r14
- addq %r11, %r8
- adcq %r14, %r12
- movq %r12, %r9
- adcq $0x00, %r10
- # h in r10, r9, r8
- # Store h to ctx
- movq %r8, 24(%rdi)
- movq %r9, 32(%rdi)
- movq %r10, 40(%rdi)
- popq %r14
- popq %r13
- popq %r12
- popq %rbx
- popq %r15
- repz retq
- #ifndef __APPLE__
- .size poly1305_block_avx,.-poly1305_block_avx
- #endif /* __APPLE__ */
- #ifndef __APPLE__
- .text
- .globl poly1305_blocks_avx
- .type poly1305_blocks_avx,@function
- .align 16
- poly1305_blocks_avx:
- #else
- .section __TEXT,__text
- .globl _poly1305_blocks_avx
- .p2align 4
- _poly1305_blocks_avx:
- #endif /* __APPLE__ */
- pushq %r15
- pushq %rbx
- pushq %r12
- pushq %r13
- pushq %r14
- movq %rdx, %rcx
- movq (%rdi), %r15
- movq 8(%rdi), %rbx
- movq 24(%rdi), %r8
- movq 32(%rdi), %r9
- movq 40(%rdi), %r10
- L_poly1305_avx_blocks_start:
- # h += m
- movq (%rsi), %r11
- movq 8(%rsi), %r12
- addq %r11, %r8
- adcq %r12, %r9
- movq %rbx, %rax
- adcq $0x00, %r10
- # r[1] * h[0] => rdx, rax ==> t2, t1
- mulq %r8
- movq %rax, %r12
- movq %rdx, %r13
- # r[0] * h[1] => rdx, rax ++> t2, t1
- movq %r15, %rax
- mulq %r9
- addq %rax, %r12
- movq %r15, %rax
- adcq %rdx, %r13
- # r[0] * h[0] => rdx, rax ==> t4, t0
- mulq %r8
- movq %rax, %r11
- movq %rdx, %r8
- # r[1] * h[1] => rdx, rax =+> t3, t2
- movq %rbx, %rax
- mulq %r9
- # r[0] * h[2] +> t2
- addq 360(%rdi,%r10,8), %r13
- movq %rdx, %r14
- addq %r8, %r12
- adcq %rax, %r13
- # r[1] * h[2] +> t3
- adcq 416(%rdi,%r10,8), %r14
- # r * h in r14, r13, r12, r11
- # h = (r * h) mod 2^130 - 5
- movq %r13, %r10
- andq $-4, %r13
- andq $3, %r10
- addq %r13, %r11
- movq %r13, %r8
- adcq %r14, %r12
- adcq $0x00, %r10
- shrdq $2, %r14, %r8
- shrq $2, %r14
- addq %r11, %r8
- adcq %r14, %r12
- movq %r12, %r9
- adcq $0x00, %r10
- # h in r10, r9, r8
- # Next block from message
- addq $16, %rsi
- subq $16, %rcx
- jg L_poly1305_avx_blocks_start
- # Store h to ctx
- movq %r8, 24(%rdi)
- movq %r9, 32(%rdi)
- movq %r10, 40(%rdi)
- popq %r14
- popq %r13
- popq %r12
- popq %rbx
- popq %r15
- repz retq
- #ifndef __APPLE__
- .size poly1305_blocks_avx,.-poly1305_blocks_avx
- #endif /* __APPLE__ */
- #ifndef __APPLE__
- .text
- .globl poly1305_final_avx
- .type poly1305_final_avx,@function
- .align 16
- poly1305_final_avx:
- #else
- .section __TEXT,__text
- .globl _poly1305_final_avx
- .p2align 4
- _poly1305_final_avx:
- #endif /* __APPLE__ */
- pushq %rbx
- pushq %r12
- movq %rsi, %rbx
- movq 608(%rdi), %rax
- testq %rax, %rax
- je L_poly1305_avx_final_no_more
- movb $0x01, 480(%rdi,%rax,1)
- jmp L_poly1305_avx_final_cmp_rem
- L_poly1305_avx_final_zero_rem:
- movb $0x00, 480(%rdi,%rax,1)
- L_poly1305_avx_final_cmp_rem:
- incb %al
- cmpq $16, %rax
- jl L_poly1305_avx_final_zero_rem
- movb $0x00, 616(%rdi)
- leaq 480(%rdi), %rsi
- #ifndef __APPLE__
- callq poly1305_block_avx@plt
- #else
- callq _poly1305_block_avx
- #endif /* __APPLE__ */
- L_poly1305_avx_final_no_more:
- movq 24(%rdi), %rax
- movq 32(%rdi), %rdx
- movq 40(%rdi), %rcx
- movq 48(%rdi), %r11
- movq 56(%rdi), %r12
- # h %= p
- # h = (h + pad)
- # mod 2^130 - 5
- movq %rcx, %r8
- andq $3, %rcx
- shrq $2, %r8
- # Multily by 5
- leaq 0(%r8,%r8,4), %r8
- addq %r8, %rax
- adcq $0x00, %rdx
- adcq $0x00, %rcx
- # Fixup when between (1 << 130) - 1 and (1 << 130) - 5
- movq %rax, %r8
- movq %rdx, %r9
- movq %rcx, %r10
- addq $5, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- cmpq $4, %r10
- cmoveq %r8, %rax
- cmoveq %r9, %rdx
- # h += pad
- addq %r11, %rax
- adcq %r12, %rdx
- movq %rax, (%rbx)
- movq %rdx, 8(%rbx)
- # Zero out r
- movq $0x00, (%rdi)
- movq $0x00, 8(%rdi)
- # Zero out h
- movq $0x00, 24(%rdi)
- movq $0x00, 32(%rdi)
- movq $0x00, 40(%rdi)
- # Zero out pad
- movq $0x00, 48(%rdi)
- movq $0x00, 56(%rdi)
- popq %r12
- popq %rbx
- repz retq
- #ifndef __APPLE__
- .size poly1305_final_avx,.-poly1305_final_avx
- #endif /* __APPLE__ */
- #endif /* HAVE_INTEL_AVX1 */
- #ifdef HAVE_INTEL_AVX2
- #ifndef __APPLE__
- .text
- .globl poly1305_calc_powers_avx2
- .type poly1305_calc_powers_avx2,@function
- .align 16
- poly1305_calc_powers_avx2:
- #else
- .section __TEXT,__text
- .globl _poly1305_calc_powers_avx2
- .p2align 4
- _poly1305_calc_powers_avx2:
- #endif /* __APPLE__ */
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
- pushq %rbx
- pushq %rbp
- movq (%rdi), %rcx
- movq 8(%rdi), %r8
- xorq %r9, %r9
- # Convert to 26 bits in 32
- movq %rcx, %rax
- movq %rcx, %rdx
- movq %rcx, %rsi
- movq %r8, %rbx
- movq %r8, %rbp
- shrq $26, %rdx
- shrdq $52, %r8, %rsi
- shrq $14, %rbx
- shrdq $40, %r9, %rbp
- andq $0x3ffffff, %rax
- andq $0x3ffffff, %rdx
- andq $0x3ffffff, %rsi
- andq $0x3ffffff, %rbx
- andq $0x3ffffff, %rbp
- movl %eax, 224(%rdi)
- movl %edx, 228(%rdi)
- movl %esi, 232(%rdi)
- movl %ebx, 236(%rdi)
- movl %ebp, 240(%rdi)
- movl $0x00, 244(%rdi)
- # Square 128-bit
- movq %r8, %rax
- mulq %rcx
- xorq %r13, %r13
- movq %rax, %r11
- movq %rdx, %r12
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- movq %rcx, %rax
- mulq %rax
- movq %rax, %r10
- movq %rdx, %r15
- movq %r8, %rax
- mulq %rax
- addq %r15, %r11
- adcq %rax, %r12
- adcq %rdx, %r13
- # Reduce 256-bit to 130-bit
- movq %r12, %rax
- movq %r13, %rdx
- andq $-4, %rax
- andq $3, %r12
- addq %rax, %r10
- adcq %rdx, %r11
- adcq $0x00, %r12
- shrdq $2, %rdx, %rax
- shrq $2, %rdx
- addq %rax, %r10
- adcq %rdx, %r11
- adcq $0x00, %r12
- movq %r12, %rax
- shrq $2, %rax
- leaq 0(%rax,%rax,4), %rax
- andq $3, %r12
- addq %rax, %r10
- adcq $0x00, %r11
- adcq $0x00, %r12
- # Convert to 26 bits in 32
- movq %r10, %rax
- movq %r10, %rdx
- movq %r10, %rsi
- movq %r11, %rbx
- movq %r11, %rbp
- shrq $26, %rdx
- shrdq $52, %r11, %rsi
- shrq $14, %rbx
- shrdq $40, %r12, %rbp
- andq $0x3ffffff, %rax
- andq $0x3ffffff, %rdx
- andq $0x3ffffff, %rsi
- andq $0x3ffffff, %rbx
- andq $0x3ffffff, %rbp
- movl %eax, 256(%rdi)
- movl %edx, 260(%rdi)
- movl %esi, 264(%rdi)
- movl %ebx, 268(%rdi)
- movl %ebp, 272(%rdi)
- movl $0x00, 276(%rdi)
- # Multiply 128-bit by 130-bit
- # r1[0] * r2[0]
- movq %rcx, %rax
- mulq %r10
- movq %rax, %r13
- movq %rdx, %r14
- # r1[0] * r2[1]
- movq %rcx, %rax
- mulq %r11
- movq $0x00, %r15
- addq %rax, %r14
- adcq %rdx, %r15
- # r1[1] * r2[0]
- movq %r8, %rax
- mulq %r10
- movq $0x00, %rsi
- addq %rax, %r14
- adcq %rdx, %r15
- adcq $0x00, %rsi
- # r1[0] * r2[2]
- movq %rcx, %rax
- mulq %r12
- addq %rax, %r15
- adcq %rdx, %rsi
- # r1[1] * r2[1]
- movq %r8, %rax
- mulq %r11
- movq $0x00, %rbx
- addq %rax, %r15
- adcq %rdx, %rsi
- adcq $0x00, %rbx
- # r1[1] * r2[2]
- movq %r8, %rax
- mulq %r12
- addq %rax, %rsi
- adcq %rdx, %rbx
- # Reduce 260-bit to 130-bit
- movq %r15, %rax
- movq %rsi, %rdx
- movq %rbx, %rbx
- andq $-4, %rax
- andq $3, %r15
- addq %rax, %r13
- adcq %rdx, %r14
- adcq %rbx, %r15
- shrdq $2, %rdx, %rax
- shrdq $2, %rbx, %rdx
- shrq $2, %rbx
- addq %rax, %r13
- adcq %rdx, %r14
- adcq %rbx, %r15
- movq %r15, %rax
- andq $3, %r15
- shrq $2, %rax
- leaq 0(%rax,%rax,4), %rax
- addq %rax, %r13
- adcq $0x00, %r14
- adcq $0x00, %r15
- # Convert to 26 bits in 32
- movq %r13, %rax
- movq %r13, %rdx
- movq %r13, %rsi
- movq %r14, %rbx
- movq %r14, %rbp
- shrq $26, %rdx
- shrdq $52, %r14, %rsi
- shrq $14, %rbx
- shrdq $40, %r15, %rbp
- andq $0x3ffffff, %rax
- andq $0x3ffffff, %rdx
- andq $0x3ffffff, %rsi
- andq $0x3ffffff, %rbx
- andq $0x3ffffff, %rbp
- movl %eax, 288(%rdi)
- movl %edx, 292(%rdi)
- movl %esi, 296(%rdi)
- movl %ebx, 300(%rdi)
- movl %ebp, 304(%rdi)
- movl $0x00, 308(%rdi)
- # Square 130-bit
- movq %r11, %rax
- mulq %r10
- xorq %r13, %r13
- movq %rax, %r8
- movq %rdx, %r9
- addq %rax, %r8
- adcq %rdx, %r9
- adcq $0x00, %r13
- movq %r10, %rax
- mulq %rax
- movq %rax, %rcx
- movq %rdx, %r15
- movq %r11, %rax
- mulq %rax
- addq %r15, %r8
- adcq %rax, %r9
- adcq %rdx, %r13
- movq %r12, %rax
- mulq %rax
- movq %rax, %r14
- movq %r12, %rax
- mulq %r10
- addq %rax, %r9
- adcq %rdx, %r13
- adcq $0x00, %r14
- addq %rax, %r9
- adcq %rdx, %r13
- adcq $0x00, %r14
- movq %r12, %rax
- mulq %r11
- addq %rax, %r13
- adcq %rdx, %r14
- addq %rax, %r13
- adcq %rdx, %r14
- # Reduce 260-bit to 130-bit
- movq %r9, %rax
- movq %r13, %rdx
- movq %r14, %r15
- andq $-4, %rax
- andq $3, %r9
- addq %rax, %rcx
- adcq %rdx, %r8
- adcq %r15, %r9
- shrdq $2, %rdx, %rax
- shrdq $2, %r15, %rdx
- shrq $2, %r15
- addq %rax, %rcx
- adcq %rdx, %r8
- adcq %r15, %r9
- movq %r9, %rax
- andq $3, %r9
- shrq $2, %rax
- leaq 0(%rax,%rax,4), %rax
- addq %rax, %rcx
- adcq $0x00, %r8
- adcq $0x00, %r9
- # Convert to 26 bits in 32
- movq %rcx, %rax
- movq %rcx, %rdx
- movq %rcx, %rsi
- movq %r8, %rbx
- movq %r8, %rbp
- shrq $26, %rdx
- shrdq $52, %r8, %rsi
- shrq $14, %rbx
- shrdq $40, %r9, %rbp
- andq $0x3ffffff, %rax
- andq $0x3ffffff, %rdx
- andq $0x3ffffff, %rsi
- andq $0x3ffffff, %rbx
- andq $0x3ffffff, %rbp
- movl %eax, 320(%rdi)
- movl %edx, 324(%rdi)
- movl %esi, 328(%rdi)
- movl %ebx, 332(%rdi)
- movl %ebp, 336(%rdi)
- movl $0x00, 340(%rdi)
- popq %rbp
- popq %rbx
- popq %r15
- popq %r14
- popq %r13
- popq %r12
- repz retq
- #ifndef __APPLE__
- .size poly1305_calc_powers_avx2,.-poly1305_calc_powers_avx2
- #endif /* __APPLE__ */
- #ifndef __APPLE__
- .text
- .globl poly1305_setkey_avx2
- .type poly1305_setkey_avx2,@function
- .align 16
- poly1305_setkey_avx2:
- #else
- .section __TEXT,__text
- .globl _poly1305_setkey_avx2
- .p2align 4
- _poly1305_setkey_avx2:
- #endif /* __APPLE__ */
- #ifndef __APPLE__
- callq poly1305_setkey_avx@plt
- #else
- callq _poly1305_setkey_avx
- #endif /* __APPLE__ */
- vpxor %ymm0, %ymm0, %ymm0
- vmovdqu %ymm0, 64(%rdi)
- vmovdqu %ymm0, 96(%rdi)
- vmovdqu %ymm0, 128(%rdi)
- vmovdqu %ymm0, 160(%rdi)
- vmovdqu %ymm0, 192(%rdi)
- movq $0x00, 608(%rdi)
- movw $0x00, 616(%rdi)
- repz retq
- #ifndef __APPLE__
- .size poly1305_setkey_avx2,.-poly1305_setkey_avx2
- #endif /* __APPLE__ */
- #ifndef __APPLE__
- .data
- #else
- .section __DATA,__data
- #endif /* __APPLE__ */
- #ifndef __APPLE__
- .align 32
- #else
- .p2align 5
- #endif /* __APPLE__ */
- L_poly1305_avx2_blocks_mask:
- .quad 0x3ffffff, 0x3ffffff
- .quad 0x3ffffff, 0x3ffffff
- #ifndef __APPLE__
- .data
- #else
- .section __DATA,__data
- #endif /* __APPLE__ */
- #ifndef __APPLE__
- .align 32
- #else
- .p2align 5
- #endif /* __APPLE__ */
- L_poly1305_avx2_blocks_hibit:
- .quad 0x1000000, 0x1000000
- .quad 0x1000000, 0x1000000
- #ifndef __APPLE__
- .text
- .globl poly1305_blocks_avx2
- .type poly1305_blocks_avx2,@function
- .align 16
- poly1305_blocks_avx2:
- #else
- .section __TEXT,__text
- .globl _poly1305_blocks_avx2
- .p2align 4
- _poly1305_blocks_avx2:
- #endif /* __APPLE__ */
- pushq %r12
- pushq %rbx
- subq $0x140, %rsp
- movq %rsp, %rcx
- andq $-32, %rcx
- addq $32, %rcx
- vpxor %ymm15, %ymm15, %ymm15
- movq %rcx, %rbx
- leaq 64(%rdi), %rax
- addq $0xa0, %rbx
- cmpw $0x00, 616(%rdi)
- jne L_poly1305_avx2_blocks_begin_h
- # Load the message data
- vmovdqu (%rsi), %ymm0
- vmovdqu 32(%rsi), %ymm1
- vperm2i128 $32, %ymm1, %ymm0, %ymm2
- vperm2i128 $49, %ymm1, %ymm0, %ymm0
- vpunpckldq %ymm0, %ymm2, %ymm1
- vpunpckhdq %ymm0, %ymm2, %ymm3
- vpunpckldq %ymm15, %ymm1, %ymm0
- vpunpckhdq %ymm15, %ymm1, %ymm1
- vpunpckldq %ymm15, %ymm3, %ymm2
- vpunpckhdq %ymm15, %ymm3, %ymm3
- vmovdqu L_poly1305_avx2_blocks_hibit(%rip), %ymm4
- vpsllq $6, %ymm1, %ymm1
- vpsllq $12, %ymm2, %ymm2
- vpsllq $18, %ymm3, %ymm3
- vmovdqu L_poly1305_avx2_blocks_mask(%rip), %ymm14
- # Reduce, in place, the message data
- vpsrlq $26, %ymm0, %ymm10
- vpsrlq $26, %ymm3, %ymm11
- vpand %ymm14, %ymm0, %ymm0
- vpand %ymm14, %ymm3, %ymm3
- vpaddq %ymm1, %ymm10, %ymm1
- vpaddq %ymm4, %ymm11, %ymm4
- vpsrlq $26, %ymm1, %ymm10
- vpsrlq $26, %ymm4, %ymm11
- vpand %ymm14, %ymm1, %ymm1
- vpand %ymm14, %ymm4, %ymm4
- vpaddq %ymm2, %ymm10, %ymm2
- vpslld $2, %ymm11, %ymm12
- vpaddd %ymm12, %ymm11, %ymm12
- vpsrlq $26, %ymm2, %ymm10
- vpaddq %ymm0, %ymm12, %ymm0
- vpsrlq $26, %ymm0, %ymm11
- vpand %ymm14, %ymm2, %ymm2
- vpand %ymm14, %ymm0, %ymm0
- vpaddq %ymm3, %ymm10, %ymm3
- vpaddq %ymm1, %ymm11, %ymm1
- vpsrlq $26, %ymm3, %ymm10
- vpand %ymm14, %ymm3, %ymm3
- vpaddq %ymm4, %ymm10, %ymm4
- addq $0x40, %rsi
- subq $0x40, %rdx
- jz L_poly1305_avx2_blocks_store
- jmp L_poly1305_avx2_blocks_load_r4
- L_poly1305_avx2_blocks_begin_h:
- # Load the H values.
- vmovdqu (%rax), %ymm0
- vmovdqu 32(%rax), %ymm1
- vmovdqu 64(%rax), %ymm2
- vmovdqu 96(%rax), %ymm3
- vmovdqu 128(%rax), %ymm4
- # Check if there is a power of r to load - otherwise use r^4.
- cmpb $0x00, 616(%rdi)
- je L_poly1305_avx2_blocks_load_r4
- # Load the 4 powers of r - r^4, r^3, r^2, r^1.
- vmovdqu 224(%rdi), %ymm8
- vmovdqu 256(%rdi), %ymm7
- vmovdqu 288(%rdi), %ymm6
- vmovdqu 320(%rdi), %ymm5
- vpermq $0xd8, %ymm5, %ymm5
- vpermq $0xd8, %ymm6, %ymm6
- vpermq $0xd8, %ymm7, %ymm7
- vpermq $0xd8, %ymm8, %ymm8
- vpunpcklqdq %ymm6, %ymm5, %ymm10
- vpunpckhqdq %ymm6, %ymm5, %ymm11
- vpunpcklqdq %ymm8, %ymm7, %ymm12
- vpunpckhqdq %ymm8, %ymm7, %ymm13
- vperm2i128 $32, %ymm12, %ymm10, %ymm5
- vperm2i128 $49, %ymm12, %ymm10, %ymm7
- vperm2i128 $32, %ymm13, %ymm11, %ymm9
- vpsrlq $32, %ymm5, %ymm6
- vpsrlq $32, %ymm7, %ymm8
- jmp L_poly1305_avx2_blocks_mul_5
- L_poly1305_avx2_blocks_load_r4:
- # Load r^4 into all four positions.
- vmovdqu 320(%rdi), %ymm13
- vpermq $0x00, %ymm13, %ymm5
- vpsrlq $32, %ymm13, %ymm14
- vpermq $0x55, %ymm13, %ymm7
- vpermq $0xaa, %ymm13, %ymm9
- vpermq $0x00, %ymm14, %ymm6
- vpermq $0x55, %ymm14, %ymm8
- L_poly1305_avx2_blocks_mul_5:
- # Multiply top 4 26-bit values of all four H by 5
- vpslld $2, %ymm6, %ymm10
- vpslld $2, %ymm7, %ymm11
- vpslld $2, %ymm8, %ymm12
- vpslld $2, %ymm9, %ymm13
- vpaddq %ymm10, %ymm6, %ymm10
- vpaddq %ymm11, %ymm7, %ymm11
- vpaddq %ymm12, %ymm8, %ymm12
- vpaddq %ymm13, %ymm9, %ymm13
- # Store powers of r and multiple of 5 for use in multiply.
- vmovdqa %ymm10, (%rbx)
- vmovdqa %ymm11, 32(%rbx)
- vmovdqa %ymm12, 64(%rbx)
- vmovdqa %ymm13, 96(%rbx)
- vmovdqa %ymm5, (%rcx)
- vmovdqa %ymm6, 32(%rcx)
- vmovdqa %ymm7, 64(%rcx)
- vmovdqa %ymm8, 96(%rcx)
- vmovdqa %ymm9, 128(%rcx)
- vmovdqu L_poly1305_avx2_blocks_mask(%rip), %ymm14
- # If not finished then loop over data
- cmpb $0x01, 616(%rdi)
- jne L_poly1305_avx2_blocks_start
- # Do last multiply, reduce, add the four H together and move to
- # 32-bit registers
- vpmuludq (%rbx), %ymm4, %ymm5
- vpmuludq 32(%rbx), %ymm3, %ymm10
- vpmuludq 32(%rbx), %ymm4, %ymm6
- vpmuludq 64(%rbx), %ymm3, %ymm11
- vpmuludq 64(%rbx), %ymm4, %ymm7
- vpaddq %ymm5, %ymm10, %ymm5
- vpmuludq 64(%rbx), %ymm2, %ymm12
- vpmuludq 96(%rbx), %ymm4, %ymm8
- vpaddq %ymm6, %ymm11, %ymm6
- vpmuludq 96(%rbx), %ymm1, %ymm13
- vpmuludq 96(%rbx), %ymm2, %ymm10
- vpaddq %ymm5, %ymm12, %ymm5
- vpmuludq 96(%rbx), %ymm3, %ymm11
- vpmuludq (%rcx), %ymm3, %ymm12
- vpaddq %ymm5, %ymm13, %ymm5
- vpmuludq (%rcx), %ymm4, %ymm9
- vpaddq %ymm6, %ymm10, %ymm6
- vpmuludq (%rcx), %ymm0, %ymm13
- vpaddq %ymm7, %ymm11, %ymm7
- vpmuludq (%rcx), %ymm1, %ymm10
- vpaddq %ymm8, %ymm12, %ymm8
- vpmuludq (%rcx), %ymm2, %ymm11
- vpmuludq 32(%rcx), %ymm2, %ymm12
- vpaddq %ymm5, %ymm13, %ymm5
- vpmuludq 32(%rcx), %ymm3, %ymm13
- vpaddq %ymm6, %ymm10, %ymm6
- vpmuludq 32(%rcx), %ymm0, %ymm10
- vpaddq %ymm7, %ymm11, %ymm7
- vpmuludq 32(%rcx), %ymm1, %ymm11
- vpaddq %ymm8, %ymm12, %ymm8
- vpmuludq 64(%rcx), %ymm1, %ymm12
- vpaddq %ymm9, %ymm13, %ymm9
- vpmuludq 64(%rcx), %ymm2, %ymm13
- vpaddq %ymm6, %ymm10, %ymm6
- vpmuludq 64(%rcx), %ymm0, %ymm10
- vpaddq %ymm7, %ymm11, %ymm7
- vpmuludq 96(%rcx), %ymm0, %ymm11
- vpaddq %ymm8, %ymm12, %ymm8
- vpmuludq 96(%rcx), %ymm1, %ymm12
- vpaddq %ymm9, %ymm13, %ymm9
- vpaddq %ymm7, %ymm10, %ymm7
- vpmuludq 128(%rcx), %ymm0, %ymm13
- vpaddq %ymm8, %ymm11, %ymm8
- vpaddq %ymm9, %ymm12, %ymm9
- vpaddq %ymm9, %ymm13, %ymm9
- vpsrlq $26, %ymm5, %ymm10
- vpsrlq $26, %ymm8, %ymm11
- vpand %ymm14, %ymm5, %ymm5
- vpand %ymm14, %ymm8, %ymm8
- vpaddq %ymm6, %ymm10, %ymm6
- vpaddq %ymm9, %ymm11, %ymm9
- vpsrlq $26, %ymm6, %ymm10
- vpsrlq $26, %ymm9, %ymm11
- vpand %ymm14, %ymm6, %ymm1
- vpand %ymm14, %ymm9, %ymm4
- vpaddq %ymm7, %ymm10, %ymm7
- vpslld $2, %ymm11, %ymm12
- vpaddd %ymm12, %ymm11, %ymm12
- vpsrlq $26, %ymm7, %ymm10
- vpaddq %ymm5, %ymm12, %ymm5
- vpsrlq $26, %ymm5, %ymm11
- vpand %ymm14, %ymm7, %ymm2
- vpand %ymm14, %ymm5, %ymm0
- vpaddq %ymm8, %ymm10, %ymm8
- vpaddq %ymm1, %ymm11, %ymm1
- vpsrlq $26, %ymm8, %ymm10
- vpand %ymm14, %ymm8, %ymm3
- vpaddq %ymm4, %ymm10, %ymm4
- vpsrldq $8, %ymm0, %ymm5
- vpsrldq $8, %ymm1, %ymm6
- vpsrldq $8, %ymm2, %ymm7
- vpsrldq $8, %ymm3, %ymm8
- vpsrldq $8, %ymm4, %ymm9
- vpaddq %ymm0, %ymm5, %ymm0
- vpaddq %ymm1, %ymm6, %ymm1
- vpaddq %ymm2, %ymm7, %ymm2
- vpaddq %ymm3, %ymm8, %ymm3
- vpaddq %ymm4, %ymm9, %ymm4
- vpermq $2, %ymm0, %ymm5
- vpermq $2, %ymm1, %ymm6
- vpermq $2, %ymm2, %ymm7
- vpermq $2, %ymm3, %ymm8
- vpermq $2, %ymm4, %ymm9
- vpaddq %ymm0, %ymm5, %ymm0
- vpaddq %ymm1, %ymm6, %ymm1
- vpaddq %ymm2, %ymm7, %ymm2
- vpaddq %ymm3, %ymm8, %ymm3
- vpaddq %ymm4, %ymm9, %ymm4
- vmovd %xmm0, %r8d
- vmovd %xmm1, %r9d
- vmovd %xmm2, %r10d
- vmovd %xmm3, %r11d
- vmovd %xmm4, %r12d
- jmp L_poly1305_avx2_blocks_end_calc
- L_poly1305_avx2_blocks_start:
- vmovdqu (%rsi), %ymm5
- vmovdqu 32(%rsi), %ymm6
- vperm2i128 $32, %ymm6, %ymm5, %ymm7
- vperm2i128 $49, %ymm6, %ymm5, %ymm5
- vpunpckldq %ymm5, %ymm7, %ymm6
- vpunpckhdq %ymm5, %ymm7, %ymm8
- vpunpckldq %ymm15, %ymm6, %ymm5
- vpunpckhdq %ymm15, %ymm6, %ymm6
- vpunpckldq %ymm15, %ymm8, %ymm7
- vpunpckhdq %ymm15, %ymm8, %ymm8
- vmovdqu L_poly1305_avx2_blocks_hibit(%rip), %ymm9
- vpsllq $6, %ymm6, %ymm6
- vpsllq $12, %ymm7, %ymm7
- vpsllq $18, %ymm8, %ymm8
- vpmuludq (%rbx), %ymm4, %ymm10
- vpaddq %ymm5, %ymm10, %ymm5
- vpmuludq 32(%rbx), %ymm3, %ymm10
- vpmuludq 32(%rbx), %ymm4, %ymm11
- vpaddq %ymm6, %ymm11, %ymm6
- vpmuludq 64(%rbx), %ymm3, %ymm11
- vpmuludq 64(%rbx), %ymm4, %ymm12
- vpaddq %ymm7, %ymm12, %ymm7
- vpaddq %ymm5, %ymm10, %ymm5
- vpmuludq 64(%rbx), %ymm2, %ymm12
- vpmuludq 96(%rbx), %ymm4, %ymm13
- vpaddq %ymm8, %ymm13, %ymm8
- vpaddq %ymm6, %ymm11, %ymm6
- vpmuludq 96(%rbx), %ymm1, %ymm13
- vpmuludq 96(%rbx), %ymm2, %ymm10
- vpaddq %ymm5, %ymm12, %ymm5
- vpmuludq 96(%rbx), %ymm3, %ymm11
- vpmuludq (%rcx), %ymm3, %ymm12
- vpaddq %ymm5, %ymm13, %ymm5
- vpmuludq (%rcx), %ymm4, %ymm13
- vpaddq %ymm9, %ymm13, %ymm9
- vpaddq %ymm6, %ymm10, %ymm6
- vpmuludq (%rcx), %ymm0, %ymm13
- vpaddq %ymm7, %ymm11, %ymm7
- vpmuludq (%rcx), %ymm1, %ymm10
- vpaddq %ymm8, %ymm12, %ymm8
- vpmuludq (%rcx), %ymm2, %ymm11
- vpmuludq 32(%rcx), %ymm2, %ymm12
- vpaddq %ymm5, %ymm13, %ymm5
- vpmuludq 32(%rcx), %ymm3, %ymm13
- vpaddq %ymm6, %ymm10, %ymm6
- vpmuludq 32(%rcx), %ymm0, %ymm10
- vpaddq %ymm7, %ymm11, %ymm7
- vpmuludq 32(%rcx), %ymm1, %ymm11
- vpaddq %ymm8, %ymm12, %ymm8
- vpmuludq 64(%rcx), %ymm1, %ymm12
- vpaddq %ymm9, %ymm13, %ymm9
- vpmuludq 64(%rcx), %ymm2, %ymm13
- vpaddq %ymm6, %ymm10, %ymm6
- vpmuludq 64(%rcx), %ymm0, %ymm10
- vpaddq %ymm7, %ymm11, %ymm7
- vpmuludq 96(%rcx), %ymm0, %ymm11
- vpaddq %ymm8, %ymm12, %ymm8
- vpmuludq 96(%rcx), %ymm1, %ymm12
- vpaddq %ymm9, %ymm13, %ymm9
- vpaddq %ymm7, %ymm10, %ymm7
- vpmuludq 128(%rcx), %ymm0, %ymm13
- vpaddq %ymm8, %ymm11, %ymm8
- vpaddq %ymm9, %ymm12, %ymm9
- vpaddq %ymm9, %ymm13, %ymm9
- vpsrlq $26, %ymm5, %ymm10
- vpsrlq $26, %ymm8, %ymm11
- vpand %ymm14, %ymm5, %ymm5
- vpand %ymm14, %ymm8, %ymm8
- vpaddq %ymm6, %ymm10, %ymm6
- vpaddq %ymm9, %ymm11, %ymm9
- vpsrlq $26, %ymm6, %ymm10
- vpsrlq $26, %ymm9, %ymm11
- vpand %ymm14, %ymm6, %ymm1
- vpand %ymm14, %ymm9, %ymm4
- vpaddq %ymm7, %ymm10, %ymm7
- vpslld $2, %ymm11, %ymm12
- vpaddd %ymm12, %ymm11, %ymm12
- vpsrlq $26, %ymm7, %ymm10
- vpaddq %ymm5, %ymm12, %ymm5
- vpsrlq $26, %ymm5, %ymm11
- vpand %ymm14, %ymm7, %ymm2
- vpand %ymm14, %ymm5, %ymm0
- vpaddq %ymm8, %ymm10, %ymm8
- vpaddq %ymm1, %ymm11, %ymm1
- vpsrlq $26, %ymm8, %ymm10
- vpand %ymm14, %ymm8, %ymm3
- vpaddq %ymm4, %ymm10, %ymm4
- addq $0x40, %rsi
- subq $0x40, %rdx
- jnz L_poly1305_avx2_blocks_start
- L_poly1305_avx2_blocks_store:
- # Store four H values - state
- vmovdqu %ymm0, (%rax)
- vmovdqu %ymm1, 32(%rax)
- vmovdqu %ymm2, 64(%rax)
- vmovdqu %ymm3, 96(%rax)
- vmovdqu %ymm4, 128(%rax)
- L_poly1305_avx2_blocks_end_calc:
- cmpb $0x00, 616(%rdi)
- je L_poly1305_avx2_blocks_complete
- movq %r8, %rax
- movq %r10, %rdx
- movq %r12, %rcx
- shrq $12, %rdx
- shrq $24, %rcx
- shlq $26, %r9
- shlq $52, %r10
- shlq $14, %r11
- shlq $40, %r12
- addq %r9, %rax
- adcq %r10, %rax
- adcq %r11, %rdx
- adcq %r12, %rdx
- adcq $0x00, %rcx
- movq %rcx, %r8
- andq $3, %rcx
- shrq $2, %r8
- leaq 0(%r8,%r8,4), %r8
- addq %r8, %rax
- adcq $0x00, %rdx
- adcq $0x00, %rcx
- movq %rax, 24(%rdi)
- movq %rdx, 32(%rdi)
- movq %rcx, 40(%rdi)
- L_poly1305_avx2_blocks_complete:
- movb $0x01, 617(%rdi)
- addq $0x140, %rsp
- popq %rbx
- popq %r12
- repz retq
- #ifndef __APPLE__
- .size poly1305_blocks_avx2,.-poly1305_blocks_avx2
- #endif /* __APPLE__ */
- #ifndef __APPLE__
- .text
- .globl poly1305_final_avx2
- .type poly1305_final_avx2,@function
- .align 16
- poly1305_final_avx2:
- #else
- .section __TEXT,__text
- .globl _poly1305_final_avx2
- .p2align 4
- _poly1305_final_avx2:
- #endif /* __APPLE__ */
- movb $0x01, 616(%rdi)
- movb 617(%rdi), %cl
- cmpb $0x00, %cl
- je L_poly1305_avx2_final_done_blocks_X4
- pushq %rsi
- movq $0x40, %rdx
- xorq %rsi, %rsi
- #ifndef __APPLE__
- callq poly1305_blocks_avx2@plt
- #else
- callq _poly1305_blocks_avx2
- #endif /* __APPLE__ */
- popq %rsi
- L_poly1305_avx2_final_done_blocks_X4:
- movq 608(%rdi), %rax
- movq %rax, %rcx
- andq $-16, %rcx
- cmpb $0x00, %cl
- je L_poly1305_avx2_final_done_blocks
- pushq %rcx
- pushq %rax
- pushq %rsi
- movq %rcx, %rdx
- leaq 480(%rdi), %rsi
- #ifndef __APPLE__
- callq poly1305_blocks_avx@plt
- #else
- callq _poly1305_blocks_avx
- #endif /* __APPLE__ */
- popq %rsi
- popq %rax
- popq %rcx
- L_poly1305_avx2_final_done_blocks:
- subq %rcx, 608(%rdi)
- xorq %rdx, %rdx
- jmp L_poly1305_avx2_final_cmp_copy
- L_poly1305_avx2_final_start_copy:
- movb 480(%rdi,%rcx,1), %r8b
- movb %r8b, 480(%rdi,%rdx,1)
- incb %cl
- incb %dl
- L_poly1305_avx2_final_cmp_copy:
- cmp %rcx, %rax
- jne L_poly1305_avx2_final_start_copy
- #ifndef __APPLE__
- callq poly1305_final_avx@plt
- #else
- callq _poly1305_final_avx
- #endif /* __APPLE__ */
- vpxor %ymm0, %ymm0, %ymm0
- vmovdqu %ymm0, 64(%rdi)
- vmovdqu %ymm0, 96(%rdi)
- vmovdqu %ymm0, 128(%rdi)
- vmovdqu %ymm0, 160(%rdi)
- vmovdqu %ymm0, 192(%rdi)
- vmovdqu %ymm0, 224(%rdi)
- vmovdqu %ymm0, 256(%rdi)
- vmovdqu %ymm0, 288(%rdi)
- vmovdqu %ymm0, 320(%rdi)
- movq $0x00, 608(%rdi)
- movw $0x00, 616(%rdi)
- repz retq
- #ifndef __APPLE__
- .size poly1305_final_avx2,.-poly1305_final_avx2
- #endif /* __APPLE__ */
- #endif /* HAVE_INTEL_AVX2 */
- #if defined(__linux__) && defined(__ELF__)
- .section .note.GNU-stack,"",%progbits
- #endif
|