123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398 |
- #!/usr/bin/env perl
- # ====================================================================
- # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
- # project. The module is, however, dual licensed under OpenSSL and
- # CRYPTOGAMS licenses depending on where you obtain it. For further
- # details see http://www.openssl.org/~appro/cryptogams/.
- # ====================================================================
- # September 2011
- #
- # Assembler helpers for Padlock engine. See even e_padlock-x86.pl for
- # details.
- $flavour = shift;
- $output = shift;
- if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
- $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
- $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
- ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
- ( $xlate="${dir}../../crypto/perlasm/x86_64-xlate.pl" and -f $xlate) or
- die "can't locate x86_64-xlate.pl";
- open STDOUT,"| $^X $xlate $flavour $output";
- $code=".text\n";
- $PADLOCK_CHUNK=512; # Must be a power of 2 between 32 and 2^20
- $ctx="%rdx";
- $out="%rdi";
- $inp="%rsi";
- $len="%rcx";
- $chunk="%rbx";
- ($arg1,$arg2,$arg3,$arg4)=$win64?("%rcx","%rdx","%r8", "%r9") : # Win64 order
- ("%rdi","%rsi","%rdx","%rcx"); # Unix order
- $code.=<<___;
- .globl padlock_capability
- .type padlock_capability,\@abi-omnipotent
- .align 16
- padlock_capability:
- mov %rbx,%r8
- xor %eax,%eax
- cpuid
- xor %eax,%eax
- cmp \$`"0x".unpack("H*",'tneC')`,%ebx
- jne .Lnoluck
- cmp \$`"0x".unpack("H*",'Hrua')`,%edx
- jne .Lnoluck
- cmp \$`"0x".unpack("H*",'slua')`,%ecx
- jne .Lnoluck
- mov \$0xC0000000,%eax
- cpuid
- mov %eax,%edx
- xor %eax,%eax
- cmp \$0xC0000001,%edx
- jb .Lnoluck
- mov \$0xC0000001,%eax
- cpuid
- mov %edx,%eax
- and \$0xffffffef,%eax
- or \$0x10,%eax # set Nano bit#4
- .Lnoluck:
- mov %r8,%rbx
- ret
- .size padlock_capability,.-padlock_capability
- .globl padlock_key_bswap
- .type padlock_key_bswap,\@abi-omnipotent,0
- .align 16
- padlock_key_bswap:
- mov 240($arg1),%edx
- .Lbswap_loop:
- mov ($arg1),%eax
- bswap %eax
- mov %eax,($arg1)
- lea 4($arg1),$arg1
- sub \$1,%edx
- jnz .Lbswap_loop
- ret
- .size padlock_key_bswap,.-padlock_key_bswap
- .globl padlock_verify_context
- .type padlock_verify_context,\@abi-omnipotent
- .align 16
- padlock_verify_context:
- mov $arg1,$ctx
- pushf
- lea .Lpadlock_saved_context(%rip),%rax
- call _padlock_verify_ctx
- lea 8(%rsp),%rsp
- ret
- .size padlock_verify_context,.-padlock_verify_context
- .type _padlock_verify_ctx,\@abi-omnipotent
- .align 16
- _padlock_verify_ctx:
- mov 8(%rsp),%r8
- bt \$30,%r8
- jnc .Lverified
- cmp (%rax),$ctx
- je .Lverified
- pushf
- popf
- .Lverified:
- mov $ctx,(%rax)
- ret
- .size _padlock_verify_ctx,.-_padlock_verify_ctx
- .globl padlock_reload_key
- .type padlock_reload_key,\@abi-omnipotent
- .align 16
- padlock_reload_key:
- pushf
- popf
- ret
- .size padlock_reload_key,.-padlock_reload_key
- .globl padlock_aes_block
- .type padlock_aes_block,\@function,3
- .align 16
- padlock_aes_block:
- mov %rbx,%r8
- mov \$1,$len
- lea 32($ctx),%rbx # key
- lea 16($ctx),$ctx # control word
- .byte 0xf3,0x0f,0xa7,0xc8 # rep xcryptecb
- mov %r8,%rbx
- ret
- .size padlock_aes_block,.-padlock_aes_block
- .globl padlock_xstore
- .type padlock_xstore,\@function,2
- .align 16
- padlock_xstore:
- mov %esi,%edx
- .byte 0x0f,0xa7,0xc0 # xstore
- ret
- .size padlock_xstore,.-padlock_xstore
- .globl padlock_sha1_oneshot
- .type padlock_sha1_oneshot,\@function,3
- .align 16
- padlock_sha1_oneshot:
- xor %rax,%rax
- mov %rdx,%rcx
- .byte 0xf3,0x0f,0xa6,0xc8 # rep xsha1
- ret
- .size padlock_sha1_oneshot,.-padlock_sha1_oneshot
- .globl padlock_sha1_blocks
- .type padlock_sha1_blocks,\@function,3
- .align 16
- padlock_sha1_blocks:
- mov \$-1,%rax
- mov %rdx,%rcx
- .byte 0xf3,0x0f,0xa6,0xc8 # rep xsha1
- ret
- .size padlock_sha1_blocks,.-padlock_sha1_blocks
- .globl padlock_sha256_oneshot
- .type padlock_sha256_oneshot,\@function,3
- .align 16
- padlock_sha256_oneshot:
- xor %rax,%rax
- mov %rdx,%rcx
- .byte 0xf3,0x0f,0xa6,0xd0 # rep xsha256
- ret
- .size padlock_sha256_oneshot,.-padlock_sha256_oneshot
- .globl padlock_sha256_blocks
- .type padlock_sha256_blocks,\@function,3
- .align 16
- padlock_sha256_blocks:
- mov \$-1,%rax
- mov %rdx,%rcx
- .byte 0xf3,0x0f,0xa6,0xd0 # rep xsha256
- ret
- .size padlock_sha256_blocks,.-padlock_sha256_blocks
- .globl padlock_sha512_blocks
- .type padlock_sha512_blocks,\@function,3
- .align 16
- padlock_sha512_blocks:
- mov %rdx,%rcx
- .byte 0xf3,0x0f,0xa6,0xe0 # rep xha512
- ret
- .size padlock_sha512_blocks,.-padlock_sha512_blocks
- ___
- sub generate_mode {
- my ($mode,$opcode) = @_;
- # int padlock_$mode_encrypt(void *out, const void *inp,
- # struct padlock_cipher_data *ctx, size_t len);
- $code.=<<___;
- .globl padlock_${mode}_encrypt
- .type padlock_${mode}_encrypt,\@function,4
- .align 16
- padlock_${mode}_encrypt:
- push %rbp
- push %rbx
- xor %eax,%eax
- test \$15,$ctx
- jnz .L${mode}_abort
- test \$15,$len
- jnz .L${mode}_abort
- lea .Lpadlock_saved_context(%rip),%rax
- pushf
- cld
- call _padlock_verify_ctx
- lea 16($ctx),$ctx # control word
- xor %eax,%eax
- xor %ebx,%ebx
- testl \$`1<<5`,($ctx) # align bit in control word
- jnz .L${mode}_aligned
- test \$0x0f,$out
- setz %al # !out_misaligned
- test \$0x0f,$inp
- setz %bl # !inp_misaligned
- test %ebx,%eax
- jnz .L${mode}_aligned
- neg %rax
- mov \$$PADLOCK_CHUNK,$chunk
- not %rax # out_misaligned?-1:0
- lea (%rsp),%rbp
- cmp $chunk,$len
- cmovc $len,$chunk # chunk=len>PADLOCK_CHUNK?PADLOCK_CHUNK:len
- and $chunk,%rax # out_misaligned?chunk:0
- mov $len,$chunk
- neg %rax
- and \$$PADLOCK_CHUNK-1,$chunk # chunk%=PADLOCK_CHUNK
- lea (%rax,%rbp),%rsp
- ___
- $code.=<<___ if ($mode eq "ctr32");
- mov -4($ctx),%eax # pull 32-bit counter
- bswap %eax
- neg %eax
- and \$`$PADLOCK_CHUNK/16-1`,%eax
- jz .L${mode}_loop
- shl \$4,%eax
- cmp %rax,$len
- cmova %rax,$chunk # don't let counter cross PADLOCK_CHUNK
- ___
- $code.=<<___;
- jmp .L${mode}_loop
- .align 16
- .L${mode}_loop:
- cmp $len,$chunk # ctr32 artefact
- cmova $len,$chunk # ctr32 artefact
- mov $out,%r8 # save parameters
- mov $inp,%r9
- mov $len,%r10
- mov $chunk,$len
- mov $chunk,%r11
- test \$0x0f,$out # out_misaligned
- cmovnz %rsp,$out
- test \$0x0f,$inp # inp_misaligned
- jz .L${mode}_inp_aligned
- shr \$3,$len
- .byte 0xf3,0x48,0xa5 # rep movsq
- sub $chunk,$out
- mov $chunk,$len
- mov $out,$inp
- .L${mode}_inp_aligned:
- lea -16($ctx),%rax # ivp
- lea 16($ctx),%rbx # key
- shr \$4,$len
- .byte 0xf3,0x0f,0xa7,$opcode # rep xcrypt*
- ___
- $code.=<<___ if ($mode !~ /ecb|ctr/);
- movdqa (%rax),%xmm0
- movdqa %xmm0,-16($ctx) # copy [or refresh] iv
- ___
- $code.=<<___ if ($mode eq "ctr32");
- mov -4($ctx),%eax # pull 32-bit counter
- test \$0xffff0000,%eax
- jnz .L${mode}_no_corr
- bswap %eax
- add \$0x10000,%eax
- bswap %eax
- mov %eax,-4($ctx)
- .L${mode}_no_corr:
- ___
- $code.=<<___;
- mov %r8,$out # restore paramters
- mov %r11,$chunk
- test \$0x0f,$out
- jz .L${mode}_out_aligned
- mov $chunk,$len
- shr \$3,$len
- lea (%rsp),$inp
- .byte 0xf3,0x48,0xa5 # rep movsq
- sub $chunk,$out
- .L${mode}_out_aligned:
- mov %r9,$inp
- mov %r10,$len
- add $chunk,$out
- add $chunk,$inp
- sub $chunk,$len
- mov \$$PADLOCK_CHUNK,$chunk
- jnz .L${mode}_loop
- test \$0x0f,$out
- jz .L${mode}_done
- mov %rbp,$len
- mov %rsp,$out
- sub %rsp,$len
- xor %rax,%rax
- shr \$3,$len
- .byte 0xf3,0x48,0xab # rep stosq
- .L${mode}_done:
- lea (%rbp),%rsp
- jmp .L${mode}_exit
- .align 16
- .L${mode}_aligned:
- ___
- $code.=<<___ if ($mode eq "ctr32");
- mov -4($ctx),%eax # pull 32-bit counter
- mov \$`16*0x10000`,$chunk
- bswap %eax
- cmp $len,$chunk
- cmova $len,$chunk
- neg %eax
- and \$0xffff,%eax
- jz .L${mode}_aligned_loop
- shl \$4,%eax
- cmp %rax,$len
- cmova %rax,$chunk # don't let counter cross 2^16
- jmp .L${mode}_aligned_loop
- .align 16
- .L${mode}_aligned_loop:
- cmp $len,$chunk
- cmova $len,$chunk
- mov $len,%r10 # save parameters
- mov $chunk,$len
- mov $chunk,%r11
- ___
- $code.=<<___;
- lea -16($ctx),%rax # ivp
- lea 16($ctx),%rbx # key
- shr \$4,$len # len/=AES_BLOCK_SIZE
- .byte 0xf3,0x0f,0xa7,$opcode # rep xcrypt*
- ___
- $code.=<<___ if ($mode !~ /ecb|ctr/);
- movdqa (%rax),%xmm0
- movdqa %xmm0,-16($ctx) # copy [or refresh] iv
- ___
- $code.=<<___ if ($mode eq "ctr32");
- mov -4($ctx),%eax # pull 32-bit counter
- bswap %eax
- add \$0x10000,%eax
- bswap %eax
- mov %eax,-4($ctx)
- mov %r11,$chunk # restore paramters
- mov %r10,$len
- sub $chunk,$len
- mov \$`16*0x10000`,$chunk
- jnz .L${mode}_aligned_loop
- ___
- $code.=<<___;
- .L${mode}_exit:
- mov \$1,%eax
- lea 8(%rsp),%rsp
- .L${mode}_abort:
- pop %rbx
- pop %rbp
- ret
- .size padlock_${mode}_encrypt,.-padlock_${mode}_encrypt
- ___
- }
- &generate_mode("ecb",0xc8);
- &generate_mode("cbc",0xd0);
- &generate_mode("cfb",0xe0);
- &generate_mode("ofb",0xe8);
- &generate_mode("ctr32",0xd8); # all 64-bit CPUs have working CTR...
- $code.=<<___;
- .asciz "VIA Padlock x86_64 module, CRYPTOGAMS by <appro\@openssl.org>"
- .align 16
- .data
- .align 8
- .Lpadlock_saved_context:
- .quad 0
- ___
- $code =~ s/\`([^\`]*)\`/eval($1)/gem;
- print $code;
- close STDOUT;
|