123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441 |
- #! /usr/bin/env perl
- # Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved.
- #
- # Licensed under the Apache License 2.0 (the "License"). You may not use
- # this file except in compliance with the License. You can obtain a copy
- # in the file LICENSE in the source distribution or at
- # https://www.openssl.org/source/license.html
- # ====================================================================
- # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
- # project. The module is, however, dual licensed under OpenSSL and
- # CRYPTOGAMS licenses depending on where you obtain it. For further
- # details see http://www.openssl.org/~appro/cryptogams/.
- # ====================================================================
- # Poly1305 hash for MIPS64.
- #
- # May 2016
- #
- # Numbers are cycles per processed byte with poly1305_blocks alone.
- #
- # IALU/gcc
- # R1x000 5.64/+120% (big-endian)
- # Octeon II 3.80/+280% (little-endian)
- ######################################################################
- # There is a number of MIPS ABI in use, O32 and N32/64 are most
- # widely used. Then there is a new contender: NUBI. It appears that if
- # one picks the latter, it's possible to arrange code in ABI neutral
- # manner. Therefore let's stick to NUBI register layout:
- #
- ($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
- ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
- ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
- ($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
- #
- # The return value is placed in $a0. Following coding rules facilitate
- # interoperability:
- #
- # - never ever touch $tp, "thread pointer", former $gp [o32 can be
- # excluded from the rule, because it's specified volatile];
- # - copy return value to $t0, former $v0 [or to $a0 if you're adapting
- # old code];
- # - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
- #
- # For reference here is register layout for N32/64 MIPS ABIs:
- #
- # ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
- # ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
- # ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
- # ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
- # ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
- #
- # <appro@openssl.org>
- #
- ######################################################################
- # $output is the last argument if it looks like a file (it has an extension)
- # $flavour is the first argument if it doesn't look like a file
- $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
- # supported flavours are o32,n32,64,nubi32,nubi64, default is o32
- $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : "o32";
- die "MIPS64 only" unless ($flavour =~ /64|n32/i);
- $v0 = ($flavour =~ /nubi/i) ? $a0 : $t0;
- $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x0003f000" : "0x00030000";
- ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
- ($in0,$in1,$tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = ($a4,$a5,$a6,$a7,$at,$t0,$t1);
- $code.=<<___;
- #include "mips_arch.h"
- #ifdef MIPSEB
- # define MSB 0
- # define LSB 7
- #else
- # define MSB 7
- # define LSB 0
- #endif
- .text
- .set noat
- .set noreorder
- .align 5
- .globl poly1305_init
- .ent poly1305_init
- poly1305_init:
- .frame $sp,0,$ra
- .set reorder
- sd $zero,0($ctx)
- sd $zero,8($ctx)
- sd $zero,16($ctx)
- beqz $inp,.Lno_key
- #if defined(_MIPS_ARCH_MIPS64R6)
- ld $in0,0($inp)
- ld $in1,8($inp)
- #else
- ldl $in0,0+MSB($inp)
- ldl $in1,8+MSB($inp)
- ldr $in0,0+LSB($inp)
- ldr $in1,8+LSB($inp)
- #endif
- #ifdef MIPSEB
- # if defined(_MIPS_ARCH_MIPS64R2)
- dsbh $in0,$in0 # byte swap
- dsbh $in1,$in1
- dshd $in0,$in0
- dshd $in1,$in1
- # else
- ori $tmp0,$zero,0xFF
- dsll $tmp2,$tmp0,32
- or $tmp0,$tmp2 # 0x000000FF000000FF
- and $tmp1,$in0,$tmp0 # byte swap
- and $tmp3,$in1,$tmp0
- dsrl $tmp2,$in0,24
- dsrl $tmp4,$in1,24
- dsll $tmp1,24
- dsll $tmp3,24
- and $tmp2,$tmp0
- and $tmp4,$tmp0
- dsll $tmp0,8 # 0x0000FF000000FF00
- or $tmp1,$tmp2
- or $tmp3,$tmp4
- and $tmp2,$in0,$tmp0
- and $tmp4,$in1,$tmp0
- dsrl $in0,8
- dsrl $in1,8
- dsll $tmp2,8
- dsll $tmp4,8
- and $in0,$tmp0
- and $in1,$tmp0
- or $tmp1,$tmp2
- or $tmp3,$tmp4
- or $in0,$tmp1
- or $in1,$tmp3
- dsrl $tmp1,$in0,32
- dsrl $tmp3,$in1,32
- dsll $in0,32
- dsll $in1,32
- or $in0,$tmp1
- or $in1,$tmp3
- # endif
- #endif
- li $tmp0,1
- dsll $tmp0,32
- daddiu $tmp0,-63
- dsll $tmp0,28
- daddiu $tmp0,-1 # 0ffffffc0fffffff
- and $in0,$tmp0
- daddiu $tmp0,-3 # 0ffffffc0ffffffc
- and $in1,$tmp0
- sd $in0,24($ctx)
- dsrl $tmp0,$in1,2
- sd $in1,32($ctx)
- daddu $tmp0,$in1 # s1 = r1 + (r1 >> 2)
- sd $tmp0,40($ctx)
- .Lno_key:
- li $v0,0 # return 0
- jr $ra
- .end poly1305_init
- ___
- {
- my ($h0,$h1,$h2,$r0,$r1,$s1,$d0,$d1,$d2) =
- ($s0,$s1,$s2,$s3,$s4,$s5,$in0,$in1,$t2);
- $code.=<<___;
- .align 5
- .globl poly1305_blocks
- .ent poly1305_blocks
- poly1305_blocks:
- .set noreorder
- dsrl $len,4 # number of complete blocks
- bnez $len,poly1305_blocks_internal
- nop
- jr $ra
- nop
- .end poly1305_blocks
- .align 5
- .ent poly1305_blocks_internal
- poly1305_blocks_internal:
- .frame $sp,6*8,$ra
- .mask $SAVED_REGS_MASK,-8
- .set noreorder
- dsubu $sp,6*8
- sd $s5,40($sp)
- sd $s4,32($sp)
- ___
- $code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
- sd $s3,24($sp)
- sd $s2,16($sp)
- sd $s1,8($sp)
- sd $s0,0($sp)
- ___
- $code.=<<___;
- .set reorder
- ld $h0,0($ctx) # load hash value
- ld $h1,8($ctx)
- ld $h2,16($ctx)
- ld $r0,24($ctx) # load key
- ld $r1,32($ctx)
- ld $s1,40($ctx)
- .Loop:
- #if defined(_MIPS_ARCH_MIPS64R6)
- ld $in0,0($inp) # load input
- ld $in1,8($inp)
- #else
- ldl $in0,0+MSB($inp) # load input
- ldl $in1,8+MSB($inp)
- ldr $in0,0+LSB($inp)
- ldr $in1,8+LSB($inp)
- #endif
- daddiu $len,-1
- daddiu $inp,16
- #ifdef MIPSEB
- # if defined(_MIPS_ARCH_MIPS64R2)
- dsbh $in0,$in0 # byte swap
- dsbh $in1,$in1
- dshd $in0,$in0
- dshd $in1,$in1
- # else
- ori $tmp0,$zero,0xFF
- dsll $tmp2,$tmp0,32
- or $tmp0,$tmp2 # 0x000000FF000000FF
- and $tmp1,$in0,$tmp0 # byte swap
- and $tmp3,$in1,$tmp0
- dsrl $tmp2,$in0,24
- dsrl $tmp4,$in1,24
- dsll $tmp1,24
- dsll $tmp3,24
- and $tmp2,$tmp0
- and $tmp4,$tmp0
- dsll $tmp0,8 # 0x0000FF000000FF00
- or $tmp1,$tmp2
- or $tmp3,$tmp4
- and $tmp2,$in0,$tmp0
- and $tmp4,$in1,$tmp0
- dsrl $in0,8
- dsrl $in1,8
- dsll $tmp2,8
- dsll $tmp4,8
- and $in0,$tmp0
- and $in1,$tmp0
- or $tmp1,$tmp2
- or $tmp3,$tmp4
- or $in0,$tmp1
- or $in1,$tmp3
- dsrl $tmp1,$in0,32
- dsrl $tmp3,$in1,32
- dsll $in0,32
- dsll $in1,32
- or $in0,$tmp1
- or $in1,$tmp3
- # endif
- #endif
- daddu $h0,$in0 # accumulate input
- daddu $h1,$in1
- sltu $tmp0,$h0,$in0
- sltu $tmp1,$h1,$in1
- daddu $h1,$tmp0
- dmultu ($r0,$h0) # h0*r0
- daddu $h2,$padbit
- sltu $tmp0,$h1,$tmp0
- mflo ($d0,$r0,$h0)
- mfhi ($d1,$r0,$h0)
- dmultu ($s1,$h1) # h1*5*r1
- daddu $tmp0,$tmp1
- daddu $h2,$tmp0
- mflo ($tmp0,$s1,$h1)
- mfhi ($tmp1,$s1,$h1)
- dmultu ($r1,$h0) # h0*r1
- daddu $d0,$tmp0
- daddu $d1,$tmp1
- mflo ($tmp2,$r1,$h0)
- mfhi ($d2,$r1,$h0)
- sltu $tmp0,$d0,$tmp0
- daddu $d1,$tmp0
- dmultu ($r0,$h1) # h1*r0
- daddu $d1,$tmp2
- sltu $tmp2,$d1,$tmp2
- mflo ($tmp0,$r0,$h1)
- mfhi ($tmp1,$r0,$h1)
- daddu $d2,$tmp2
- dmultu ($s1,$h2) # h2*5*r1
- daddu $d1,$tmp0
- daddu $d2,$tmp1
- mflo ($tmp2,$s1,$h2)
- dmultu ($r0,$h2) # h2*r0
- sltu $tmp0,$d1,$tmp0
- daddu $d2,$tmp0
- mflo ($tmp3,$r0,$h2)
- daddu $d1,$tmp2
- daddu $d2,$tmp3
- sltu $tmp2,$d1,$tmp2
- daddu $d2,$tmp2
- li $tmp0,-4 # final reduction
- and $tmp0,$d2
- dsrl $tmp1,$d2,2
- andi $h2,$d2,3
- daddu $tmp0,$tmp1
- daddu $h0,$d0,$tmp0
- sltu $tmp0,$h0,$tmp0
- daddu $h1,$d1,$tmp0
- sltu $tmp0,$h1,$tmp0
- daddu $h2,$h2,$tmp0
- bnez $len,.Loop
- sd $h0,0($ctx) # store hash value
- sd $h1,8($ctx)
- sd $h2,16($ctx)
- .set noreorder
- ld $s5,40($sp) # epilogue
- ld $s4,32($sp)
- ___
- $code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi epilogue
- ld $s3,24($sp)
- ld $s2,16($sp)
- ld $s1,8($sp)
- ld $s0,0($sp)
- ___
- $code.=<<___;
- jr $ra
- daddu $sp,6*8
- .end poly1305_blocks_internal
- ___
- }
- {
- my ($ctx,$mac,$nonce) = ($a0,$a1,$a2);
- $code.=<<___;
- .align 5
- .globl poly1305_emit
- .ent poly1305_emit
- poly1305_emit:
- .frame $sp,0,$ra
- .set reorder
- ld $tmp0,0($ctx)
- ld $tmp1,8($ctx)
- ld $tmp2,16($ctx)
- daddiu $in0,$tmp0,5 # compare to modulus
- sltiu $tmp3,$in0,5
- daddu $in1,$tmp1,$tmp3
- sltu $tmp3,$in1,$tmp3
- daddu $tmp2,$tmp2,$tmp3
- dsrl $tmp2,2 # see if it carried/borrowed
- dsubu $tmp2,$zero,$tmp2
- nor $tmp3,$zero,$tmp2
- and $in0,$tmp2
- and $tmp0,$tmp3
- and $in1,$tmp2
- and $tmp1,$tmp3
- or $in0,$tmp0
- or $in1,$tmp1
- lwu $tmp0,0($nonce) # load nonce
- lwu $tmp1,4($nonce)
- lwu $tmp2,8($nonce)
- lwu $tmp3,12($nonce)
- dsll $tmp1,32
- dsll $tmp3,32
- or $tmp0,$tmp1
- or $tmp2,$tmp3
- daddu $in0,$tmp0 # accumulate nonce
- daddu $in1,$tmp2
- sltu $tmp0,$in0,$tmp0
- daddu $in1,$tmp0
- dsrl $tmp0,$in0,8 # write mac value
- dsrl $tmp1,$in0,16
- dsrl $tmp2,$in0,24
- sb $in0,0($mac)
- dsrl $tmp3,$in0,32
- sb $tmp0,1($mac)
- dsrl $tmp0,$in0,40
- sb $tmp1,2($mac)
- dsrl $tmp1,$in0,48
- sb $tmp2,3($mac)
- dsrl $tmp2,$in0,56
- sb $tmp3,4($mac)
- dsrl $tmp3,$in1,8
- sb $tmp0,5($mac)
- dsrl $tmp0,$in1,16
- sb $tmp1,6($mac)
- dsrl $tmp1,$in1,24
- sb $tmp2,7($mac)
- sb $in1,8($mac)
- dsrl $tmp2,$in1,32
- sb $tmp3,9($mac)
- dsrl $tmp3,$in1,40
- sb $tmp0,10($mac)
- dsrl $tmp0,$in1,48
- sb $tmp1,11($mac)
- dsrl $tmp1,$in1,56
- sb $tmp2,12($mac)
- sb $tmp3,13($mac)
- sb $tmp0,14($mac)
- sb $tmp1,15($mac)
- jr $ra
- .end poly1305_emit
- .rdata
- .asciiz "Poly1305 for MIPS64, CRYPTOGAMS by <appro\@openssl.org>"
- .align 2
- ___
- }
- $output and open STDOUT,">$output";
- print $code;
- close STDOUT or die "error closing STDOUT: $!";
|