123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743 |
- #! /usr/bin/env perl
- # Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved.
- #
- # Licensed under the Apache License 2.0 (the "License"). You may not use
- # this file except in compliance with the License. You can obtain a copy
- # in the file LICENSE in the source distribution or at
- # https://www.openssl.org/source/license.html
- #
- # ====================================================================
- # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
- # project. The module is, however, dual licensed under OpenSSL and
- # CRYPTOGAMS licenses depending on where you obtain it. For further
- # details see http://www.openssl.org/~appro/cryptogams/.
- # ====================================================================
- #
- # This module implements Poly1305 hash for PowerPC FPU.
- #
- # June 2015
- #
- # Numbers are cycles per processed byte with poly1305_blocks alone,
- # and improvement coefficients relative to gcc-generated code.
- #
- # Freescale e300 9.78/+30%
- # PPC74x0 6.92/+50%
- # PPC970 6.03/+80%
- # POWER7 3.50/+30%
- # POWER8 3.75/+10%
- # $output is the last argument if it looks like a file (it has an extension)
- # $flavour is the first argument if it doesn't look like a file
- $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
- $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
- if ($flavour =~ /64/) {
- $SIZE_T =8;
- $LRSAVE =2*$SIZE_T;
- $UCMP ="cmpld";
- $STU ="stdu";
- $POP ="ld";
- $PUSH ="std";
- } elsif ($flavour =~ /32/) {
- $SIZE_T =4;
- $LRSAVE =$SIZE_T;
- $UCMP ="cmplw";
- $STU ="stwu";
- $POP ="lwz";
- $PUSH ="stw";
- } else { die "nonsense $flavour"; }
- $LITTLE_ENDIAN = ($flavour=~/le$/) ? 4 : 0;
- $LWXLE = $LITTLE_ENDIAN ? "lwzx" : "lwbrx";
- $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
- ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
- ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
- die "can't locate ppc-xlate.pl";
- open STDOUT,"| $^X $xlate $flavour \"$output\""
- or die "can't call $xlate: $!";
- $LOCALS=6*$SIZE_T;
- $FRAME=$LOCALS+6*8+18*8;
- my $sp="r1";
- my ($ctx,$inp,$len,$padbit) = map("r$_",(3..6));
- my ($in0,$in1,$in2,$in3,$i1,$i2,$i3) = map("r$_",(7..12,6));
- my ($h0lo,$h0hi,$h1lo,$h1hi,$h2lo,$h2hi,$h3lo,$h3hi,
- $two0,$two32,$two64,$two96,$two130,$five_two130,
- $r0lo,$r0hi,$r1lo,$r1hi,$r2lo,$r2hi,
- $s2lo,$s2hi,$s3lo,$s3hi,
- $c0lo,$c0hi,$c1lo,$c1hi,$c2lo,$c2hi,$c3lo,$c3hi) = map("f$_",(0..31));
- # borrowings
- my ($r3lo,$r3hi,$s1lo,$s1hi) = ($c0lo,$c0hi,$c1lo,$c1hi);
- my ($x0,$x1,$x2,$x3) = ($c2lo,$c2hi,$c3lo,$c3hi);
- my ($y0,$y1,$y2,$y3) = ($c3lo,$c3hi,$c1lo,$c1hi);
- $code.=<<___;
- .machine "any"
- .text
- .globl .poly1305_init_fpu
- .align 6
- .poly1305_init_fpu:
- $STU $sp,-$LOCALS($sp) # minimal frame
- mflr $padbit
- $PUSH $padbit,`$LOCALS+$LRSAVE`($sp)
- bl LPICmeup
- xor r0,r0,r0
- mtlr $padbit # restore lr
- lfd $two0,8*0($len) # load constants
- lfd $two32,8*1($len)
- lfd $two64,8*2($len)
- lfd $two96,8*3($len)
- lfd $two130,8*4($len)
- lfd $five_two130,8*5($len)
- stfd $two0,8*0($ctx) # initial hash value, biased 0
- stfd $two32,8*1($ctx)
- stfd $two64,8*2($ctx)
- stfd $two96,8*3($ctx)
- $UCMP $inp,r0
- beq- Lno_key
- lfd $h3lo,8*13($len) # new fpscr
- mffs $h3hi # old fpscr
- stfd $two0,8*4($ctx) # key "template"
- stfd $two32,8*5($ctx)
- stfd $two64,8*6($ctx)
- stfd $two96,8*7($ctx)
- li $in1,4
- li $in2,8
- li $in3,12
- $LWXLE $in0,0,$inp # load key
- $LWXLE $in1,$in1,$inp
- $LWXLE $in2,$in2,$inp
- $LWXLE $in3,$in3,$inp
- lis $i1,0xf000 # 0xf0000000
- ori $i2,$i1,3 # 0xf0000003
- andc $in0,$in0,$i1 # &=0x0fffffff
- andc $in1,$in1,$i2 # &=0x0ffffffc
- andc $in2,$in2,$i2
- andc $in3,$in3,$i2
- stw $in0,`8*4+(4^$LITTLE_ENDIAN)`($ctx) # fill "template"
- stw $in1,`8*5+(4^$LITTLE_ENDIAN)`($ctx)
- stw $in2,`8*6+(4^$LITTLE_ENDIAN)`($ctx)
- stw $in3,`8*7+(4^$LITTLE_ENDIAN)`($ctx)
- mtfsf 255,$h3lo # fpscr
- stfd $two0,8*18($ctx) # copy constants to context
- stfd $two32,8*19($ctx)
- stfd $two64,8*20($ctx)
- stfd $two96,8*21($ctx)
- stfd $two130,8*22($ctx)
- stfd $five_two130,8*23($ctx)
- lfd $h0lo,8*4($ctx) # load [biased] key
- lfd $h1lo,8*5($ctx)
- lfd $h2lo,8*6($ctx)
- lfd $h3lo,8*7($ctx)
- fsub $h0lo,$h0lo,$two0 # r0
- fsub $h1lo,$h1lo,$two32 # r1
- fsub $h2lo,$h2lo,$two64 # r2
- fsub $h3lo,$h3lo,$two96 # r3
- lfd $two0,8*6($len) # more constants
- lfd $two32,8*7($len)
- lfd $two64,8*8($len)
- lfd $two96,8*9($len)
- fmul $h1hi,$h1lo,$five_two130 # s1
- fmul $h2hi,$h2lo,$five_two130 # s2
- stfd $h3hi,8*15($ctx) # borrow slot for original fpscr
- fmul $h3hi,$h3lo,$five_two130 # s3
- fadd $h0hi,$h0lo,$two0
- stfd $h1hi,8*12($ctx) # put aside for now
- fadd $h1hi,$h1lo,$two32
- stfd $h2hi,8*13($ctx)
- fadd $h2hi,$h2lo,$two64
- stfd $h3hi,8*14($ctx)
- fadd $h3hi,$h3lo,$two96
- fsub $h0hi,$h0hi,$two0
- fsub $h1hi,$h1hi,$two32
- fsub $h2hi,$h2hi,$two64
- fsub $h3hi,$h3hi,$two96
- lfd $two0,8*10($len) # more constants
- lfd $two32,8*11($len)
- lfd $two64,8*12($len)
- fsub $h0lo,$h0lo,$h0hi
- fsub $h1lo,$h1lo,$h1hi
- fsub $h2lo,$h2lo,$h2hi
- fsub $h3lo,$h3lo,$h3hi
- stfd $h0hi,8*5($ctx) # r0hi
- stfd $h1hi,8*7($ctx) # r1hi
- stfd $h2hi,8*9($ctx) # r2hi
- stfd $h3hi,8*11($ctx) # r3hi
- stfd $h0lo,8*4($ctx) # r0lo
- stfd $h1lo,8*6($ctx) # r1lo
- stfd $h2lo,8*8($ctx) # r2lo
- stfd $h3lo,8*10($ctx) # r3lo
- lfd $h1lo,8*12($ctx) # s1
- lfd $h2lo,8*13($ctx) # s2
- lfd $h3lo,8*14($ctx) # s3
- lfd $h0lo,8*15($ctx) # pull original fpscr
- fadd $h1hi,$h1lo,$two0
- fadd $h2hi,$h2lo,$two32
- fadd $h3hi,$h3lo,$two64
- fsub $h1hi,$h1hi,$two0
- fsub $h2hi,$h2hi,$two32
- fsub $h3hi,$h3hi,$two64
- fsub $h1lo,$h1lo,$h1hi
- fsub $h2lo,$h2lo,$h2hi
- fsub $h3lo,$h3lo,$h3hi
- stfd $h1hi,8*13($ctx) # s1hi
- stfd $h2hi,8*15($ctx) # s2hi
- stfd $h3hi,8*17($ctx) # s3hi
- stfd $h1lo,8*12($ctx) # s1lo
- stfd $h2lo,8*14($ctx) # s2lo
- stfd $h3lo,8*16($ctx) # s3lo
- mtfsf 255,$h0lo # restore fpscr
- Lno_key:
- xor r3,r3,r3
- addi $sp,$sp,$LOCALS
- blr
- .long 0
- .byte 0,12,4,1,0x80,0,2,0
- .size .poly1305_init_fpu,.-.poly1305_init_fpu
- .globl .poly1305_blocks_fpu
- .align 4
- .poly1305_blocks_fpu:
- srwi. $len,$len,4
- beq- Labort
- $STU $sp,-$FRAME($sp)
- mflr r0
- stfd f14,`$FRAME-8*18`($sp)
- stfd f15,`$FRAME-8*17`($sp)
- stfd f16,`$FRAME-8*16`($sp)
- stfd f17,`$FRAME-8*15`($sp)
- stfd f18,`$FRAME-8*14`($sp)
- stfd f19,`$FRAME-8*13`($sp)
- stfd f20,`$FRAME-8*12`($sp)
- stfd f21,`$FRAME-8*11`($sp)
- stfd f22,`$FRAME-8*10`($sp)
- stfd f23,`$FRAME-8*9`($sp)
- stfd f24,`$FRAME-8*8`($sp)
- stfd f25,`$FRAME-8*7`($sp)
- stfd f26,`$FRAME-8*6`($sp)
- stfd f27,`$FRAME-8*5`($sp)
- stfd f28,`$FRAME-8*4`($sp)
- stfd f29,`$FRAME-8*3`($sp)
- stfd f30,`$FRAME-8*2`($sp)
- stfd f31,`$FRAME-8*1`($sp)
- $PUSH r0,`$FRAME+$LRSAVE`($sp)
- xor r0,r0,r0
- li $in3,1
- mtctr $len
- neg $len,$len
- stw r0,`$LOCALS+8*4+(0^$LITTLE_ENDIAN)`($sp)
- stw $in3,`$LOCALS+8*4+(4^$LITTLE_ENDIAN)`($sp)
- lfd $two0,8*18($ctx) # load constants
- lfd $two32,8*19($ctx)
- lfd $two64,8*20($ctx)
- lfd $two96,8*21($ctx)
- lfd $two130,8*22($ctx)
- lfd $five_two130,8*23($ctx)
- lfd $h0lo,8*0($ctx) # load [biased] hash value
- lfd $h1lo,8*1($ctx)
- lfd $h2lo,8*2($ctx)
- lfd $h3lo,8*3($ctx)
- stfd $two0,`$LOCALS+8*0`($sp) # input "template"
- oris $in3,$padbit,`(1023+52+96)<<4`
- stfd $two32,`$LOCALS+8*1`($sp)
- stfd $two64,`$LOCALS+8*2`($sp)
- stw $in3,`$LOCALS+8*3+(0^$LITTLE_ENDIAN)`($sp)
- li $i1,4
- li $i2,8
- li $i3,12
- $LWXLE $in0,0,$inp # load input
- $LWXLE $in1,$i1,$inp
- $LWXLE $in2,$i2,$inp
- $LWXLE $in3,$i3,$inp
- addi $inp,$inp,16
- stw $in0,`$LOCALS+8*0+(4^$LITTLE_ENDIAN)`($sp) # fill "template"
- stw $in1,`$LOCALS+8*1+(4^$LITTLE_ENDIAN)`($sp)
- stw $in2,`$LOCALS+8*2+(4^$LITTLE_ENDIAN)`($sp)
- stw $in3,`$LOCALS+8*3+(4^$LITTLE_ENDIAN)`($sp)
- mffs $x0 # original fpscr
- lfd $x1,`$LOCALS+8*4`($sp) # new fpscr
- lfd $r0lo,8*4($ctx) # load key
- lfd $r0hi,8*5($ctx)
- lfd $r1lo,8*6($ctx)
- lfd $r1hi,8*7($ctx)
- lfd $r2lo,8*8($ctx)
- lfd $r2hi,8*9($ctx)
- lfd $r3lo,8*10($ctx)
- lfd $r3hi,8*11($ctx)
- lfd $s1lo,8*12($ctx)
- lfd $s1hi,8*13($ctx)
- lfd $s2lo,8*14($ctx)
- lfd $s2hi,8*15($ctx)
- lfd $s3lo,8*16($ctx)
- lfd $s3hi,8*17($ctx)
- stfd $x0,`$LOCALS+8*4`($sp) # save original fpscr
- mtfsf 255,$x1
- addic $len,$len,1
- addze r0,r0
- slwi. r0,r0,4
- sub $inp,$inp,r0 # conditional rewind
- lfd $x0,`$LOCALS+8*0`($sp)
- lfd $x1,`$LOCALS+8*1`($sp)
- lfd $x2,`$LOCALS+8*2`($sp)
- lfd $x3,`$LOCALS+8*3`($sp)
- fsub $h0lo,$h0lo,$two0 # de-bias hash value
- $LWXLE $in0,0,$inp # modulo-scheduled input load
- fsub $h1lo,$h1lo,$two32
- $LWXLE $in1,$i1,$inp
- fsub $h2lo,$h2lo,$two64
- $LWXLE $in2,$i2,$inp
- fsub $h3lo,$h3lo,$two96
- $LWXLE $in3,$i3,$inp
- fsub $x0,$x0,$two0 # de-bias input
- addi $inp,$inp,16
- fsub $x1,$x1,$two32
- fsub $x2,$x2,$two64
- fsub $x3,$x3,$two96
- fadd $x0,$x0,$h0lo # accumulate input
- stw $in0,`$LOCALS+8*0+(4^$LITTLE_ENDIAN)`($sp)
- fadd $x1,$x1,$h1lo
- stw $in1,`$LOCALS+8*1+(4^$LITTLE_ENDIAN)`($sp)
- fadd $x2,$x2,$h2lo
- stw $in2,`$LOCALS+8*2+(4^$LITTLE_ENDIAN)`($sp)
- fadd $x3,$x3,$h3lo
- stw $in3,`$LOCALS+8*3+(4^$LITTLE_ENDIAN)`($sp)
- b Lentry
- .align 4
- Loop:
- fsub $y0,$y0,$two0 # de-bias input
- addic $len,$len,1
- fsub $y1,$y1,$two32
- addze r0,r0
- fsub $y2,$y2,$two64
- slwi. r0,r0,4
- fsub $y3,$y3,$two96
- sub $inp,$inp,r0 # conditional rewind
- fadd $h0lo,$h0lo,$y0 # accumulate input
- fadd $h0hi,$h0hi,$y1
- fadd $h2lo,$h2lo,$y2
- fadd $h2hi,$h2hi,$y3
- ######################################### base 2^48 -> base 2^32
- fadd $c1lo,$h1lo,$two64
- $LWXLE $in0,0,$inp # modulo-scheduled input load
- fadd $c1hi,$h1hi,$two64
- $LWXLE $in1,$i1,$inp
- fadd $c3lo,$h3lo,$two130
- $LWXLE $in2,$i2,$inp
- fadd $c3hi,$h3hi,$two130
- $LWXLE $in3,$i3,$inp
- fadd $c0lo,$h0lo,$two32
- addi $inp,$inp,16
- fadd $c0hi,$h0hi,$two32
- fadd $c2lo,$h2lo,$two96
- fadd $c2hi,$h2hi,$two96
- fsub $c1lo,$c1lo,$two64
- stw $in0,`$LOCALS+8*0+(4^$LITTLE_ENDIAN)`($sp) # fill "template"
- fsub $c1hi,$c1hi,$two64
- stw $in1,`$LOCALS+8*1+(4^$LITTLE_ENDIAN)`($sp)
- fsub $c3lo,$c3lo,$two130
- stw $in2,`$LOCALS+8*2+(4^$LITTLE_ENDIAN)`($sp)
- fsub $c3hi,$c3hi,$two130
- stw $in3,`$LOCALS+8*3+(4^$LITTLE_ENDIAN)`($sp)
- fsub $c0lo,$c0lo,$two32
- fsub $c0hi,$c0hi,$two32
- fsub $c2lo,$c2lo,$two96
- fsub $c2hi,$c2hi,$two96
- fsub $h1lo,$h1lo,$c1lo
- fsub $h1hi,$h1hi,$c1hi
- fsub $h3lo,$h3lo,$c3lo
- fsub $h3hi,$h3hi,$c3hi
- fsub $h2lo,$h2lo,$c2lo
- fsub $h2hi,$h2hi,$c2hi
- fsub $h0lo,$h0lo,$c0lo
- fsub $h0hi,$h0hi,$c0hi
- fadd $h1lo,$h1lo,$c0lo
- fadd $h1hi,$h1hi,$c0hi
- fadd $h3lo,$h3lo,$c2lo
- fadd $h3hi,$h3hi,$c2hi
- fadd $h2lo,$h2lo,$c1lo
- fadd $h2hi,$h2hi,$c1hi
- fmadd $h0lo,$c3lo,$five_two130,$h0lo
- fmadd $h0hi,$c3hi,$five_two130,$h0hi
- fadd $x1,$h1lo,$h1hi
- lfd $s1lo,8*12($ctx) # reload constants
- fadd $x3,$h3lo,$h3hi
- lfd $s1hi,8*13($ctx)
- fadd $x2,$h2lo,$h2hi
- lfd $r3lo,8*10($ctx)
- fadd $x0,$h0lo,$h0hi
- lfd $r3hi,8*11($ctx)
- Lentry:
- fmul $h0lo,$s3lo,$x1
- fmul $h0hi,$s3hi,$x1
- fmul $h2lo,$r1lo,$x1
- fmul $h2hi,$r1hi,$x1
- fmul $h1lo,$r0lo,$x1
- fmul $h1hi,$r0hi,$x1
- fmul $h3lo,$r2lo,$x1
- fmul $h3hi,$r2hi,$x1
- fmadd $h0lo,$s1lo,$x3,$h0lo
- fmadd $h0hi,$s1hi,$x3,$h0hi
- fmadd $h2lo,$s3lo,$x3,$h2lo
- fmadd $h2hi,$s3hi,$x3,$h2hi
- fmadd $h1lo,$s2lo,$x3,$h1lo
- fmadd $h1hi,$s2hi,$x3,$h1hi
- fmadd $h3lo,$r0lo,$x3,$h3lo
- fmadd $h3hi,$r0hi,$x3,$h3hi
- fmadd $h0lo,$s2lo,$x2,$h0lo
- fmadd $h0hi,$s2hi,$x2,$h0hi
- fmadd $h2lo,$r0lo,$x2,$h2lo
- fmadd $h2hi,$r0hi,$x2,$h2hi
- fmadd $h1lo,$s3lo,$x2,$h1lo
- fmadd $h1hi,$s3hi,$x2,$h1hi
- fmadd $h3lo,$r1lo,$x2,$h3lo
- fmadd $h3hi,$r1hi,$x2,$h3hi
- fmadd $h0lo,$r0lo,$x0,$h0lo
- lfd $y0,`$LOCALS+8*0`($sp) # load [biased] input
- fmadd $h0hi,$r0hi,$x0,$h0hi
- lfd $y1,`$LOCALS+8*1`($sp)
- fmadd $h2lo,$r2lo,$x0,$h2lo
- lfd $y2,`$LOCALS+8*2`($sp)
- fmadd $h2hi,$r2hi,$x0,$h2hi
- lfd $y3,`$LOCALS+8*3`($sp)
- fmadd $h1lo,$r1lo,$x0,$h1lo
- fmadd $h1hi,$r1hi,$x0,$h1hi
- fmadd $h3lo,$r3lo,$x0,$h3lo
- fmadd $h3hi,$r3hi,$x0,$h3hi
- bdnz Loop
- ######################################### base 2^48 -> base 2^32
- fadd $c0lo,$h0lo,$two32
- fadd $c0hi,$h0hi,$two32
- fadd $c2lo,$h2lo,$two96
- fadd $c2hi,$h2hi,$two96
- fadd $c1lo,$h1lo,$two64
- fadd $c1hi,$h1hi,$two64
- fadd $c3lo,$h3lo,$two130
- fadd $c3hi,$h3hi,$two130
- fsub $c0lo,$c0lo,$two32
- fsub $c0hi,$c0hi,$two32
- fsub $c2lo,$c2lo,$two96
- fsub $c2hi,$c2hi,$two96
- fsub $c1lo,$c1lo,$two64
- fsub $c1hi,$c1hi,$two64
- fsub $c3lo,$c3lo,$two130
- fsub $c3hi,$c3hi,$two130
- fsub $h1lo,$h1lo,$c1lo
- fsub $h1hi,$h1hi,$c1hi
- fsub $h3lo,$h3lo,$c3lo
- fsub $h3hi,$h3hi,$c3hi
- fsub $h2lo,$h2lo,$c2lo
- fsub $h2hi,$h2hi,$c2hi
- fsub $h0lo,$h0lo,$c0lo
- fsub $h0hi,$h0hi,$c0hi
- fadd $h1lo,$h1lo,$c0lo
- fadd $h1hi,$h1hi,$c0hi
- fadd $h3lo,$h3lo,$c2lo
- fadd $h3hi,$h3hi,$c2hi
- fadd $h2lo,$h2lo,$c1lo
- fadd $h2hi,$h2hi,$c1hi
- fmadd $h0lo,$c3lo,$five_two130,$h0lo
- fmadd $h0hi,$c3hi,$five_two130,$h0hi
- fadd $x1,$h1lo,$h1hi
- fadd $x3,$h3lo,$h3hi
- fadd $x2,$h2lo,$h2hi
- fadd $x0,$h0lo,$h0hi
- lfd $h0lo,`$LOCALS+8*4`($sp) # pull saved fpscr
- fadd $x1,$x1,$two32 # bias
- fadd $x3,$x3,$two96
- fadd $x2,$x2,$two64
- fadd $x0,$x0,$two0
- stfd $x1,8*1($ctx) # store [biased] hash value
- stfd $x3,8*3($ctx)
- stfd $x2,8*2($ctx)
- stfd $x0,8*0($ctx)
- mtfsf 255,$h0lo # restore original fpscr
- lfd f14,`$FRAME-8*18`($sp)
- lfd f15,`$FRAME-8*17`($sp)
- lfd f16,`$FRAME-8*16`($sp)
- lfd f17,`$FRAME-8*15`($sp)
- lfd f18,`$FRAME-8*14`($sp)
- lfd f19,`$FRAME-8*13`($sp)
- lfd f20,`$FRAME-8*12`($sp)
- lfd f21,`$FRAME-8*11`($sp)
- lfd f22,`$FRAME-8*10`($sp)
- lfd f23,`$FRAME-8*9`($sp)
- lfd f24,`$FRAME-8*8`($sp)
- lfd f25,`$FRAME-8*7`($sp)
- lfd f26,`$FRAME-8*6`($sp)
- lfd f27,`$FRAME-8*5`($sp)
- lfd f28,`$FRAME-8*4`($sp)
- lfd f29,`$FRAME-8*3`($sp)
- lfd f30,`$FRAME-8*2`($sp)
- lfd f31,`$FRAME-8*1`($sp)
- addi $sp,$sp,$FRAME
- Labort:
- blr
- .long 0
- .byte 0,12,4,1,0x80,0,4,0
- .size .poly1305_blocks_fpu,.-.poly1305_blocks_fpu
- ___
- {
- my ($mac,$nonce)=($inp,$len);
- my ($h0,$h1,$h2,$h3,$h4, $d0,$d1,$d2,$d3
- ) = map("r$_",(7..11,28..31));
- my $mask = "r0";
- my $FRAME = (6+4)*$SIZE_T;
- $code.=<<___;
- .globl .poly1305_emit_fpu
- .align 4
- .poly1305_emit_fpu:
- $STU $sp,-$FRAME($sp)
- mflr r0
- $PUSH r28,`$FRAME-$SIZE_T*4`($sp)
- $PUSH r29,`$FRAME-$SIZE_T*3`($sp)
- $PUSH r30,`$FRAME-$SIZE_T*2`($sp)
- $PUSH r31,`$FRAME-$SIZE_T*1`($sp)
- $PUSH r0,`$FRAME+$LRSAVE`($sp)
- lwz $d0,`8*0+(0^$LITTLE_ENDIAN)`($ctx) # load hash
- lwz $h0,`8*0+(4^$LITTLE_ENDIAN)`($ctx)
- lwz $d1,`8*1+(0^$LITTLE_ENDIAN)`($ctx)
- lwz $h1,`8*1+(4^$LITTLE_ENDIAN)`($ctx)
- lwz $d2,`8*2+(0^$LITTLE_ENDIAN)`($ctx)
- lwz $h2,`8*2+(4^$LITTLE_ENDIAN)`($ctx)
- lwz $d3,`8*3+(0^$LITTLE_ENDIAN)`($ctx)
- lwz $h3,`8*3+(4^$LITTLE_ENDIAN)`($ctx)
- lis $mask,0xfff0
- andc $d0,$d0,$mask # mask exponent
- andc $d1,$d1,$mask
- andc $d2,$d2,$mask
- andc $d3,$d3,$mask # can be partially reduced...
- li $mask,3
- srwi $padbit,$d3,2 # ... so reduce
- and $h4,$d3,$mask
- andc $d3,$d3,$mask
- add $d3,$d3,$padbit
- ___
- if ($SIZE_T==4) {
- $code.=<<___;
- addc $h0,$h0,$d3
- adde $h1,$h1,$d0
- adde $h2,$h2,$d1
- adde $h3,$h3,$d2
- addze $h4,$h4
- addic $d0,$h0,5 # compare to modulus
- addze $d1,$h1
- addze $d2,$h2
- addze $d3,$h3
- addze $mask,$h4
- srwi $mask,$mask,2 # did it carry/borrow?
- neg $mask,$mask
- srawi $mask,$mask,31 # mask
- andc $h0,$h0,$mask
- and $d0,$d0,$mask
- andc $h1,$h1,$mask
- and $d1,$d1,$mask
- or $h0,$h0,$d0
- lwz $d0,0($nonce) # load nonce
- andc $h2,$h2,$mask
- and $d2,$d2,$mask
- or $h1,$h1,$d1
- lwz $d1,4($nonce)
- andc $h3,$h3,$mask
- and $d3,$d3,$mask
- or $h2,$h2,$d2
- lwz $d2,8($nonce)
- or $h3,$h3,$d3
- lwz $d3,12($nonce)
- addc $h0,$h0,$d0 # accumulate nonce
- adde $h1,$h1,$d1
- adde $h2,$h2,$d2
- adde $h3,$h3,$d3
- ___
- } else {
- $code.=<<___;
- add $h0,$h0,$d3
- add $h1,$h1,$d0
- add $h2,$h2,$d1
- add $h3,$h3,$d2
- srdi $d0,$h0,32
- add $h1,$h1,$d0
- srdi $d1,$h1,32
- add $h2,$h2,$d1
- srdi $d2,$h2,32
- add $h3,$h3,$d2
- srdi $d3,$h3,32
- add $h4,$h4,$d3
- insrdi $h0,$h1,32,0
- insrdi $h2,$h3,32,0
- addic $d0,$h0,5 # compare to modulus
- addze $d1,$h2
- addze $d2,$h4
- srdi $mask,$d2,2 # did it carry/borrow?
- neg $mask,$mask
- sradi $mask,$mask,63 # mask
- ld $d2,0($nonce) # load nonce
- ld $d3,8($nonce)
- andc $h0,$h0,$mask
- and $d0,$d0,$mask
- andc $h2,$h2,$mask
- and $d1,$d1,$mask
- or $h0,$h0,$d0
- or $h2,$h2,$d1
- ___
- $code.=<<___ if (!$LITTLE_ENDIAN);
- rotldi $d2,$d2,32 # flip nonce words
- rotldi $d3,$d3,32
- ___
- $code.=<<___;
- addc $h0,$h0,$d2 # accumulate nonce
- adde $h2,$h2,$d3
- srdi $h1,$h0,32
- srdi $h3,$h2,32
- ___
- }
- $code.=<<___ if ($LITTLE_ENDIAN);
- stw $h0,0($mac) # write result
- stw $h1,4($mac)
- stw $h2,8($mac)
- stw $h3,12($mac)
- ___
- $code.=<<___ if (!$LITTLE_ENDIAN);
- li $d1,4
- stwbrx $h0,0,$mac # write result
- li $d2,8
- stwbrx $h1,$d1,$mac
- li $d3,12
- stwbrx $h2,$d2,$mac
- stwbrx $h3,$d3,$mac
- ___
- $code.=<<___;
- $POP r28,`$FRAME-$SIZE_T*4`($sp)
- $POP r29,`$FRAME-$SIZE_T*3`($sp)
- $POP r30,`$FRAME-$SIZE_T*2`($sp)
- $POP r31,`$FRAME-$SIZE_T*1`($sp)
- addi $sp,$sp,$FRAME
- blr
- .long 0
- .byte 0,12,4,1,0x80,4,3,0
- .size .poly1305_emit_fpu,.-.poly1305_emit_fpu
- ___
- }
- # Ugly hack here, because PPC assembler syntax seem to vary too
- # much from platforms to platform...
- $code.=<<___;
- .align 6
- LPICmeup:
- mflr r0
- bcl 20,31,\$+4
- mflr $len # vvvvvv "distance" between . and 1st data entry
- addi $len,$len,`64-8` # borrow $len
- mtlr r0
- blr
- .long 0
- .byte 0,12,0x14,0,0,0,0,0
- .space `64-9*4`
- .quad 0x4330000000000000 # 2^(52+0)
- .quad 0x4530000000000000 # 2^(52+32)
- .quad 0x4730000000000000 # 2^(52+64)
- .quad 0x4930000000000000 # 2^(52+96)
- .quad 0x4b50000000000000 # 2^(52+130)
- .quad 0x37f4000000000000 # 5/2^130
- .quad 0x4430000000000000 # 2^(52+16+0)
- .quad 0x4630000000000000 # 2^(52+16+32)
- .quad 0x4830000000000000 # 2^(52+16+64)
- .quad 0x4a30000000000000 # 2^(52+16+96)
- .quad 0x3e30000000000000 # 2^(52+16+0-96)
- .quad 0x4030000000000000 # 2^(52+16+32-96)
- .quad 0x4230000000000000 # 2^(52+16+64-96)
- .quad 0x0000000000000001 # fpscr: truncate, no exceptions
- .asciz "Poly1305 for PPC FPU, CRYPTOGAMS by <appro\@openssl.org>"
- .align 4
- ___
- $code =~ s/\`([^\`]*)\`/eval $1/gem;
- print $code;
- close STDOUT or die "error closing STDOUT: $!";
|