#! /usr/bin/env perl # Copyright 2016-2018 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the OpenSSL license (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html # # ==================================================================== # Written by Andy Polyakov for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== # # This module implements Poly1305 hash for PowerPC. # # June 2015 # # Numbers are cycles per processed byte with poly1305_blocks alone, # and improvement coefficients relative to gcc-generated code. # # -m32 -m64 # # Freescale e300 14.8/+80% - # PPC74x0 7.60/+60% - # PPC970 7.00/+114% 3.51/+205% # POWER7 3.75/+260% 1.93/+100% # POWER8 - 2.03/+200% # POWER9 - 2.00/+150% # # Do we need floating-point implementation for PPC? Results presented # in poly1305_ieee754.c are tricky to compare to, because they are for # compiler-generated code. On the other hand it's known that floating- # point performance can be dominated by FPU latency, which means that # there is limit even for ideally optimized (and even vectorized) code. # And this limit is estimated to be higher than above -m64 results. Or # in other words floating-point implementation can be meaningful to # consider only in 32-bit application context. We probably have to # recognize that 32-bit builds are getting less popular on high-end # systems and therefore tend to target embedded ones, which might not # even have FPU... # # On side note, Power ISA 2.07 enables vector base 2^26 implementation, # and POWER8 might have capacity to break 1.0 cycle per byte barrier... $flavour = shift; if ($flavour =~ /64/) { $SIZE_T =8; $LRSAVE =2*$SIZE_T; $UCMP ="cmpld"; $STU ="stdu"; $POP ="ld"; $PUSH ="std"; } elsif ($flavour =~ /32/) { $SIZE_T =4; $LRSAVE =$SIZE_T; $UCMP ="cmplw"; $STU ="stwu"; $POP ="lwz"; $PUSH ="stw"; } else { die "nonsense $flavour"; } # Define endianness based on flavour # i.e.: linux64le $LITTLE_ENDIAN = ($flavour=~/le$/) ? $SIZE_T : 0; $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or die "can't locate ppc-xlate.pl"; open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; $FRAME=24*$SIZE_T; $sp="r1"; my ($ctx,$inp,$len,$padbit) = map("r$_",(3..6)); my ($mac,$nonce)=($inp,$len); my $mask = "r0"; $code=<<___; .machine "any" .text ___ if ($flavour =~ /64/) { ############################################################################### # base 2^64 implementation my ($h0,$h1,$h2,$d0,$d1,$d2, $r0,$r1,$s1, $t0,$t1) = map("r$_",(7..12,27..31)); $code.=<<___; .globl .poly1305_init_int .align 4 .poly1305_init_int: xor r0,r0,r0 std r0,0($ctx) # zero hash value std r0,8($ctx) std r0,16($ctx) $UCMP $inp,r0 beq- Lno_key ___ $code.=<<___ if ($LITTLE_ENDIAN); ld $d0,0($inp) # load key material ld $d1,8($inp) ___ $code.=<<___ if (!$LITTLE_ENDIAN); li $h0,4 lwbrx $d0,0,$inp # load key material li $d1,8 lwbrx $h0,$h0,$inp li $h1,12 lwbrx $d1,$d1,$inp lwbrx $h1,$h1,$inp insrdi $d0,$h0,32,0 insrdi $d1,$h1,32,0 ___ $code.=<<___; lis $h1,0xfff # 0x0fff0000 ori $h1,$h1,0xfffc # 0x0ffffffc insrdi $h1,$h1,32,0 # 0x0ffffffc0ffffffc ori $h0,$h1,3 # 0x0ffffffc0fffffff and $d0,$d0,$h0 and $d1,$d1,$h1 std $d0,32($ctx) # store key std $d1,40($ctx) Lno_key: xor r3,r3,r3 blr .long 0 .byte 0,12,0x14,0,0,0,2,0 .size .poly1305_init_int,.-.poly1305_init_int .globl .poly1305_blocks .align 4 .poly1305_blocks: srdi. $len,$len,4 beq- Labort $STU $sp,-$FRAME($sp) mflr r0 $PUSH r27,`$FRAME-$SIZE_T*5`($sp) $PUSH r28,`$FRAME-$SIZE_T*4`($sp) $PUSH r29,`$FRAME-$SIZE_T*3`($sp) $PUSH r30,`$FRAME-$SIZE_T*2`($sp) $PUSH r31,`$FRAME-$SIZE_T*1`($sp) $PUSH r0,`$FRAME+$LRSAVE`($sp) ld $r0,32($ctx) # load key ld $r1,40($ctx) ld $h0,0($ctx) # load hash value ld $h1,8($ctx) ld $h2,16($ctx) srdi $s1,$r1,2 mtctr $len add $s1,$s1,$r1 # s1 = r1 + r1>>2 li $mask,3 b Loop .align 4 Loop: ___ $code.=<<___ if ($LITTLE_ENDIAN); ld $t0,0($inp) # load input ld $t1,8($inp) ___ $code.=<<___ if (!$LITTLE_ENDIAN); li $d0,4 lwbrx $t0,0,$inp # load input li $t1,8 lwbrx $d0,$d0,$inp li $d1,12 lwbrx $t1,$t1,$inp lwbrx $d1,$d1,$inp insrdi $t0,$d0,32,0 insrdi $t1,$d1,32,0 ___ $code.=<<___; addi $inp,$inp,16 addc $h0,$h0,$t0 # accumulate input adde $h1,$h1,$t1 mulld $d0,$h0,$r0 # h0*r0 mulhdu $d1,$h0,$r0 adde $h2,$h2,$padbit mulld $t0,$h1,$s1 # h1*5*r1 mulhdu $t1,$h1,$s1 addc $d0,$d0,$t0 adde $d1,$d1,$t1 mulld $t0,$h0,$r1 # h0*r1 mulhdu $d2,$h0,$r1 addc $d1,$d1,$t0 addze $d2,$d2 mulld $t0,$h1,$r0 # h1*r0 mulhdu $t1,$h1,$r0 addc $d1,$d1,$t0 adde $d2,$d2,$t1 mulld $t0,$h2,$s1 # h2*5*r1 mulld $t1,$h2,$r0 # h2*r0 addc $d1,$d1,$t0 adde $d2,$d2,$t1 andc $t0,$d2,$mask # final reduction step and $h2,$d2,$mask srdi $t1,$t0,2 add $t0,$t0,$t1 addc $h0,$d0,$t0 addze $h1,$d1 addze $h2,$h2 bdnz Loop std $h0,0($ctx) # store hash value std $h1,8($ctx) std $h2,16($ctx) $POP r27,`$FRAME-$SIZE_T*5`($sp) $POP r28,`$FRAME-$SIZE_T*4`($sp) $POP r29,`$FRAME-$SIZE_T*3`($sp) $POP r30,`$FRAME-$SIZE_T*2`($sp) $POP r31,`$FRAME-$SIZE_T*1`($sp) addi $sp,$sp,$FRAME Labort: blr .long 0 .byte 0,12,4,1,0x80,5,4,0 .size .poly1305_blocks,.-.poly1305_blocks .globl .poly1305_emit .align 4 .poly1305_emit: ld $h0,0($ctx) # load hash ld $h1,8($ctx) ld $h2,16($ctx) ld $padbit,0($nonce) # load nonce ld $nonce,8($nonce) addic $d0,$h0,5 # compare to modulus addze $d1,$h1 addze $d2,$h2 srdi $mask,$d2,2 # did it carry/borrow? neg $mask,$mask andc $h0,$h0,$mask and $d0,$d0,$mask andc $h1,$h1,$mask and $d1,$d1,$mask or $h0,$h0,$d0 or $h1,$h1,$d1 ___ $code.=<<___ if (!$LITTLE_ENDIAN); rotldi $padbit,$padbit,32 # flip nonce words rotldi $nonce,$nonce,32 ___ $code.=<<___; addc $h0,$h0,$padbit # accumulate nonce adde $h1,$h1,$nonce ___ $code.=<<___ if ($LITTLE_ENDIAN); std $h0,0($mac) # write result std $h1,8($mac) ___ $code.=<<___ if (!$LITTLE_ENDIAN); extrdi r0,$h0,32,0 li $d0,4 stwbrx $h0,0,$mac # write result extrdi $h0,$h1,32,0 li $d1,8 stwbrx r0,$d0,$mac li $d2,12 stwbrx $h1,$d1,$mac stwbrx $h0,$d2,$mac ___ $code.=<<___; blr .long 0 .byte 0,12,0x14,0,0,0,3,0 .size .poly1305_emit,.-.poly1305_emit ___ } else { ############################################################################### # base 2^32 implementation my ($h0,$h1,$h2,$h3,$h4, $r0,$r1,$r2,$r3, $s1,$s2,$s3, $t0,$t1,$t2,$t3, $D0,$D1,$D2,$D3, $d0,$d1,$d2,$d3 ) = map("r$_",(7..12,14..31)); $code.=<<___; .globl .poly1305_init_int .align 4 .poly1305_init_int: xor r0,r0,r0 stw r0,0($ctx) # zero hash value stw r0,4($ctx) stw r0,8($ctx) stw r0,12($ctx) stw r0,16($ctx) $UCMP $inp,r0 beq- Lno_key ___ $code.=<<___ if ($LITTLE_ENDIAN); lw $h0,0($inp) # load key material lw $h1,4($inp) lw $h2,8($inp) lw $h3,12($inp) ___ $code.=<<___ if (!$LITTLE_ENDIAN); li $h1,4 lwbrx $h0,0,$inp # load key material li $h2,8 lwbrx $h1,$h1,$inp li $h3,12 lwbrx $h2,$h2,$inp lwbrx $h3,$h3,$inp ___ $code.=<<___; lis $mask,0xf000 # 0xf0000000 li $r0,-4 andc $r0,$r0,$mask # 0x0ffffffc andc $h0,$h0,$mask and $h1,$h1,$r0 and $h2,$h2,$r0 and $h3,$h3,$r0 stw $h0,32($ctx) # store key stw $h1,36($ctx) stw $h2,40($ctx) stw $h3,44($ctx) Lno_key: xor r3,r3,r3 blr .long 0 .byte 0,12,0x14,0,0,0,2,0 .size .poly1305_init_int,.-.poly1305_init_int .globl .poly1305_blocks .align 4 .poly1305_blocks: srwi. $len,$len,4 beq- Labort $STU $sp,-$FRAME($sp) mflr r0 $PUSH r14,`$FRAME-$SIZE_T*18`($sp) $PUSH r15,`$FRAME-$SIZE_T*17`($sp) $PUSH r16,`$FRAME-$SIZE_T*16`($sp) $PUSH r17,`$FRAME-$SIZE_T*15`($sp) $PUSH r18,`$FRAME-$SIZE_T*14`($sp) $PUSH r19,`$FRAME-$SIZE_T*13`($sp) $PUSH r20,`$FRAME-$SIZE_T*12`($sp) $PUSH r21,`$FRAME-$SIZE_T*11`($sp) $PUSH r22,`$FRAME-$SIZE_T*10`($sp) $PUSH r23,`$FRAME-$SIZE_T*9`($sp) $PUSH r24,`$FRAME-$SIZE_T*8`($sp) $PUSH r25,`$FRAME-$SIZE_T*7`($sp) $PUSH r26,`$FRAME-$SIZE_T*6`($sp) $PUSH r27,`$FRAME-$SIZE_T*5`($sp) $PUSH r28,`$FRAME-$SIZE_T*4`($sp) $PUSH r29,`$FRAME-$SIZE_T*3`($sp) $PUSH r30,`$FRAME-$SIZE_T*2`($sp) $PUSH r31,`$FRAME-$SIZE_T*1`($sp) $PUSH r0,`$FRAME+$LRSAVE`($sp) lwz $r0,32($ctx) # load key lwz $r1,36($ctx) lwz $r2,40($ctx) lwz $r3,44($ctx) lwz $h0,0($ctx) # load hash value lwz $h1,4($ctx) lwz $h2,8($ctx) lwz $h3,12($ctx) lwz $h4,16($ctx) srwi $s1,$r1,2 srwi $s2,$r2,2 srwi $s3,$r3,2 add $s1,$s1,$r1 # si = ri + ri>>2 add $s2,$s2,$r2 add $s3,$s3,$r3 mtctr $len li $mask,3 b Loop .align 4 Loop: ___ $code.=<<___ if ($LITTLE_ENDIAN); lwz $d0,0($inp) # load input lwz $d1,4($inp) lwz $d2,8($inp) lwz $d3,12($inp) ___ $code.=<<___ if (!$LITTLE_ENDIAN); li $d1,4 lwbrx $d0,0,$inp # load input li $d2,8 lwbrx $d1,$d1,$inp li $d3,12 lwbrx $d2,$d2,$inp lwbrx $d3,$d3,$inp ___ $code.=<<___; addi $inp,$inp,16 addc $h0,$h0,$d0 # accumulate input adde $h1,$h1,$d1 adde $h2,$h2,$d2 mullw $d0,$h0,$r0 # h0*r0 mulhwu $D0,$h0,$r0 mullw $d1,$h0,$r1 # h0*r1 mulhwu $D1,$h0,$r1 mullw $d2,$h0,$r2 # h0*r2 mulhwu $D2,$h0,$r2 adde $h3,$h3,$d3 adde $h4,$h4,$padbit mullw $d3,$h0,$r3 # h0*r3 mulhwu $D3,$h0,$r3 mullw $t0,$h1,$s3 # h1*s3 mulhwu $t1,$h1,$s3 mullw $t2,$h1,$r0 # h1*r0 mulhwu $t3,$h1,$r0 addc $d0,$d0,$t0 adde $D0,$D0,$t1 mullw $t0,$h1,$r1 # h1*r1 mulhwu $t1,$h1,$r1 addc $d1,$d1,$t2 adde $D1,$D1,$t3 mullw $t2,$h1,$r2 # h1*r2 mulhwu $t3,$h1,$r2 addc $d2,$d2,$t0 adde $D2,$D2,$t1 mullw $t0,$h2,$s2 # h2*s2 mulhwu $t1,$h2,$s2 addc $d3,$d3,$t2 adde $D3,$D3,$t3 mullw $t2,$h2,$s3 # h2*s3 mulhwu $t3,$h2,$s3 addc $d0,$d0,$t0 adde $D0,$D0,$t1 mullw $t0,$h2,$r0 # h2*r0 mulhwu $t1,$h2,$r0 addc $d1,$d1,$t2 adde $D1,$D1,$t3 mullw $t2,$h2,$r1 # h2*r1 mulhwu $t3,$h2,$r1 addc $d2,$d2,$t0 adde $D2,$D2,$t1 mullw $t0,$h3,$s1 # h3*s1 mulhwu $t1,$h3,$s1 addc $d3,$d3,$t2 adde $D3,$D3,$t3 mullw $t2,$h3,$s2 # h3*s2 mulhwu $t3,$h3,$s2 addc $d0,$d0,$t0 adde $D0,$D0,$t1 mullw $t0,$h3,$s3 # h3*s3 mulhwu $t1,$h3,$s3 addc $d1,$d1,$t2 adde $D1,$D1,$t3 mullw $t2,$h3,$r0 # h3*r0 mulhwu $t3,$h3,$r0 addc $d2,$d2,$t0 adde $D2,$D2,$t1 mullw $t0,$h4,$s1 # h4*s1 addc $d3,$d3,$t2 adde $D3,$D3,$t3 addc $d1,$d1,$t0 mullw $t1,$h4,$s2 # h4*s2 addze $D1,$D1 addc $d2,$d2,$t1 addze $D2,$D2 mullw $t2,$h4,$s3 # h4*s3 addc $d3,$d3,$t2 addze $D3,$D3 mullw $h4,$h4,$r0 # h4*r0 addc $h1,$d1,$D0 adde $h2,$d2,$D1 adde $h3,$d3,$D2 adde $h4,$h4,$D3 andc $D0,$h4,$mask # final reduction step and $h4,$h4,$mask srwi $D1,$D0,2 add $D0,$D0,$D1 addc $h0,$d0,$D0 addze $h1,$h1 addze $h2,$h2 addze $h3,$h3 addze $h4,$h4 bdnz Loop stw $h0,0($ctx) # store hash value stw $h1,4($ctx) stw $h2,8($ctx) stw $h3,12($ctx) stw $h4,16($ctx) $POP r14,`$FRAME-$SIZE_T*18`($sp) $POP r15,`$FRAME-$SIZE_T*17`($sp) $POP r16,`$FRAME-$SIZE_T*16`($sp) $POP r17,`$FRAME-$SIZE_T*15`($sp) $POP r18,`$FRAME-$SIZE_T*14`($sp) $POP r19,`$FRAME-$SIZE_T*13`($sp) $POP r20,`$FRAME-$SIZE_T*12`($sp) $POP r21,`$FRAME-$SIZE_T*11`($sp) $POP r22,`$FRAME-$SIZE_T*10`($sp) $POP r23,`$FRAME-$SIZE_T*9`($sp) $POP r24,`$FRAME-$SIZE_T*8`($sp) $POP r25,`$FRAME-$SIZE_T*7`($sp) $POP r26,`$FRAME-$SIZE_T*6`($sp) $POP r27,`$FRAME-$SIZE_T*5`($sp) $POP r28,`$FRAME-$SIZE_T*4`($sp) $POP r29,`$FRAME-$SIZE_T*3`($sp) $POP r30,`$FRAME-$SIZE_T*2`($sp) $POP r31,`$FRAME-$SIZE_T*1`($sp) addi $sp,$sp,$FRAME Labort: blr .long 0 .byte 0,12,4,1,0x80,18,4,0 .size .poly1305_blocks,.-.poly1305_blocks .globl .poly1305_emit .align 4 .poly1305_emit: $STU $sp,-$FRAME($sp) mflr r0 $PUSH r28,`$FRAME-$SIZE_T*4`($sp) $PUSH r29,`$FRAME-$SIZE_T*3`($sp) $PUSH r30,`$FRAME-$SIZE_T*2`($sp) $PUSH r31,`$FRAME-$SIZE_T*1`($sp) $PUSH r0,`$FRAME+$LRSAVE`($sp) lwz $h0,0($ctx) # load hash lwz $h1,4($ctx) lwz $h2,8($ctx) lwz $h3,12($ctx) lwz $h4,16($ctx) addic $d0,$h0,5 # compare to modulus addze $d1,$h1 addze $d2,$h2 addze $d3,$h3 addze $mask,$h4 srwi $mask,$mask,2 # did it carry/borrow? neg $mask,$mask andc $h0,$h0,$mask and $d0,$d0,$mask andc $h1,$h1,$mask and $d1,$d1,$mask or $h0,$h0,$d0 lwz $d0,0($nonce) # load nonce andc $h2,$h2,$mask and $d2,$d2,$mask or $h1,$h1,$d1 lwz $d1,4($nonce) andc $h3,$h3,$mask and $d3,$d3,$mask or $h2,$h2,$d2 lwz $d2,8($nonce) or $h3,$h3,$d3 lwz $d3,12($nonce) addc $h0,$h0,$d0 # accumulate nonce adde $h1,$h1,$d1 adde $h2,$h2,$d2 adde $h3,$h3,$d3 ___ $code.=<<___ if ($LITTLE_ENDIAN); stw $h0,0($mac) # write result stw $h1,4($mac) stw $h2,8($mac) stw $h3,12($mac) ___ $code.=<<___ if (!$LITTLE_ENDIAN); li $d1,4 stwbrx $h0,0,$mac # write result li $d2,8 stwbrx $h1,$d1,$mac li $d3,12 stwbrx $h2,$d2,$mac stwbrx $h3,$d3,$mac ___ $code.=<<___; $POP r28,`$FRAME-$SIZE_T*4`($sp) $POP r29,`$FRAME-$SIZE_T*3`($sp) $POP r30,`$FRAME-$SIZE_T*2`($sp) $POP r31,`$FRAME-$SIZE_T*1`($sp) addi $sp,$sp,$FRAME blr .long 0 .byte 0,12,4,1,0x80,4,3,0 .size .poly1305_emit,.-.poly1305_emit ___ } $code.=<<___; .asciz "Poly1305 for PPC, CRYPTOGAMS by " ___ $code =~ s/\`([^\`]*)\`/eval $1/gem; print $code; close STDOUT;