12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553 |
- #! /usr/bin/env perl
- # Copyright 2022-2024 The OpenSSL Project Authors. All Rights Reserved.
- #
- # Licensed under the Apache License 2.0 (the "License"). You may not use
- # this file except in compliance with the License. You can obtain a copy
- # in the file LICENSE in the source distribution or at
- # https://www.openssl.org/source/license.html
- #
- # This module implements SM4 with ASIMD and AESE on AARCH64
- #
- # Dec 2022
- #
- # $output is the last argument if it looks like a file (it has an extension)
- # $flavour is the first argument if it doesn't look like a file
- $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
- $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
- $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
- ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
- ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
- die "can't locate arm-xlate.pl";
- open OUT,"| \"$^X\" $xlate $flavour \"$output\""
- or die "can't call $xlate: $!";
- *STDOUT=*OUT;
- $prefix="vpsm4_ex";
- my @vtmp=map("v$_",(0..3));
- my @qtmp=map("q$_",(0..3));
- my @data=map("v$_",(4..7));
- my @datax=map("v$_",(8..11));
- my ($rk0,$rk1)=("v12","v13");
- my ($rka,$rkb)=("v14","v15");
- my @vtmpx=map("v$_",(12..15));
- my ($vtmp4,$vtmp5)=("v24","v25");
- my ($MaskV,$TAHMatV,$TALMatV,$ATAHMatV,$ATALMatV,$ANDMaskV)=("v26","v27","v28","v29","v30","v31");
- my ($MaskQ,$TAHMatQ,$TALMatQ,$ATAHMatQ,$ATALMatQ,$ANDMaskQ)=("q26","q27","q28","q29","q30","q31");
- my ($inp,$outp,$blocks,$rks)=("x0","x1","w2","x3");
- my ($tmpw,$tmp,$wtmp0,$wtmp1,$wtmp2)=("w6","x6","w7","w8","w9");
- my ($xtmp1,$xtmp2)=("x8","x9");
- my ($ptr,$counter)=("x10","w11");
- my ($word0,$word1,$word2,$word3)=("w12","w13","w14","w15");
- sub rev32() {
- my $dst = shift;
- my $src = shift;
- if ($src and ("$src" ne "$dst")) {
- $code.=<<___;
- #ifndef __AARCH64EB__
- rev32 $dst.16b,$src.16b
- #else
- mov $dst.16b,$src.16b
- #endif
- ___
- } else {
- $code.=<<___;
- #ifndef __AARCH64EB__
- rev32 $dst.16b,$dst.16b
- #endif
- ___
- }
- }
- sub rev32_armeb() {
- my $dst = shift;
- my $src = shift;
- if ($src and ("$src" ne "$dst")) {
- $code.=<<___;
- #ifdef __AARCH64EB__
- rev32 $dst.16b,$src.16b
- #else
- mov $dst.16b,$src.16b
- #endif
- ___
- } else {
- $code.=<<___;
- #ifdef __AARCH64EB__
- rev32 $dst.16b,$dst.16b
- #endif
- ___
- }
- }
- sub rbit() {
- my $dst = shift;
- my $src = shift;
- my $std = shift;
- if ($src and ("$src" ne "$dst")) {
- if ($std eq "_gb") {
- $code.=<<___;
- rbit $dst.16b,$src.16b
- ___
- } else {
- $code.=<<___;
- mov $dst.16b,$src.16b
- ___
- }
- } else {
- if ($std eq "_gb") {
- $code.=<<___;
- rbit $dst.16b,$src.16b
- ___
- }
- }
- }
- sub transpose() {
- my ($dat0,$dat1,$dat2,$dat3,$vt0,$vt1,$vt2,$vt3) = @_;
- $code.=<<___;
- zip1 $vt0.4s,$dat0.4s,$dat1.4s
- zip2 $vt1.4s,$dat0.4s,$dat1.4s
- zip1 $vt2.4s,$dat2.4s,$dat3.4s
- zip2 $vt3.4s,$dat2.4s,$dat3.4s
- zip1 $dat0.2d,$vt0.2d,$vt2.2d
- zip2 $dat1.2d,$vt0.2d,$vt2.2d
- zip1 $dat2.2d,$vt1.2d,$vt3.2d
- zip2 $dat3.2d,$vt1.2d,$vt3.2d
- ___
- }
- # matrix multiplication Mat*x = (lowerMat*x) ^ (higherMat*x)
- sub mul_matrix() {
- my $x = shift;
- my $higherMat = shift;
- my $lowerMat = shift;
- my $tmp = shift;
- $code.=<<___;
- ushr $tmp.16b, $x.16b, 4
- and $x.16b, $x.16b, $ANDMaskV.16b
- tbl $x.16b, {$lowerMat.16b}, $x.16b
- tbl $tmp.16b, {$higherMat.16b}, $tmp.16b
- eor $x.16b, $x.16b, $tmp.16b
- ___
- }
- # sbox operations for 4-lane of words
- # sbox operation for 4-lane of words
- sub sbox() {
- my $dat = shift;
- $code.=<<___;
- // optimize sbox using AESE instruction
- tbl @vtmp[0].16b, {$dat.16b}, $MaskV.16b
- ___
- &mul_matrix(@vtmp[0], $TAHMatV, $TALMatV, $vtmp4);
- $code.=<<___;
- eor @vtmp[1].16b, @vtmp[1].16b, @vtmp[1].16b
- aese @vtmp[0].16b,@vtmp[1].16b
- ___
- &mul_matrix(@vtmp[0], $ATAHMatV, $ATALMatV, $vtmp4);
- $code.=<<___;
- mov $dat.16b,@vtmp[0].16b
- // linear transformation
- ushr @vtmp[0].4s,$dat.4s,32-2
- ushr @vtmp[1].4s,$dat.4s,32-10
- ushr @vtmp[2].4s,$dat.4s,32-18
- ushr @vtmp[3].4s,$dat.4s,32-24
- sli @vtmp[0].4s,$dat.4s,2
- sli @vtmp[1].4s,$dat.4s,10
- sli @vtmp[2].4s,$dat.4s,18
- sli @vtmp[3].4s,$dat.4s,24
- eor $vtmp4.16b,@vtmp[0].16b,$dat.16b
- eor $vtmp4.16b,$vtmp4.16b,$vtmp[1].16b
- eor $dat.16b,@vtmp[2].16b,@vtmp[3].16b
- eor $dat.16b,$dat.16b,$vtmp4.16b
- ___
- }
- # sbox operation for 8-lane of words
- sub sbox_double() {
- my $dat = shift;
- my $datx = shift;
- $code.=<<___;
- // optimize sbox using AESE instruction
- tbl @vtmp[0].16b, {$dat.16b}, $MaskV.16b
- tbl @vtmp[1].16b, {$datx.16b}, $MaskV.16b
- ___
- &mul_matrix(@vtmp[0], $TAHMatV, $TALMatV, $vtmp4);
- &mul_matrix(@vtmp[1], $TAHMatV, $TALMatV, $vtmp4);
- $code.=<<___;
- eor $vtmp5.16b, $vtmp5.16b, $vtmp5.16b
- aese @vtmp[0].16b,$vtmp5.16b
- aese @vtmp[1].16b,$vtmp5.16b
- ___
- &mul_matrix(@vtmp[0], $ATAHMatV, $ATALMatV,$vtmp4);
- &mul_matrix(@vtmp[1], $ATAHMatV, $ATALMatV,$vtmp4);
- $code.=<<___;
- mov $dat.16b,@vtmp[0].16b
- mov $datx.16b,@vtmp[1].16b
- // linear transformation
- ushr @vtmp[0].4s,$dat.4s,32-2
- ushr $vtmp5.4s,$datx.4s,32-2
- ushr @vtmp[1].4s,$dat.4s,32-10
- ushr @vtmp[2].4s,$dat.4s,32-18
- ushr @vtmp[3].4s,$dat.4s,32-24
- sli @vtmp[0].4s,$dat.4s,2
- sli $vtmp5.4s,$datx.4s,2
- sli @vtmp[1].4s,$dat.4s,10
- sli @vtmp[2].4s,$dat.4s,18
- sli @vtmp[3].4s,$dat.4s,24
- eor $vtmp4.16b,@vtmp[0].16b,$dat.16b
- eor $vtmp4.16b,$vtmp4.16b,@vtmp[1].16b
- eor $dat.16b,@vtmp[2].16b,@vtmp[3].16b
- eor $dat.16b,$dat.16b,$vtmp4.16b
- ushr @vtmp[1].4s,$datx.4s,32-10
- ushr @vtmp[2].4s,$datx.4s,32-18
- ushr @vtmp[3].4s,$datx.4s,32-24
- sli @vtmp[1].4s,$datx.4s,10
- sli @vtmp[2].4s,$datx.4s,18
- sli @vtmp[3].4s,$datx.4s,24
- eor $vtmp4.16b,$vtmp5.16b,$datx.16b
- eor $vtmp4.16b,$vtmp4.16b,@vtmp[1].16b
- eor $datx.16b,@vtmp[2].16b,@vtmp[3].16b
- eor $datx.16b,$datx.16b,$vtmp4.16b
- ___
- }
- # sbox operation for one single word
- sub sbox_1word () {
- my $word = shift;
- $code.=<<___;
- mov @vtmp[3].s[0],$word
- // optimize sbox using AESE instruction
- tbl @vtmp[0].16b, {@vtmp[3].16b}, $MaskV.16b
- ___
- &mul_matrix(@vtmp[0], $TAHMatV, $TALMatV, @vtmp[2]);
- $code.=<<___;
- eor @vtmp[1].16b, @vtmp[1].16b, @vtmp[1].16b
- aese @vtmp[0].16b,@vtmp[1].16b
- ___
- &mul_matrix(@vtmp[0], $ATAHMatV, $ATALMatV, @vtmp[2]);
- $code.=<<___;
- mov $wtmp0,@vtmp[0].s[0]
- eor $word,$wtmp0,$wtmp0,ror #32-2
- eor $word,$word,$wtmp0,ror #32-10
- eor $word,$word,$wtmp0,ror #32-18
- eor $word,$word,$wtmp0,ror #32-24
- ___
- }
- # sm4 for one block of data, in scalar registers word0/word1/word2/word3
- sub sm4_1blk () {
- my $kptr = shift;
- $code.=<<___;
- ldp $wtmp0,$wtmp1,[$kptr],8
- // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
- eor $tmpw,$word2,$word3
- eor $wtmp2,$wtmp0,$word1
- eor $tmpw,$tmpw,$wtmp2
- ___
- &sbox_1word($tmpw);
- $code.=<<___;
- eor $word0,$word0,$tmpw
- // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
- eor $tmpw,$word2,$word3
- eor $wtmp2,$word0,$wtmp1
- eor $tmpw,$tmpw,$wtmp2
- ___
- &sbox_1word($tmpw);
- $code.=<<___;
- ldp $wtmp0,$wtmp1,[$kptr],8
- eor $word1,$word1,$tmpw
- // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
- eor $tmpw,$word0,$word1
- eor $wtmp2,$wtmp0,$word3
- eor $tmpw,$tmpw,$wtmp2
- ___
- &sbox_1word($tmpw);
- $code.=<<___;
- eor $word2,$word2,$tmpw
- // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
- eor $tmpw,$word0,$word1
- eor $wtmp2,$word2,$wtmp1
- eor $tmpw,$tmpw,$wtmp2
- ___
- &sbox_1word($tmpw);
- $code.=<<___;
- eor $word3,$word3,$tmpw
- ___
- }
- # sm4 for 4-lanes of data, in neon registers data0/data1/data2/data3
- sub sm4_4blks () {
- my $kptr = shift;
- $code.=<<___;
- ldp $wtmp0,$wtmp1,[$kptr],8
- dup $rk0.4s,$wtmp0
- dup $rk1.4s,$wtmp1
- // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
- eor $rka.16b,@data[2].16b,@data[3].16b
- eor $rk0.16b,@data[1].16b,$rk0.16b
- eor $rk0.16b,$rka.16b,$rk0.16b
- ___
- &sbox($rk0);
- $code.=<<___;
- eor @data[0].16b,@data[0].16b,$rk0.16b
- // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
- eor $rka.16b,$rka.16b,@data[0].16b
- eor $rk1.16b,$rka.16b,$rk1.16b
- ___
- &sbox($rk1);
- $code.=<<___;
- ldp $wtmp0,$wtmp1,[$kptr],8
- eor @data[1].16b,@data[1].16b,$rk1.16b
- dup $rk0.4s,$wtmp0
- dup $rk1.4s,$wtmp1
- // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
- eor $rka.16b,@data[0].16b,@data[1].16b
- eor $rk0.16b,@data[3].16b,$rk0.16b
- eor $rk0.16b,$rka.16b,$rk0.16b
- ___
- &sbox($rk0);
- $code.=<<___;
- eor @data[2].16b,@data[2].16b,$rk0.16b
- // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
- eor $rka.16b,$rka.16b,@data[2].16b
- eor $rk1.16b,$rka.16b,$rk1.16b
- ___
- &sbox($rk1);
- $code.=<<___;
- eor @data[3].16b,@data[3].16b,$rk1.16b
- ___
- }
- # sm4 for 8 lanes of data, in neon registers
- # data0/data1/data2/data3 datax0/datax1/datax2/datax3
- sub sm4_8blks () {
- my $kptr = shift;
- $code.=<<___;
- ldp $wtmp0,$wtmp1,[$kptr],8
- // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
- dup $rk0.4s,$wtmp0
- eor $rka.16b,@data[2].16b,@data[3].16b
- eor $rkb.16b,@datax[2].16b,@datax[3].16b
- eor @vtmp[0].16b,@data[1].16b,$rk0.16b
- eor @vtmp[1].16b,@datax[1].16b,$rk0.16b
- eor $rk0.16b,$rka.16b,@vtmp[0].16b
- eor $rk1.16b,$rkb.16b,@vtmp[1].16b
- ___
- &sbox_double($rk0,$rk1);
- $code.=<<___;
- eor @data[0].16b,@data[0].16b,$rk0.16b
- eor @datax[0].16b,@datax[0].16b,$rk1.16b
- // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
- dup $rk1.4s,$wtmp1
- eor $rka.16b,$rka.16b,@data[0].16b
- eor $rkb.16b,$rkb.16b,@datax[0].16b
- eor $rk0.16b,$rka.16b,$rk1.16b
- eor $rk1.16b,$rkb.16b,$rk1.16b
- ___
- &sbox_double($rk0,$rk1);
- $code.=<<___;
- ldp $wtmp0,$wtmp1,[$kptr],8
- eor @data[1].16b,@data[1].16b,$rk0.16b
- eor @datax[1].16b,@datax[1].16b,$rk1.16b
- // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
- dup $rk0.4s,$wtmp0
- eor $rka.16b,@data[0].16b,@data[1].16b
- eor $rkb.16b,@datax[0].16b,@datax[1].16b
- eor @vtmp[0].16b,@data[3].16b,$rk0.16b
- eor @vtmp[1].16b,@datax[3].16b,$rk0.16b
- eor $rk0.16b,$rka.16b,@vtmp[0].16b
- eor $rk1.16b,$rkb.16b,@vtmp[1].16b
- ___
- &sbox_double($rk0,$rk1);
- $code.=<<___;
- eor @data[2].16b,@data[2].16b,$rk0.16b
- eor @datax[2].16b,@datax[2].16b,$rk1.16b
- // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
- dup $rk1.4s,$wtmp1
- eor $rka.16b,$rka.16b,@data[2].16b
- eor $rkb.16b,$rkb.16b,@datax[2].16b
- eor $rk0.16b,$rka.16b,$rk1.16b
- eor $rk1.16b,$rkb.16b,$rk1.16b
- ___
- &sbox_double($rk0,$rk1);
- $code.=<<___;
- eor @data[3].16b,@data[3].16b,$rk0.16b
- eor @datax[3].16b,@datax[3].16b,$rk1.16b
- ___
- }
- sub encrypt_1blk_norev() {
- my $dat = shift;
- $code.=<<___;
- mov $ptr,$rks
- mov $counter,#8
- mov $word0,$dat.s[0]
- mov $word1,$dat.s[1]
- mov $word2,$dat.s[2]
- mov $word3,$dat.s[3]
- 10:
- ___
- &sm4_1blk($ptr);
- $code.=<<___;
- subs $counter,$counter,#1
- b.ne 10b
- mov $dat.s[0],$word3
- mov $dat.s[1],$word2
- mov $dat.s[2],$word1
- mov $dat.s[3],$word0
- ___
- }
- sub encrypt_1blk() {
- my $dat = shift;
- &encrypt_1blk_norev($dat);
- &rev32($dat,$dat);
- }
- sub encrypt_4blks() {
- $code.=<<___;
- mov $ptr,$rks
- mov $counter,#8
- 10:
- ___
- &sm4_4blks($ptr);
- $code.=<<___;
- subs $counter,$counter,#1
- b.ne 10b
- ___
- &rev32(@vtmp[3],@data[0]);
- &rev32(@vtmp[2],@data[1]);
- &rev32(@vtmp[1],@data[2]);
- &rev32(@vtmp[0],@data[3]);
- }
- sub encrypt_8blks() {
- $code.=<<___;
- mov $ptr,$rks
- mov $counter,#8
- 10:
- ___
- &sm4_8blks($ptr);
- $code.=<<___;
- subs $counter,$counter,#1
- b.ne 10b
- ___
- &rev32(@vtmp[3],@data[0]);
- &rev32(@vtmp[2],@data[1]);
- &rev32(@vtmp[1],@data[2]);
- &rev32(@vtmp[0],@data[3]);
- &rev32(@data[3],@datax[0]);
- &rev32(@data[2],@datax[1]);
- &rev32(@data[1],@datax[2]);
- &rev32(@data[0],@datax[3]);
- }
- sub load_sbox () {
- my $data = shift;
- $code.=<<___;
- ldr $MaskQ, .Lsbox_magic
- ldr $TAHMatQ, .Lsbox_magic+16
- ldr $TALMatQ, .Lsbox_magic+32
- ldr $ATAHMatQ, .Lsbox_magic+48
- ldr $ATALMatQ, .Lsbox_magic+64
- ldr $ANDMaskQ, .Lsbox_magic+80
- ___
- }
- sub mov_reg_to_vec() {
- my $src0 = shift;
- my $src1 = shift;
- my $desv = shift;
- $code.=<<___;
- mov $desv.d[0],$src0
- mov $desv.d[1],$src1
- ___
- &rev32_armeb($desv,$desv);
- }
- sub mov_vec_to_reg() {
- my $srcv = shift;
- my $des0 = shift;
- my $des1 = shift;
- $code.=<<___;
- mov $des0,$srcv.d[0]
- mov $des1,$srcv.d[1]
- ___
- }
- sub compute_tweak() {
- my $src0 = shift;
- my $src1 = shift;
- my $des0 = shift;
- my $des1 = shift;
- $code.=<<___;
- mov $wtmp0,0x87
- extr $xtmp2,$src1,$src1,#32
- extr $des1,$src1,$src0,#63
- and $wtmp1,$wtmp0,$wtmp2,asr#31
- eor $des0,$xtmp1,$src0,lsl#1
- ___
- }
- sub compute_tweak_vec() {
- my $src = shift;
- my $des = shift;
- my $std = shift;
- &rbit(@vtmp[2],$src,$std);
- $code.=<<___;
- ldr @qtmp[0], .Lxts_magic
- shl $des.16b, @vtmp[2].16b, #1
- ext @vtmp[1].16b, @vtmp[2].16b, @vtmp[2].16b,#15
- ushr @vtmp[1].16b, @vtmp[1].16b, #7
- mul @vtmp[1].16b, @vtmp[1].16b, @vtmp[0].16b
- eor $des.16b, $des.16b, @vtmp[1].16b
- ___
- &rbit($des,$des,$std);
- }
- $code=<<___;
- #include "arm_arch.h"
- .arch armv8-a+crypto
- .text
- .type _${prefix}_consts,%object
- .align 7
- _${prefix}_consts:
- .Lck:
- .long 0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269
- .long 0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9
- .long 0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249
- .long 0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9
- .long 0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229
- .long 0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299
- .long 0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209
- .long 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279
- .Lfk:
- .quad 0x56aa3350a3b1bac6,0xb27022dc677d9197
- .Lshuffles:
- .quad 0x0B0A090807060504,0x030201000F0E0D0C
- .Lxts_magic:
- .quad 0x0101010101010187,0x0101010101010101
- .Lsbox_magic:
- .quad 0x0b0e0104070a0d00,0x0306090c0f020508
- .quad 0x62185a2042387a00,0x22581a6002783a40
- .quad 0x15df62a89e54e923,0xc10bb67c4a803df7
- .quad 0xb9aa6b78c1d21300,0x1407c6d56c7fbead
- .quad 0x6404462679195b3b,0xe383c1a1fe9edcbc
- .quad 0x0f0f0f0f0f0f0f0f,0x0f0f0f0f0f0f0f0f
- .size _${prefix}_consts,.-_${prefix}_consts
- ___
- {{{
- my ($key,$keys,$enc)=("x0","x1","w2");
- my ($pointer,$schedules,$wtmp,$roundkey)=("x5","x6","w7","w8");
- my ($vkey,$vfk,$vmap)=("v5","v6","v7");
- $code.=<<___;
- .type _${prefix}_set_key,%function
- .align 4
- _${prefix}_set_key:
- AARCH64_VALID_CALL_TARGET
- ld1 {$vkey.4s},[$key]
- ___
- &load_sbox();
- &rev32($vkey,$vkey);
- $code.=<<___;
- adr $pointer,.Lshuffles
- ld1 {$vmap.2d},[$pointer]
- adr $pointer,.Lfk
- ld1 {$vfk.2d},[$pointer]
- eor $vkey.16b,$vkey.16b,$vfk.16b
- mov $schedules,#32
- adr $pointer,.Lck
- movi @vtmp[0].16b,#64
- cbnz $enc,1f
- add $keys,$keys,124
- 1:
- mov $wtmp,$vkey.s[1]
- ldr $roundkey,[$pointer],#4
- eor $roundkey,$roundkey,$wtmp
- mov $wtmp,$vkey.s[2]
- eor $roundkey,$roundkey,$wtmp
- mov $wtmp,$vkey.s[3]
- eor $roundkey,$roundkey,$wtmp
- // optimize sbox using AESE instruction
- mov @data[0].s[0],$roundkey
- tbl @vtmp[0].16b, {@data[0].16b}, $MaskV.16b
- ___
- &mul_matrix(@vtmp[0], $TAHMatV, $TALMatV, @vtmp[2]);
- $code.=<<___;
- eor @vtmp[1].16b, @vtmp[1].16b, @vtmp[1].16b
- aese @vtmp[0].16b,@vtmp[1].16b
- ___
- &mul_matrix(@vtmp[0], $ATAHMatV, $ATALMatV, @vtmp[2]);
- $code.=<<___;
- mov $wtmp,@vtmp[0].s[0]
- eor $roundkey,$wtmp,$wtmp,ror #19
- eor $roundkey,$roundkey,$wtmp,ror #9
- mov $wtmp,$vkey.s[0]
- eor $roundkey,$roundkey,$wtmp
- mov $vkey.s[0],$roundkey
- cbz $enc,2f
- str $roundkey,[$keys],#4
- b 3f
- 2:
- str $roundkey,[$keys],#-4
- 3:
- tbl $vkey.16b,{$vkey.16b},$vmap.16b
- subs $schedules,$schedules,#1
- b.ne 1b
- ret
- .size _${prefix}_set_key,.-_${prefix}_set_key
- ___
- }}}
- {{{
- $code.=<<___;
- .type _${prefix}_enc_4blks,%function
- .align 4
- _${prefix}_enc_4blks:
- AARCH64_VALID_CALL_TARGET
- ___
- &encrypt_4blks();
- $code.=<<___;
- ret
- .size _${prefix}_enc_4blks,.-_${prefix}_enc_4blks
- ___
- }}}
- {{{
- $code.=<<___;
- .type _${prefix}_enc_8blks,%function
- .align 4
- _${prefix}_enc_8blks:
- AARCH64_VALID_CALL_TARGET
- ___
- &encrypt_8blks();
- $code.=<<___;
- ret
- .size _${prefix}_enc_8blks,.-_${prefix}_enc_8blks
- ___
- }}}
- {{{
- my ($key,$keys)=("x0","x1");
- $code.=<<___;
- .globl ${prefix}_set_encrypt_key
- .type ${prefix}_set_encrypt_key,%function
- .align 5
- ${prefix}_set_encrypt_key:
- AARCH64_SIGN_LINK_REGISTER
- stp x29,x30,[sp,#-16]!
- mov w2,1
- bl _${prefix}_set_key
- ldp x29,x30,[sp],#16
- AARCH64_VALIDATE_LINK_REGISTER
- ret
- .size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
- ___
- }}}
- {{{
- my ($key,$keys)=("x0","x1");
- $code.=<<___;
- .globl ${prefix}_set_decrypt_key
- .type ${prefix}_set_decrypt_key,%function
- .align 5
- ${prefix}_set_decrypt_key:
- AARCH64_SIGN_LINK_REGISTER
- stp x29,x30,[sp,#-16]!
- mov w2,0
- bl _${prefix}_set_key
- ldp x29,x30,[sp],#16
- AARCH64_VALIDATE_LINK_REGISTER
- ret
- .size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
- ___
- }}}
- {{{
- sub gen_block () {
- my $dir = shift;
- my ($inp,$outp,$rk)=map("x$_",(0..2));
- $code.=<<___;
- .globl ${prefix}_${dir}crypt
- .type ${prefix}_${dir}crypt,%function
- .align 5
- ${prefix}_${dir}crypt:
- AARCH64_VALID_CALL_TARGET
- ld1 {@data[0].4s},[$inp]
- ___
- &load_sbox();
- &rev32(@data[0],@data[0]);
- $code.=<<___;
- mov $rks,$rk
- ___
- &encrypt_1blk(@data[0]);
- $code.=<<___;
- st1 {@data[0].4s},[$outp]
- ret
- .size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
- ___
- }
- &gen_block("en");
- &gen_block("de");
- }}}
- {{{
- $code.=<<___;
- .globl ${prefix}_ecb_encrypt
- .type ${prefix}_ecb_encrypt,%function
- .align 5
- ${prefix}_ecb_encrypt:
- AARCH64_SIGN_LINK_REGISTER
- // convert length into blocks
- lsr x2,x2,4
- stp d8,d9,[sp,#-80]!
- stp d10,d11,[sp,#16]
- stp d12,d13,[sp,#32]
- stp d14,d15,[sp,#48]
- stp x29,x30,[sp,#64]
- ___
- &load_sbox();
- $code.=<<___;
- .Lecb_8_blocks_process:
- cmp $blocks,#8
- b.lt .Lecb_4_blocks_process
- ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
- ld4 {@datax[0].4s,$datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
- ___
- &rev32(@data[0],@data[0]);
- &rev32(@data[1],@data[1]);
- &rev32(@data[2],@data[2]);
- &rev32(@data[3],@data[3]);
- &rev32(@datax[0],@datax[0]);
- &rev32(@datax[1],@datax[1]);
- &rev32(@datax[2],@datax[2]);
- &rev32(@datax[3],@datax[3]);
- $code.=<<___;
- bl _${prefix}_enc_8blks
- st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
- st4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
- subs $blocks,$blocks,#8
- b.gt .Lecb_8_blocks_process
- b 100f
- .Lecb_4_blocks_process:
- cmp $blocks,#4
- b.lt 1f
- ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
- ___
- &rev32(@data[0],@data[0]);
- &rev32(@data[1],@data[1]);
- &rev32(@data[2],@data[2]);
- &rev32(@data[3],@data[3]);
- $code.=<<___;
- bl _${prefix}_enc_4blks
- st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
- sub $blocks,$blocks,#4
- 1:
- // process last block
- cmp $blocks,#1
- b.lt 100f
- b.gt 1f
- ld1 {@data[0].4s},[$inp]
- ___
- &rev32(@data[0],@data[0]);
- &encrypt_1blk(@data[0]);
- $code.=<<___;
- st1 {@data[0].4s},[$outp]
- b 100f
- 1: // process last 2 blocks
- ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[0],[$inp],#16
- ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[1],[$inp],#16
- cmp $blocks,#2
- b.gt 1f
- ___
- &rev32(@data[0],@data[0]);
- &rev32(@data[1],@data[1]);
- &rev32(@data[2],@data[2]);
- &rev32(@data[3],@data[3]);
- $code.=<<___;
- bl _${prefix}_enc_4blks
- st4 {@vtmp[0].s-@vtmp[3].s}[0],[$outp],#16
- st4 {@vtmp[0].s-@vtmp[3].s}[1],[$outp]
- b 100f
- 1: // process last 3 blocks
- ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[2],[$inp],#16
- ___
- &rev32(@data[0],@data[0]);
- &rev32(@data[1],@data[1]);
- &rev32(@data[2],@data[2]);
- &rev32(@data[3],@data[3]);
- $code.=<<___;
- bl _${prefix}_enc_4blks
- st4 {@vtmp[0].s-@vtmp[3].s}[0],[$outp],#16
- st4 {@vtmp[0].s-@vtmp[3].s}[1],[$outp],#16
- st4 {@vtmp[0].s-@vtmp[3].s}[2],[$outp]
- 100:
- ldp d10,d11,[sp,#16]
- ldp d12,d13,[sp,#32]
- ldp d14,d15,[sp,#48]
- ldp x29,x30,[sp,#64]
- ldp d8,d9,[sp],#80
- AARCH64_VALIDATE_LINK_REGISTER
- ret
- .size ${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt
- ___
- }}}
- {{{
- my ($len,$ivp,$enc)=("x2","x4","w5");
- my $ivec0=("v3");
- my $ivec1=("v15");
- $code.=<<___;
- .globl ${prefix}_cbc_encrypt
- .type ${prefix}_cbc_encrypt,%function
- .align 5
- ${prefix}_cbc_encrypt:
- AARCH64_VALID_CALL_TARGET
- lsr $len,$len,4
- ___
- &load_sbox();
- $code.=<<___;
- cbz $enc,.Ldec
- ld1 {$ivec0.4s},[$ivp]
- .Lcbc_4_blocks_enc:
- cmp $blocks,#4
- b.lt 1f
- ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
- eor @data[0].16b,@data[0].16b,$ivec0.16b
- ___
- &rev32(@data[1],@data[1]);
- &rev32(@data[0],@data[0]);
- &rev32(@data[2],@data[2]);
- &rev32(@data[3],@data[3]);
- &encrypt_1blk_norev(@data[0]);
- $code.=<<___;
- eor @data[1].16b,@data[1].16b,@data[0].16b
- ___
- &encrypt_1blk_norev(@data[1]);
- &rev32(@data[0],@data[0]);
- $code.=<<___;
- eor @data[2].16b,@data[2].16b,@data[1].16b
- ___
- &encrypt_1blk_norev(@data[2]);
- &rev32(@data[1],@data[1]);
- $code.=<<___;
- eor @data[3].16b,@data[3].16b,@data[2].16b
- ___
- &encrypt_1blk_norev(@data[3]);
- &rev32(@data[2],@data[2]);
- &rev32(@data[3],@data[3]);
- $code.=<<___;
- orr $ivec0.16b,@data[3].16b,@data[3].16b
- st1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
- subs $blocks,$blocks,#4
- b.ne .Lcbc_4_blocks_enc
- b 2f
- 1:
- subs $blocks,$blocks,#1
- b.lt 2f
- ld1 {@data[0].4s},[$inp],#16
- eor $ivec0.16b,$ivec0.16b,@data[0].16b
- ___
- &rev32($ivec0,$ivec0);
- &encrypt_1blk($ivec0);
- $code.=<<___;
- st1 {$ivec0.4s},[$outp],#16
- b 1b
- 2:
- // save back IV
- st1 {$ivec0.4s},[$ivp]
- ret
- .Ldec:
- // decryption mode starts
- AARCH64_SIGN_LINK_REGISTER
- stp d8,d9,[sp,#-80]!
- stp d10,d11,[sp,#16]
- stp d12,d13,[sp,#32]
- stp d14,d15,[sp,#48]
- stp x29,x30,[sp,#64]
- .Lcbc_8_blocks_dec:
- cmp $blocks,#8
- b.lt 1f
- ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp]
- add $ptr,$inp,#64
- ld4 {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$ptr]
- ___
- &rev32(@data[0],@data[0]);
- &rev32(@data[1],@data[1]);
- &rev32(@data[2],@data[2]);
- &rev32(@data[3],$data[3]);
- &rev32(@datax[0],@datax[0]);
- &rev32(@datax[1],@datax[1]);
- &rev32(@datax[2],@datax[2]);
- &rev32(@datax[3],$datax[3]);
- $code.=<<___;
- bl _${prefix}_enc_8blks
- ___
- &transpose(@vtmp,@datax);
- &transpose(@data,@datax);
- $code.=<<___;
- ld1 {$ivec1.4s},[$ivp]
- ld1 {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
- // note ivec1 and vtmpx[3] are reusing the same register
- // care needs to be taken to avoid conflict
- eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
- ld1 {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64
- eor @vtmp[1].16b,@vtmp[1].16b,@datax[0].16b
- eor @vtmp[2].16b,@vtmp[2].16b,@datax[1].16b
- eor @vtmp[3].16b,$vtmp[3].16b,@datax[2].16b
- // save back IV
- st1 {$vtmpx[3].4s}, [$ivp]
- eor @data[0].16b,@data[0].16b,$datax[3].16b
- eor @data[1].16b,@data[1].16b,@vtmpx[0].16b
- eor @data[2].16b,@data[2].16b,@vtmpx[1].16b
- eor @data[3].16b,$data[3].16b,@vtmpx[2].16b
- st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
- st1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
- subs $blocks,$blocks,#8
- b.gt .Lcbc_8_blocks_dec
- b.eq 100f
- 1:
- ld1 {$ivec1.4s},[$ivp]
- .Lcbc_4_blocks_dec:
- cmp $blocks,#4
- b.lt 1f
- ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp]
- ___
- &rev32(@data[0],@data[0]);
- &rev32(@data[1],@data[1]);
- &rev32(@data[2],@data[2]);
- &rev32(@data[3],$data[3]);
- $code.=<<___;
- bl _${prefix}_enc_4blks
- ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
- ___
- &transpose(@vtmp,@datax);
- $code.=<<___;
- eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
- eor @vtmp[1].16b,@vtmp[1].16b,@data[0].16b
- orr $ivec1.16b,@data[3].16b,@data[3].16b
- eor @vtmp[2].16b,@vtmp[2].16b,@data[1].16b
- eor @vtmp[3].16b,$vtmp[3].16b,@data[2].16b
- st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
- subs $blocks,$blocks,#4
- b.gt .Lcbc_4_blocks_dec
- // save back IV
- st1 {@data[3].4s}, [$ivp]
- b 100f
- 1: // last block
- subs $blocks,$blocks,#1
- b.lt 100f
- b.gt 1f
- ld1 {@data[0].4s},[$inp],#16
- // save back IV
- st1 {$data[0].4s}, [$ivp]
- ___
- &rev32(@datax[0],@data[0]);
- &encrypt_1blk(@datax[0]);
- $code.=<<___;
- eor @datax[0].16b,@datax[0].16b,$ivec1.16b
- st1 {@datax[0].4s},[$outp],#16
- b 100f
- 1: // last two blocks
- ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[0],[$inp]
- add $ptr,$inp,#16
- ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[1],[$ptr],#16
- subs $blocks,$blocks,1
- b.gt 1f
- ___
- &rev32(@data[0],@data[0]);
- &rev32(@data[1],@data[1]);
- &rev32(@data[2],@data[2]);
- &rev32(@data[3],@data[3]);
- $code.=<<___;
- bl _${prefix}_enc_4blks
- ld1 {@data[0].4s,@data[1].4s},[$inp],#32
- ___
- &transpose(@vtmp,@datax);
- $code.=<<___;
- eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
- eor @vtmp[1].16b,@vtmp[1].16b,@data[0].16b
- st1 {@vtmp[0].4s,@vtmp[1].4s},[$outp],#32
- // save back IV
- st1 {@data[1].4s}, [$ivp]
- b 100f
- 1: // last 3 blocks
- ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[2],[$ptr]
- ___
- &rev32(@data[0],@data[0]);
- &rev32(@data[1],@data[1]);
- &rev32(@data[2],@data[2]);
- &rev32(@data[3],@data[3]);
- $code.=<<___;
- bl _${prefix}_enc_4blks
- ld1 {@data[0].4s,@data[1].4s,@data[2].4s},[$inp],#48
- ___
- &transpose(@vtmp,@datax);
- $code.=<<___;
- eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
- eor @vtmp[1].16b,@vtmp[1].16b,@data[0].16b
- eor @vtmp[2].16b,@vtmp[2].16b,@data[1].16b
- st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s},[$outp],#48
- // save back IV
- st1 {@data[2].4s}, [$ivp]
- 100:
- ldp d10,d11,[sp,#16]
- ldp d12,d13,[sp,#32]
- ldp d14,d15,[sp,#48]
- ldp x29,x30,[sp,#64]
- ldp d8,d9,[sp],#80
- AARCH64_VALIDATE_LINK_REGISTER
- ret
- .size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
- ___
- }}}
- {{{
- my ($ivp)=("x4");
- my ($ctr)=("w5");
- my $ivec=("v3");
- $code.=<<___;
- .globl ${prefix}_ctr32_encrypt_blocks
- .type ${prefix}_ctr32_encrypt_blocks,%function
- .align 5
- ${prefix}_ctr32_encrypt_blocks:
- AARCH64_VALID_CALL_TARGET
- ld1 {$ivec.4s},[$ivp]
- ___
- &rev32($ivec,$ivec);
- &load_sbox();
- $code.=<<___;
- cmp $blocks,#1
- b.ne 1f
- // fast processing for one single block without
- // context saving overhead
- ___
- &encrypt_1blk($ivec);
- $code.=<<___;
- ld1 {@data[0].4s},[$inp]
- eor @data[0].16b,@data[0].16b,$ivec.16b
- st1 {@data[0].4s},[$outp]
- ret
- 1:
- AARCH64_SIGN_LINK_REGISTER
- stp d8,d9,[sp,#-80]!
- stp d10,d11,[sp,#16]
- stp d12,d13,[sp,#32]
- stp d14,d15,[sp,#48]
- stp x29,x30,[sp,#64]
- mov $word0,$ivec.s[0]
- mov $word1,$ivec.s[1]
- mov $word2,$ivec.s[2]
- mov $ctr,$ivec.s[3]
- .Lctr32_4_blocks_process:
- cmp $blocks,#4
- b.lt 1f
- dup @data[0].4s,$word0
- dup @data[1].4s,$word1
- dup @data[2].4s,$word2
- mov @data[3].s[0],$ctr
- add $ctr,$ctr,#1
- mov $data[3].s[1],$ctr
- add $ctr,$ctr,#1
- mov @data[3].s[2],$ctr
- add $ctr,$ctr,#1
- mov @data[3].s[3],$ctr
- add $ctr,$ctr,#1
- cmp $blocks,#8
- b.ge .Lctr32_8_blocks_process
- bl _${prefix}_enc_4blks
- ld4 {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64
- eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
- eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
- eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
- eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
- st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
- subs $blocks,$blocks,#4
- b.ne .Lctr32_4_blocks_process
- b 100f
- .Lctr32_8_blocks_process:
- dup @datax[0].4s,$word0
- dup @datax[1].4s,$word1
- dup @datax[2].4s,$word2
- mov @datax[3].s[0],$ctr
- add $ctr,$ctr,#1
- mov $datax[3].s[1],$ctr
- add $ctr,$ctr,#1
- mov @datax[3].s[2],$ctr
- add $ctr,$ctr,#1
- mov @datax[3].s[3],$ctr
- add $ctr,$ctr,#1
- bl _${prefix}_enc_8blks
- ld4 {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64
- ld4 {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
- eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
- eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
- eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
- eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
- eor @data[0].16b,@data[0].16b,@datax[0].16b
- eor @data[1].16b,@data[1].16b,@datax[1].16b
- eor @data[2].16b,@data[2].16b,@datax[2].16b
- eor @data[3].16b,@data[3].16b,@datax[3].16b
- st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
- st4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
- subs $blocks,$blocks,#8
- b.ne .Lctr32_4_blocks_process
- b 100f
- 1: // last block processing
- subs $blocks,$blocks,#1
- b.lt 100f
- b.gt 1f
- mov $ivec.s[0],$word0
- mov $ivec.s[1],$word1
- mov $ivec.s[2],$word2
- mov $ivec.s[3],$ctr
- ___
- &encrypt_1blk($ivec);
- $code.=<<___;
- ld1 {@data[0].4s},[$inp]
- eor @data[0].16b,@data[0].16b,$ivec.16b
- st1 {@data[0].4s},[$outp]
- b 100f
- 1: // last 2 blocks processing
- dup @data[0].4s,$word0
- dup @data[1].4s,$word1
- dup @data[2].4s,$word2
- mov @data[3].s[0],$ctr
- add $ctr,$ctr,#1
- mov @data[3].s[1],$ctr
- subs $blocks,$blocks,#1
- b.ne 1f
- bl _${prefix}_enc_4blks
- ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[0],[$inp],#16
- ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[1],[$inp],#16
- eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
- eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
- eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
- eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
- st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[0],[$outp],#16
- st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[1],[$outp],#16
- b 100f
- 1: // last 3 blocks processing
- add $ctr,$ctr,#1
- mov @data[3].s[2],$ctr
- bl _${prefix}_enc_4blks
- ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[0],[$inp],#16
- ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[1],[$inp],#16
- ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[2],[$inp],#16
- eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
- eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
- eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
- eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
- st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[0],[$outp],#16
- st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[1],[$outp],#16
- st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[2],[$outp],#16
- 100:
- ldp d10,d11,[sp,#16]
- ldp d12,d13,[sp,#32]
- ldp d14,d15,[sp,#48]
- ldp x29,x30,[sp,#64]
- ldp d8,d9,[sp],#80
- AARCH64_VALIDATE_LINK_REGISTER
- ret
- .size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
- ___
- }}}
- {{{
- my ($blocks,$len)=("x2","x2");
- my $ivp=("x5");
- my @twx=map("x$_",(12..27));
- my ($rks1,$rks2)=("x26","x27");
- my $lastBlk=("x26");
- my $enc=("w28");
- my $remain=("x29");
- my @tweak=map("v$_",(16..23));
- my $lastTweak=("v25");
- sub gen_xts_cipher() {
- my $std = shift;
- $code.=<<___;
- .globl ${prefix}_xts_encrypt${std}
- .type ${prefix}_xts_encrypt${std},%function
- .align 5
- ${prefix}_xts_encrypt${std}:
- AARCH64_SIGN_LINK_REGISTER
- stp x15, x16, [sp, #-0x10]!
- stp x17, x18, [sp, #-0x10]!
- stp x19, x20, [sp, #-0x10]!
- stp x21, x22, [sp, #-0x10]!
- stp x23, x24, [sp, #-0x10]!
- stp x25, x26, [sp, #-0x10]!
- stp x27, x28, [sp, #-0x10]!
- stp x29, x30, [sp, #-0x10]!
- stp d8, d9, [sp, #-0x10]!
- stp d10, d11, [sp, #-0x10]!
- stp d12, d13, [sp, #-0x10]!
- stp d14, d15, [sp, #-0x10]!
- mov $rks1,x3
- mov $rks2,x4
- mov $enc,w6
- ld1 {@tweak[0].4s}, [$ivp]
- mov $rks,$rks2
- ___
- &load_sbox();
- &rev32(@tweak[0],@tweak[0]);
- &encrypt_1blk(@tweak[0]);
- $code.=<<___;
- mov $rks,$rks1
- and $remain,$len,#0x0F
- // convert length into blocks
- lsr $blocks,$len,4
- cmp $blocks,#1
- b.lt .return${std}
- cmp $remain,0
- // If the encryption/decryption Length is N times of 16,
- // the all blocks are encrypted/decrypted in .xts_encrypt_blocks${std}
- b.eq .xts_encrypt_blocks${std}
- // If the encryption/decryption length is not N times of 16,
- // the last two blocks are encrypted/decrypted in .last_2blks_tweak${std} or .only_2blks_tweak${std}
- // the other blocks are encrypted/decrypted in .xts_encrypt_blocks${std}
- subs $blocks,$blocks,#1
- b.eq .only_2blks_tweak${std}
- .xts_encrypt_blocks${std}:
- ___
- &rbit(@tweak[0],@tweak[0],$std);
- &rev32_armeb(@tweak[0],@tweak[0]);
- &mov_vec_to_reg(@tweak[0],@twx[0],@twx[1]);
- &compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3]);
- &compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5]);
- &compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7]);
- &compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9]);
- &compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11]);
- &compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13]);
- &compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15]);
- $code.=<<___;
- .Lxts_8_blocks_process${std}:
- cmp $blocks,#8
- ___
- &mov_reg_to_vec(@twx[0],@twx[1],@tweak[0]);
- &compute_tweak(@twx[14],@twx[15],@twx[0],@twx[1]);
- &mov_reg_to_vec(@twx[2],@twx[3],@tweak[1]);
- &compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3]);
- &mov_reg_to_vec(@twx[4],@twx[5],@tweak[2]);
- &compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5]);
- &mov_reg_to_vec(@twx[6],@twx[7],@tweak[3]);
- &compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7]);
- &mov_reg_to_vec(@twx[8],@twx[9],@tweak[4]);
- &compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9]);
- &mov_reg_to_vec(@twx[10],@twx[11],@tweak[5]);
- &compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11]);
- &mov_reg_to_vec(@twx[12],@twx[13],@tweak[6]);
- &compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13]);
- &mov_reg_to_vec(@twx[14],@twx[15],@tweak[7]);
- &compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15]);
- $code.=<<___;
- b.lt .Lxts_4_blocks_process${std}
- ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
- ___
- &rbit(@tweak[0],@tweak[0],$std);
- &rbit(@tweak[1],@tweak[1],$std);
- &rbit(@tweak[2],@tweak[2],$std);
- &rbit(@tweak[3],@tweak[3],$std);
- $code.=<<___;
- eor @data[0].16b, @data[0].16b, @tweak[0].16b
- eor @data[1].16b, @data[1].16b, @tweak[1].16b
- eor @data[2].16b, @data[2].16b, @tweak[2].16b
- eor @data[3].16b, @data[3].16b, @tweak[3].16b
- ld1 {@datax[0].4s,$datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
- ___
- &rbit(@tweak[4],@tweak[4],$std);
- &rbit(@tweak[5],@tweak[5],$std);
- &rbit(@tweak[6],@tweak[6],$std);
- &rbit(@tweak[7],@tweak[7],$std);
- $code.=<<___;
- eor @datax[0].16b, @datax[0].16b, @tweak[4].16b
- eor @datax[1].16b, @datax[1].16b, @tweak[5].16b
- eor @datax[2].16b, @datax[2].16b, @tweak[6].16b
- eor @datax[3].16b, @datax[3].16b, @tweak[7].16b
- ___
- &rev32(@data[0],@data[0]);
- &rev32(@data[1],@data[1]);
- &rev32(@data[2],@data[2]);
- &rev32(@data[3],@data[3]);
- &rev32(@datax[0],@datax[0]);
- &rev32(@datax[1],@datax[1]);
- &rev32(@datax[2],@datax[2]);
- &rev32(@datax[3],@datax[3]);
- &transpose(@data,@vtmp);
- &transpose(@datax,@vtmp);
- $code.=<<___;
- bl _${prefix}_enc_8blks
- ___
- &transpose(@vtmp,@datax);
- &transpose(@data,@datax);
- $code.=<<___;
- eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b
- eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b
- eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b
- eor @vtmp[3].16b, @vtmp[3].16b, @tweak[3].16b
- eor @data[0].16b, @data[0].16b, @tweak[4].16b
- eor @data[1].16b, @data[1].16b, @tweak[5].16b
- eor @data[2].16b, @data[2].16b, @tweak[6].16b
- eor @data[3].16b, @data[3].16b, @tweak[7].16b
- // save the last tweak
- mov $lastTweak.16b,@tweak[7].16b
- st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
- st1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
- subs $blocks,$blocks,#8
- b.gt .Lxts_8_blocks_process${std}
- b 100f
- .Lxts_4_blocks_process${std}:
- cmp $blocks,#4
- b.lt 1f
- ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
- ___
- &rbit(@tweak[0],@tweak[0],$std);
- &rbit(@tweak[1],@tweak[1],$std);
- &rbit(@tweak[2],@tweak[2],$std);
- &rbit(@tweak[3],@tweak[3],$std);
- $code.=<<___;
- eor @data[0].16b, @data[0].16b, @tweak[0].16b
- eor @data[1].16b, @data[1].16b, @tweak[1].16b
- eor @data[2].16b, @data[2].16b, @tweak[2].16b
- eor @data[3].16b, @data[3].16b, @tweak[3].16b
- ___
- &rev32(@data[0],@data[0]);
- &rev32(@data[1],@data[1]);
- &rev32(@data[2],@data[2]);
- &rev32(@data[3],@data[3]);
- &transpose(@data,@vtmp);
- $code.=<<___;
- bl _${prefix}_enc_4blks
- ___
- &transpose(@vtmp,@data);
- $code.=<<___;
- eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b
- eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b
- eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b
- eor @vtmp[3].16b, @vtmp[3].16b, @tweak[3].16b
- st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
- sub $blocks,$blocks,#4
- mov @tweak[0].16b,@tweak[4].16b
- mov @tweak[1].16b,@tweak[5].16b
- mov @tweak[2].16b,@tweak[6].16b
- // save the last tweak
- mov $lastTweak.16b,@tweak[3].16b
- 1:
- // process last block
- cmp $blocks,#1
- b.lt 100f
- b.gt 1f
- ld1 {@data[0].4s},[$inp],#16
- ___
- &rbit(@tweak[0],@tweak[0],$std);
- $code.=<<___;
- eor @data[0].16b, @data[0].16b, @tweak[0].16b
- ___
- &rev32(@data[0],@data[0]);
- &encrypt_1blk(@data[0]);
- $code.=<<___;
- eor @data[0].16b, @data[0].16b, @tweak[0].16b
- st1 {@data[0].4s},[$outp],#16
- // save the last tweak
- mov $lastTweak.16b,@tweak[0].16b
- b 100f
- 1: // process last 2 blocks
- cmp $blocks,#2
- b.gt 1f
- ld1 {@data[0].4s,@data[1].4s},[$inp],#32
- ___
- &rbit(@tweak[0],@tweak[0],$std);
- &rbit(@tweak[1],@tweak[1],$std);
- $code.=<<___;
- eor @data[0].16b, @data[0].16b, @tweak[0].16b
- eor @data[1].16b, @data[1].16b, @tweak[1].16b
- ___
- &rev32(@data[0],@data[0]);
- &rev32(@data[1],@data[1]);
- &transpose(@data,@vtmp);
- $code.=<<___;
- bl _${prefix}_enc_4blks
- ___
- &transpose(@vtmp,@data);
- $code.=<<___;
- eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b
- eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b
- st1 {@vtmp[0].4s,@vtmp[1].4s},[$outp],#32
- // save the last tweak
- mov $lastTweak.16b,@tweak[1].16b
- b 100f
- 1: // process last 3 blocks
- ld1 {@data[0].4s,@data[1].4s,@data[2].4s},[$inp],#48
- ___
- &rbit(@tweak[0],@tweak[0],$std);
- &rbit(@tweak[1],@tweak[1],$std);
- &rbit(@tweak[2],@tweak[2],$std);
- $code.=<<___;
- eor @data[0].16b, @data[0].16b, @tweak[0].16b
- eor @data[1].16b, @data[1].16b, @tweak[1].16b
- eor @data[2].16b, @data[2].16b, @tweak[2].16b
- ___
- &rev32(@data[0],@data[0]);
- &rev32(@data[1],@data[1]);
- &rev32(@data[2],@data[2]);
- &transpose(@data,@vtmp);
- $code.=<<___;
- bl _${prefix}_enc_4blks
- ___
- &transpose(@vtmp,@data);
- $code.=<<___;
- eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b
- eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b
- eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b
- st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s},[$outp],#48
- // save the last tweak
- mov $lastTweak.16b,@tweak[2].16b
- 100:
- cmp $remain,0
- b.eq .return${std}
- // This branch calculates the last two tweaks,
- // while the encryption/decryption length is larger than 32
- .last_2blks_tweak${std}:
- ___
- &rev32_armeb($lastTweak,$lastTweak);
- &compute_tweak_vec($lastTweak,@tweak[1],$std);
- &compute_tweak_vec(@tweak[1],@tweak[2],$std);
- $code.=<<___;
- b .check_dec${std}
- // This branch calculates the last two tweaks,
- // while the encryption/decryption length is equal to 32, who only need two tweaks
- .only_2blks_tweak${std}:
- mov @tweak[1].16b,@tweak[0].16b
- ___
- &rev32_armeb(@tweak[1],@tweak[1]);
- &compute_tweak_vec(@tweak[1],@tweak[2],$std);
- $code.=<<___;
- b .check_dec${std}
- // Determine whether encryption or decryption is required.
- // The last two tweaks need to be swapped for decryption.
- .check_dec${std}:
- // encryption:1 decryption:0
- cmp $enc,1
- b.eq .process_last_2blks${std}
- mov @vtmp[0].16B,@tweak[1].16b
- mov @tweak[1].16B,@tweak[2].16b
- mov @tweak[2].16B,@vtmp[0].16b
- .process_last_2blks${std}:
- ___
- &rev32_armeb(@tweak[1],@tweak[1]);
- &rev32_armeb(@tweak[2],@tweak[2]);
- $code.=<<___;
- ld1 {@data[0].4s},[$inp],#16
- eor @data[0].16b, @data[0].16b, @tweak[1].16b
- ___
- &rev32(@data[0],@data[0]);
- &encrypt_1blk(@data[0]);
- $code.=<<___;
- eor @data[0].16b, @data[0].16b, @tweak[1].16b
- st1 {@data[0].4s},[$outp],#16
- sub $lastBlk,$outp,16
- .loop${std}:
- subs $remain,$remain,1
- ldrb $wtmp0,[$lastBlk,$remain]
- ldrb $wtmp1,[$inp,$remain]
- strb $wtmp1,[$lastBlk,$remain]
- strb $wtmp0,[$outp,$remain]
- b.gt .loop${std}
- ld1 {@data[0].4s}, [$lastBlk]
- eor @data[0].16b, @data[0].16b, @tweak[2].16b
- ___
- &rev32(@data[0],@data[0]);
- &encrypt_1blk(@data[0]);
- $code.=<<___;
- eor @data[0].16b, @data[0].16b, @tweak[2].16b
- st1 {@data[0].4s}, [$lastBlk]
- .return${std}:
- ldp d14, d15, [sp], #0x10
- ldp d12, d13, [sp], #0x10
- ldp d10, d11, [sp], #0x10
- ldp d8, d9, [sp], #0x10
- ldp x29, x30, [sp], #0x10
- ldp x27, x28, [sp], #0x10
- ldp x25, x26, [sp], #0x10
- ldp x23, x24, [sp], #0x10
- ldp x21, x22, [sp], #0x10
- ldp x19, x20, [sp], #0x10
- ldp x17, x18, [sp], #0x10
- ldp x15, x16, [sp], #0x10
- AARCH64_VALIDATE_LINK_REGISTER
- ret
- .size ${prefix}_xts_encrypt${std},.-${prefix}_xts_encrypt${std}
- ___
- } # end of gen_xts_cipher
- &gen_xts_cipher("_gb");
- &gen_xts_cipher("");
- }}}
- ########################################
- open SELF,$0;
- while(<SELF>) {
- next if (/^#!/);
- last if (!s/^#/\/\// and !/^$/);
- print;
- }
- close SELF;
- foreach(split("\n",$code)) {
- s/\`([^\`]*)\`/eval($1)/ge;
- print $_,"\n";
- }
- close STDOUT or die "error closing STDOUT: $!";
|