vpaes-ppc.pl 41 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586
  1. #!/usr/bin/env perl
  2. ######################################################################
  3. ## Constant-time SSSE3 AES core implementation.
  4. ## version 0.1
  5. ##
  6. ## By Mike Hamburg (Stanford University), 2009
  7. ## Public domain.
  8. ##
  9. ## For details see http://shiftleft.org/papers/vector_aes/ and
  10. ## http://crypto.stanford.edu/vpaes/.
  11. # CBC encrypt/decrypt performance in cycles per byte processed with
  12. # 128-bit key.
  13. #
  14. # aes-ppc.pl this
  15. # G4e 35.5/52.1/(23.8) 11.9(*)/15.4
  16. # POWER6 42.7/54.3/(28.2) 63.0/92.8(**)
  17. # POWER7 32.3/42.9/(18.4) 18.5/23.3
  18. #
  19. # (*) This is ~10% worse than reported in paper. The reason is
  20. # twofold. This module doesn't make any assumption about
  21. # key schedule (or data for that matter) alignment and handles
  22. # it in-line. Secondly it, being transliterated from
  23. # vpaes-x86_64.pl, relies on "nested inversion" better suited
  24. # for Intel CPUs.
  25. # (**) Inadequate POWER6 performance is due to astronomic AltiVec
  26. # latency, 9 cycles per simple logical operation.
  27. $flavour = shift;
  28. if ($flavour =~ /64/) {
  29. $SIZE_T =8;
  30. $LRSAVE =2*$SIZE_T;
  31. $STU ="stdu";
  32. $POP ="ld";
  33. $PUSH ="std";
  34. $UCMP ="cmpld";
  35. } elsif ($flavour =~ /32/) {
  36. $SIZE_T =4;
  37. $LRSAVE =$SIZE_T;
  38. $STU ="stwu";
  39. $POP ="lwz";
  40. $PUSH ="stw";
  41. $UCMP ="cmplw";
  42. } else { die "nonsense $flavour"; }
  43. $sp="r1";
  44. $FRAME=6*$SIZE_T+13*16; # 13*16 is for v20-v31 offload
  45. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  46. ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
  47. ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
  48. die "can't locate ppc-xlate.pl";
  49. open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
  50. $code.=<<___;
  51. .machine "any"
  52. .text
  53. .align 7 # totally strategic alignment
  54. _vpaes_consts:
  55. Lk_mc_forward: # mc_forward
  56. .long 0x01020300, 0x05060704, 0x090a0b08, 0x0d0e0f0c ?inv
  57. .long 0x05060704, 0x090a0b08, 0x0d0e0f0c, 0x01020300 ?inv
  58. .long 0x090a0b08, 0x0d0e0f0c, 0x01020300, 0x05060704 ?inv
  59. .long 0x0d0e0f0c, 0x01020300, 0x05060704, 0x090a0b08 ?inv
  60. Lk_mc_backward: # mc_backward
  61. .long 0x03000102, 0x07040506, 0x0b08090a, 0x0f0c0d0e ?inv
  62. .long 0x0f0c0d0e, 0x03000102, 0x07040506, 0x0b08090a ?inv
  63. .long 0x0b08090a, 0x0f0c0d0e, 0x03000102, 0x07040506 ?inv
  64. .long 0x07040506, 0x0b08090a, 0x0f0c0d0e, 0x03000102 ?inv
  65. Lk_sr: # sr
  66. .long 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f ?inv
  67. .long 0x00050a0f, 0x04090e03, 0x080d0207, 0x0c01060b ?inv
  68. .long 0x0009020b, 0x040d060f, 0x08010a03, 0x0c050e07 ?inv
  69. .long 0x000d0a07, 0x04010e0b, 0x0805020f, 0x0c090603 ?inv
  70. ##
  71. ## "Hot" constants
  72. ##
  73. Lk_inv: # inv, inva
  74. .long 0xf001080d, 0x0f06050e, 0x020c0b0a, 0x09030704 ?rev
  75. .long 0xf0070b0f, 0x060a0401, 0x09080502, 0x0c0e0d03 ?rev
  76. Lk_ipt: # input transform (lo, hi)
  77. .long 0x00702a5a, 0x98e8b2c2, 0x08782252, 0x90e0baca ?rev
  78. .long 0x004d7c31, 0x7d30014c, 0x81ccfdb0, 0xfcb180cd ?rev
  79. Lk_sbo: # sbou, sbot
  80. .long 0x00c7bd6f, 0x176dd2d0, 0x78a802c5, 0x7abfaa15 ?rev
  81. .long 0x006abb5f, 0xa574e4cf, 0xfa352b41, 0xd1901e8e ?rev
  82. Lk_sb1: # sb1u, sb1t
  83. .long 0x0023e2fa, 0x15d41836, 0xefd92e0d, 0xc1ccf73b ?rev
  84. .long 0x003e50cb, 0x8fe19bb1, 0x44f52a14, 0x6e7adfa5 ?rev
  85. Lk_sb2: # sb2u, sb2t
  86. .long 0x0029e10a, 0x4088eb69, 0x4a2382ab, 0xc863a1c2 ?rev
  87. .long 0x0024710b, 0xc6937ae2, 0xcd2f98bc, 0x55e9b75e ?rev
  88. ##
  89. ## Decryption stuff
  90. ##
  91. Lk_dipt: # decryption input transform
  92. .long 0x005f540b, 0x045b500f, 0x1a454e11, 0x1e414a15 ?rev
  93. .long 0x00650560, 0xe683e386, 0x94f191f4, 0x72177712 ?rev
  94. Lk_dsbo: # decryption sbox final output
  95. .long 0x0040f97e, 0x53ea8713, 0x2d3e94d4, 0xb96daac7 ?rev
  96. .long 0x001d4493, 0x0f56d712, 0x9c8ec5d8, 0x59814bca ?rev
  97. Lk_dsb9: # decryption sbox output *9*u, *9*t
  98. .long 0x00d6869a, 0x53031c85, 0xc94c994f, 0x501fd5ca ?rev
  99. .long 0x0049d7ec, 0x89173bc0, 0x65a5fbb2, 0x9e2c5e72 ?rev
  100. Lk_dsbd: # decryption sbox output *D*u, *D*t
  101. .long 0x00a2b1e6, 0xdfcc577d, 0x39442a88, 0x139b6ef5 ?rev
  102. .long 0x00cbc624, 0xf7fae23c, 0xd3efde15, 0x0d183129 ?rev
  103. Lk_dsbb: # decryption sbox output *B*u, *B*t
  104. .long 0x0042b496, 0x926422d0, 0x04d4f2b0, 0xf6462660 ?rev
  105. .long 0x006759cd, 0xa69894c1, 0x6baa5532, 0x3e0cfff3 ?rev
  106. Lk_dsbe: # decryption sbox output *E*u, *E*t
  107. .long 0x00d0d426, 0x9692f246, 0xb0f6b464, 0x04604222 ?rev
  108. .long 0x00c1aaff, 0xcda6550c, 0x323e5998, 0x6bf36794 ?rev
  109. ##
  110. ## Key schedule constants
  111. ##
  112. Lk_dksd: # decryption key schedule: invskew x*D
  113. .long 0x0047e4a3, 0x5d1ab9fe, 0xf9be1d5a, 0xa4e34007 ?rev
  114. .long 0x008336b5, 0xf477c241, 0x1e9d28ab, 0xea69dc5f ?rev
  115. Lk_dksb: # decryption key schedule: invskew x*B
  116. .long 0x00d55085, 0x1fca4f9a, 0x994cc91c, 0x8653d603 ?rev
  117. .long 0x004afcb6, 0xa7ed5b11, 0xc882347e, 0x6f2593d9 ?rev
  118. Lk_dkse: # decryption key schedule: invskew x*E + 0x63
  119. .long 0x00d6c91f, 0xca1c03d5, 0x86504f99, 0x4c9a8553 ?rev
  120. .long 0xe87bdc4f, 0x059631a2, 0x8714b320, 0x6af95ecd ?rev
  121. Lk_dks9: # decryption key schedule: invskew x*9
  122. .long 0x00a7d97e, 0xc86f11b6, 0xfc5b2582, 0x3493ed4a ?rev
  123. .long 0x00331427, 0x62517645, 0xcefddae9, 0xac9fb88b ?rev
  124. Lk_rcon: # rcon
  125. .long 0xb6ee9daf, 0xb991831f, 0x817d7c4d, 0x08982a70 ?asis
  126. Lk_s63:
  127. .long 0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b ?asis
  128. Lk_opt: # output transform
  129. .long 0x0060b6d6, 0x29499fff, 0x0868bede, 0x214197f7 ?rev
  130. .long 0x00ecbc50, 0x51bded01, 0xe00c5cb0, 0xb15d0de1 ?rev
  131. Lk_deskew: # deskew tables: inverts the sbox's "skew"
  132. .long 0x00e3a447, 0x40a3e407, 0x1af9be5d, 0x5ab9fe1d ?rev
  133. .long 0x0069ea83, 0xdcb5365f, 0x771e9df4, 0xabc24128 ?rev
  134. .align 5
  135. Lconsts:
  136. mflr r0
  137. bcl 20,31,\$+4
  138. mflr r12 #vvvvv "distance between . and _vpaes_consts
  139. addi r12,r12,-0x308
  140. mtlr r0
  141. blr
  142. .long 0
  143. .byte 0,12,0x14,0,0,0,0,0
  144. .asciz "Vector Permutation AES for AltiVec, Mike Hamburg (Stanford University)"
  145. .align 6
  146. ___
  147. my ($inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm) = map("v$_",(26..31));
  148. {
  149. my ($inp,$out,$key) = map("r$_",(3..5));
  150. my ($invlo,$invhi,$iptlo,$ipthi,$sbou,$sbot) = map("v$_",(10..15));
  151. my ($sb1u,$sb1t,$sb2u,$sb2t) = map("v$_",(16..19));
  152. my ($sb9u,$sb9t,$sbdu,$sbdt,$sbbu,$sbbt,$sbeu,$sbet)=map("v$_",(16..23));
  153. $code.=<<___;
  154. ##
  155. ## _aes_preheat
  156. ##
  157. ## Fills register %r10 -> .aes_consts (so you can -fPIC)
  158. ## and %xmm9-%xmm15 as specified below.
  159. ##
  160. .align 4
  161. _vpaes_encrypt_preheat:
  162. mflr r8
  163. bl Lconsts
  164. mtlr r8
  165. li r11, 0xc0 # Lk_inv
  166. li r10, 0xd0
  167. li r9, 0xe0 # Lk_ipt
  168. li r8, 0xf0
  169. vxor v7, v7, v7 # 0x00..00
  170. vspltisb v8,4 # 0x04..04
  171. vspltisb v9,0x0f # 0x0f..0f
  172. lvx $invlo, r12, r11
  173. li r11, 0x100
  174. lvx $invhi, r12, r10
  175. li r10, 0x110
  176. lvx $iptlo, r12, r9
  177. li r9, 0x120
  178. lvx $ipthi, r12, r8
  179. li r8, 0x130
  180. lvx $sbou, r12, r11
  181. li r11, 0x140
  182. lvx $sbot, r12, r10
  183. li r10, 0x150
  184. lvx $sb1u, r12, r9
  185. lvx $sb1t, r12, r8
  186. lvx $sb2u, r12, r11
  187. lvx $sb2t, r12, r10
  188. blr
  189. .long 0
  190. .byte 0,12,0x14,0,0,0,0,0
  191. ##
  192. ## _aes_encrypt_core
  193. ##
  194. ## AES-encrypt %xmm0.
  195. ##
  196. ## Inputs:
  197. ## %xmm0 = input
  198. ## %xmm9-%xmm15 as in _vpaes_preheat
  199. ## (%rdx) = scheduled keys
  200. ##
  201. ## Output in %xmm0
  202. ## Clobbers %xmm1-%xmm6, %r9, %r10, %r11, %rax
  203. ##
  204. ##
  205. .align 5
  206. _vpaes_encrypt_core:
  207. lwz r8, 240($key) # pull rounds
  208. li r9, 16
  209. lvx v5, 0, $key # vmovdqu (%r9), %xmm5 # round0 key
  210. li r11, 0x10
  211. lvx v6, r9, $key
  212. addi r9, r9, 16
  213. ?vperm v5, v5, v6, $keyperm # align round key
  214. addi r10, r11, 0x40
  215. vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0
  216. vperm v0, $iptlo, $iptlo, v0 # vpshufb %xmm1, %xmm2, %xmm1
  217. vperm v1, $ipthi, $ipthi, v1 # vpshufb %xmm0, %xmm3, %xmm2
  218. vxor v0, v0, v5 # vpxor %xmm5, %xmm1, %xmm0
  219. vxor v0, v0, v1 # vpxor %xmm2, %xmm0, %xmm0
  220. mtctr r8
  221. b Lenc_entry
  222. .align 4
  223. Lenc_loop:
  224. # middle of middle round
  225. vperm v4, $sb1t, v7, v2 # vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u
  226. lvx v1, r12, r11 # vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]
  227. addi r11, r11, 16
  228. vperm v0, $sb1u, v7, v3 # vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t
  229. vxor v4, v4, v5 # vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
  230. andi. r11, r11, 0x30 # and \$0x30, %r11 # ... mod 4
  231. vperm v5, $sb2t, v7, v2 # vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u
  232. vxor v0, v0, v4 # vpxor %xmm4, %xmm0, %xmm0 # 0 = A
  233. vperm v2, $sb2u, v7, v3 # vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t
  234. lvx v4, r12, r10 # vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[]
  235. addi r10, r11, 0x40
  236. vperm v3, v0, v7, v1 # vpshufb %xmm1, %xmm0, %xmm3 # 0 = B
  237. vxor v2, v2, v5 # vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A
  238. vperm v0, v0, v7, v4 # vpshufb %xmm4, %xmm0, %xmm0 # 3 = D
  239. vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B
  240. vperm v4, v3, v7, v1 # vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C
  241. vxor v0, v0, v3 # vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D
  242. vxor v0, v0, v4 # vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D
  243. Lenc_entry:
  244. # top of round
  245. vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0 # 1 = i
  246. vperm v5, $invhi, $invhi, v0 # vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k
  247. vxor v0, v0, v1 # vpxor %xmm0, %xmm1, %xmm1 # 0 = j
  248. vperm v3, $invlo, $invlo, v1 # vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
  249. vperm v4, $invlo, $invlo, v0 # vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
  250. vand v0, v0, v9
  251. vxor v3, v3, v5 # vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
  252. vxor v4, v4, v5 # vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
  253. vperm v2, $invlo, v7, v3 # vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
  254. vmr v5, v6
  255. lvx v6, r9, $key # vmovdqu (%r9), %xmm5
  256. vperm v3, $invlo, v7, v4 # vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
  257. addi r9, r9, 16
  258. vxor v2, v2, v0 # vpxor %xmm1, %xmm2, %xmm2 # 2 = io
  259. ?vperm v5, v5, v6, $keyperm # align round key
  260. vxor v3, v3, v1 # vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
  261. bdnz Lenc_loop
  262. # middle of last round
  263. addi r10, r11, 0x80
  264. # vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo
  265. # vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
  266. vperm v4, $sbou, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
  267. lvx v1, r12, r10 # vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[]
  268. vperm v0, $sbot, v7, v3 # vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t
  269. vxor v4, v4, v5 # vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
  270. vxor v0, v0, v4 # vpxor %xmm4, %xmm0, %xmm0 # 0 = A
  271. vperm v0, v0, v7, v1 # vpshufb %xmm1, %xmm0, %xmm0
  272. blr
  273. .long 0
  274. .byte 0,12,0x14,0,0,0,0,0
  275. .globl .vpaes_encrypt
  276. .align 5
  277. .vpaes_encrypt:
  278. $STU $sp,-$FRAME($sp)
  279. li r10,`15+6*$SIZE_T`
  280. li r11,`31+6*$SIZE_T`
  281. mflr r6
  282. mfspr r7, 256 # save vrsave
  283. stvx v20,r10,$sp
  284. addi r10,r10,32
  285. stvx v21,r11,$sp
  286. addi r11,r11,32
  287. stvx v22,r10,$sp
  288. addi r10,r10,32
  289. stvx v23,r11,$sp
  290. addi r11,r11,32
  291. stvx v24,r10,$sp
  292. addi r10,r10,32
  293. stvx v25,r11,$sp
  294. addi r11,r11,32
  295. stvx v26,r10,$sp
  296. addi r10,r10,32
  297. stvx v27,r11,$sp
  298. addi r11,r11,32
  299. stvx v28,r10,$sp
  300. addi r10,r10,32
  301. stvx v29,r11,$sp
  302. addi r11,r11,32
  303. stvx v30,r10,$sp
  304. stvx v31,r11,$sp
  305. stw r7,`$FRAME-4`($sp) # save vrsave
  306. li r0, -1
  307. $PUSH r6,`$FRAME+$LRSAVE`($sp)
  308. mtspr 256, r0 # preserve all AltiVec registers
  309. bl _vpaes_encrypt_preheat
  310. ?lvsl $inpperm, 0, $inp # prepare for unaligned access
  311. lvx v0, 0, $inp
  312. addi $inp, $inp, 15 # 15 is not a typo
  313. ?lvsr $outperm, 0, $out
  314. ?lvsl $keyperm, 0, $key # prepare for unaligned access
  315. lvx $inptail, 0, $inp # redundant in aligned case
  316. ?vperm v0, v0, $inptail, $inpperm
  317. bl _vpaes_encrypt_core
  318. andi. r8, $out, 15
  319. li r9, 16
  320. beq Lenc_out_aligned
  321. vperm v0, v0, v0, $outperm # rotate right/left
  322. mtctr r9
  323. Lenc_out_unaligned:
  324. stvebx v0, 0, $out
  325. addi $out, $out, 1
  326. bdnz Lenc_out_unaligned
  327. b Lenc_done
  328. .align 4
  329. Lenc_out_aligned:
  330. stvx v0, 0, $out
  331. Lenc_done:
  332. li r10,`15+6*$SIZE_T`
  333. li r11,`31+6*$SIZE_T`
  334. mtlr r6
  335. mtspr 256, r7 # restore vrsave
  336. lvx v20,r10,$sp
  337. addi r10,r10,32
  338. lvx v21,r11,$sp
  339. addi r11,r11,32
  340. lvx v22,r10,$sp
  341. addi r10,r10,32
  342. lvx v23,r11,$sp
  343. addi r11,r11,32
  344. lvx v24,r10,$sp
  345. addi r10,r10,32
  346. lvx v25,r11,$sp
  347. addi r11,r11,32
  348. lvx v26,r10,$sp
  349. addi r10,r10,32
  350. lvx v27,r11,$sp
  351. addi r11,r11,32
  352. lvx v28,r10,$sp
  353. addi r10,r10,32
  354. lvx v29,r11,$sp
  355. addi r11,r11,32
  356. lvx v30,r10,$sp
  357. lvx v31,r11,$sp
  358. addi $sp,$sp,$FRAME
  359. blr
  360. .long 0
  361. .byte 0,12,0x04,1,0x80,0,3,0
  362. .long 0
  363. .size .vpaes_encrypt,.-.vpaes_encrypt
  364. .align 4
  365. _vpaes_decrypt_preheat:
  366. mflr r8
  367. bl Lconsts
  368. mtlr r8
  369. li r11, 0xc0 # Lk_inv
  370. li r10, 0xd0
  371. li r9, 0x160 # Ldipt
  372. li r8, 0x170
  373. vxor v7, v7, v7 # 0x00..00
  374. vspltisb v8,4 # 0x04..04
  375. vspltisb v9,0x0f # 0x0f..0f
  376. lvx $invlo, r12, r11
  377. li r11, 0x180
  378. lvx $invhi, r12, r10
  379. li r10, 0x190
  380. lvx $iptlo, r12, r9
  381. li r9, 0x1a0
  382. lvx $ipthi, r12, r8
  383. li r8, 0x1b0
  384. lvx $sbou, r12, r11
  385. li r11, 0x1c0
  386. lvx $sbot, r12, r10
  387. li r10, 0x1d0
  388. lvx $sb9u, r12, r9
  389. li r9, 0x1e0
  390. lvx $sb9t, r12, r8
  391. li r8, 0x1f0
  392. lvx $sbdu, r12, r11
  393. li r11, 0x200
  394. lvx $sbdt, r12, r10
  395. li r10, 0x210
  396. lvx $sbbu, r12, r9
  397. lvx $sbbt, r12, r8
  398. lvx $sbeu, r12, r11
  399. lvx $sbet, r12, r10
  400. blr
  401. .long 0
  402. .byte 0,12,0x14,0,0,0,0,0
  403. ##
  404. ## Decryption core
  405. ##
  406. ## Same API as encryption core.
  407. ##
  408. .align 4
  409. _vpaes_decrypt_core:
  410. lwz r8, 240($key) # pull rounds
  411. li r9, 16
  412. lvx v5, 0, $key # vmovdqu (%r9), %xmm4 # round0 key
  413. li r11, 0x30
  414. lvx v6, r9, $key
  415. addi r9, r9, 16
  416. ?vperm v5, v5, v6, $keyperm # align round key
  417. vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0
  418. vperm v0, $iptlo, $iptlo, v0 # vpshufb %xmm1, %xmm2, %xmm2
  419. vperm v1, $ipthi, $ipthi, v1 # vpshufb %xmm0, %xmm1, %xmm0
  420. vxor v0, v0, v5 # vpxor %xmm4, %xmm2, %xmm2
  421. vxor v0, v0, v1 # vpxor %xmm2, %xmm0, %xmm0
  422. mtctr r8
  423. b Ldec_entry
  424. .align 4
  425. Ldec_loop:
  426. #
  427. # Inverse mix columns
  428. #
  429. lvx v0, r12, r11 # v5 and v0 are flipped
  430. # vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u
  431. # vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t
  432. vperm v4, $sb9u, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u
  433. subi r11, r11, 16
  434. vperm v1, $sb9t, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t
  435. andi. r11, r11, 0x30
  436. vxor v5, v5, v4 # vpxor %xmm4, %xmm0, %xmm0
  437. # vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu
  438. vxor v5, v5, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
  439. # vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt
  440. vperm v4, $sbdu, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu
  441. vperm v5, v5, v7, v0 # vpshufb %xmm5, %xmm0, %xmm0 # MC ch
  442. vperm v1, $sbdt, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt
  443. vxor v5, v5, v4 # vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
  444. # vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu
  445. vxor v5, v5, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
  446. # vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt
  447. vperm v4, $sbbu, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu
  448. vperm v5, v5, v7, v0 # vpshufb %xmm5, %xmm0, %xmm0 # MC ch
  449. vperm v1, $sbbt, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt
  450. vxor v5, v5, v4 # vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
  451. # vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu
  452. vxor v5, v5, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
  453. # vmovdqa 0x50(%r10), %xmm1 # 0 : sbet
  454. vperm v4, $sbeu, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu
  455. vperm v5, v5, v7, v0 # vpshufb %xmm5, %xmm0, %xmm0 # MC ch
  456. vperm v1, $sbet, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet
  457. vxor v0, v5, v4 # vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
  458. vxor v0, v0, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
  459. Ldec_entry:
  460. # top of round
  461. vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0 # 1 = i
  462. vperm v2, $invhi, $invhi, v0 # vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
  463. vxor v0, v0, v1 # vpxor %xmm0, %xmm1, %xmm1 # 0 = j
  464. vperm v3, $invlo, $invlo, v1 # vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
  465. vperm v4, $invlo, $invlo, v0 # vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
  466. vand v0, v0, v9
  467. vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
  468. vxor v4, v4, v2 # vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
  469. vperm v2, $invlo, v7, v3 # vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
  470. vmr v5, v6
  471. lvx v6, r9, $key # vmovdqu (%r9), %xmm0
  472. vperm v3, $invlo, v7, v4 # vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
  473. addi r9, r9, 16
  474. vxor v2, v2, v0 # vpxor %xmm1, %xmm2, %xmm2 # 2 = io
  475. ?vperm v5, v5, v6, $keyperm # align round key
  476. vxor v3, v3, v1 # vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
  477. bdnz Ldec_loop
  478. # middle of last round
  479. addi r10, r11, 0x80
  480. # vmovdqa 0x60(%r10), %xmm4 # 3 : sbou
  481. vperm v4, $sbou, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
  482. # vmovdqa 0x70(%r10), %xmm1 # 0 : sbot
  483. lvx v2, r12, r10 # vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160
  484. vperm v1, $sbot, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t
  485. vxor v4, v4, v5 # vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k
  486. vxor v0, v1, v4 # vpxor %xmm4, %xmm1, %xmm0 # 0 = A
  487. vperm v0, v0, v7, v2 # vpshufb %xmm2, %xmm0, %xmm0
  488. blr
  489. .long 0
  490. .byte 0,12,0x14,0,0,0,0,0
  491. .globl .vpaes_decrypt
  492. .align 5
  493. .vpaes_decrypt:
  494. $STU $sp,-$FRAME($sp)
  495. li r10,`15+6*$SIZE_T`
  496. li r11,`31+6*$SIZE_T`
  497. mflr r6
  498. mfspr r7, 256 # save vrsave
  499. stvx v20,r10,$sp
  500. addi r10,r10,32
  501. stvx v21,r11,$sp
  502. addi r11,r11,32
  503. stvx v22,r10,$sp
  504. addi r10,r10,32
  505. stvx v23,r11,$sp
  506. addi r11,r11,32
  507. stvx v24,r10,$sp
  508. addi r10,r10,32
  509. stvx v25,r11,$sp
  510. addi r11,r11,32
  511. stvx v26,r10,$sp
  512. addi r10,r10,32
  513. stvx v27,r11,$sp
  514. addi r11,r11,32
  515. stvx v28,r10,$sp
  516. addi r10,r10,32
  517. stvx v29,r11,$sp
  518. addi r11,r11,32
  519. stvx v30,r10,$sp
  520. stvx v31,r11,$sp
  521. stw r7,`$FRAME-4`($sp) # save vrsave
  522. li r0, -1
  523. $PUSH r6,`$FRAME+$LRSAVE`($sp)
  524. mtspr 256, r0 # preserve all AltiVec registers
  525. bl _vpaes_decrypt_preheat
  526. ?lvsl $inpperm, 0, $inp # prepare for unaligned access
  527. lvx v0, 0, $inp
  528. addi $inp, $inp, 15 # 15 is not a typo
  529. ?lvsr $outperm, 0, $out
  530. ?lvsl $keyperm, 0, $key
  531. lvx $inptail, 0, $inp # redundant in aligned case
  532. ?vperm v0, v0, $inptail, $inpperm
  533. bl _vpaes_decrypt_core
  534. andi. r8, $out, 15
  535. li r9, 16
  536. beq Ldec_out_aligned
  537. vperm v0, v0, v0, $outperm # rotate right/left
  538. mtctr r9
  539. Ldec_out_unaligned:
  540. stvebx v0, 0, $out
  541. addi $out, $out, 1
  542. bdnz Ldec_out_unaligned
  543. b Ldec_done
  544. .align 4
  545. Ldec_out_aligned:
  546. stvx v0, 0, $out
  547. Ldec_done:
  548. li r10,`15+6*$SIZE_T`
  549. li r11,`31+6*$SIZE_T`
  550. mtlr r6
  551. mtspr 256, r7 # restore vrsave
  552. lvx v20,r10,$sp
  553. addi r10,r10,32
  554. lvx v21,r11,$sp
  555. addi r11,r11,32
  556. lvx v22,r10,$sp
  557. addi r10,r10,32
  558. lvx v23,r11,$sp
  559. addi r11,r11,32
  560. lvx v24,r10,$sp
  561. addi r10,r10,32
  562. lvx v25,r11,$sp
  563. addi r11,r11,32
  564. lvx v26,r10,$sp
  565. addi r10,r10,32
  566. lvx v27,r11,$sp
  567. addi r11,r11,32
  568. lvx v28,r10,$sp
  569. addi r10,r10,32
  570. lvx v29,r11,$sp
  571. addi r11,r11,32
  572. lvx v30,r10,$sp
  573. lvx v31,r11,$sp
  574. addi $sp,$sp,$FRAME
  575. blr
  576. .long 0
  577. .byte 0,12,0x04,1,0x80,0,3,0
  578. .long 0
  579. .size .vpaes_decrypt,.-.vpaes_decrypt
  580. .globl .vpaes_cbc_encrypt
  581. .align 5
  582. .vpaes_cbc_encrypt:
  583. ${UCMP}i r5,16
  584. bltlr-
  585. $STU $sp,-`($FRAME+2*$SIZE_T)`($sp)
  586. mflr r0
  587. li r10,`15+6*$SIZE_T`
  588. li r11,`31+6*$SIZE_T`
  589. mfspr r12, 256
  590. stvx v20,r10,$sp
  591. addi r10,r10,32
  592. stvx v21,r11,$sp
  593. addi r11,r11,32
  594. stvx v22,r10,$sp
  595. addi r10,r10,32
  596. stvx v23,r11,$sp
  597. addi r11,r11,32
  598. stvx v24,r10,$sp
  599. addi r10,r10,32
  600. stvx v25,r11,$sp
  601. addi r11,r11,32
  602. stvx v26,r10,$sp
  603. addi r10,r10,32
  604. stvx v27,r11,$sp
  605. addi r11,r11,32
  606. stvx v28,r10,$sp
  607. addi r10,r10,32
  608. stvx v29,r11,$sp
  609. addi r11,r11,32
  610. stvx v30,r10,$sp
  611. stvx v31,r11,$sp
  612. stw r12,`$FRAME-4`($sp) # save vrsave
  613. $PUSH r30,`$FRAME+$SIZE_T*0`($sp)
  614. $PUSH r31,`$FRAME+$SIZE_T*1`($sp)
  615. li r9, -16
  616. $PUSH r0, `$FRAME+$SIZE_T*2+$LRSAVE`($sp)
  617. and r30, r5, r9 # copy length&-16
  618. andi. r9, $out, 15 # is $out aligned?
  619. mr r5, r6 # copy pointer to key
  620. mr r31, r7 # copy pointer to iv
  621. li r6, -1
  622. mcrf cr1, cr0 # put aside $out alignment flag
  623. mr r7, r12 # copy vrsave
  624. mtspr 256, r6 # preserve all AltiVec registers
  625. lvx v24, 0, r31 # load [potentially unaligned] iv
  626. li r9, 15
  627. ?lvsl $inpperm, 0, r31
  628. lvx v25, r9, r31
  629. ?vperm v24, v24, v25, $inpperm
  630. cmpwi r8, 0 # test direction
  631. neg r8, $inp # prepare for unaligned access
  632. vxor v7, v7, v7
  633. ?lvsl $keyperm, 0, $key
  634. ?lvsr $outperm, 0, $out
  635. ?lvsr $inpperm, 0, r8 # -$inp
  636. vnor $outmask, v7, v7 # 0xff..ff
  637. lvx $inptail, 0, $inp
  638. ?vperm $outmask, v7, $outmask, $outperm
  639. addi $inp, $inp, 15 # 15 is not a typo
  640. beq Lcbc_decrypt
  641. bl _vpaes_encrypt_preheat
  642. li r0, 16
  643. beq cr1, Lcbc_enc_loop # $out is aligned
  644. vmr v0, $inptail
  645. lvx $inptail, 0, $inp
  646. addi $inp, $inp, 16
  647. ?vperm v0, v0, $inptail, $inpperm
  648. vxor v0, v0, v24 # ^= iv
  649. bl _vpaes_encrypt_core
  650. andi. r8, $out, 15
  651. vmr v24, v0 # put aside iv
  652. sub r9, $out, r8
  653. vperm $outhead, v0, v0, $outperm # rotate right/left
  654. Lcbc_enc_head:
  655. stvebx $outhead, r8, r9
  656. cmpwi r8, 15
  657. addi r8, r8, 1
  658. bne Lcbc_enc_head
  659. sub. r30, r30, r0 # len -= 16
  660. addi $out, $out, 16
  661. beq Lcbc_unaligned_done
  662. Lcbc_enc_loop:
  663. vmr v0, $inptail
  664. lvx $inptail, 0, $inp
  665. addi $inp, $inp, 16
  666. ?vperm v0, v0, $inptail, $inpperm
  667. vxor v0, v0, v24 # ^= iv
  668. bl _vpaes_encrypt_core
  669. vmr v24, v0 # put aside iv
  670. sub. r30, r30, r0 # len -= 16
  671. vperm v0, v0, v0, $outperm # rotate right/left
  672. vsel v1, $outhead, v0, $outmask
  673. vmr $outhead, v0
  674. stvx v1, 0, $out
  675. addi $out, $out, 16
  676. bne Lcbc_enc_loop
  677. b Lcbc_done
  678. .align 5
  679. Lcbc_decrypt:
  680. bl _vpaes_decrypt_preheat
  681. li r0, 16
  682. beq cr1, Lcbc_dec_loop # $out is aligned
  683. vmr v0, $inptail
  684. lvx $inptail, 0, $inp
  685. addi $inp, $inp, 16
  686. ?vperm v0, v0, $inptail, $inpperm
  687. vmr v25, v0 # put aside input
  688. bl _vpaes_decrypt_core
  689. andi. r8, $out, 15
  690. vxor v0, v0, v24 # ^= iv
  691. vmr v24, v25
  692. sub r9, $out, r8
  693. vperm $outhead, v0, v0, $outperm # rotate right/left
  694. Lcbc_dec_head:
  695. stvebx $outhead, r8, r9
  696. cmpwi r8, 15
  697. addi r8, r8, 1
  698. bne Lcbc_dec_head
  699. sub. r30, r30, r0 # len -= 16
  700. addi $out, $out, 16
  701. beq Lcbc_unaligned_done
  702. Lcbc_dec_loop:
  703. vmr v0, $inptail
  704. lvx $inptail, 0, $inp
  705. addi $inp, $inp, 16
  706. ?vperm v0, v0, $inptail, $inpperm
  707. vmr v25, v0 # put aside input
  708. bl _vpaes_decrypt_core
  709. vxor v0, v0, v24 # ^= iv
  710. vmr v24, v25
  711. sub. r30, r30, r0 # len -= 16
  712. vperm v0, v0, v0, $outperm # rotate right/left
  713. vsel v1, $outhead, v0, $outmask
  714. vmr $outhead, v0
  715. stvx v1, 0, $out
  716. addi $out, $out, 16
  717. bne Lcbc_dec_loop
  718. Lcbc_done:
  719. beq cr1, Lcbc_write_iv # $out is aligned
  720. Lcbc_unaligned_done:
  721. andi. r8, $out, 15
  722. sub $out, $out, r8
  723. li r9, 0
  724. Lcbc_tail:
  725. stvebx $outhead, r9, $out
  726. addi r9, r9, 1
  727. cmpw r9, r8
  728. bne Lcbc_tail
  729. Lcbc_write_iv:
  730. neg r8, r31 # write [potentially unaligned] iv
  731. li r10, 4
  732. ?lvsl $outperm, 0, r8
  733. li r11, 8
  734. li r12, 12
  735. vperm v24, v24, v24, $outperm # rotate right/left
  736. stvewx v24, 0, r31 # ivp is at least 32-bit aligned
  737. stvewx v24, r10, r31
  738. stvewx v24, r11, r31
  739. stvewx v24, r12, r31
  740. mtspr 256, r7 # restore vrsave
  741. li r10,`15+6*$SIZE_T`
  742. li r11,`31+6*$SIZE_T`
  743. lvx v20,r10,$sp
  744. addi r10,r10,32
  745. lvx v21,r11,$sp
  746. addi r11,r11,32
  747. lvx v22,r10,$sp
  748. addi r10,r10,32
  749. lvx v23,r11,$sp
  750. addi r11,r11,32
  751. lvx v24,r10,$sp
  752. addi r10,r10,32
  753. lvx v25,r11,$sp
  754. addi r11,r11,32
  755. lvx v26,r10,$sp
  756. addi r10,r10,32
  757. lvx v27,r11,$sp
  758. addi r11,r11,32
  759. lvx v28,r10,$sp
  760. addi r10,r10,32
  761. lvx v29,r11,$sp
  762. addi r11,r11,32
  763. lvx v30,r10,$sp
  764. lvx v31,r11,$sp
  765. Lcbc_abort:
  766. $POP r0, `$FRAME+$SIZE_T*2+$LRSAVE`($sp)
  767. $POP r30,`$FRAME+$SIZE_T*0`($sp)
  768. $POP r31,`$FRAME+$SIZE_T*1`($sp)
  769. mtlr r0
  770. addi $sp,$sp,`$FRAME+$SIZE_T*2`
  771. blr
  772. .long 0
  773. .byte 0,12,0x04,1,0x80,2,6,0
  774. .long 0
  775. .size .vpaes_cbc_encrypt,.-.vpaes_cbc_encrypt
  776. ___
  777. }
  778. {
  779. my ($inp,$bits,$out)=map("r$_",(3..5));
  780. my $dir="cr1";
  781. my ($invlo,$invhi,$iptlo,$ipthi,$rcon) = map("v$_",(10..13,24));
  782. $code.=<<___;
  783. ########################################################
  784. ## ##
  785. ## AES key schedule ##
  786. ## ##
  787. ########################################################
  788. .align 4
  789. _vpaes_key_preheat:
  790. mflr r8
  791. bl Lconsts
  792. mtlr r8
  793. li r11, 0xc0 # Lk_inv
  794. li r10, 0xd0
  795. li r9, 0xe0 # L_ipt
  796. li r8, 0xf0
  797. vspltisb v8,4 # 0x04..04
  798. vxor v9,v9,v9 # 0x00..00
  799. lvx $invlo, r12, r11 # Lk_inv
  800. li r11, 0x120
  801. lvx $invhi, r12, r10
  802. li r10, 0x130
  803. lvx $iptlo, r12, r9 # Lk_ipt
  804. li r9, 0x220
  805. lvx $ipthi, r12, r8
  806. li r8, 0x230
  807. lvx v14, r12, r11 # Lk_sb1
  808. li r11, 0x240
  809. lvx v15, r12, r10
  810. li r10, 0x250
  811. lvx v16, r12, r9 # Lk_dksd
  812. li r9, 0x260
  813. lvx v17, r12, r8
  814. li r8, 0x270
  815. lvx v18, r12, r11 # Lk_dksb
  816. li r11, 0x280
  817. lvx v19, r12, r10
  818. li r10, 0x290
  819. lvx v20, r12, r9 # Lk_dkse
  820. li r9, 0x2a0
  821. lvx v21, r12, r8
  822. li r8, 0x2b0
  823. lvx v22, r12, r11 # Lk_dks9
  824. lvx v23, r12, r10
  825. lvx v24, r12, r9 # Lk_rcon
  826. lvx v25, 0, r12 # Lk_mc_forward[0]
  827. lvx v26, r12, r8 # Lks63
  828. blr
  829. .long 0
  830. .byte 0,12,0x14,0,0,0,0,0
  831. .align 4
  832. _vpaes_schedule_core:
  833. mflr r7
  834. bl _vpaes_key_preheat # load the tables
  835. #lvx v0, 0, $inp # vmovdqu (%rdi), %xmm0 # load key (unaligned)
  836. neg r8, $inp # prepare for unaligned access
  837. lvx v0, 0, $inp
  838. addi $inp, $inp, 15 # 15 is not typo
  839. ?lvsr $inpperm, 0, r8 # -$inp
  840. lvx v6, 0, $inp # v6 serves as inptail
  841. addi $inp, $inp, 8
  842. ?vperm v0, v0, v6, $inpperm
  843. # input transform
  844. vmr v3, v0 # vmovdqa %xmm0, %xmm3
  845. bl _vpaes_schedule_transform
  846. vmr v7, v0 # vmovdqa %xmm0, %xmm7
  847. bne $dir, Lschedule_am_decrypting
  848. # encrypting, output zeroth round key after transform
  849. li r8, 0x30 # mov \$0x30,%r8d
  850. li r9, 4
  851. li r10, 8
  852. li r11, 12
  853. ?lvsr $outperm, 0, $out # prepare for unaligned access
  854. vnor $outmask, v9, v9 # 0xff..ff
  855. ?vperm $outmask, v9, $outmask, $outperm
  856. #stvx v0, 0, $out # vmovdqu %xmm0, (%rdx)
  857. vperm $outhead, v0, v0, $outperm # rotate right/left
  858. stvewx $outhead, 0, $out # some are superfluous
  859. stvewx $outhead, r9, $out
  860. stvewx $outhead, r10, $out
  861. addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10
  862. stvewx $outhead, r11, $out
  863. b Lschedule_go
  864. Lschedule_am_decrypting:
  865. srwi r8, $bits, 1 # shr \$1,%r8d
  866. andi. r8, r8, 32 # and \$32,%r8d
  867. xori r8, r8, 32 # xor \$32,%r8d # nbits==192?0:32
  868. addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10
  869. # decrypting, output zeroth round key after shiftrows
  870. lvx v1, r8, r10 # vmovdqa (%r8,%r10), %xmm1
  871. li r9, 4
  872. li r10, 8
  873. li r11, 12
  874. vperm v4, v3, v3, v1 # vpshufb %xmm1, %xmm3, %xmm3
  875. neg r0, $out # prepare for unaligned access
  876. ?lvsl $outperm, 0, r0
  877. vnor $outmask, v9, v9 # 0xff..ff
  878. ?vperm $outmask, $outmask, v9, $outperm
  879. #stvx v4, 0, $out # vmovdqu %xmm3, (%rdx)
  880. vperm $outhead, v4, v4, $outperm # rotate right/left
  881. stvewx $outhead, 0, $out # some are superfluous
  882. stvewx $outhead, r9, $out
  883. stvewx $outhead, r10, $out
  884. addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10
  885. stvewx $outhead, r11, $out
  886. addi $out, $out, 15 # 15 is not typo
  887. xori r8, r8, 0x30 # xor \$0x30, %r8
  888. Lschedule_go:
  889. cmplwi $bits, 192 # cmp \$192, %esi
  890. bgt Lschedule_256
  891. beq Lschedule_192
  892. # 128: fall though
  893. ##
  894. ## .schedule_128
  895. ##
  896. ## 128-bit specific part of key schedule.
  897. ##
  898. ## This schedule is really simple, because all its parts
  899. ## are accomplished by the subroutines.
  900. ##
  901. Lschedule_128:
  902. li r0, 10 # mov \$10, %esi
  903. mtctr r0
  904. Loop_schedule_128:
  905. bl _vpaes_schedule_round
  906. bdz Lschedule_mangle_last # dec %esi
  907. bl _vpaes_schedule_mangle # write output
  908. b Loop_schedule_128
  909. ##
  910. ## .aes_schedule_192
  911. ##
  912. ## 192-bit specific part of key schedule.
  913. ##
  914. ## The main body of this schedule is the same as the 128-bit
  915. ## schedule, but with more smearing. The long, high side is
  916. ## stored in %xmm7 as before, and the short, low side is in
  917. ## the high bits of %xmm6.
  918. ##
  919. ## This schedule is somewhat nastier, however, because each
  920. ## round produces 192 bits of key material, or 1.5 round keys.
  921. ## Therefore, on each cycle we do 2 rounds and produce 3 round
  922. ## keys.
  923. ##
  924. .align 4
  925. Lschedule_192:
  926. li r0, 4 # mov \$4, %esi
  927. lvx v0, 0, $inp
  928. ?vperm v0, v6, v0, $inpperm
  929. ?vsldoi v0, v3, v0, 8 # vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned)
  930. bl _vpaes_schedule_transform # input transform
  931. ?vsldoi v6, v0, v9, 8
  932. ?vsldoi v6, v9, v6, 8 # clobber "low" side with zeros
  933. mtctr r0
  934. Loop_schedule_192:
  935. bl _vpaes_schedule_round
  936. ?vsldoi v0, v6, v0, 8 # vpalignr \$8,%xmm6,%xmm0,%xmm0
  937. bl _vpaes_schedule_mangle # save key n
  938. bl _vpaes_schedule_192_smear
  939. bl _vpaes_schedule_mangle # save key n+1
  940. bl _vpaes_schedule_round
  941. bdz Lschedule_mangle_last # dec %esi
  942. bl _vpaes_schedule_mangle # save key n+2
  943. bl _vpaes_schedule_192_smear
  944. b Loop_schedule_192
  945. ##
  946. ## .aes_schedule_256
  947. ##
  948. ## 256-bit specific part of key schedule.
  949. ##
  950. ## The structure here is very similar to the 128-bit
  951. ## schedule, but with an additional "low side" in
  952. ## %xmm6. The low side's rounds are the same as the
  953. ## high side's, except no rcon and no rotation.
  954. ##
  955. .align 4
  956. Lschedule_256:
  957. li r0, 7 # mov \$7, %esi
  958. addi $inp, $inp, 8
  959. lvx v0, 0, $inp # vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
  960. ?vperm v0, v6, v0, $inpperm
  961. bl _vpaes_schedule_transform # input transform
  962. mtctr r0
  963. Loop_schedule_256:
  964. bl _vpaes_schedule_mangle # output low result
  965. vmr v6, v0 # vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6
  966. # high round
  967. bl _vpaes_schedule_round
  968. bdz Lschedule_mangle_last # dec %esi
  969. bl _vpaes_schedule_mangle
  970. # low round. swap xmm7 and xmm6
  971. ?vspltw v0, v0, 3 # vpshufd \$0xFF, %xmm0, %xmm0
  972. vmr v5, v7 # vmovdqa %xmm7, %xmm5
  973. vmr v7, v6 # vmovdqa %xmm6, %xmm7
  974. bl _vpaes_schedule_low_round
  975. vmr v7, v5 # vmovdqa %xmm5, %xmm7
  976. b Loop_schedule_256
  977. ##
  978. ## .aes_schedule_mangle_last
  979. ##
  980. ## Mangler for last round of key schedule
  981. ## Mangles %xmm0
  982. ## when encrypting, outputs out(%xmm0) ^ 63
  983. ## when decrypting, outputs unskew(%xmm0)
  984. ##
  985. ## Always called right before return... jumps to cleanup and exits
  986. ##
  987. .align 4
  988. Lschedule_mangle_last:
  989. # schedule last round key from xmm0
  990. li r11, 0x2e0 # lea .Lk_deskew(%rip),%r11
  991. li r9, 0x2f0
  992. bne $dir, Lschedule_mangle_last_dec
  993. # encrypting
  994. lvx v1, r8, r10 # vmovdqa (%r8,%r10),%xmm1
  995. li r11, 0x2c0 # lea .Lk_opt(%rip), %r11 # prepare to output transform
  996. li r9, 0x2d0 # prepare to output transform
  997. vperm v0, v0, v0, v1 # vpshufb %xmm1, %xmm0, %xmm0 # output permute
  998. lvx $iptlo, r11, r12 # reload $ipt
  999. lvx $ipthi, r9, r12
  1000. addi $out, $out, 16 # add \$16, %rdx
  1001. vxor v0, v0, v26 # vpxor .Lk_s63(%rip), %xmm0, %xmm0
  1002. bl _vpaes_schedule_transform # output transform
  1003. #stvx v0, r0, $out # vmovdqu %xmm0, (%rdx) # save last key
  1004. vperm v0, v0, v0, $outperm # rotate right/left
  1005. li r10, 4
  1006. vsel v2, $outhead, v0, $outmask
  1007. li r11, 8
  1008. stvx v2, 0, $out
  1009. li r12, 12
  1010. stvewx v0, 0, $out # some (or all) are redundant
  1011. stvewx v0, r10, $out
  1012. stvewx v0, r11, $out
  1013. stvewx v0, r12, $out
  1014. b Lschedule_mangle_done
  1015. .align 4
  1016. Lschedule_mangle_last_dec:
  1017. lvx $iptlo, r11, r12 # reload $ipt
  1018. lvx $ipthi, r9, r12
  1019. addi $out, $out, -16 # add \$-16, %rdx
  1020. vxor v0, v0, v26 # vpxor .Lk_s63(%rip), %xmm0, %xmm0
  1021. bl _vpaes_schedule_transform # output transform
  1022. #stvx v0, r0, $out # vmovdqu %xmm0, (%rdx) # save last key
  1023. addi r9, $out, -15 # -15 is not typo
  1024. vperm v0, v0, v0, $outperm # rotate right/left
  1025. li r10, 4
  1026. vsel v2, $outhead, v0, $outmask
  1027. li r11, 8
  1028. stvx v2, 0, $out
  1029. li r12, 12
  1030. stvewx v0, 0, r9 # some (or all) are redundant
  1031. stvewx v0, r10, r9
  1032. stvewx v0, r11, r9
  1033. stvewx v0, r12, r9
  1034. Lschedule_mangle_done:
  1035. mtlr r7
  1036. # cleanup
  1037. vxor v0, v0, v0 # vpxor %xmm0, %xmm0, %xmm0
  1038. vxor v1, v1, v1 # vpxor %xmm1, %xmm1, %xmm1
  1039. vxor v2, v2, v2 # vpxor %xmm2, %xmm2, %xmm2
  1040. vxor v3, v3, v3 # vpxor %xmm3, %xmm3, %xmm3
  1041. vxor v4, v4, v4 # vpxor %xmm4, %xmm4, %xmm4
  1042. vxor v5, v5, v5 # vpxor %xmm5, %xmm5, %xmm5
  1043. vxor v6, v6, v6 # vpxor %xmm6, %xmm6, %xmm6
  1044. vxor v7, v7, v7 # vpxor %xmm7, %xmm7, %xmm7
  1045. blr
  1046. .long 0
  1047. .byte 0,12,0x14,0,0,0,0,0
  1048. ##
  1049. ## .aes_schedule_192_smear
  1050. ##
  1051. ## Smear the short, low side in the 192-bit key schedule.
  1052. ##
  1053. ## Inputs:
  1054. ## %xmm7: high side, b a x y
  1055. ## %xmm6: low side, d c 0 0
  1056. ## %xmm13: 0
  1057. ##
  1058. ## Outputs:
  1059. ## %xmm6: b+c+d b+c 0 0
  1060. ## %xmm0: b+c+d b+c b a
  1061. ##
  1062. .align 4
  1063. _vpaes_schedule_192_smear:
  1064. ?vspltw v0, v7, 3
  1065. ?vsldoi v1, v9, v6, 12 # vpshufd \$0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0
  1066. ?vsldoi v0, v7, v0, 8 # vpshufd \$0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a
  1067. vxor v6, v6, v1 # vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0
  1068. vxor v6, v6, v0 # vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a
  1069. vmr v0, v6
  1070. ?vsldoi v6, v6, v9, 8
  1071. ?vsldoi v6, v9, v6, 8 # clobber low side with zeros
  1072. blr
  1073. .long 0
  1074. .byte 0,12,0x14,0,0,0,0,0
  1075. ##
  1076. ## .aes_schedule_round
  1077. ##
  1078. ## Runs one main round of the key schedule on %xmm0, %xmm7
  1079. ##
  1080. ## Specifically, runs subbytes on the high dword of %xmm0
  1081. ## then rotates it by one byte and xors into the low dword of
  1082. ## %xmm7.
  1083. ##
  1084. ## Adds rcon from low byte of %xmm8, then rotates %xmm8 for
  1085. ## next rcon.
  1086. ##
  1087. ## Smears the dwords of %xmm7 by xoring the low into the
  1088. ## second low, result into third, result into highest.
  1089. ##
  1090. ## Returns results in %xmm7 = %xmm0.
  1091. ## Clobbers %xmm1-%xmm4, %r11.
  1092. ##
  1093. .align 4
  1094. _vpaes_schedule_round:
  1095. # extract rcon from xmm8
  1096. #vxor v4, v4, v4 # vpxor %xmm4, %xmm4, %xmm4
  1097. ?vsldoi v1, $rcon, v9, 15 # vpalignr \$15, %xmm8, %xmm4, %xmm1
  1098. ?vsldoi $rcon, $rcon, $rcon, 15 # vpalignr \$15, %xmm8, %xmm8, %xmm8
  1099. vxor v7, v7, v1 # vpxor %xmm1, %xmm7, %xmm7
  1100. # rotate
  1101. ?vspltw v0, v0, 3 # vpshufd \$0xFF, %xmm0, %xmm0
  1102. ?vsldoi v0, v0, v0, 1 # vpalignr \$1, %xmm0, %xmm0, %xmm0
  1103. # fall through...
  1104. # low round: same as high round, but no rotation and no rcon.
  1105. _vpaes_schedule_low_round:
  1106. # smear xmm7
  1107. ?vsldoi v1, v9, v7, 12 # vpslldq \$4, %xmm7, %xmm1
  1108. vxor v7, v7, v1 # vpxor %xmm1, %xmm7, %xmm7
  1109. vspltisb v1, 0x0f # 0x0f..0f
  1110. ?vsldoi v4, v9, v7, 8 # vpslldq \$8, %xmm7, %xmm4
  1111. # subbytes
  1112. vand v1, v1, v0 # vpand %xmm9, %xmm0, %xmm1 # 0 = k
  1113. vsrb v0, v0, v8 # vpsrlb \$4, %xmm0, %xmm0 # 1 = i
  1114. vxor v7, v7, v4 # vpxor %xmm4, %xmm7, %xmm7
  1115. vperm v2, $invhi, v9, v1 # vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
  1116. vxor v1, v1, v0 # vpxor %xmm0, %xmm1, %xmm1 # 0 = j
  1117. vperm v3, $invlo, v9, v0 # vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
  1118. vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
  1119. vperm v4, $invlo, v9, v1 # vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
  1120. vxor v7, v7, v26 # vpxor .Lk_s63(%rip), %xmm7, %xmm7
  1121. vperm v3, $invlo, v9, v3 # vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak
  1122. vxor v4, v4, v2 # vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
  1123. vperm v2, $invlo, v9, v4 # vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak
  1124. vxor v3, v3, v1 # vpxor %xmm1, %xmm3, %xmm3 # 2 = io
  1125. vxor v2, v2, v0 # vpxor %xmm0, %xmm2, %xmm2 # 3 = jo
  1126. vperm v4, v15, v9, v3 # vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou
  1127. vperm v1, v14, v9, v2 # vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t
  1128. vxor v1, v1, v4 # vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output
  1129. # add in smeared stuff
  1130. vxor v0, v1, v7 # vpxor %xmm7, %xmm1, %xmm0
  1131. vxor v7, v1, v7 # vmovdqa %xmm0, %xmm7
  1132. blr
  1133. .long 0
  1134. .byte 0,12,0x14,0,0,0,0,0
  1135. ##
  1136. ## .aes_schedule_transform
  1137. ##
  1138. ## Linear-transform %xmm0 according to tables at (%r11)
  1139. ##
  1140. ## Requires that %xmm9 = 0x0F0F... as in preheat
  1141. ## Output in %xmm0
  1142. ## Clobbers %xmm2
  1143. ##
  1144. .align 4
  1145. _vpaes_schedule_transform:
  1146. #vand v1, v0, v9 # vpand %xmm9, %xmm0, %xmm1
  1147. vsrb v2, v0, v8 # vpsrlb \$4, %xmm0, %xmm0
  1148. # vmovdqa (%r11), %xmm2 # lo
  1149. vperm v0, $iptlo, $iptlo, v0 # vpshufb %xmm1, %xmm2, %xmm2
  1150. # vmovdqa 16(%r11), %xmm1 # hi
  1151. vperm v2, $ipthi, $ipthi, v2 # vpshufb %xmm0, %xmm1, %xmm0
  1152. vxor v0, v0, v2 # vpxor %xmm2, %xmm0, %xmm0
  1153. blr
  1154. .long 0
  1155. .byte 0,12,0x14,0,0,0,0,0
  1156. ##
  1157. ## .aes_schedule_mangle
  1158. ##
  1159. ## Mangle xmm0 from (basis-transformed) standard version
  1160. ## to our version.
  1161. ##
  1162. ## On encrypt,
  1163. ## xor with 0x63
  1164. ## multiply by circulant 0,1,1,1
  1165. ## apply shiftrows transform
  1166. ##
  1167. ## On decrypt,
  1168. ## xor with 0x63
  1169. ## multiply by "inverse mixcolumns" circulant E,B,D,9
  1170. ## deskew
  1171. ## apply shiftrows transform
  1172. ##
  1173. ##
  1174. ## Writes out to (%rdx), and increments or decrements it
  1175. ## Keeps track of round number mod 4 in %r8
  1176. ## Preserves xmm0
  1177. ## Clobbers xmm1-xmm5
  1178. ##
  1179. .align 4
  1180. _vpaes_schedule_mangle:
  1181. #vmr v4, v0 # vmovdqa %xmm0, %xmm4 # save xmm0 for later
  1182. # vmovdqa .Lk_mc_forward(%rip),%xmm5
  1183. bne $dir, Lschedule_mangle_dec
  1184. # encrypting
  1185. vxor v4, v0, v26 # vpxor .Lk_s63(%rip), %xmm0, %xmm4
  1186. addi $out, $out, 16 # add \$16, %rdx
  1187. vperm v4, v4, v4, v25 # vpshufb %xmm5, %xmm4, %xmm4
  1188. vperm v1, v4, v4, v25 # vpshufb %xmm5, %xmm4, %xmm1
  1189. vperm v3, v1, v1, v25 # vpshufb %xmm5, %xmm1, %xmm3
  1190. vxor v4, v4, v1 # vpxor %xmm1, %xmm4, %xmm4
  1191. lvx v1, r8, r10 # vmovdqa (%r8,%r10), %xmm1
  1192. vxor v3, v3, v4 # vpxor %xmm4, %xmm3, %xmm3
  1193. vperm v3, v3, v3, v1 # vpshufb %xmm1, %xmm3, %xmm3
  1194. addi r8, r8, -16 # add \$-16, %r8
  1195. andi. r8, r8, 0x30 # and \$0x30, %r8
  1196. #stvx v3, 0, $out # vmovdqu %xmm3, (%rdx)
  1197. vperm v1, v3, v3, $outperm # rotate right/left
  1198. vsel v2, $outhead, v1, $outmask
  1199. vmr $outhead, v1
  1200. stvx v2, 0, $out
  1201. blr
  1202. .align 4
  1203. Lschedule_mangle_dec:
  1204. # inverse mix columns
  1205. # lea .Lk_dksd(%rip),%r11
  1206. vsrb v1, v0, v8 # vpsrlb \$4, %xmm4, %xmm1 # 1 = hi
  1207. #and v4, v0, v9 # vpand %xmm9, %xmm4, %xmm4 # 4 = lo
  1208. # vmovdqa 0x00(%r11), %xmm2
  1209. vperm v2, v16, v16, v0 # vpshufb %xmm4, %xmm2, %xmm2
  1210. # vmovdqa 0x10(%r11), %xmm3
  1211. vperm v3, v17, v17, v1 # vpshufb %xmm1, %xmm3, %xmm3
  1212. vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3
  1213. vperm v3, v3, v9, v25 # vpshufb %xmm5, %xmm3, %xmm3
  1214. # vmovdqa 0x20(%r11), %xmm2
  1215. vperm v2, v18, v18, v0 # vpshufb %xmm4, %xmm2, %xmm2
  1216. vxor v2, v2, v3 # vpxor %xmm3, %xmm2, %xmm2
  1217. # vmovdqa 0x30(%r11), %xmm3
  1218. vperm v3, v19, v19, v1 # vpshufb %xmm1, %xmm3, %xmm3
  1219. vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3
  1220. vperm v3, v3, v9, v25 # vpshufb %xmm5, %xmm3, %xmm3
  1221. # vmovdqa 0x40(%r11), %xmm2
  1222. vperm v2, v20, v20, v0 # vpshufb %xmm4, %xmm2, %xmm2
  1223. vxor v2, v2, v3 # vpxor %xmm3, %xmm2, %xmm2
  1224. # vmovdqa 0x50(%r11), %xmm3
  1225. vperm v3, v21, v21, v1 # vpshufb %xmm1, %xmm3, %xmm3
  1226. vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3
  1227. # vmovdqa 0x60(%r11), %xmm2
  1228. vperm v2, v22, v22, v0 # vpshufb %xmm4, %xmm2, %xmm2
  1229. vperm v3, v3, v9, v25 # vpshufb %xmm5, %xmm3, %xmm3
  1230. # vmovdqa 0x70(%r11), %xmm4
  1231. vperm v4, v23, v23, v1 # vpshufb %xmm1, %xmm4, %xmm4
  1232. lvx v1, r8, r10 # vmovdqa (%r8,%r10), %xmm1
  1233. vxor v2, v2, v3 # vpxor %xmm3, %xmm2, %xmm2
  1234. vxor v3, v4, v2 # vpxor %xmm2, %xmm4, %xmm3
  1235. addi $out, $out, -16 # add \$-16, %rdx
  1236. vperm v3, v3, v3, v1 # vpshufb %xmm1, %xmm3, %xmm3
  1237. addi r8, r8, -16 # add \$-16, %r8
  1238. andi. r8, r8, 0x30 # and \$0x30, %r8
  1239. #stvx v3, 0, $out # vmovdqu %xmm3, (%rdx)
  1240. vperm v1, v3, v3, $outperm # rotate right/left
  1241. vsel v2, $outhead, v1, $outmask
  1242. vmr $outhead, v1
  1243. stvx v2, 0, $out
  1244. blr
  1245. .long 0
  1246. .byte 0,12,0x14,0,0,0,0,0
  1247. .globl .vpaes_set_encrypt_key
  1248. .align 5
  1249. .vpaes_set_encrypt_key:
  1250. $STU $sp,-$FRAME($sp)
  1251. li r10,`15+6*$SIZE_T`
  1252. li r11,`31+6*$SIZE_T`
  1253. mflr r0
  1254. mfspr r6, 256 # save vrsave
  1255. stvx v20,r10,$sp
  1256. addi r10,r10,32
  1257. stvx v21,r11,$sp
  1258. addi r11,r11,32
  1259. stvx v22,r10,$sp
  1260. addi r10,r10,32
  1261. stvx v23,r11,$sp
  1262. addi r11,r11,32
  1263. stvx v24,r10,$sp
  1264. addi r10,r10,32
  1265. stvx v25,r11,$sp
  1266. addi r11,r11,32
  1267. stvx v26,r10,$sp
  1268. addi r10,r10,32
  1269. stvx v27,r11,$sp
  1270. addi r11,r11,32
  1271. stvx v28,r10,$sp
  1272. addi r10,r10,32
  1273. stvx v29,r11,$sp
  1274. addi r11,r11,32
  1275. stvx v30,r10,$sp
  1276. stvx v31,r11,$sp
  1277. stw r6,`$FRAME-4`($sp) # save vrsave
  1278. li r7, -1
  1279. $PUSH r0, `$FRAME+$LRSAVE`($sp)
  1280. mtspr 256, r7 # preserve all AltiVec registers
  1281. srwi r9, $bits, 5 # shr \$5,%eax
  1282. addi r9, r9, 6 # add \$5,%eax
  1283. stw r9, 240($out) # mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
  1284. cmplw $dir, $bits, $bits # set encrypt direction
  1285. li r8, 0x30 # mov \$0x30,%r8d
  1286. bl _vpaes_schedule_core
  1287. $POP r0, `$FRAME+$LRSAVE`($sp)
  1288. li r10,`15+6*$SIZE_T`
  1289. li r11,`31+6*$SIZE_T`
  1290. mtspr 256, r6 # restore vrsave
  1291. mtlr r0
  1292. xor r3, r3, r3
  1293. lvx v20,r10,$sp
  1294. addi r10,r10,32
  1295. lvx v21,r11,$sp
  1296. addi r11,r11,32
  1297. lvx v22,r10,$sp
  1298. addi r10,r10,32
  1299. lvx v23,r11,$sp
  1300. addi r11,r11,32
  1301. lvx v24,r10,$sp
  1302. addi r10,r10,32
  1303. lvx v25,r11,$sp
  1304. addi r11,r11,32
  1305. lvx v26,r10,$sp
  1306. addi r10,r10,32
  1307. lvx v27,r11,$sp
  1308. addi r11,r11,32
  1309. lvx v28,r10,$sp
  1310. addi r10,r10,32
  1311. lvx v29,r11,$sp
  1312. addi r11,r11,32
  1313. lvx v30,r10,$sp
  1314. lvx v31,r11,$sp
  1315. addi $sp,$sp,$FRAME
  1316. blr
  1317. .long 0
  1318. .byte 0,12,0x04,1,0x80,0,3,0
  1319. .long 0
  1320. .size .vpaes_set_encrypt_key,.-.vpaes_set_encrypt_key
  1321. .globl .vpaes_set_decrypt_key
  1322. .align 4
  1323. .vpaes_set_decrypt_key:
  1324. $STU $sp,-$FRAME($sp)
  1325. li r10,`15+6*$SIZE_T`
  1326. li r11,`31+6*$SIZE_T`
  1327. mflr r0
  1328. mfspr r6, 256 # save vrsave
  1329. stvx v20,r10,$sp
  1330. addi r10,r10,32
  1331. stvx v21,r11,$sp
  1332. addi r11,r11,32
  1333. stvx v22,r10,$sp
  1334. addi r10,r10,32
  1335. stvx v23,r11,$sp
  1336. addi r11,r11,32
  1337. stvx v24,r10,$sp
  1338. addi r10,r10,32
  1339. stvx v25,r11,$sp
  1340. addi r11,r11,32
  1341. stvx v26,r10,$sp
  1342. addi r10,r10,32
  1343. stvx v27,r11,$sp
  1344. addi r11,r11,32
  1345. stvx v28,r10,$sp
  1346. addi r10,r10,32
  1347. stvx v29,r11,$sp
  1348. addi r11,r11,32
  1349. stvx v30,r10,$sp
  1350. stvx v31,r11,$sp
  1351. stw r6,`$FRAME-4`($sp) # save vrsave
  1352. li r7, -1
  1353. $PUSH r0, `$FRAME+$LRSAVE`($sp)
  1354. mtspr 256, r7 # preserve all AltiVec registers
  1355. srwi r9, $bits, 5 # shr \$5,%eax
  1356. addi r9, r9, 6 # add \$5,%eax
  1357. stw r9, 240($out) # mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
  1358. slwi r9, r9, 4 # shl \$4,%eax
  1359. add $out, $out, r9 # lea (%rdx,%rax),%rdx
  1360. cmplwi $dir, $bits, 0 # set decrypt direction
  1361. srwi r8, $bits, 1 # shr \$1,%r8d
  1362. andi. r8, r8, 32 # and \$32,%r8d
  1363. xori r8, r8, 32 # xor \$32,%r8d # nbits==192?0:32
  1364. bl _vpaes_schedule_core
  1365. $POP r0, `$FRAME+$LRSAVE`($sp)
  1366. li r10,`15+6*$SIZE_T`
  1367. li r11,`31+6*$SIZE_T`
  1368. mtspr 256, r6 # restore vrsave
  1369. mtlr r0
  1370. xor r3, r3, r3
  1371. lvx v20,r10,$sp
  1372. addi r10,r10,32
  1373. lvx v21,r11,$sp
  1374. addi r11,r11,32
  1375. lvx v22,r10,$sp
  1376. addi r10,r10,32
  1377. lvx v23,r11,$sp
  1378. addi r11,r11,32
  1379. lvx v24,r10,$sp
  1380. addi r10,r10,32
  1381. lvx v25,r11,$sp
  1382. addi r11,r11,32
  1383. lvx v26,r10,$sp
  1384. addi r10,r10,32
  1385. lvx v27,r11,$sp
  1386. addi r11,r11,32
  1387. lvx v28,r10,$sp
  1388. addi r10,r10,32
  1389. lvx v29,r11,$sp
  1390. addi r11,r11,32
  1391. lvx v30,r10,$sp
  1392. lvx v31,r11,$sp
  1393. addi $sp,$sp,$FRAME
  1394. blr
  1395. .long 0
  1396. .byte 0,12,0x04,1,0x80,0,3,0
  1397. .long 0
  1398. .size .vpaes_set_decrypt_key,.-.vpaes_set_decrypt_key
  1399. ___
  1400. }
  1401. my $consts=1;
  1402. foreach (split("\n",$code)) {
  1403. s/\`([^\`]*)\`/eval $1/geo;
  1404. # constants table endian-specific conversion
  1405. if ($consts && m/\.long\s+(.+)\s+(\?[a-z]*)$/o) {
  1406. my $conv=$2;
  1407. my @bytes=();
  1408. # convert to endian-agnostic format
  1409. foreach (split(/,\s+/,$1)) {
  1410. my $l = /^0/?oct:int;
  1411. push @bytes,($l>>24)&0xff,($l>>16)&0xff,($l>>8)&0xff,$l&0xff;
  1412. }
  1413. # little-endian conversion
  1414. if ($flavour =~ /le$/o) {
  1415. SWITCH: for($conv) {
  1416. /\?inv/ && do { @bytes=map($_^0xf,@bytes); last; };
  1417. /\?rev/ && do { @bytes=reverse(@bytes); last; };
  1418. }
  1419. }
  1420. #emit
  1421. print ".byte\t",join(',',map (sprintf("0x%02x",$_),@bytes)),"\n";
  1422. next;
  1423. }
  1424. $consts=0 if (m/Lconsts:/o); # end of table
  1425. # instructions prefixed with '?' are endian-specific and need
  1426. # to be adjusted accordingly...
  1427. if ($flavour =~ /le$/o) { # little-endian
  1428. s/\?lvsr/lvsl/o or
  1429. s/\?lvsl/lvsr/o or
  1430. s/\?(vperm\s+v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+)/$1$3$2$4/o or
  1431. s/\?(vsldoi\s+v[0-9]+,\s*)(v[0-9]+,)\s*(v[0-9]+,\s*)([0-9]+)/$1$3$2 16-$4/o or
  1432. s/\?(vspltw\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9])/$1$2 3-$3/o;
  1433. } else { # big-endian
  1434. s/\?([a-z]+)/$1/o;
  1435. }
  1436. print $_,"\n";
  1437. }
  1438. close STDOUT;