vpaes-ppc.pl 42 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594
  1. #! /usr/bin/env perl
  2. # Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. ######################################################################
  9. ## Constant-time SSSE3 AES core implementation.
  10. ## version 0.1
  11. ##
  12. ## By Mike Hamburg (Stanford University), 2009
  13. ## Public domain.
  14. ##
  15. ## For details see http://shiftleft.org/papers/vector_aes/ and
  16. ## http://crypto.stanford.edu/vpaes/.
  17. # CBC encrypt/decrypt performance in cycles per byte processed with
  18. # 128-bit key.
  19. #
  20. # aes-ppc.pl this
  21. # PPC74x0/G4e 35.5/52.1/(23.8) 11.9(*)/15.4
  22. # PPC970/G5 37.9/55.0/(28.5) 22.2/28.5
  23. # POWER6 42.7/54.3/(28.2) 63.0/92.8(**)
  24. # POWER7 32.3/42.9/(18.4) 18.5/23.3
  25. #
  26. # (*) This is ~10% worse than reported in paper. The reason is
  27. # twofold. This module doesn't make any assumption about
  28. # key schedule (or data for that matter) alignment and handles
  29. # it in-line. Secondly it, being transliterated from
  30. # vpaes-x86_64.pl, relies on "nested inversion" better suited
  31. # for Intel CPUs.
  32. # (**) Inadequate POWER6 performance is due to astronomic AltiVec
  33. # latency, 9 cycles per simple logical operation.
  34. $flavour = shift;
  35. if ($flavour =~ /64/) {
  36. $SIZE_T =8;
  37. $LRSAVE =2*$SIZE_T;
  38. $STU ="stdu";
  39. $POP ="ld";
  40. $PUSH ="std";
  41. $UCMP ="cmpld";
  42. } elsif ($flavour =~ /32/) {
  43. $SIZE_T =4;
  44. $LRSAVE =$SIZE_T;
  45. $STU ="stwu";
  46. $POP ="lwz";
  47. $PUSH ="stw";
  48. $UCMP ="cmplw";
  49. } else { die "nonsense $flavour"; }
  50. $sp="r1";
  51. $FRAME=6*$SIZE_T+13*16; # 13*16 is for v20-v31 offload
  52. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  53. ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
  54. ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
  55. die "can't locate ppc-xlate.pl";
  56. open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
  57. $code.=<<___;
  58. .machine "any"
  59. .text
  60. .align 7 # totally strategic alignment
  61. _vpaes_consts:
  62. Lk_mc_forward: # mc_forward
  63. .long 0x01020300, 0x05060704, 0x090a0b08, 0x0d0e0f0c ?inv
  64. .long 0x05060704, 0x090a0b08, 0x0d0e0f0c, 0x01020300 ?inv
  65. .long 0x090a0b08, 0x0d0e0f0c, 0x01020300, 0x05060704 ?inv
  66. .long 0x0d0e0f0c, 0x01020300, 0x05060704, 0x090a0b08 ?inv
  67. Lk_mc_backward: # mc_backward
  68. .long 0x03000102, 0x07040506, 0x0b08090a, 0x0f0c0d0e ?inv
  69. .long 0x0f0c0d0e, 0x03000102, 0x07040506, 0x0b08090a ?inv
  70. .long 0x0b08090a, 0x0f0c0d0e, 0x03000102, 0x07040506 ?inv
  71. .long 0x07040506, 0x0b08090a, 0x0f0c0d0e, 0x03000102 ?inv
  72. Lk_sr: # sr
  73. .long 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f ?inv
  74. .long 0x00050a0f, 0x04090e03, 0x080d0207, 0x0c01060b ?inv
  75. .long 0x0009020b, 0x040d060f, 0x08010a03, 0x0c050e07 ?inv
  76. .long 0x000d0a07, 0x04010e0b, 0x0805020f, 0x0c090603 ?inv
  77. ##
  78. ## "Hot" constants
  79. ##
  80. Lk_inv: # inv, inva
  81. .long 0xf001080d, 0x0f06050e, 0x020c0b0a, 0x09030704 ?rev
  82. .long 0xf0070b0f, 0x060a0401, 0x09080502, 0x0c0e0d03 ?rev
  83. Lk_ipt: # input transform (lo, hi)
  84. .long 0x00702a5a, 0x98e8b2c2, 0x08782252, 0x90e0baca ?rev
  85. .long 0x004d7c31, 0x7d30014c, 0x81ccfdb0, 0xfcb180cd ?rev
  86. Lk_sbo: # sbou, sbot
  87. .long 0x00c7bd6f, 0x176dd2d0, 0x78a802c5, 0x7abfaa15 ?rev
  88. .long 0x006abb5f, 0xa574e4cf, 0xfa352b41, 0xd1901e8e ?rev
  89. Lk_sb1: # sb1u, sb1t
  90. .long 0x0023e2fa, 0x15d41836, 0xefd92e0d, 0xc1ccf73b ?rev
  91. .long 0x003e50cb, 0x8fe19bb1, 0x44f52a14, 0x6e7adfa5 ?rev
  92. Lk_sb2: # sb2u, sb2t
  93. .long 0x0029e10a, 0x4088eb69, 0x4a2382ab, 0xc863a1c2 ?rev
  94. .long 0x0024710b, 0xc6937ae2, 0xcd2f98bc, 0x55e9b75e ?rev
  95. ##
  96. ## Decryption stuff
  97. ##
  98. Lk_dipt: # decryption input transform
  99. .long 0x005f540b, 0x045b500f, 0x1a454e11, 0x1e414a15 ?rev
  100. .long 0x00650560, 0xe683e386, 0x94f191f4, 0x72177712 ?rev
  101. Lk_dsbo: # decryption sbox final output
  102. .long 0x0040f97e, 0x53ea8713, 0x2d3e94d4, 0xb96daac7 ?rev
  103. .long 0x001d4493, 0x0f56d712, 0x9c8ec5d8, 0x59814bca ?rev
  104. Lk_dsb9: # decryption sbox output *9*u, *9*t
  105. .long 0x00d6869a, 0x53031c85, 0xc94c994f, 0x501fd5ca ?rev
  106. .long 0x0049d7ec, 0x89173bc0, 0x65a5fbb2, 0x9e2c5e72 ?rev
  107. Lk_dsbd: # decryption sbox output *D*u, *D*t
  108. .long 0x00a2b1e6, 0xdfcc577d, 0x39442a88, 0x139b6ef5 ?rev
  109. .long 0x00cbc624, 0xf7fae23c, 0xd3efde15, 0x0d183129 ?rev
  110. Lk_dsbb: # decryption sbox output *B*u, *B*t
  111. .long 0x0042b496, 0x926422d0, 0x04d4f2b0, 0xf6462660 ?rev
  112. .long 0x006759cd, 0xa69894c1, 0x6baa5532, 0x3e0cfff3 ?rev
  113. Lk_dsbe: # decryption sbox output *E*u, *E*t
  114. .long 0x00d0d426, 0x9692f246, 0xb0f6b464, 0x04604222 ?rev
  115. .long 0x00c1aaff, 0xcda6550c, 0x323e5998, 0x6bf36794 ?rev
  116. ##
  117. ## Key schedule constants
  118. ##
  119. Lk_dksd: # decryption key schedule: invskew x*D
  120. .long 0x0047e4a3, 0x5d1ab9fe, 0xf9be1d5a, 0xa4e34007 ?rev
  121. .long 0x008336b5, 0xf477c241, 0x1e9d28ab, 0xea69dc5f ?rev
  122. Lk_dksb: # decryption key schedule: invskew x*B
  123. .long 0x00d55085, 0x1fca4f9a, 0x994cc91c, 0x8653d603 ?rev
  124. .long 0x004afcb6, 0xa7ed5b11, 0xc882347e, 0x6f2593d9 ?rev
  125. Lk_dkse: # decryption key schedule: invskew x*E + 0x63
  126. .long 0x00d6c91f, 0xca1c03d5, 0x86504f99, 0x4c9a8553 ?rev
  127. .long 0xe87bdc4f, 0x059631a2, 0x8714b320, 0x6af95ecd ?rev
  128. Lk_dks9: # decryption key schedule: invskew x*9
  129. .long 0x00a7d97e, 0xc86f11b6, 0xfc5b2582, 0x3493ed4a ?rev
  130. .long 0x00331427, 0x62517645, 0xcefddae9, 0xac9fb88b ?rev
  131. Lk_rcon: # rcon
  132. .long 0xb6ee9daf, 0xb991831f, 0x817d7c4d, 0x08982a70 ?asis
  133. Lk_s63:
  134. .long 0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b ?asis
  135. Lk_opt: # output transform
  136. .long 0x0060b6d6, 0x29499fff, 0x0868bede, 0x214197f7 ?rev
  137. .long 0x00ecbc50, 0x51bded01, 0xe00c5cb0, 0xb15d0de1 ?rev
  138. Lk_deskew: # deskew tables: inverts the sbox's "skew"
  139. .long 0x00e3a447, 0x40a3e407, 0x1af9be5d, 0x5ab9fe1d ?rev
  140. .long 0x0069ea83, 0xdcb5365f, 0x771e9df4, 0xabc24128 ?rev
  141. .align 5
  142. Lconsts:
  143. mflr r0
  144. bcl 20,31,\$+4
  145. mflr r12 #vvvvv "distance between . and _vpaes_consts
  146. addi r12,r12,-0x308
  147. mtlr r0
  148. blr
  149. .long 0
  150. .byte 0,12,0x14,0,0,0,0,0
  151. .asciz "Vector Permutation AES for AltiVec, Mike Hamburg (Stanford University)"
  152. .align 6
  153. ___
  154. my ($inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm) = map("v$_",(26..31));
  155. {
  156. my ($inp,$out,$key) = map("r$_",(3..5));
  157. my ($invlo,$invhi,$iptlo,$ipthi,$sbou,$sbot) = map("v$_",(10..15));
  158. my ($sb1u,$sb1t,$sb2u,$sb2t) = map("v$_",(16..19));
  159. my ($sb9u,$sb9t,$sbdu,$sbdt,$sbbu,$sbbt,$sbeu,$sbet)=map("v$_",(16..23));
  160. $code.=<<___;
  161. ##
  162. ## _aes_preheat
  163. ##
  164. ## Fills register %r10 -> .aes_consts (so you can -fPIC)
  165. ## and %xmm9-%xmm15 as specified below.
  166. ##
  167. .align 4
  168. _vpaes_encrypt_preheat:
  169. mflr r8
  170. bl Lconsts
  171. mtlr r8
  172. li r11, 0xc0 # Lk_inv
  173. li r10, 0xd0
  174. li r9, 0xe0 # Lk_ipt
  175. li r8, 0xf0
  176. vxor v7, v7, v7 # 0x00..00
  177. vspltisb v8,4 # 0x04..04
  178. vspltisb v9,0x0f # 0x0f..0f
  179. lvx $invlo, r12, r11
  180. li r11, 0x100
  181. lvx $invhi, r12, r10
  182. li r10, 0x110
  183. lvx $iptlo, r12, r9
  184. li r9, 0x120
  185. lvx $ipthi, r12, r8
  186. li r8, 0x130
  187. lvx $sbou, r12, r11
  188. li r11, 0x140
  189. lvx $sbot, r12, r10
  190. li r10, 0x150
  191. lvx $sb1u, r12, r9
  192. lvx $sb1t, r12, r8
  193. lvx $sb2u, r12, r11
  194. lvx $sb2t, r12, r10
  195. blr
  196. .long 0
  197. .byte 0,12,0x14,0,0,0,0,0
  198. ##
  199. ## _aes_encrypt_core
  200. ##
  201. ## AES-encrypt %xmm0.
  202. ##
  203. ## Inputs:
  204. ## %xmm0 = input
  205. ## %xmm9-%xmm15 as in _vpaes_preheat
  206. ## (%rdx) = scheduled keys
  207. ##
  208. ## Output in %xmm0
  209. ## Clobbers %xmm1-%xmm6, %r9, %r10, %r11, %rax
  210. ##
  211. ##
  212. .align 5
  213. _vpaes_encrypt_core:
  214. lwz r8, 240($key) # pull rounds
  215. li r9, 16
  216. lvx v5, 0, $key # vmovdqu (%r9), %xmm5 # round0 key
  217. li r11, 0x10
  218. lvx v6, r9, $key
  219. addi r9, r9, 16
  220. ?vperm v5, v5, v6, $keyperm # align round key
  221. addi r10, r11, 0x40
  222. vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0
  223. vperm v0, $iptlo, $iptlo, v0 # vpshufb %xmm1, %xmm2, %xmm1
  224. vperm v1, $ipthi, $ipthi, v1 # vpshufb %xmm0, %xmm3, %xmm2
  225. vxor v0, v0, v5 # vpxor %xmm5, %xmm1, %xmm0
  226. vxor v0, v0, v1 # vpxor %xmm2, %xmm0, %xmm0
  227. mtctr r8
  228. b Lenc_entry
  229. .align 4
  230. Lenc_loop:
  231. # middle of middle round
  232. vperm v4, $sb1t, v7, v2 # vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u
  233. lvx v1, r12, r11 # vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]
  234. addi r11, r11, 16
  235. vperm v0, $sb1u, v7, v3 # vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t
  236. vxor v4, v4, v5 # vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
  237. andi. r11, r11, 0x30 # and \$0x30, %r11 # ... mod 4
  238. vperm v5, $sb2t, v7, v2 # vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u
  239. vxor v0, v0, v4 # vpxor %xmm4, %xmm0, %xmm0 # 0 = A
  240. vperm v2, $sb2u, v7, v3 # vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t
  241. lvx v4, r12, r10 # vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[]
  242. addi r10, r11, 0x40
  243. vperm v3, v0, v7, v1 # vpshufb %xmm1, %xmm0, %xmm3 # 0 = B
  244. vxor v2, v2, v5 # vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A
  245. vperm v0, v0, v7, v4 # vpshufb %xmm4, %xmm0, %xmm0 # 3 = D
  246. vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B
  247. vperm v4, v3, v7, v1 # vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C
  248. vxor v0, v0, v3 # vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D
  249. vxor v0, v0, v4 # vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D
  250. Lenc_entry:
  251. # top of round
  252. vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0 # 1 = i
  253. vperm v5, $invhi, $invhi, v0 # vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k
  254. vxor v0, v0, v1 # vpxor %xmm0, %xmm1, %xmm1 # 0 = j
  255. vperm v3, $invlo, $invlo, v1 # vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
  256. vperm v4, $invlo, $invlo, v0 # vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
  257. vand v0, v0, v9
  258. vxor v3, v3, v5 # vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
  259. vxor v4, v4, v5 # vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
  260. vperm v2, $invlo, v7, v3 # vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
  261. vmr v5, v6
  262. lvx v6, r9, $key # vmovdqu (%r9), %xmm5
  263. vperm v3, $invlo, v7, v4 # vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
  264. addi r9, r9, 16
  265. vxor v2, v2, v0 # vpxor %xmm1, %xmm2, %xmm2 # 2 = io
  266. ?vperm v5, v5, v6, $keyperm # align round key
  267. vxor v3, v3, v1 # vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
  268. bdnz Lenc_loop
  269. # middle of last round
  270. addi r10, r11, 0x80
  271. # vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo
  272. # vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
  273. vperm v4, $sbou, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
  274. lvx v1, r12, r10 # vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[]
  275. vperm v0, $sbot, v7, v3 # vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t
  276. vxor v4, v4, v5 # vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
  277. vxor v0, v0, v4 # vpxor %xmm4, %xmm0, %xmm0 # 0 = A
  278. vperm v0, v0, v7, v1 # vpshufb %xmm1, %xmm0, %xmm0
  279. blr
  280. .long 0
  281. .byte 0,12,0x14,0,0,0,0,0
  282. .globl .vpaes_encrypt
  283. .align 5
  284. .vpaes_encrypt:
  285. $STU $sp,-$FRAME($sp)
  286. li r10,`15+6*$SIZE_T`
  287. li r11,`31+6*$SIZE_T`
  288. mflr r6
  289. mfspr r7, 256 # save vrsave
  290. stvx v20,r10,$sp
  291. addi r10,r10,32
  292. stvx v21,r11,$sp
  293. addi r11,r11,32
  294. stvx v22,r10,$sp
  295. addi r10,r10,32
  296. stvx v23,r11,$sp
  297. addi r11,r11,32
  298. stvx v24,r10,$sp
  299. addi r10,r10,32
  300. stvx v25,r11,$sp
  301. addi r11,r11,32
  302. stvx v26,r10,$sp
  303. addi r10,r10,32
  304. stvx v27,r11,$sp
  305. addi r11,r11,32
  306. stvx v28,r10,$sp
  307. addi r10,r10,32
  308. stvx v29,r11,$sp
  309. addi r11,r11,32
  310. stvx v30,r10,$sp
  311. stvx v31,r11,$sp
  312. stw r7,`$FRAME-4`($sp) # save vrsave
  313. li r0, -1
  314. $PUSH r6,`$FRAME+$LRSAVE`($sp)
  315. mtspr 256, r0 # preserve all AltiVec registers
  316. bl _vpaes_encrypt_preheat
  317. ?lvsl $inpperm, 0, $inp # prepare for unaligned access
  318. lvx v0, 0, $inp
  319. addi $inp, $inp, 15 # 15 is not a typo
  320. ?lvsr $outperm, 0, $out
  321. ?lvsl $keyperm, 0, $key # prepare for unaligned access
  322. lvx $inptail, 0, $inp # redundant in aligned case
  323. ?vperm v0, v0, $inptail, $inpperm
  324. bl _vpaes_encrypt_core
  325. andi. r8, $out, 15
  326. li r9, 16
  327. beq Lenc_out_aligned
  328. vperm v0, v0, v0, $outperm # rotate right/left
  329. mtctr r9
  330. Lenc_out_unaligned:
  331. stvebx v0, 0, $out
  332. addi $out, $out, 1
  333. bdnz Lenc_out_unaligned
  334. b Lenc_done
  335. .align 4
  336. Lenc_out_aligned:
  337. stvx v0, 0, $out
  338. Lenc_done:
  339. li r10,`15+6*$SIZE_T`
  340. li r11,`31+6*$SIZE_T`
  341. mtlr r6
  342. mtspr 256, r7 # restore vrsave
  343. lvx v20,r10,$sp
  344. addi r10,r10,32
  345. lvx v21,r11,$sp
  346. addi r11,r11,32
  347. lvx v22,r10,$sp
  348. addi r10,r10,32
  349. lvx v23,r11,$sp
  350. addi r11,r11,32
  351. lvx v24,r10,$sp
  352. addi r10,r10,32
  353. lvx v25,r11,$sp
  354. addi r11,r11,32
  355. lvx v26,r10,$sp
  356. addi r10,r10,32
  357. lvx v27,r11,$sp
  358. addi r11,r11,32
  359. lvx v28,r10,$sp
  360. addi r10,r10,32
  361. lvx v29,r11,$sp
  362. addi r11,r11,32
  363. lvx v30,r10,$sp
  364. lvx v31,r11,$sp
  365. addi $sp,$sp,$FRAME
  366. blr
  367. .long 0
  368. .byte 0,12,0x04,1,0x80,0,3,0
  369. .long 0
  370. .size .vpaes_encrypt,.-.vpaes_encrypt
  371. .align 4
  372. _vpaes_decrypt_preheat:
  373. mflr r8
  374. bl Lconsts
  375. mtlr r8
  376. li r11, 0xc0 # Lk_inv
  377. li r10, 0xd0
  378. li r9, 0x160 # Ldipt
  379. li r8, 0x170
  380. vxor v7, v7, v7 # 0x00..00
  381. vspltisb v8,4 # 0x04..04
  382. vspltisb v9,0x0f # 0x0f..0f
  383. lvx $invlo, r12, r11
  384. li r11, 0x180
  385. lvx $invhi, r12, r10
  386. li r10, 0x190
  387. lvx $iptlo, r12, r9
  388. li r9, 0x1a0
  389. lvx $ipthi, r12, r8
  390. li r8, 0x1b0
  391. lvx $sbou, r12, r11
  392. li r11, 0x1c0
  393. lvx $sbot, r12, r10
  394. li r10, 0x1d0
  395. lvx $sb9u, r12, r9
  396. li r9, 0x1e0
  397. lvx $sb9t, r12, r8
  398. li r8, 0x1f0
  399. lvx $sbdu, r12, r11
  400. li r11, 0x200
  401. lvx $sbdt, r12, r10
  402. li r10, 0x210
  403. lvx $sbbu, r12, r9
  404. lvx $sbbt, r12, r8
  405. lvx $sbeu, r12, r11
  406. lvx $sbet, r12, r10
  407. blr
  408. .long 0
  409. .byte 0,12,0x14,0,0,0,0,0
  410. ##
  411. ## Decryption core
  412. ##
  413. ## Same API as encryption core.
  414. ##
  415. .align 4
  416. _vpaes_decrypt_core:
  417. lwz r8, 240($key) # pull rounds
  418. li r9, 16
  419. lvx v5, 0, $key # vmovdqu (%r9), %xmm4 # round0 key
  420. li r11, 0x30
  421. lvx v6, r9, $key
  422. addi r9, r9, 16
  423. ?vperm v5, v5, v6, $keyperm # align round key
  424. vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0
  425. vperm v0, $iptlo, $iptlo, v0 # vpshufb %xmm1, %xmm2, %xmm2
  426. vperm v1, $ipthi, $ipthi, v1 # vpshufb %xmm0, %xmm1, %xmm0
  427. vxor v0, v0, v5 # vpxor %xmm4, %xmm2, %xmm2
  428. vxor v0, v0, v1 # vpxor %xmm2, %xmm0, %xmm0
  429. mtctr r8
  430. b Ldec_entry
  431. .align 4
  432. Ldec_loop:
  433. #
  434. # Inverse mix columns
  435. #
  436. lvx v0, r12, r11 # v5 and v0 are flipped
  437. # vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u
  438. # vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t
  439. vperm v4, $sb9u, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u
  440. subi r11, r11, 16
  441. vperm v1, $sb9t, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t
  442. andi. r11, r11, 0x30
  443. vxor v5, v5, v4 # vpxor %xmm4, %xmm0, %xmm0
  444. # vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu
  445. vxor v5, v5, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
  446. # vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt
  447. vperm v4, $sbdu, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu
  448. vperm v5, v5, v7, v0 # vpshufb %xmm5, %xmm0, %xmm0 # MC ch
  449. vperm v1, $sbdt, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt
  450. vxor v5, v5, v4 # vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
  451. # vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu
  452. vxor v5, v5, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
  453. # vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt
  454. vperm v4, $sbbu, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu
  455. vperm v5, v5, v7, v0 # vpshufb %xmm5, %xmm0, %xmm0 # MC ch
  456. vperm v1, $sbbt, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt
  457. vxor v5, v5, v4 # vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
  458. # vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu
  459. vxor v5, v5, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
  460. # vmovdqa 0x50(%r10), %xmm1 # 0 : sbet
  461. vperm v4, $sbeu, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu
  462. vperm v5, v5, v7, v0 # vpshufb %xmm5, %xmm0, %xmm0 # MC ch
  463. vperm v1, $sbet, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet
  464. vxor v0, v5, v4 # vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
  465. vxor v0, v0, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
  466. Ldec_entry:
  467. # top of round
  468. vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0 # 1 = i
  469. vperm v2, $invhi, $invhi, v0 # vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
  470. vxor v0, v0, v1 # vpxor %xmm0, %xmm1, %xmm1 # 0 = j
  471. vperm v3, $invlo, $invlo, v1 # vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
  472. vperm v4, $invlo, $invlo, v0 # vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
  473. vand v0, v0, v9
  474. vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
  475. vxor v4, v4, v2 # vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
  476. vperm v2, $invlo, v7, v3 # vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
  477. vmr v5, v6
  478. lvx v6, r9, $key # vmovdqu (%r9), %xmm0
  479. vperm v3, $invlo, v7, v4 # vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
  480. addi r9, r9, 16
  481. vxor v2, v2, v0 # vpxor %xmm1, %xmm2, %xmm2 # 2 = io
  482. ?vperm v5, v5, v6, $keyperm # align round key
  483. vxor v3, v3, v1 # vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
  484. bdnz Ldec_loop
  485. # middle of last round
  486. addi r10, r11, 0x80
  487. # vmovdqa 0x60(%r10), %xmm4 # 3 : sbou
  488. vperm v4, $sbou, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
  489. # vmovdqa 0x70(%r10), %xmm1 # 0 : sbot
  490. lvx v2, r12, r10 # vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160
  491. vperm v1, $sbot, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t
  492. vxor v4, v4, v5 # vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k
  493. vxor v0, v1, v4 # vpxor %xmm4, %xmm1, %xmm0 # 0 = A
  494. vperm v0, v0, v7, v2 # vpshufb %xmm2, %xmm0, %xmm0
  495. blr
  496. .long 0
  497. .byte 0,12,0x14,0,0,0,0,0
  498. .globl .vpaes_decrypt
  499. .align 5
  500. .vpaes_decrypt:
  501. $STU $sp,-$FRAME($sp)
  502. li r10,`15+6*$SIZE_T`
  503. li r11,`31+6*$SIZE_T`
  504. mflr r6
  505. mfspr r7, 256 # save vrsave
  506. stvx v20,r10,$sp
  507. addi r10,r10,32
  508. stvx v21,r11,$sp
  509. addi r11,r11,32
  510. stvx v22,r10,$sp
  511. addi r10,r10,32
  512. stvx v23,r11,$sp
  513. addi r11,r11,32
  514. stvx v24,r10,$sp
  515. addi r10,r10,32
  516. stvx v25,r11,$sp
  517. addi r11,r11,32
  518. stvx v26,r10,$sp
  519. addi r10,r10,32
  520. stvx v27,r11,$sp
  521. addi r11,r11,32
  522. stvx v28,r10,$sp
  523. addi r10,r10,32
  524. stvx v29,r11,$sp
  525. addi r11,r11,32
  526. stvx v30,r10,$sp
  527. stvx v31,r11,$sp
  528. stw r7,`$FRAME-4`($sp) # save vrsave
  529. li r0, -1
  530. $PUSH r6,`$FRAME+$LRSAVE`($sp)
  531. mtspr 256, r0 # preserve all AltiVec registers
  532. bl _vpaes_decrypt_preheat
  533. ?lvsl $inpperm, 0, $inp # prepare for unaligned access
  534. lvx v0, 0, $inp
  535. addi $inp, $inp, 15 # 15 is not a typo
  536. ?lvsr $outperm, 0, $out
  537. ?lvsl $keyperm, 0, $key
  538. lvx $inptail, 0, $inp # redundant in aligned case
  539. ?vperm v0, v0, $inptail, $inpperm
  540. bl _vpaes_decrypt_core
  541. andi. r8, $out, 15
  542. li r9, 16
  543. beq Ldec_out_aligned
  544. vperm v0, v0, v0, $outperm # rotate right/left
  545. mtctr r9
  546. Ldec_out_unaligned:
  547. stvebx v0, 0, $out
  548. addi $out, $out, 1
  549. bdnz Ldec_out_unaligned
  550. b Ldec_done
  551. .align 4
  552. Ldec_out_aligned:
  553. stvx v0, 0, $out
  554. Ldec_done:
  555. li r10,`15+6*$SIZE_T`
  556. li r11,`31+6*$SIZE_T`
  557. mtlr r6
  558. mtspr 256, r7 # restore vrsave
  559. lvx v20,r10,$sp
  560. addi r10,r10,32
  561. lvx v21,r11,$sp
  562. addi r11,r11,32
  563. lvx v22,r10,$sp
  564. addi r10,r10,32
  565. lvx v23,r11,$sp
  566. addi r11,r11,32
  567. lvx v24,r10,$sp
  568. addi r10,r10,32
  569. lvx v25,r11,$sp
  570. addi r11,r11,32
  571. lvx v26,r10,$sp
  572. addi r10,r10,32
  573. lvx v27,r11,$sp
  574. addi r11,r11,32
  575. lvx v28,r10,$sp
  576. addi r10,r10,32
  577. lvx v29,r11,$sp
  578. addi r11,r11,32
  579. lvx v30,r10,$sp
  580. lvx v31,r11,$sp
  581. addi $sp,$sp,$FRAME
  582. blr
  583. .long 0
  584. .byte 0,12,0x04,1,0x80,0,3,0
  585. .long 0
  586. .size .vpaes_decrypt,.-.vpaes_decrypt
  587. .globl .vpaes_cbc_encrypt
  588. .align 5
  589. .vpaes_cbc_encrypt:
  590. ${UCMP}i r5,16
  591. bltlr-
  592. $STU $sp,-`($FRAME+2*$SIZE_T)`($sp)
  593. mflr r0
  594. li r10,`15+6*$SIZE_T`
  595. li r11,`31+6*$SIZE_T`
  596. mfspr r12, 256
  597. stvx v20,r10,$sp
  598. addi r10,r10,32
  599. stvx v21,r11,$sp
  600. addi r11,r11,32
  601. stvx v22,r10,$sp
  602. addi r10,r10,32
  603. stvx v23,r11,$sp
  604. addi r11,r11,32
  605. stvx v24,r10,$sp
  606. addi r10,r10,32
  607. stvx v25,r11,$sp
  608. addi r11,r11,32
  609. stvx v26,r10,$sp
  610. addi r10,r10,32
  611. stvx v27,r11,$sp
  612. addi r11,r11,32
  613. stvx v28,r10,$sp
  614. addi r10,r10,32
  615. stvx v29,r11,$sp
  616. addi r11,r11,32
  617. stvx v30,r10,$sp
  618. stvx v31,r11,$sp
  619. stw r12,`$FRAME-4`($sp) # save vrsave
  620. $PUSH r30,`$FRAME+$SIZE_T*0`($sp)
  621. $PUSH r31,`$FRAME+$SIZE_T*1`($sp)
  622. li r9, -16
  623. $PUSH r0, `$FRAME+$SIZE_T*2+$LRSAVE`($sp)
  624. and r30, r5, r9 # copy length&-16
  625. andi. r9, $out, 15 # is $out aligned?
  626. mr r5, r6 # copy pointer to key
  627. mr r31, r7 # copy pointer to iv
  628. li r6, -1
  629. mcrf cr1, cr0 # put aside $out alignment flag
  630. mr r7, r12 # copy vrsave
  631. mtspr 256, r6 # preserve all AltiVec registers
  632. lvx v24, 0, r31 # load [potentially unaligned] iv
  633. li r9, 15
  634. ?lvsl $inpperm, 0, r31
  635. lvx v25, r9, r31
  636. ?vperm v24, v24, v25, $inpperm
  637. cmpwi r8, 0 # test direction
  638. neg r8, $inp # prepare for unaligned access
  639. vxor v7, v7, v7
  640. ?lvsl $keyperm, 0, $key
  641. ?lvsr $outperm, 0, $out
  642. ?lvsr $inpperm, 0, r8 # -$inp
  643. vnor $outmask, v7, v7 # 0xff..ff
  644. lvx $inptail, 0, $inp
  645. ?vperm $outmask, v7, $outmask, $outperm
  646. addi $inp, $inp, 15 # 15 is not a typo
  647. beq Lcbc_decrypt
  648. bl _vpaes_encrypt_preheat
  649. li r0, 16
  650. beq cr1, Lcbc_enc_loop # $out is aligned
  651. vmr v0, $inptail
  652. lvx $inptail, 0, $inp
  653. addi $inp, $inp, 16
  654. ?vperm v0, v0, $inptail, $inpperm
  655. vxor v0, v0, v24 # ^= iv
  656. bl _vpaes_encrypt_core
  657. andi. r8, $out, 15
  658. vmr v24, v0 # put aside iv
  659. sub r9, $out, r8
  660. vperm $outhead, v0, v0, $outperm # rotate right/left
  661. Lcbc_enc_head:
  662. stvebx $outhead, r8, r9
  663. cmpwi r8, 15
  664. addi r8, r8, 1
  665. bne Lcbc_enc_head
  666. sub. r30, r30, r0 # len -= 16
  667. addi $out, $out, 16
  668. beq Lcbc_unaligned_done
  669. Lcbc_enc_loop:
  670. vmr v0, $inptail
  671. lvx $inptail, 0, $inp
  672. addi $inp, $inp, 16
  673. ?vperm v0, v0, $inptail, $inpperm
  674. vxor v0, v0, v24 # ^= iv
  675. bl _vpaes_encrypt_core
  676. vmr v24, v0 # put aside iv
  677. sub. r30, r30, r0 # len -= 16
  678. vperm v0, v0, v0, $outperm # rotate right/left
  679. vsel v1, $outhead, v0, $outmask
  680. vmr $outhead, v0
  681. stvx v1, 0, $out
  682. addi $out, $out, 16
  683. bne Lcbc_enc_loop
  684. b Lcbc_done
  685. .align 5
  686. Lcbc_decrypt:
  687. bl _vpaes_decrypt_preheat
  688. li r0, 16
  689. beq cr1, Lcbc_dec_loop # $out is aligned
  690. vmr v0, $inptail
  691. lvx $inptail, 0, $inp
  692. addi $inp, $inp, 16
  693. ?vperm v0, v0, $inptail, $inpperm
  694. vmr v25, v0 # put aside input
  695. bl _vpaes_decrypt_core
  696. andi. r8, $out, 15
  697. vxor v0, v0, v24 # ^= iv
  698. vmr v24, v25
  699. sub r9, $out, r8
  700. vperm $outhead, v0, v0, $outperm # rotate right/left
  701. Lcbc_dec_head:
  702. stvebx $outhead, r8, r9
  703. cmpwi r8, 15
  704. addi r8, r8, 1
  705. bne Lcbc_dec_head
  706. sub. r30, r30, r0 # len -= 16
  707. addi $out, $out, 16
  708. beq Lcbc_unaligned_done
  709. Lcbc_dec_loop:
  710. vmr v0, $inptail
  711. lvx $inptail, 0, $inp
  712. addi $inp, $inp, 16
  713. ?vperm v0, v0, $inptail, $inpperm
  714. vmr v25, v0 # put aside input
  715. bl _vpaes_decrypt_core
  716. vxor v0, v0, v24 # ^= iv
  717. vmr v24, v25
  718. sub. r30, r30, r0 # len -= 16
  719. vperm v0, v0, v0, $outperm # rotate right/left
  720. vsel v1, $outhead, v0, $outmask
  721. vmr $outhead, v0
  722. stvx v1, 0, $out
  723. addi $out, $out, 16
  724. bne Lcbc_dec_loop
  725. Lcbc_done:
  726. beq cr1, Lcbc_write_iv # $out is aligned
  727. Lcbc_unaligned_done:
  728. andi. r8, $out, 15
  729. sub $out, $out, r8
  730. li r9, 0
  731. Lcbc_tail:
  732. stvebx $outhead, r9, $out
  733. addi r9, r9, 1
  734. cmpw r9, r8
  735. bne Lcbc_tail
  736. Lcbc_write_iv:
  737. neg r8, r31 # write [potentially unaligned] iv
  738. li r10, 4
  739. ?lvsl $outperm, 0, r8
  740. li r11, 8
  741. li r12, 12
  742. vperm v24, v24, v24, $outperm # rotate right/left
  743. stvewx v24, 0, r31 # ivp is at least 32-bit aligned
  744. stvewx v24, r10, r31
  745. stvewx v24, r11, r31
  746. stvewx v24, r12, r31
  747. mtspr 256, r7 # restore vrsave
  748. li r10,`15+6*$SIZE_T`
  749. li r11,`31+6*$SIZE_T`
  750. lvx v20,r10,$sp
  751. addi r10,r10,32
  752. lvx v21,r11,$sp
  753. addi r11,r11,32
  754. lvx v22,r10,$sp
  755. addi r10,r10,32
  756. lvx v23,r11,$sp
  757. addi r11,r11,32
  758. lvx v24,r10,$sp
  759. addi r10,r10,32
  760. lvx v25,r11,$sp
  761. addi r11,r11,32
  762. lvx v26,r10,$sp
  763. addi r10,r10,32
  764. lvx v27,r11,$sp
  765. addi r11,r11,32
  766. lvx v28,r10,$sp
  767. addi r10,r10,32
  768. lvx v29,r11,$sp
  769. addi r11,r11,32
  770. lvx v30,r10,$sp
  771. lvx v31,r11,$sp
  772. Lcbc_abort:
  773. $POP r0, `$FRAME+$SIZE_T*2+$LRSAVE`($sp)
  774. $POP r30,`$FRAME+$SIZE_T*0`($sp)
  775. $POP r31,`$FRAME+$SIZE_T*1`($sp)
  776. mtlr r0
  777. addi $sp,$sp,`$FRAME+$SIZE_T*2`
  778. blr
  779. .long 0
  780. .byte 0,12,0x04,1,0x80,2,6,0
  781. .long 0
  782. .size .vpaes_cbc_encrypt,.-.vpaes_cbc_encrypt
  783. ___
  784. }
  785. {
  786. my ($inp,$bits,$out)=map("r$_",(3..5));
  787. my $dir="cr1";
  788. my ($invlo,$invhi,$iptlo,$ipthi,$rcon) = map("v$_",(10..13,24));
  789. $code.=<<___;
  790. ########################################################
  791. ## ##
  792. ## AES key schedule ##
  793. ## ##
  794. ########################################################
  795. .align 4
  796. _vpaes_key_preheat:
  797. mflr r8
  798. bl Lconsts
  799. mtlr r8
  800. li r11, 0xc0 # Lk_inv
  801. li r10, 0xd0
  802. li r9, 0xe0 # L_ipt
  803. li r8, 0xf0
  804. vspltisb v8,4 # 0x04..04
  805. vxor v9,v9,v9 # 0x00..00
  806. lvx $invlo, r12, r11 # Lk_inv
  807. li r11, 0x120
  808. lvx $invhi, r12, r10
  809. li r10, 0x130
  810. lvx $iptlo, r12, r9 # Lk_ipt
  811. li r9, 0x220
  812. lvx $ipthi, r12, r8
  813. li r8, 0x230
  814. lvx v14, r12, r11 # Lk_sb1
  815. li r11, 0x240
  816. lvx v15, r12, r10
  817. li r10, 0x250
  818. lvx v16, r12, r9 # Lk_dksd
  819. li r9, 0x260
  820. lvx v17, r12, r8
  821. li r8, 0x270
  822. lvx v18, r12, r11 # Lk_dksb
  823. li r11, 0x280
  824. lvx v19, r12, r10
  825. li r10, 0x290
  826. lvx v20, r12, r9 # Lk_dkse
  827. li r9, 0x2a0
  828. lvx v21, r12, r8
  829. li r8, 0x2b0
  830. lvx v22, r12, r11 # Lk_dks9
  831. lvx v23, r12, r10
  832. lvx v24, r12, r9 # Lk_rcon
  833. lvx v25, 0, r12 # Lk_mc_forward[0]
  834. lvx v26, r12, r8 # Lks63
  835. blr
  836. .long 0
  837. .byte 0,12,0x14,0,0,0,0,0
  838. .align 4
  839. _vpaes_schedule_core:
  840. mflr r7
  841. bl _vpaes_key_preheat # load the tables
  842. #lvx v0, 0, $inp # vmovdqu (%rdi), %xmm0 # load key (unaligned)
  843. neg r8, $inp # prepare for unaligned access
  844. lvx v0, 0, $inp
  845. addi $inp, $inp, 15 # 15 is not typo
  846. ?lvsr $inpperm, 0, r8 # -$inp
  847. lvx v6, 0, $inp # v6 serves as inptail
  848. addi $inp, $inp, 8
  849. ?vperm v0, v0, v6, $inpperm
  850. # input transform
  851. vmr v3, v0 # vmovdqa %xmm0, %xmm3
  852. bl _vpaes_schedule_transform
  853. vmr v7, v0 # vmovdqa %xmm0, %xmm7
  854. bne $dir, Lschedule_am_decrypting
  855. # encrypting, output zeroth round key after transform
  856. li r8, 0x30 # mov \$0x30,%r8d
  857. li r9, 4
  858. li r10, 8
  859. li r11, 12
  860. ?lvsr $outperm, 0, $out # prepare for unaligned access
  861. vnor $outmask, v9, v9 # 0xff..ff
  862. ?vperm $outmask, v9, $outmask, $outperm
  863. #stvx v0, 0, $out # vmovdqu %xmm0, (%rdx)
  864. vperm $outhead, v0, v0, $outperm # rotate right/left
  865. stvewx $outhead, 0, $out # some are superfluous
  866. stvewx $outhead, r9, $out
  867. stvewx $outhead, r10, $out
  868. addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10
  869. stvewx $outhead, r11, $out
  870. b Lschedule_go
  871. Lschedule_am_decrypting:
  872. srwi r8, $bits, 1 # shr \$1,%r8d
  873. andi. r8, r8, 32 # and \$32,%r8d
  874. xori r8, r8, 32 # xor \$32,%r8d # nbits==192?0:32
  875. addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10
  876. # decrypting, output zeroth round key after shiftrows
  877. lvx v1, r8, r10 # vmovdqa (%r8,%r10), %xmm1
  878. li r9, 4
  879. li r10, 8
  880. li r11, 12
  881. vperm v4, v3, v3, v1 # vpshufb %xmm1, %xmm3, %xmm3
  882. neg r0, $out # prepare for unaligned access
  883. ?lvsl $outperm, 0, r0
  884. vnor $outmask, v9, v9 # 0xff..ff
  885. ?vperm $outmask, $outmask, v9, $outperm
  886. #stvx v4, 0, $out # vmovdqu %xmm3, (%rdx)
  887. vperm $outhead, v4, v4, $outperm # rotate right/left
  888. stvewx $outhead, 0, $out # some are superfluous
  889. stvewx $outhead, r9, $out
  890. stvewx $outhead, r10, $out
  891. addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10
  892. stvewx $outhead, r11, $out
  893. addi $out, $out, 15 # 15 is not typo
  894. xori r8, r8, 0x30 # xor \$0x30, %r8
  895. Lschedule_go:
  896. cmplwi $bits, 192 # cmp \$192, %esi
  897. bgt Lschedule_256
  898. beq Lschedule_192
  899. # 128: fall though
  900. ##
  901. ## .schedule_128
  902. ##
  903. ## 128-bit specific part of key schedule.
  904. ##
  905. ## This schedule is really simple, because all its parts
  906. ## are accomplished by the subroutines.
  907. ##
  908. Lschedule_128:
  909. li r0, 10 # mov \$10, %esi
  910. mtctr r0
  911. Loop_schedule_128:
  912. bl _vpaes_schedule_round
  913. bdz Lschedule_mangle_last # dec %esi
  914. bl _vpaes_schedule_mangle # write output
  915. b Loop_schedule_128
  916. ##
  917. ## .aes_schedule_192
  918. ##
  919. ## 192-bit specific part of key schedule.
  920. ##
  921. ## The main body of this schedule is the same as the 128-bit
  922. ## schedule, but with more smearing. The long, high side is
  923. ## stored in %xmm7 as before, and the short, low side is in
  924. ## the high bits of %xmm6.
  925. ##
  926. ## This schedule is somewhat nastier, however, because each
  927. ## round produces 192 bits of key material, or 1.5 round keys.
  928. ## Therefore, on each cycle we do 2 rounds and produce 3 round
  929. ## keys.
  930. ##
  931. .align 4
  932. Lschedule_192:
  933. li r0, 4 # mov \$4, %esi
  934. lvx v0, 0, $inp
  935. ?vperm v0, v6, v0, $inpperm
  936. ?vsldoi v0, v3, v0, 8 # vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned)
  937. bl _vpaes_schedule_transform # input transform
  938. ?vsldoi v6, v0, v9, 8
  939. ?vsldoi v6, v9, v6, 8 # clobber "low" side with zeros
  940. mtctr r0
  941. Loop_schedule_192:
  942. bl _vpaes_schedule_round
  943. ?vsldoi v0, v6, v0, 8 # vpalignr \$8,%xmm6,%xmm0,%xmm0
  944. bl _vpaes_schedule_mangle # save key n
  945. bl _vpaes_schedule_192_smear
  946. bl _vpaes_schedule_mangle # save key n+1
  947. bl _vpaes_schedule_round
  948. bdz Lschedule_mangle_last # dec %esi
  949. bl _vpaes_schedule_mangle # save key n+2
  950. bl _vpaes_schedule_192_smear
  951. b Loop_schedule_192
  952. ##
  953. ## .aes_schedule_256
  954. ##
  955. ## 256-bit specific part of key schedule.
  956. ##
  957. ## The structure here is very similar to the 128-bit
  958. ## schedule, but with an additional "low side" in
  959. ## %xmm6. The low side's rounds are the same as the
  960. ## high side's, except no rcon and no rotation.
  961. ##
  962. .align 4
  963. Lschedule_256:
  964. li r0, 7 # mov \$7, %esi
  965. addi $inp, $inp, 8
  966. lvx v0, 0, $inp # vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
  967. ?vperm v0, v6, v0, $inpperm
  968. bl _vpaes_schedule_transform # input transform
  969. mtctr r0
  970. Loop_schedule_256:
  971. bl _vpaes_schedule_mangle # output low result
  972. vmr v6, v0 # vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6
  973. # high round
  974. bl _vpaes_schedule_round
  975. bdz Lschedule_mangle_last # dec %esi
  976. bl _vpaes_schedule_mangle
  977. # low round. swap xmm7 and xmm6
  978. ?vspltw v0, v0, 3 # vpshufd \$0xFF, %xmm0, %xmm0
  979. vmr v5, v7 # vmovdqa %xmm7, %xmm5
  980. vmr v7, v6 # vmovdqa %xmm6, %xmm7
  981. bl _vpaes_schedule_low_round
  982. vmr v7, v5 # vmovdqa %xmm5, %xmm7
  983. b Loop_schedule_256
  984. ##
  985. ## .aes_schedule_mangle_last
  986. ##
  987. ## Mangler for last round of key schedule
  988. ## Mangles %xmm0
  989. ## when encrypting, outputs out(%xmm0) ^ 63
  990. ## when decrypting, outputs unskew(%xmm0)
  991. ##
  992. ## Always called right before return... jumps to cleanup and exits
  993. ##
  994. .align 4
  995. Lschedule_mangle_last:
  996. # schedule last round key from xmm0
  997. li r11, 0x2e0 # lea .Lk_deskew(%rip),%r11
  998. li r9, 0x2f0
  999. bne $dir, Lschedule_mangle_last_dec
  1000. # encrypting
  1001. lvx v1, r8, r10 # vmovdqa (%r8,%r10),%xmm1
  1002. li r11, 0x2c0 # lea .Lk_opt(%rip), %r11 # prepare to output transform
  1003. li r9, 0x2d0 # prepare to output transform
  1004. vperm v0, v0, v0, v1 # vpshufb %xmm1, %xmm0, %xmm0 # output permute
  1005. lvx $iptlo, r11, r12 # reload $ipt
  1006. lvx $ipthi, r9, r12
  1007. addi $out, $out, 16 # add \$16, %rdx
  1008. vxor v0, v0, v26 # vpxor .Lk_s63(%rip), %xmm0, %xmm0
  1009. bl _vpaes_schedule_transform # output transform
  1010. #stvx v0, r0, $out # vmovdqu %xmm0, (%rdx) # save last key
  1011. vperm v0, v0, v0, $outperm # rotate right/left
  1012. li r10, 4
  1013. vsel v2, $outhead, v0, $outmask
  1014. li r11, 8
  1015. stvx v2, 0, $out
  1016. li r12, 12
  1017. stvewx v0, 0, $out # some (or all) are redundant
  1018. stvewx v0, r10, $out
  1019. stvewx v0, r11, $out
  1020. stvewx v0, r12, $out
  1021. b Lschedule_mangle_done
  1022. .align 4
  1023. Lschedule_mangle_last_dec:
  1024. lvx $iptlo, r11, r12 # reload $ipt
  1025. lvx $ipthi, r9, r12
  1026. addi $out, $out, -16 # add \$-16, %rdx
  1027. vxor v0, v0, v26 # vpxor .Lk_s63(%rip), %xmm0, %xmm0
  1028. bl _vpaes_schedule_transform # output transform
  1029. #stvx v0, r0, $out # vmovdqu %xmm0, (%rdx) # save last key
  1030. addi r9, $out, -15 # -15 is not typo
  1031. vperm v0, v0, v0, $outperm # rotate right/left
  1032. li r10, 4
  1033. vsel v2, $outhead, v0, $outmask
  1034. li r11, 8
  1035. stvx v2, 0, $out
  1036. li r12, 12
  1037. stvewx v0, 0, r9 # some (or all) are redundant
  1038. stvewx v0, r10, r9
  1039. stvewx v0, r11, r9
  1040. stvewx v0, r12, r9
  1041. Lschedule_mangle_done:
  1042. mtlr r7
  1043. # cleanup
  1044. vxor v0, v0, v0 # vpxor %xmm0, %xmm0, %xmm0
  1045. vxor v1, v1, v1 # vpxor %xmm1, %xmm1, %xmm1
  1046. vxor v2, v2, v2 # vpxor %xmm2, %xmm2, %xmm2
  1047. vxor v3, v3, v3 # vpxor %xmm3, %xmm3, %xmm3
  1048. vxor v4, v4, v4 # vpxor %xmm4, %xmm4, %xmm4
  1049. vxor v5, v5, v5 # vpxor %xmm5, %xmm5, %xmm5
  1050. vxor v6, v6, v6 # vpxor %xmm6, %xmm6, %xmm6
  1051. vxor v7, v7, v7 # vpxor %xmm7, %xmm7, %xmm7
  1052. blr
  1053. .long 0
  1054. .byte 0,12,0x14,0,0,0,0,0
  1055. ##
  1056. ## .aes_schedule_192_smear
  1057. ##
  1058. ## Smear the short, low side in the 192-bit key schedule.
  1059. ##
  1060. ## Inputs:
  1061. ## %xmm7: high side, b a x y
  1062. ## %xmm6: low side, d c 0 0
  1063. ## %xmm13: 0
  1064. ##
  1065. ## Outputs:
  1066. ## %xmm6: b+c+d b+c 0 0
  1067. ## %xmm0: b+c+d b+c b a
  1068. ##
  1069. .align 4
  1070. _vpaes_schedule_192_smear:
  1071. ?vspltw v0, v7, 3
  1072. ?vsldoi v1, v9, v6, 12 # vpshufd \$0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0
  1073. ?vsldoi v0, v7, v0, 8 # vpshufd \$0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a
  1074. vxor v6, v6, v1 # vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0
  1075. vxor v6, v6, v0 # vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a
  1076. vmr v0, v6
  1077. ?vsldoi v6, v6, v9, 8
  1078. ?vsldoi v6, v9, v6, 8 # clobber low side with zeros
  1079. blr
  1080. .long 0
  1081. .byte 0,12,0x14,0,0,0,0,0
  1082. ##
  1083. ## .aes_schedule_round
  1084. ##
  1085. ## Runs one main round of the key schedule on %xmm0, %xmm7
  1086. ##
  1087. ## Specifically, runs subbytes on the high dword of %xmm0
  1088. ## then rotates it by one byte and xors into the low dword of
  1089. ## %xmm7.
  1090. ##
  1091. ## Adds rcon from low byte of %xmm8, then rotates %xmm8 for
  1092. ## next rcon.
  1093. ##
  1094. ## Smears the dwords of %xmm7 by xoring the low into the
  1095. ## second low, result into third, result into highest.
  1096. ##
  1097. ## Returns results in %xmm7 = %xmm0.
  1098. ## Clobbers %xmm1-%xmm4, %r11.
  1099. ##
  1100. .align 4
  1101. _vpaes_schedule_round:
  1102. # extract rcon from xmm8
  1103. #vxor v4, v4, v4 # vpxor %xmm4, %xmm4, %xmm4
  1104. ?vsldoi v1, $rcon, v9, 15 # vpalignr \$15, %xmm8, %xmm4, %xmm1
  1105. ?vsldoi $rcon, $rcon, $rcon, 15 # vpalignr \$15, %xmm8, %xmm8, %xmm8
  1106. vxor v7, v7, v1 # vpxor %xmm1, %xmm7, %xmm7
  1107. # rotate
  1108. ?vspltw v0, v0, 3 # vpshufd \$0xFF, %xmm0, %xmm0
  1109. ?vsldoi v0, v0, v0, 1 # vpalignr \$1, %xmm0, %xmm0, %xmm0
  1110. # fall through...
  1111. # low round: same as high round, but no rotation and no rcon.
  1112. _vpaes_schedule_low_round:
  1113. # smear xmm7
  1114. ?vsldoi v1, v9, v7, 12 # vpslldq \$4, %xmm7, %xmm1
  1115. vxor v7, v7, v1 # vpxor %xmm1, %xmm7, %xmm7
  1116. vspltisb v1, 0x0f # 0x0f..0f
  1117. ?vsldoi v4, v9, v7, 8 # vpslldq \$8, %xmm7, %xmm4
  1118. # subbytes
  1119. vand v1, v1, v0 # vpand %xmm9, %xmm0, %xmm1 # 0 = k
  1120. vsrb v0, v0, v8 # vpsrlb \$4, %xmm0, %xmm0 # 1 = i
  1121. vxor v7, v7, v4 # vpxor %xmm4, %xmm7, %xmm7
  1122. vperm v2, $invhi, v9, v1 # vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
  1123. vxor v1, v1, v0 # vpxor %xmm0, %xmm1, %xmm1 # 0 = j
  1124. vperm v3, $invlo, v9, v0 # vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
  1125. vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
  1126. vperm v4, $invlo, v9, v1 # vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
  1127. vxor v7, v7, v26 # vpxor .Lk_s63(%rip), %xmm7, %xmm7
  1128. vperm v3, $invlo, v9, v3 # vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak
  1129. vxor v4, v4, v2 # vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
  1130. vperm v2, $invlo, v9, v4 # vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak
  1131. vxor v3, v3, v1 # vpxor %xmm1, %xmm3, %xmm3 # 2 = io
  1132. vxor v2, v2, v0 # vpxor %xmm0, %xmm2, %xmm2 # 3 = jo
  1133. vperm v4, v15, v9, v3 # vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou
  1134. vperm v1, v14, v9, v2 # vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t
  1135. vxor v1, v1, v4 # vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output
  1136. # add in smeared stuff
  1137. vxor v0, v1, v7 # vpxor %xmm7, %xmm1, %xmm0
  1138. vxor v7, v1, v7 # vmovdqa %xmm0, %xmm7
  1139. blr
  1140. .long 0
  1141. .byte 0,12,0x14,0,0,0,0,0
  1142. ##
  1143. ## .aes_schedule_transform
  1144. ##
  1145. ## Linear-transform %xmm0 according to tables at (%r11)
  1146. ##
  1147. ## Requires that %xmm9 = 0x0F0F... as in preheat
  1148. ## Output in %xmm0
  1149. ## Clobbers %xmm2
  1150. ##
  1151. .align 4
  1152. _vpaes_schedule_transform:
  1153. #vand v1, v0, v9 # vpand %xmm9, %xmm0, %xmm1
  1154. vsrb v2, v0, v8 # vpsrlb \$4, %xmm0, %xmm0
  1155. # vmovdqa (%r11), %xmm2 # lo
  1156. vperm v0, $iptlo, $iptlo, v0 # vpshufb %xmm1, %xmm2, %xmm2
  1157. # vmovdqa 16(%r11), %xmm1 # hi
  1158. vperm v2, $ipthi, $ipthi, v2 # vpshufb %xmm0, %xmm1, %xmm0
  1159. vxor v0, v0, v2 # vpxor %xmm2, %xmm0, %xmm0
  1160. blr
  1161. .long 0
  1162. .byte 0,12,0x14,0,0,0,0,0
  1163. ##
  1164. ## .aes_schedule_mangle
  1165. ##
  1166. ## Mangle xmm0 from (basis-transformed) standard version
  1167. ## to our version.
  1168. ##
  1169. ## On encrypt,
  1170. ## xor with 0x63
  1171. ## multiply by circulant 0,1,1,1
  1172. ## apply shiftrows transform
  1173. ##
  1174. ## On decrypt,
  1175. ## xor with 0x63
  1176. ## multiply by "inverse mixcolumns" circulant E,B,D,9
  1177. ## deskew
  1178. ## apply shiftrows transform
  1179. ##
  1180. ##
  1181. ## Writes out to (%rdx), and increments or decrements it
  1182. ## Keeps track of round number mod 4 in %r8
  1183. ## Preserves xmm0
  1184. ## Clobbers xmm1-xmm5
  1185. ##
  1186. .align 4
  1187. _vpaes_schedule_mangle:
  1188. #vmr v4, v0 # vmovdqa %xmm0, %xmm4 # save xmm0 for later
  1189. # vmovdqa .Lk_mc_forward(%rip),%xmm5
  1190. bne $dir, Lschedule_mangle_dec
  1191. # encrypting
  1192. vxor v4, v0, v26 # vpxor .Lk_s63(%rip), %xmm0, %xmm4
  1193. addi $out, $out, 16 # add \$16, %rdx
  1194. vperm v4, v4, v4, v25 # vpshufb %xmm5, %xmm4, %xmm4
  1195. vperm v1, v4, v4, v25 # vpshufb %xmm5, %xmm4, %xmm1
  1196. vperm v3, v1, v1, v25 # vpshufb %xmm5, %xmm1, %xmm3
  1197. vxor v4, v4, v1 # vpxor %xmm1, %xmm4, %xmm4
  1198. lvx v1, r8, r10 # vmovdqa (%r8,%r10), %xmm1
  1199. vxor v3, v3, v4 # vpxor %xmm4, %xmm3, %xmm3
  1200. vperm v3, v3, v3, v1 # vpshufb %xmm1, %xmm3, %xmm3
  1201. addi r8, r8, -16 # add \$-16, %r8
  1202. andi. r8, r8, 0x30 # and \$0x30, %r8
  1203. #stvx v3, 0, $out # vmovdqu %xmm3, (%rdx)
  1204. vperm v1, v3, v3, $outperm # rotate right/left
  1205. vsel v2, $outhead, v1, $outmask
  1206. vmr $outhead, v1
  1207. stvx v2, 0, $out
  1208. blr
  1209. .align 4
  1210. Lschedule_mangle_dec:
  1211. # inverse mix columns
  1212. # lea .Lk_dksd(%rip),%r11
  1213. vsrb v1, v0, v8 # vpsrlb \$4, %xmm4, %xmm1 # 1 = hi
  1214. #and v4, v0, v9 # vpand %xmm9, %xmm4, %xmm4 # 4 = lo
  1215. # vmovdqa 0x00(%r11), %xmm2
  1216. vperm v2, v16, v16, v0 # vpshufb %xmm4, %xmm2, %xmm2
  1217. # vmovdqa 0x10(%r11), %xmm3
  1218. vperm v3, v17, v17, v1 # vpshufb %xmm1, %xmm3, %xmm3
  1219. vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3
  1220. vperm v3, v3, v9, v25 # vpshufb %xmm5, %xmm3, %xmm3
  1221. # vmovdqa 0x20(%r11), %xmm2
  1222. vperm v2, v18, v18, v0 # vpshufb %xmm4, %xmm2, %xmm2
  1223. vxor v2, v2, v3 # vpxor %xmm3, %xmm2, %xmm2
  1224. # vmovdqa 0x30(%r11), %xmm3
  1225. vperm v3, v19, v19, v1 # vpshufb %xmm1, %xmm3, %xmm3
  1226. vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3
  1227. vperm v3, v3, v9, v25 # vpshufb %xmm5, %xmm3, %xmm3
  1228. # vmovdqa 0x40(%r11), %xmm2
  1229. vperm v2, v20, v20, v0 # vpshufb %xmm4, %xmm2, %xmm2
  1230. vxor v2, v2, v3 # vpxor %xmm3, %xmm2, %xmm2
  1231. # vmovdqa 0x50(%r11), %xmm3
  1232. vperm v3, v21, v21, v1 # vpshufb %xmm1, %xmm3, %xmm3
  1233. vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3
  1234. # vmovdqa 0x60(%r11), %xmm2
  1235. vperm v2, v22, v22, v0 # vpshufb %xmm4, %xmm2, %xmm2
  1236. vperm v3, v3, v9, v25 # vpshufb %xmm5, %xmm3, %xmm3
  1237. # vmovdqa 0x70(%r11), %xmm4
  1238. vperm v4, v23, v23, v1 # vpshufb %xmm1, %xmm4, %xmm4
  1239. lvx v1, r8, r10 # vmovdqa (%r8,%r10), %xmm1
  1240. vxor v2, v2, v3 # vpxor %xmm3, %xmm2, %xmm2
  1241. vxor v3, v4, v2 # vpxor %xmm2, %xmm4, %xmm3
  1242. addi $out, $out, -16 # add \$-16, %rdx
  1243. vperm v3, v3, v3, v1 # vpshufb %xmm1, %xmm3, %xmm3
  1244. addi r8, r8, -16 # add \$-16, %r8
  1245. andi. r8, r8, 0x30 # and \$0x30, %r8
  1246. #stvx v3, 0, $out # vmovdqu %xmm3, (%rdx)
  1247. vperm v1, v3, v3, $outperm # rotate right/left
  1248. vsel v2, $outhead, v1, $outmask
  1249. vmr $outhead, v1
  1250. stvx v2, 0, $out
  1251. blr
  1252. .long 0
  1253. .byte 0,12,0x14,0,0,0,0,0
  1254. .globl .vpaes_set_encrypt_key
  1255. .align 5
  1256. .vpaes_set_encrypt_key:
  1257. $STU $sp,-$FRAME($sp)
  1258. li r10,`15+6*$SIZE_T`
  1259. li r11,`31+6*$SIZE_T`
  1260. mflr r0
  1261. mfspr r6, 256 # save vrsave
  1262. stvx v20,r10,$sp
  1263. addi r10,r10,32
  1264. stvx v21,r11,$sp
  1265. addi r11,r11,32
  1266. stvx v22,r10,$sp
  1267. addi r10,r10,32
  1268. stvx v23,r11,$sp
  1269. addi r11,r11,32
  1270. stvx v24,r10,$sp
  1271. addi r10,r10,32
  1272. stvx v25,r11,$sp
  1273. addi r11,r11,32
  1274. stvx v26,r10,$sp
  1275. addi r10,r10,32
  1276. stvx v27,r11,$sp
  1277. addi r11,r11,32
  1278. stvx v28,r10,$sp
  1279. addi r10,r10,32
  1280. stvx v29,r11,$sp
  1281. addi r11,r11,32
  1282. stvx v30,r10,$sp
  1283. stvx v31,r11,$sp
  1284. stw r6,`$FRAME-4`($sp) # save vrsave
  1285. li r7, -1
  1286. $PUSH r0, `$FRAME+$LRSAVE`($sp)
  1287. mtspr 256, r7 # preserve all AltiVec registers
  1288. srwi r9, $bits, 5 # shr \$5,%eax
  1289. addi r9, r9, 6 # add \$5,%eax
  1290. stw r9, 240($out) # mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
  1291. cmplw $dir, $bits, $bits # set encrypt direction
  1292. li r8, 0x30 # mov \$0x30,%r8d
  1293. bl _vpaes_schedule_core
  1294. $POP r0, `$FRAME+$LRSAVE`($sp)
  1295. li r10,`15+6*$SIZE_T`
  1296. li r11,`31+6*$SIZE_T`
  1297. mtspr 256, r6 # restore vrsave
  1298. mtlr r0
  1299. xor r3, r3, r3
  1300. lvx v20,r10,$sp
  1301. addi r10,r10,32
  1302. lvx v21,r11,$sp
  1303. addi r11,r11,32
  1304. lvx v22,r10,$sp
  1305. addi r10,r10,32
  1306. lvx v23,r11,$sp
  1307. addi r11,r11,32
  1308. lvx v24,r10,$sp
  1309. addi r10,r10,32
  1310. lvx v25,r11,$sp
  1311. addi r11,r11,32
  1312. lvx v26,r10,$sp
  1313. addi r10,r10,32
  1314. lvx v27,r11,$sp
  1315. addi r11,r11,32
  1316. lvx v28,r10,$sp
  1317. addi r10,r10,32
  1318. lvx v29,r11,$sp
  1319. addi r11,r11,32
  1320. lvx v30,r10,$sp
  1321. lvx v31,r11,$sp
  1322. addi $sp,$sp,$FRAME
  1323. blr
  1324. .long 0
  1325. .byte 0,12,0x04,1,0x80,0,3,0
  1326. .long 0
  1327. .size .vpaes_set_encrypt_key,.-.vpaes_set_encrypt_key
  1328. .globl .vpaes_set_decrypt_key
  1329. .align 4
  1330. .vpaes_set_decrypt_key:
  1331. $STU $sp,-$FRAME($sp)
  1332. li r10,`15+6*$SIZE_T`
  1333. li r11,`31+6*$SIZE_T`
  1334. mflr r0
  1335. mfspr r6, 256 # save vrsave
  1336. stvx v20,r10,$sp
  1337. addi r10,r10,32
  1338. stvx v21,r11,$sp
  1339. addi r11,r11,32
  1340. stvx v22,r10,$sp
  1341. addi r10,r10,32
  1342. stvx v23,r11,$sp
  1343. addi r11,r11,32
  1344. stvx v24,r10,$sp
  1345. addi r10,r10,32
  1346. stvx v25,r11,$sp
  1347. addi r11,r11,32
  1348. stvx v26,r10,$sp
  1349. addi r10,r10,32
  1350. stvx v27,r11,$sp
  1351. addi r11,r11,32
  1352. stvx v28,r10,$sp
  1353. addi r10,r10,32
  1354. stvx v29,r11,$sp
  1355. addi r11,r11,32
  1356. stvx v30,r10,$sp
  1357. stvx v31,r11,$sp
  1358. stw r6,`$FRAME-4`($sp) # save vrsave
  1359. li r7, -1
  1360. $PUSH r0, `$FRAME+$LRSAVE`($sp)
  1361. mtspr 256, r7 # preserve all AltiVec registers
  1362. srwi r9, $bits, 5 # shr \$5,%eax
  1363. addi r9, r9, 6 # add \$5,%eax
  1364. stw r9, 240($out) # mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
  1365. slwi r9, r9, 4 # shl \$4,%eax
  1366. add $out, $out, r9 # lea (%rdx,%rax),%rdx
  1367. cmplwi $dir, $bits, 0 # set decrypt direction
  1368. srwi r8, $bits, 1 # shr \$1,%r8d
  1369. andi. r8, r8, 32 # and \$32,%r8d
  1370. xori r8, r8, 32 # xor \$32,%r8d # nbits==192?0:32
  1371. bl _vpaes_schedule_core
  1372. $POP r0, `$FRAME+$LRSAVE`($sp)
  1373. li r10,`15+6*$SIZE_T`
  1374. li r11,`31+6*$SIZE_T`
  1375. mtspr 256, r6 # restore vrsave
  1376. mtlr r0
  1377. xor r3, r3, r3
  1378. lvx v20,r10,$sp
  1379. addi r10,r10,32
  1380. lvx v21,r11,$sp
  1381. addi r11,r11,32
  1382. lvx v22,r10,$sp
  1383. addi r10,r10,32
  1384. lvx v23,r11,$sp
  1385. addi r11,r11,32
  1386. lvx v24,r10,$sp
  1387. addi r10,r10,32
  1388. lvx v25,r11,$sp
  1389. addi r11,r11,32
  1390. lvx v26,r10,$sp
  1391. addi r10,r10,32
  1392. lvx v27,r11,$sp
  1393. addi r11,r11,32
  1394. lvx v28,r10,$sp
  1395. addi r10,r10,32
  1396. lvx v29,r11,$sp
  1397. addi r11,r11,32
  1398. lvx v30,r10,$sp
  1399. lvx v31,r11,$sp
  1400. addi $sp,$sp,$FRAME
  1401. blr
  1402. .long 0
  1403. .byte 0,12,0x04,1,0x80,0,3,0
  1404. .long 0
  1405. .size .vpaes_set_decrypt_key,.-.vpaes_set_decrypt_key
  1406. ___
  1407. }
  1408. my $consts=1;
  1409. foreach (split("\n",$code)) {
  1410. s/\`([^\`]*)\`/eval $1/geo;
  1411. # constants table endian-specific conversion
  1412. if ($consts && m/\.long\s+(.+)\s+(\?[a-z]*)$/o) {
  1413. my $conv=$2;
  1414. my @bytes=();
  1415. # convert to endian-agnostic format
  1416. foreach (split(/,\s+/,$1)) {
  1417. my $l = /^0/?oct:int;
  1418. push @bytes,($l>>24)&0xff,($l>>16)&0xff,($l>>8)&0xff,$l&0xff;
  1419. }
  1420. # little-endian conversion
  1421. if ($flavour =~ /le$/o) {
  1422. SWITCH: for($conv) {
  1423. /\?inv/ && do { @bytes=map($_^0xf,@bytes); last; };
  1424. /\?rev/ && do { @bytes=reverse(@bytes); last; };
  1425. }
  1426. }
  1427. #emit
  1428. print ".byte\t",join(',',map (sprintf("0x%02x",$_),@bytes)),"\n";
  1429. next;
  1430. }
  1431. $consts=0 if (m/Lconsts:/o); # end of table
  1432. # instructions prefixed with '?' are endian-specific and need
  1433. # to be adjusted accordingly...
  1434. if ($flavour =~ /le$/o) { # little-endian
  1435. s/\?lvsr/lvsl/o or
  1436. s/\?lvsl/lvsr/o or
  1437. s/\?(vperm\s+v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+)/$1$3$2$4/o or
  1438. s/\?(vsldoi\s+v[0-9]+,\s*)(v[0-9]+,)\s*(v[0-9]+,\s*)([0-9]+)/$1$3$2 16-$4/o or
  1439. s/\?(vspltw\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9])/$1$2 3-$3/o;
  1440. } else { # big-endian
  1441. s/\?([a-z]+)/$1/o;
  1442. }
  1443. print $_,"\n";
  1444. }
  1445. close STDOUT;