vpaes-ppc.pl 42 KB


  1. #! /usr/bin/env perl
  2. # Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. ######################################################################
  9. ## Constant-time SSSE3 AES core implementation.
  10. ## version 0.1
  11. ##
  12. ## By Mike Hamburg (Stanford University), 2009
  13. ## Public domain.
  14. ##
  15. ## For details see http://shiftleft.org/papers/vector_aes/ and
  16. ## http://crypto.stanford.edu/vpaes/.
  17. # CBC encrypt/decrypt performance in cycles per byte processed with
  18. # 128-bit key.
  19. #
  20. # aes-ppc.pl this
  21. # PPC74x0/G4e 35.5/52.1/(23.8) 11.9(*)/15.4
  22. # PPC970/G5 37.9/55.0/(28.5) 22.2/28.5
  23. # POWER6 42.7/54.3/(28.2) 63.0/92.8(**)
  24. # POWER7 32.3/42.9/(18.4) 18.5/23.3
  25. #
  26. # (*) This is ~10% worse than reported in paper. The reason is
  27. # twofold. This module doesn't make any assumption about
  28. # key schedule (or data for that matter) alignment and handles
  29. # it in-line. Secondly it, being transliterated from
  30. # vpaes-x86_64.pl, relies on "nested inversion" better suited
  31. # for Intel CPUs.
  32. # (**) Inadequate POWER6 performance is due to astronomic AltiVec
  33. # latency, 9 cycles per simple logical operation.
  34. # $output is the last argument if it looks like a file (it has an extension)
  35. # $flavour is the first argument if it doesn't look like a file
  36. $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  37. $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  38. if ($flavour =~ /64/) {
  39. $SIZE_T =8;
  40. $LRSAVE =2*$SIZE_T;
  41. $STU ="stdu";
  42. $POP ="ld";
  43. $PUSH ="std";
  44. $UCMP ="cmpld";
  45. } elsif ($flavour =~ /32/) {
  46. $SIZE_T =4;
  47. $LRSAVE =$SIZE_T;
  48. $STU ="stwu";
  49. $POP ="lwz";
  50. $PUSH ="stw";
  51. $UCMP ="cmplw";
  52. } else { die "nonsense $flavour"; }
  53. $sp="r1";
  54. $FRAME=6*$SIZE_T+13*16; # 13*16 is for v20-v31 offload
  55. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  56. ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
  57. ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
  58. die "can't locate ppc-xlate.pl";
  59. open STDOUT,"| $^X $xlate $flavour \"$output\""
  60. || die "can't call $xlate: $!";
  61. $code.=<<___;
  62. .machine "any"
  63. .text
  64. .align 7 # totally strategic alignment
  65. _vpaes_consts:
  66. Lk_mc_forward: # mc_forward
  67. .long 0x01020300, 0x05060704, 0x090a0b08, 0x0d0e0f0c ?inv
  68. .long 0x05060704, 0x090a0b08, 0x0d0e0f0c, 0x01020300 ?inv
  69. .long 0x090a0b08, 0x0d0e0f0c, 0x01020300, 0x05060704 ?inv
  70. .long 0x0d0e0f0c, 0x01020300, 0x05060704, 0x090a0b08 ?inv
  71. Lk_mc_backward: # mc_backward
  72. .long 0x03000102, 0x07040506, 0x0b08090a, 0x0f0c0d0e ?inv
  73. .long 0x0f0c0d0e, 0x03000102, 0x07040506, 0x0b08090a ?inv
  74. .long 0x0b08090a, 0x0f0c0d0e, 0x03000102, 0x07040506 ?inv
  75. .long 0x07040506, 0x0b08090a, 0x0f0c0d0e, 0x03000102 ?inv
  76. Lk_sr: # sr
  77. .long 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f ?inv
  78. .long 0x00050a0f, 0x04090e03, 0x080d0207, 0x0c01060b ?inv
  79. .long 0x0009020b, 0x040d060f, 0x08010a03, 0x0c050e07 ?inv
  80. .long 0x000d0a07, 0x04010e0b, 0x0805020f, 0x0c090603 ?inv
  81. ##
  82. ## "Hot" constants
  83. ##
  84. Lk_inv: # inv, inva
  85. .long 0xf001080d, 0x0f06050e, 0x020c0b0a, 0x09030704 ?rev
  86. .long 0xf0070b0f, 0x060a0401, 0x09080502, 0x0c0e0d03 ?rev
  87. Lk_ipt: # input transform (lo, hi)
  88. .long 0x00702a5a, 0x98e8b2c2, 0x08782252, 0x90e0baca ?rev
  89. .long 0x004d7c31, 0x7d30014c, 0x81ccfdb0, 0xfcb180cd ?rev
  90. Lk_sbo: # sbou, sbot
  91. .long 0x00c7bd6f, 0x176dd2d0, 0x78a802c5, 0x7abfaa15 ?rev
  92. .long 0x006abb5f, 0xa574e4cf, 0xfa352b41, 0xd1901e8e ?rev
  93. Lk_sb1: # sb1u, sb1t
  94. .long 0x0023e2fa, 0x15d41836, 0xefd92e0d, 0xc1ccf73b ?rev
  95. .long 0x003e50cb, 0x8fe19bb1, 0x44f52a14, 0x6e7adfa5 ?rev
  96. Lk_sb2: # sb2u, sb2t
  97. .long 0x0029e10a, 0x4088eb69, 0x4a2382ab, 0xc863a1c2 ?rev
  98. .long 0x0024710b, 0xc6937ae2, 0xcd2f98bc, 0x55e9b75e ?rev
  99. ##
  100. ## Decryption stuff
  101. ##
  102. Lk_dipt: # decryption input transform
  103. .long 0x005f540b, 0x045b500f, 0x1a454e11, 0x1e414a15 ?rev
  104. .long 0x00650560, 0xe683e386, 0x94f191f4, 0x72177712 ?rev
  105. Lk_dsbo: # decryption sbox final output
  106. .long 0x0040f97e, 0x53ea8713, 0x2d3e94d4, 0xb96daac7 ?rev
  107. .long 0x001d4493, 0x0f56d712, 0x9c8ec5d8, 0x59814bca ?rev
  108. Lk_dsb9: # decryption sbox output *9*u, *9*t
  109. .long 0x00d6869a, 0x53031c85, 0xc94c994f, 0x501fd5ca ?rev
  110. .long 0x0049d7ec, 0x89173bc0, 0x65a5fbb2, 0x9e2c5e72 ?rev
  111. Lk_dsbd: # decryption sbox output *D*u, *D*t
  112. .long 0x00a2b1e6, 0xdfcc577d, 0x39442a88, 0x139b6ef5 ?rev
  113. .long 0x00cbc624, 0xf7fae23c, 0xd3efde15, 0x0d183129 ?rev
  114. Lk_dsbb: # decryption sbox output *B*u, *B*t
  115. .long 0x0042b496, 0x926422d0, 0x04d4f2b0, 0xf6462660 ?rev
  116. .long 0x006759cd, 0xa69894c1, 0x6baa5532, 0x3e0cfff3 ?rev
  117. Lk_dsbe: # decryption sbox output *E*u, *E*t
  118. .long 0x00d0d426, 0x9692f246, 0xb0f6b464, 0x04604222 ?rev
  119. .long 0x00c1aaff, 0xcda6550c, 0x323e5998, 0x6bf36794 ?rev
  120. ##
  121. ## Key schedule constants
  122. ##
  123. Lk_dksd: # decryption key schedule: invskew x*D
  124. .long 0x0047e4a3, 0x5d1ab9fe, 0xf9be1d5a, 0xa4e34007 ?rev
  125. .long 0x008336b5, 0xf477c241, 0x1e9d28ab, 0xea69dc5f ?rev
  126. Lk_dksb: # decryption key schedule: invskew x*B
  127. .long 0x00d55085, 0x1fca4f9a, 0x994cc91c, 0x8653d603 ?rev
  128. .long 0x004afcb6, 0xa7ed5b11, 0xc882347e, 0x6f2593d9 ?rev
  129. Lk_dkse: # decryption key schedule: invskew x*E + 0x63
  130. .long 0x00d6c91f, 0xca1c03d5, 0x86504f99, 0x4c9a8553 ?rev
  131. .long 0xe87bdc4f, 0x059631a2, 0x8714b320, 0x6af95ecd ?rev
  132. Lk_dks9: # decryption key schedule: invskew x*9
  133. .long 0x00a7d97e, 0xc86f11b6, 0xfc5b2582, 0x3493ed4a ?rev
  134. .long 0x00331427, 0x62517645, 0xcefddae9, 0xac9fb88b ?rev
  135. Lk_rcon: # rcon
  136. .long 0xb6ee9daf, 0xb991831f, 0x817d7c4d, 0x08982a70 ?asis
  137. Lk_s63:
  138. .long 0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b ?asis
  139. Lk_opt: # output transform
  140. .long 0x0060b6d6, 0x29499fff, 0x0868bede, 0x214197f7 ?rev
  141. .long 0x00ecbc50, 0x51bded01, 0xe00c5cb0, 0xb15d0de1 ?rev
  142. Lk_deskew: # deskew tables: inverts the sbox's "skew"
  143. .long 0x00e3a447, 0x40a3e407, 0x1af9be5d, 0x5ab9fe1d ?rev
  144. .long 0x0069ea83, 0xdcb5365f, 0x771e9df4, 0xabc24128 ?rev
  145. .align 5
  146. Lconsts:
  147. mflr r0
  148. bcl 20,31,\$+4
  149. mflr r12 #vvvvv "distance between . and _vpaes_consts
  150. addi r12,r12,-0x308
  151. mtlr r0
  152. blr
  153. .long 0
  154. .byte 0,12,0x14,0,0,0,0,0
  155. .asciz "Vector Permutation AES for AltiVec, Mike Hamburg (Stanford University)"
  156. .align 6
  157. ___
  158. my ($inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm) = map("v$_",(26..31));
  159. {
  160. my ($inp,$out,$key) = map("r$_",(3..5));
  161. my ($invlo,$invhi,$iptlo,$ipthi,$sbou,$sbot) = map("v$_",(10..15));
  162. my ($sb1u,$sb1t,$sb2u,$sb2t) = map("v$_",(16..19));
  163. my ($sb9u,$sb9t,$sbdu,$sbdt,$sbbu,$sbbt,$sbeu,$sbet)=map("v$_",(16..23));
  164. $code.=<<___;
  165. ##
  166. ## _aes_preheat
  167. ##
  168. ## Fills register %r10 -> .aes_consts (so you can -fPIC)
  169. ## and %xmm9-%xmm15 as specified below.
  170. ##
  171. .align 4
  172. _vpaes_encrypt_preheat:
  173. mflr r8
  174. bl Lconsts
  175. mtlr r8
  176. li r11, 0xc0 # Lk_inv
  177. li r10, 0xd0
  178. li r9, 0xe0 # Lk_ipt
  179. li r8, 0xf0
  180. vxor v7, v7, v7 # 0x00..00
  181. vspltisb v8,4 # 0x04..04
  182. vspltisb v9,0x0f # 0x0f..0f
  183. lvx $invlo, r12, r11
  184. li r11, 0x100
  185. lvx $invhi, r12, r10
  186. li r10, 0x110
  187. lvx $iptlo, r12, r9
  188. li r9, 0x120
  189. lvx $ipthi, r12, r8
  190. li r8, 0x130
  191. lvx $sbou, r12, r11
  192. li r11, 0x140
  193. lvx $sbot, r12, r10
  194. li r10, 0x150
  195. lvx $sb1u, r12, r9
  196. lvx $sb1t, r12, r8
  197. lvx $sb2u, r12, r11
  198. lvx $sb2t, r12, r10
  199. blr
  200. .long 0
  201. .byte 0,12,0x14,0,0,0,0,0
  202. ##
  203. ## _aes_encrypt_core
  204. ##
  205. ## AES-encrypt %xmm0.
  206. ##
  207. ## Inputs:
  208. ## %xmm0 = input
  209. ## %xmm9-%xmm15 as in _vpaes_preheat
  210. ## (%rdx) = scheduled keys
  211. ##
  212. ## Output in %xmm0
  213. ## Clobbers %xmm1-%xmm6, %r9, %r10, %r11, %rax
  214. ##
  215. ##
  216. .align 5
  217. _vpaes_encrypt_core:
  218. lwz r8, 240($key) # pull rounds
  219. li r9, 16
  220. lvx v5, 0, $key # vmovdqu (%r9), %xmm5 # round0 key
  221. li r11, 0x10
  222. lvx v6, r9, $key
  223. addi r9, r9, 16
  224. ?vperm v5, v5, v6, $keyperm # align round key
  225. addi r10, r11, 0x40
  226. vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0
  227. vperm v0, $iptlo, $iptlo, v0 # vpshufb %xmm1, %xmm2, %xmm1
  228. vperm v1, $ipthi, $ipthi, v1 # vpshufb %xmm0, %xmm3, %xmm2
  229. vxor v0, v0, v5 # vpxor %xmm5, %xmm1, %xmm0
  230. vxor v0, v0, v1 # vpxor %xmm2, %xmm0, %xmm0
  231. mtctr r8
  232. b Lenc_entry
  233. .align 4
  234. Lenc_loop:
  235. # middle of middle round
  236. vperm v4, $sb1t, v7, v2 # vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u
  237. lvx v1, r12, r11 # vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]
  238. addi r11, r11, 16
  239. vperm v0, $sb1u, v7, v3 # vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t
  240. vxor v4, v4, v5 # vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
  241. andi. r11, r11, 0x30 # and \$0x30, %r11 # ... mod 4
  242. vperm v5, $sb2t, v7, v2 # vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u
  243. vxor v0, v0, v4 # vpxor %xmm4, %xmm0, %xmm0 # 0 = A
  244. vperm v2, $sb2u, v7, v3 # vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t
  245. lvx v4, r12, r10 # vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[]
  246. addi r10, r11, 0x40
  247. vperm v3, v0, v7, v1 # vpshufb %xmm1, %xmm0, %xmm3 # 0 = B
  248. vxor v2, v2, v5 # vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A
  249. vperm v0, v0, v7, v4 # vpshufb %xmm4, %xmm0, %xmm0 # 3 = D
  250. vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B
  251. vperm v4, v3, v7, v1 # vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C
  252. vxor v0, v0, v3 # vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D
  253. vxor v0, v0, v4 # vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D
  254. Lenc_entry:
  255. # top of round
  256. vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0 # 1 = i
  257. vperm v5, $invhi, $invhi, v0 # vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k
  258. vxor v0, v0, v1 # vpxor %xmm0, %xmm1, %xmm1 # 0 = j
  259. vperm v3, $invlo, $invlo, v1 # vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
  260. vperm v4, $invlo, $invlo, v0 # vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
  261. vand v0, v0, v9
  262. vxor v3, v3, v5 # vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
  263. vxor v4, v4, v5 # vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
  264. vperm v2, $invlo, v7, v3 # vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
  265. vmr v5, v6
  266. lvx v6, r9, $key # vmovdqu (%r9), %xmm5
  267. vperm v3, $invlo, v7, v4 # vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
  268. addi r9, r9, 16
  269. vxor v2, v2, v0 # vpxor %xmm1, %xmm2, %xmm2 # 2 = io
  270. ?vperm v5, v5, v6, $keyperm # align round key
  271. vxor v3, v3, v1 # vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
  272. bdnz Lenc_loop
  273. # middle of last round
  274. addi r10, r11, 0x80
  275. # vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo
  276. # vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
  277. vperm v4, $sbou, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
  278. lvx v1, r12, r10 # vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[]
  279. vperm v0, $sbot, v7, v3 # vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t
  280. vxor v4, v4, v5 # vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
  281. vxor v0, v0, v4 # vpxor %xmm4, %xmm0, %xmm0 # 0 = A
  282. vperm v0, v0, v7, v1 # vpshufb %xmm1, %xmm0, %xmm0
  283. blr
  284. .long 0
  285. .byte 0,12,0x14,0,0,0,0,0
  286. .globl .vpaes_encrypt
  287. .align 5
  288. .vpaes_encrypt:
  289. $STU $sp,-$FRAME($sp)
  290. li r10,`15+6*$SIZE_T`
  291. li r11,`31+6*$SIZE_T`
  292. mflr r6
  293. mfspr r7, 256 # save vrsave
  294. stvx v20,r10,$sp
  295. addi r10,r10,32
  296. stvx v21,r11,$sp
  297. addi r11,r11,32
  298. stvx v22,r10,$sp
  299. addi r10,r10,32
  300. stvx v23,r11,$sp
  301. addi r11,r11,32
  302. stvx v24,r10,$sp
  303. addi r10,r10,32
  304. stvx v25,r11,$sp
  305. addi r11,r11,32
  306. stvx v26,r10,$sp
  307. addi r10,r10,32
  308. stvx v27,r11,$sp
  309. addi r11,r11,32
  310. stvx v28,r10,$sp
  311. addi r10,r10,32
  312. stvx v29,r11,$sp
  313. addi r11,r11,32
  314. stvx v30,r10,$sp
  315. stvx v31,r11,$sp
  316. stw r7,`$FRAME-4`($sp) # save vrsave
  317. li r0, -1
  318. $PUSH r6,`$FRAME+$LRSAVE`($sp)
  319. mtspr 256, r0 # preserve all AltiVec registers
  320. bl _vpaes_encrypt_preheat
  321. ?lvsl $inpperm, 0, $inp # prepare for unaligned access
  322. lvx v0, 0, $inp
  323. addi $inp, $inp, 15 # 15 is not a typo
  324. ?lvsr $outperm, 0, $out
  325. ?lvsl $keyperm, 0, $key # prepare for unaligned access
  326. lvx $inptail, 0, $inp # redundant in aligned case
  327. ?vperm v0, v0, $inptail, $inpperm
  328. bl _vpaes_encrypt_core
  329. andi. r8, $out, 15
  330. li r9, 16
  331. beq Lenc_out_aligned
  332. vperm v0, v0, v0, $outperm # rotate right/left
  333. mtctr r9
  334. Lenc_out_unaligned:
  335. stvebx v0, 0, $out
  336. addi $out, $out, 1
  337. bdnz Lenc_out_unaligned
  338. b Lenc_done
  339. .align 4
  340. Lenc_out_aligned:
  341. stvx v0, 0, $out
  342. Lenc_done:
  343. li r10,`15+6*$SIZE_T`
  344. li r11,`31+6*$SIZE_T`
  345. mtlr r6
  346. mtspr 256, r7 # restore vrsave
  347. lvx v20,r10,$sp
  348. addi r10,r10,32
  349. lvx v21,r11,$sp
  350. addi r11,r11,32
  351. lvx v22,r10,$sp
  352. addi r10,r10,32
  353. lvx v23,r11,$sp
  354. addi r11,r11,32
  355. lvx v24,r10,$sp
  356. addi r10,r10,32
  357. lvx v25,r11,$sp
  358. addi r11,r11,32
  359. lvx v26,r10,$sp
  360. addi r10,r10,32
  361. lvx v27,r11,$sp
  362. addi r11,r11,32
  363. lvx v28,r10,$sp
  364. addi r10,r10,32
  365. lvx v29,r11,$sp
  366. addi r11,r11,32
  367. lvx v30,r10,$sp
  368. lvx v31,r11,$sp
  369. addi $sp,$sp,$FRAME
  370. blr
  371. .long 0
  372. .byte 0,12,0x04,1,0x80,0,3,0
  373. .long 0
  374. .size .vpaes_encrypt,.-.vpaes_encrypt
  375. .align 4
  376. _vpaes_decrypt_preheat:
  377. mflr r8
  378. bl Lconsts
  379. mtlr r8
  380. li r11, 0xc0 # Lk_inv
  381. li r10, 0xd0
  382. li r9, 0x160 # Ldipt
  383. li r8, 0x170
  384. vxor v7, v7, v7 # 0x00..00
  385. vspltisb v8,4 # 0x04..04
  386. vspltisb v9,0x0f # 0x0f..0f
  387. lvx $invlo, r12, r11
  388. li r11, 0x180
  389. lvx $invhi, r12, r10
  390. li r10, 0x190
  391. lvx $iptlo, r12, r9
  392. li r9, 0x1a0
  393. lvx $ipthi, r12, r8
  394. li r8, 0x1b0
  395. lvx $sbou, r12, r11
  396. li r11, 0x1c0
  397. lvx $sbot, r12, r10
  398. li r10, 0x1d0
  399. lvx $sb9u, r12, r9
  400. li r9, 0x1e0
  401. lvx $sb9t, r12, r8
  402. li r8, 0x1f0
  403. lvx $sbdu, r12, r11
  404. li r11, 0x200
  405. lvx $sbdt, r12, r10
  406. li r10, 0x210
  407. lvx $sbbu, r12, r9
  408. lvx $sbbt, r12, r8
  409. lvx $sbeu, r12, r11
  410. lvx $sbet, r12, r10
  411. blr
  412. .long 0
  413. .byte 0,12,0x14,0,0,0,0,0
  414. ##
  415. ## Decryption core
  416. ##
  417. ## Same API as encryption core.
  418. ##
  419. .align 4
  420. _vpaes_decrypt_core:
  421. lwz r8, 240($key) # pull rounds
  422. li r9, 16
  423. lvx v5, 0, $key # vmovdqu (%r9), %xmm4 # round0 key
  424. li r11, 0x30
  425. lvx v6, r9, $key
  426. addi r9, r9, 16
  427. ?vperm v5, v5, v6, $keyperm # align round key
  428. vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0
  429. vperm v0, $iptlo, $iptlo, v0 # vpshufb %xmm1, %xmm2, %xmm2
  430. vperm v1, $ipthi, $ipthi, v1 # vpshufb %xmm0, %xmm1, %xmm0
  431. vxor v0, v0, v5 # vpxor %xmm4, %xmm2, %xmm2
  432. vxor v0, v0, v1 # vpxor %xmm2, %xmm0, %xmm0
  433. mtctr r8
  434. b Ldec_entry
  435. .align 4
  436. Ldec_loop:
  437. #
  438. # Inverse mix columns
  439. #
  440. lvx v0, r12, r11 # v5 and v0 are flipped
  441. # vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u
  442. # vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t
  443. vperm v4, $sb9u, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u
  444. subi r11, r11, 16
  445. vperm v1, $sb9t, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t
  446. andi. r11, r11, 0x30
  447. vxor v5, v5, v4 # vpxor %xmm4, %xmm0, %xmm0
  448. # vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu
  449. vxor v5, v5, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
  450. # vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt
  451. vperm v4, $sbdu, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu
  452. vperm v5, v5, v7, v0 # vpshufb %xmm5, %xmm0, %xmm0 # MC ch
  453. vperm v1, $sbdt, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt
  454. vxor v5, v5, v4 # vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
  455. # vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu
  456. vxor v5, v5, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
  457. # vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt
  458. vperm v4, $sbbu, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu
  459. vperm v5, v5, v7, v0 # vpshufb %xmm5, %xmm0, %xmm0 # MC ch
  460. vperm v1, $sbbt, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt
  461. vxor v5, v5, v4 # vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
  462. # vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu
  463. vxor v5, v5, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
  464. # vmovdqa 0x50(%r10), %xmm1 # 0 : sbet
  465. vperm v4, $sbeu, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu
  466. vperm v5, v5, v7, v0 # vpshufb %xmm5, %xmm0, %xmm0 # MC ch
  467. vperm v1, $sbet, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet
  468. vxor v0, v5, v4 # vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
  469. vxor v0, v0, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
  470. Ldec_entry:
  471. # top of round
  472. vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0 # 1 = i
  473. vperm v2, $invhi, $invhi, v0 # vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
  474. vxor v0, v0, v1 # vpxor %xmm0, %xmm1, %xmm1 # 0 = j
  475. vperm v3, $invlo, $invlo, v1 # vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
  476. vperm v4, $invlo, $invlo, v0 # vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
  477. vand v0, v0, v9
  478. vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
  479. vxor v4, v4, v2 # vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
  480. vperm v2, $invlo, v7, v3 # vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
  481. vmr v5, v6
  482. lvx v6, r9, $key # vmovdqu (%r9), %xmm0
  483. vperm v3, $invlo, v7, v4 # vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
  484. addi r9, r9, 16
  485. vxor v2, v2, v0 # vpxor %xmm1, %xmm2, %xmm2 # 2 = io
  486. ?vperm v5, v5, v6, $keyperm # align round key
  487. vxor v3, v3, v1 # vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
  488. bdnz Ldec_loop
  489. # middle of last round
  490. addi r10, r11, 0x80
  491. # vmovdqa 0x60(%r10), %xmm4 # 3 : sbou
  492. vperm v4, $sbou, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
  493. # vmovdqa 0x70(%r10), %xmm1 # 0 : sbot
  494. lvx v2, r12, r10 # vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160
  495. vperm v1, $sbot, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t
  496. vxor v4, v4, v5 # vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k
  497. vxor v0, v1, v4 # vpxor %xmm4, %xmm1, %xmm0 # 0 = A
  498. vperm v0, v0, v7, v2 # vpshufb %xmm2, %xmm0, %xmm0
  499. blr
  500. .long 0
  501. .byte 0,12,0x14,0,0,0,0,0
  502. .globl .vpaes_decrypt
  503. .align 5
  504. .vpaes_decrypt:
  505. $STU $sp,-$FRAME($sp)
  506. li r10,`15+6*$SIZE_T`
  507. li r11,`31+6*$SIZE_T`
  508. mflr r6
  509. mfspr r7, 256 # save vrsave
  510. stvx v20,r10,$sp
  511. addi r10,r10,32
  512. stvx v21,r11,$sp
  513. addi r11,r11,32
  514. stvx v22,r10,$sp
  515. addi r10,r10,32
  516. stvx v23,r11,$sp
  517. addi r11,r11,32
  518. stvx v24,r10,$sp
  519. addi r10,r10,32
  520. stvx v25,r11,$sp
  521. addi r11,r11,32
  522. stvx v26,r10,$sp
  523. addi r10,r10,32
  524. stvx v27,r11,$sp
  525. addi r11,r11,32
  526. stvx v28,r10,$sp
  527. addi r10,r10,32
  528. stvx v29,r11,$sp
  529. addi r11,r11,32
  530. stvx v30,r10,$sp
  531. stvx v31,r11,$sp
  532. stw r7,`$FRAME-4`($sp) # save vrsave
  533. li r0, -1
  534. $PUSH r6,`$FRAME+$LRSAVE`($sp)
  535. mtspr 256, r0 # preserve all AltiVec registers
  536. bl _vpaes_decrypt_preheat
  537. ?lvsl $inpperm, 0, $inp # prepare for unaligned access
  538. lvx v0, 0, $inp
  539. addi $inp, $inp, 15 # 15 is not a typo
  540. ?lvsr $outperm, 0, $out
  541. ?lvsl $keyperm, 0, $key
  542. lvx $inptail, 0, $inp # redundant in aligned case
  543. ?vperm v0, v0, $inptail, $inpperm
  544. bl _vpaes_decrypt_core
  545. andi. r8, $out, 15
  546. li r9, 16
  547. beq Ldec_out_aligned
  548. vperm v0, v0, v0, $outperm # rotate right/left
  549. mtctr r9
  550. Ldec_out_unaligned:
  551. stvebx v0, 0, $out
  552. addi $out, $out, 1
  553. bdnz Ldec_out_unaligned
  554. b Ldec_done
  555. .align 4
  556. Ldec_out_aligned:
  557. stvx v0, 0, $out
  558. Ldec_done:
  559. li r10,`15+6*$SIZE_T`
  560. li r11,`31+6*$SIZE_T`
  561. mtlr r6
  562. mtspr 256, r7 # restore vrsave
  563. lvx v20,r10,$sp
  564. addi r10,r10,32
  565. lvx v21,r11,$sp
  566. addi r11,r11,32
  567. lvx v22,r10,$sp
  568. addi r10,r10,32
  569. lvx v23,r11,$sp
  570. addi r11,r11,32
  571. lvx v24,r10,$sp
  572. addi r10,r10,32
  573. lvx v25,r11,$sp
  574. addi r11,r11,32
  575. lvx v26,r10,$sp
  576. addi r10,r10,32
  577. lvx v27,r11,$sp
  578. addi r11,r11,32
  579. lvx v28,r10,$sp
  580. addi r10,r10,32
  581. lvx v29,r11,$sp
  582. addi r11,r11,32
  583. lvx v30,r10,$sp
  584. lvx v31,r11,$sp
  585. addi $sp,$sp,$FRAME
  586. blr
  587. .long 0
  588. .byte 0,12,0x04,1,0x80,0,3,0
  589. .long 0
  590. .size .vpaes_decrypt,.-.vpaes_decrypt
  591. .globl .vpaes_cbc_encrypt
  592. .align 5
  593. .vpaes_cbc_encrypt:
  594. ${UCMP}i r5,16
  595. bltlr-
  596. $STU $sp,-`($FRAME+2*$SIZE_T)`($sp)
  597. mflr r0
  598. li r10,`15+6*$SIZE_T`
  599. li r11,`31+6*$SIZE_T`
  600. mfspr r12, 256
  601. stvx v20,r10,$sp
  602. addi r10,r10,32
  603. stvx v21,r11,$sp
  604. addi r11,r11,32
  605. stvx v22,r10,$sp
  606. addi r10,r10,32
  607. stvx v23,r11,$sp
  608. addi r11,r11,32
  609. stvx v24,r10,$sp
  610. addi r10,r10,32
  611. stvx v25,r11,$sp
  612. addi r11,r11,32
  613. stvx v26,r10,$sp
  614. addi r10,r10,32
  615. stvx v27,r11,$sp
  616. addi r11,r11,32
  617. stvx v28,r10,$sp
  618. addi r10,r10,32
  619. stvx v29,r11,$sp
  620. addi r11,r11,32
  621. stvx v30,r10,$sp
  622. stvx v31,r11,$sp
  623. stw r12,`$FRAME-4`($sp) # save vrsave
  624. $PUSH r30,`$FRAME+$SIZE_T*0`($sp)
  625. $PUSH r31,`$FRAME+$SIZE_T*1`($sp)
  626. li r9, -16
  627. $PUSH r0, `$FRAME+$SIZE_T*2+$LRSAVE`($sp)
  628. and r30, r5, r9 # copy length&-16
  629. andi. r9, $out, 15 # is $out aligned?
  630. mr r5, r6 # copy pointer to key
  631. mr r31, r7 # copy pointer to iv
  632. li r6, -1
  633. mcrf cr1, cr0 # put aside $out alignment flag
  634. mr r7, r12 # copy vrsave
  635. mtspr 256, r6 # preserve all AltiVec registers
  636. lvx v24, 0, r31 # load [potentially unaligned] iv
  637. li r9, 15
  638. ?lvsl $inpperm, 0, r31
  639. lvx v25, r9, r31
  640. ?vperm v24, v24, v25, $inpperm
  641. cmpwi r8, 0 # test direction
  642. neg r8, $inp # prepare for unaligned access
  643. vxor v7, v7, v7
  644. ?lvsl $keyperm, 0, $key
  645. ?lvsr $outperm, 0, $out
  646. ?lvsr $inpperm, 0, r8 # -$inp
  647. vnor $outmask, v7, v7 # 0xff..ff
  648. lvx $inptail, 0, $inp
  649. ?vperm $outmask, v7, $outmask, $outperm
  650. addi $inp, $inp, 15 # 15 is not a typo
  651. beq Lcbc_decrypt
  652. bl _vpaes_encrypt_preheat
  653. li r0, 16
  654. beq cr1, Lcbc_enc_loop # $out is aligned
  655. vmr v0, $inptail
  656. lvx $inptail, 0, $inp
  657. addi $inp, $inp, 16
  658. ?vperm v0, v0, $inptail, $inpperm
  659. vxor v0, v0, v24 # ^= iv
  660. bl _vpaes_encrypt_core
  661. andi. r8, $out, 15
  662. vmr v24, v0 # put aside iv
  663. sub r9, $out, r8
  664. vperm $outhead, v0, v0, $outperm # rotate right/left
  665. Lcbc_enc_head:
  666. stvebx $outhead, r8, r9
  667. cmpwi r8, 15
  668. addi r8, r8, 1
  669. bne Lcbc_enc_head
  670. sub. r30, r30, r0 # len -= 16
  671. addi $out, $out, 16
  672. beq Lcbc_unaligned_done
  673. Lcbc_enc_loop:
  674. vmr v0, $inptail
  675. lvx $inptail, 0, $inp
  676. addi $inp, $inp, 16
  677. ?vperm v0, v0, $inptail, $inpperm
  678. vxor v0, v0, v24 # ^= iv
  679. bl _vpaes_encrypt_core
  680. vmr v24, v0 # put aside iv
  681. sub. r30, r30, r0 # len -= 16
  682. vperm v0, v0, v0, $outperm # rotate right/left
  683. vsel v1, $outhead, v0, $outmask
  684. vmr $outhead, v0
  685. stvx v1, 0, $out
  686. addi $out, $out, 16
  687. bne Lcbc_enc_loop
  688. b Lcbc_done
  689. .align 5
  690. Lcbc_decrypt:
  691. bl _vpaes_decrypt_preheat
  692. li r0, 16
  693. beq cr1, Lcbc_dec_loop # $out is aligned
  694. vmr v0, $inptail
  695. lvx $inptail, 0, $inp
  696. addi $inp, $inp, 16
  697. ?vperm v0, v0, $inptail, $inpperm
  698. vmr v25, v0 # put aside input
  699. bl _vpaes_decrypt_core
  700. andi. r8, $out, 15
  701. vxor v0, v0, v24 # ^= iv
  702. vmr v24, v25
  703. sub r9, $out, r8
  704. vperm $outhead, v0, v0, $outperm # rotate right/left
  705. Lcbc_dec_head:
  706. stvebx $outhead, r8, r9
  707. cmpwi r8, 15
  708. addi r8, r8, 1
  709. bne Lcbc_dec_head
  710. sub. r30, r30, r0 # len -= 16
  711. addi $out, $out, 16
  712. beq Lcbc_unaligned_done
  713. Lcbc_dec_loop:
  714. vmr v0, $inptail
  715. lvx $inptail, 0, $inp
  716. addi $inp, $inp, 16
  717. ?vperm v0, v0, $inptail, $inpperm
  718. vmr v25, v0 # put aside input
  719. bl _vpaes_decrypt_core
  720. vxor v0, v0, v24 # ^= iv
  721. vmr v24, v25
  722. sub. r30, r30, r0 # len -= 16
  723. vperm v0, v0, v0, $outperm # rotate right/left
  724. vsel v1, $outhead, v0, $outmask
  725. vmr $outhead, v0
  726. stvx v1, 0, $out
  727. addi $out, $out, 16
  728. bne Lcbc_dec_loop
  729. Lcbc_done:
  730. beq cr1, Lcbc_write_iv # $out is aligned
  731. Lcbc_unaligned_done:
  732. andi. r8, $out, 15
  733. sub $out, $out, r8
  734. li r9, 0
  735. Lcbc_tail:
  736. stvebx $outhead, r9, $out
  737. addi r9, r9, 1
  738. cmpw r9, r8
  739. bne Lcbc_tail
  740. Lcbc_write_iv:
  741. neg r8, r31 # write [potentially unaligned] iv
  742. li r10, 4
  743. ?lvsl $outperm, 0, r8
  744. li r11, 8
  745. li r12, 12
  746. vperm v24, v24, v24, $outperm # rotate right/left
  747. stvewx v24, 0, r31 # ivp is at least 32-bit aligned
  748. stvewx v24, r10, r31
  749. stvewx v24, r11, r31
  750. stvewx v24, r12, r31
  751. mtspr 256, r7 # restore vrsave
  752. li r10,`15+6*$SIZE_T`
  753. li r11,`31+6*$SIZE_T`
  754. lvx v20,r10,$sp
  755. addi r10,r10,32
  756. lvx v21,r11,$sp
  757. addi r11,r11,32
  758. lvx v22,r10,$sp
  759. addi r10,r10,32
  760. lvx v23,r11,$sp
  761. addi r11,r11,32
  762. lvx v24,r10,$sp
  763. addi r10,r10,32
  764. lvx v25,r11,$sp
  765. addi r11,r11,32
  766. lvx v26,r10,$sp
  767. addi r10,r10,32
  768. lvx v27,r11,$sp
  769. addi r11,r11,32
  770. lvx v28,r10,$sp
  771. addi r10,r10,32
  772. lvx v29,r11,$sp
  773. addi r11,r11,32
  774. lvx v30,r10,$sp
  775. lvx v31,r11,$sp
  776. Lcbc_abort:
  777. $POP r0, `$FRAME+$SIZE_T*2+$LRSAVE`($sp)
  778. $POP r30,`$FRAME+$SIZE_T*0`($sp)
  779. $POP r31,`$FRAME+$SIZE_T*1`($sp)
  780. mtlr r0
  781. addi $sp,$sp,`$FRAME+$SIZE_T*2`
  782. blr
  783. .long 0
  784. .byte 0,12,0x04,1,0x80,2,6,0
  785. .long 0
  786. .size .vpaes_cbc_encrypt,.-.vpaes_cbc_encrypt
  787. ___
  788. }
  789. {
  790. my ($inp,$bits,$out)=map("r$_",(3..5));
  791. my $dir="cr1";
  792. my ($invlo,$invhi,$iptlo,$ipthi,$rcon) = map("v$_",(10..13,24));
  793. $code.=<<___;
  794. ########################################################
  795. ## ##
  796. ## AES key schedule ##
  797. ## ##
  798. ########################################################
  799. .align 4
  800. _vpaes_key_preheat:
  801. mflr r8
  802. bl Lconsts
  803. mtlr r8
  804. li r11, 0xc0 # Lk_inv
  805. li r10, 0xd0
  806. li r9, 0xe0 # L_ipt
  807. li r8, 0xf0
  808. vspltisb v8,4 # 0x04..04
  809. vxor v9,v9,v9 # 0x00..00
  810. lvx $invlo, r12, r11 # Lk_inv
  811. li r11, 0x120
  812. lvx $invhi, r12, r10
  813. li r10, 0x130
  814. lvx $iptlo, r12, r9 # Lk_ipt
  815. li r9, 0x220
  816. lvx $ipthi, r12, r8
  817. li r8, 0x230
  818. lvx v14, r12, r11 # Lk_sb1
  819. li r11, 0x240
  820. lvx v15, r12, r10
  821. li r10, 0x250
  822. lvx v16, r12, r9 # Lk_dksd
  823. li r9, 0x260
  824. lvx v17, r12, r8
  825. li r8, 0x270
  826. lvx v18, r12, r11 # Lk_dksb
  827. li r11, 0x280
  828. lvx v19, r12, r10
  829. li r10, 0x290
  830. lvx v20, r12, r9 # Lk_dkse
  831. li r9, 0x2a0
  832. lvx v21, r12, r8
  833. li r8, 0x2b0
  834. lvx v22, r12, r11 # Lk_dks9
  835. lvx v23, r12, r10
  836. lvx v24, r12, r9 # Lk_rcon
  837. lvx v25, 0, r12 # Lk_mc_forward[0]
  838. lvx v26, r12, r8 # Lks63
  839. blr
  840. .long 0
  841. .byte 0,12,0x14,0,0,0,0,0
  842. .align 4
  843. _vpaes_schedule_core:
  844. mflr r7
  845. bl _vpaes_key_preheat # load the tables
  846. #lvx v0, 0, $inp # vmovdqu (%rdi), %xmm0 # load key (unaligned)
  847. neg r8, $inp # prepare for unaligned access
  848. lvx v0, 0, $inp
  849. addi $inp, $inp, 15 # 15 is not typo
  850. ?lvsr $inpperm, 0, r8 # -$inp
  851. lvx v6, 0, $inp # v6 serves as inptail
  852. addi $inp, $inp, 8
  853. ?vperm v0, v0, v6, $inpperm
  854. # input transform
  855. vmr v3, v0 # vmovdqa %xmm0, %xmm3
  856. bl _vpaes_schedule_transform
  857. vmr v7, v0 # vmovdqa %xmm0, %xmm7
  858. bne $dir, Lschedule_am_decrypting
  859. # encrypting, output zeroth round key after transform
  860. li r8, 0x30 # mov \$0x30,%r8d
  861. li r9, 4
  862. li r10, 8
  863. li r11, 12
  864. ?lvsr $outperm, 0, $out # prepare for unaligned access
  865. vnor $outmask, v9, v9 # 0xff..ff
  866. ?vperm $outmask, v9, $outmask, $outperm
  867. #stvx v0, 0, $out # vmovdqu %xmm0, (%rdx)
  868. vperm $outhead, v0, v0, $outperm # rotate right/left
  869. stvewx $outhead, 0, $out # some are superfluous
  870. stvewx $outhead, r9, $out
  871. stvewx $outhead, r10, $out
  872. addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10
  873. stvewx $outhead, r11, $out
  874. b Lschedule_go
  875. Lschedule_am_decrypting:
  876. srwi r8, $bits, 1 # shr \$1,%r8d
  877. andi. r8, r8, 32 # and \$32,%r8d
  878. xori r8, r8, 32 # xor \$32,%r8d # nbits==192?0:32
  879. addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10
  880. # decrypting, output zeroth round key after shiftrows
  881. lvx v1, r8, r10 # vmovdqa (%r8,%r10), %xmm1
  882. li r9, 4
  883. li r10, 8
  884. li r11, 12
  885. vperm v4, v3, v3, v1 # vpshufb %xmm1, %xmm3, %xmm3
  886. neg r0, $out # prepare for unaligned access
  887. ?lvsl $outperm, 0, r0
  888. vnor $outmask, v9, v9 # 0xff..ff
  889. ?vperm $outmask, $outmask, v9, $outperm
  890. #stvx v4, 0, $out # vmovdqu %xmm3, (%rdx)
  891. vperm $outhead, v4, v4, $outperm # rotate right/left
  892. stvewx $outhead, 0, $out # some are superfluous
  893. stvewx $outhead, r9, $out
  894. stvewx $outhead, r10, $out
  895. addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10
  896. stvewx $outhead, r11, $out
  897. addi $out, $out, 15 # 15 is not typo
  898. xori r8, r8, 0x30 # xor \$0x30, %r8
  899. Lschedule_go:
  900. cmplwi $bits, 192 # cmp \$192, %esi
  901. bgt Lschedule_256
  902. beq Lschedule_192
  903. # 128: fall though
  904. ##
  905. ## .schedule_128
  906. ##
  907. ## 128-bit specific part of key schedule.
  908. ##
  909. ## This schedule is really simple, because all its parts
  910. ## are accomplished by the subroutines.
  911. ##
  912. Lschedule_128:
  913. li r0, 10 # mov \$10, %esi
  914. mtctr r0
  915. Loop_schedule_128:
  916. bl _vpaes_schedule_round
  917. bdz Lschedule_mangle_last # dec %esi
  918. bl _vpaes_schedule_mangle # write output
  919. b Loop_schedule_128
  920. ##
  921. ## .aes_schedule_192
  922. ##
  923. ## 192-bit specific part of key schedule.
  924. ##
  925. ## The main body of this schedule is the same as the 128-bit
  926. ## schedule, but with more smearing. The long, high side is
  927. ## stored in %xmm7 as before, and the short, low side is in
  928. ## the high bits of %xmm6.
  929. ##
  930. ## This schedule is somewhat nastier, however, because each
  931. ## round produces 192 bits of key material, or 1.5 round keys.
  932. ## Therefore, on each cycle we do 2 rounds and produce 3 round
  933. ## keys.
  934. ##
  935. .align 4
  936. Lschedule_192:
  937. li r0, 4 # mov \$4, %esi
  938. lvx v0, 0, $inp
  939. ?vperm v0, v6, v0, $inpperm
  940. ?vsldoi v0, v3, v0, 8 # vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned)
  941. bl _vpaes_schedule_transform # input transform
  942. ?vsldoi v6, v0, v9, 8
  943. ?vsldoi v6, v9, v6, 8 # clobber "low" side with zeros
  944. mtctr r0
  945. Loop_schedule_192:
  946. bl _vpaes_schedule_round
  947. ?vsldoi v0, v6, v0, 8 # vpalignr \$8,%xmm6,%xmm0,%xmm0
  948. bl _vpaes_schedule_mangle # save key n
  949. bl _vpaes_schedule_192_smear
  950. bl _vpaes_schedule_mangle # save key n+1
  951. bl _vpaes_schedule_round
  952. bdz Lschedule_mangle_last # dec %esi
  953. bl _vpaes_schedule_mangle # save key n+2
  954. bl _vpaes_schedule_192_smear
  955. b Loop_schedule_192
  956. ##
  957. ## .aes_schedule_256
  958. ##
  959. ## 256-bit specific part of key schedule.
  960. ##
  961. ## The structure here is very similar to the 128-bit
  962. ## schedule, but with an additional "low side" in
  963. ## %xmm6. The low side's rounds are the same as the
  964. ## high side's, except no rcon and no rotation.
  965. ##
  966. .align 4
  967. Lschedule_256:
  968. li r0, 7 # mov \$7, %esi
  969. addi $inp, $inp, 8
  970. lvx v0, 0, $inp # vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
  971. ?vperm v0, v6, v0, $inpperm
  972. bl _vpaes_schedule_transform # input transform
  973. mtctr r0
  974. Loop_schedule_256:
  975. bl _vpaes_schedule_mangle # output low result
  976. vmr v6, v0 # vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6
  977. # high round
  978. bl _vpaes_schedule_round
  979. bdz Lschedule_mangle_last # dec %esi
  980. bl _vpaes_schedule_mangle
  981. # low round. swap xmm7 and xmm6
  982. ?vspltw v0, v0, 3 # vpshufd \$0xFF, %xmm0, %xmm0
  983. vmr v5, v7 # vmovdqa %xmm7, %xmm5
  984. vmr v7, v6 # vmovdqa %xmm6, %xmm7
  985. bl _vpaes_schedule_low_round
  986. vmr v7, v5 # vmovdqa %xmm5, %xmm7
  987. b Loop_schedule_256
  988. ##
  989. ## .aes_schedule_mangle_last
  990. ##
  991. ## Mangler for last round of key schedule
  992. ## Mangles %xmm0
  993. ## when encrypting, outputs out(%xmm0) ^ 63
  994. ## when decrypting, outputs unskew(%xmm0)
  995. ##
  996. ## Always called right before return... jumps to cleanup and exits
  997. ##
  998. .align 4
  999. Lschedule_mangle_last:
  1000. # schedule last round key from xmm0
  1001. li r11, 0x2e0 # lea .Lk_deskew(%rip),%r11
  1002. li r9, 0x2f0
  1003. bne $dir, Lschedule_mangle_last_dec
  1004. # encrypting
  1005. lvx v1, r8, r10 # vmovdqa (%r8,%r10),%xmm1
  1006. li r11, 0x2c0 # lea .Lk_opt(%rip), %r11 # prepare to output transform
  1007. li r9, 0x2d0 # prepare to output transform
  1008. vperm v0, v0, v0, v1 # vpshufb %xmm1, %xmm0, %xmm0 # output permute
  1009. lvx $iptlo, r11, r12 # reload $ipt
  1010. lvx $ipthi, r9, r12
  1011. addi $out, $out, 16 # add \$16, %rdx
  1012. vxor v0, v0, v26 # vpxor .Lk_s63(%rip), %xmm0, %xmm0
  1013. bl _vpaes_schedule_transform # output transform
  1014. #stvx v0, r0, $out # vmovdqu %xmm0, (%rdx) # save last key
  1015. vperm v0, v0, v0, $outperm # rotate right/left
  1016. li r10, 4
  1017. vsel v2, $outhead, v0, $outmask
  1018. li r11, 8
  1019. stvx v2, 0, $out
  1020. li r12, 12
  1021. stvewx v0, 0, $out # some (or all) are redundant
  1022. stvewx v0, r10, $out
  1023. stvewx v0, r11, $out
  1024. stvewx v0, r12, $out
  1025. b Lschedule_mangle_done
  1026. .align 4
  1027. Lschedule_mangle_last_dec:
  1028. lvx $iptlo, r11, r12 # reload $ipt
  1029. lvx $ipthi, r9, r12
  1030. addi $out, $out, -16 # add \$-16, %rdx
  1031. vxor v0, v0, v26 # vpxor .Lk_s63(%rip), %xmm0, %xmm0
  1032. bl _vpaes_schedule_transform # output transform
  1033. #stvx v0, r0, $out # vmovdqu %xmm0, (%rdx) # save last key
  1034. addi r9, $out, -15 # -15 is not typo
  1035. vperm v0, v0, v0, $outperm # rotate right/left
  1036. li r10, 4
  1037. vsel v2, $outhead, v0, $outmask
  1038. li r11, 8
  1039. stvx v2, 0, $out
  1040. li r12, 12
  1041. stvewx v0, 0, r9 # some (or all) are redundant
  1042. stvewx v0, r10, r9
  1043. stvewx v0, r11, r9
  1044. stvewx v0, r12, r9
  1045. Lschedule_mangle_done:
  1046. mtlr r7
  1047. # cleanup
  1048. vxor v0, v0, v0 # vpxor %xmm0, %xmm0, %xmm0
  1049. vxor v1, v1, v1 # vpxor %xmm1, %xmm1, %xmm1
  1050. vxor v2, v2, v2 # vpxor %xmm2, %xmm2, %xmm2
  1051. vxor v3, v3, v3 # vpxor %xmm3, %xmm3, %xmm3
  1052. vxor v4, v4, v4 # vpxor %xmm4, %xmm4, %xmm4
  1053. vxor v5, v5, v5 # vpxor %xmm5, %xmm5, %xmm5
  1054. vxor v6, v6, v6 # vpxor %xmm6, %xmm6, %xmm6
  1055. vxor v7, v7, v7 # vpxor %xmm7, %xmm7, %xmm7
  1056. blr
  1057. .long 0
  1058. .byte 0,12,0x14,0,0,0,0,0
  1059. ##
  1060. ## .aes_schedule_192_smear
  1061. ##
  1062. ## Smear the short, low side in the 192-bit key schedule.
  1063. ##
  1064. ## Inputs:
  1065. ## %xmm7: high side, b a x y
  1066. ## %xmm6: low side, d c 0 0
  1067. ## %xmm13: 0
  1068. ##
  1069. ## Outputs:
  1070. ## %xmm6: b+c+d b+c 0 0
  1071. ## %xmm0: b+c+d b+c b a
  1072. ##
  1073. .align 4
  1074. _vpaes_schedule_192_smear:
  1075. ?vspltw v0, v7, 3
  1076. ?vsldoi v1, v9, v6, 12 # vpshufd \$0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0
  1077. ?vsldoi v0, v7, v0, 8 # vpshufd \$0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a
  1078. vxor v6, v6, v1 # vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0
  1079. vxor v6, v6, v0 # vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a
  1080. vmr v0, v6
  1081. ?vsldoi v6, v6, v9, 8
  1082. ?vsldoi v6, v9, v6, 8 # clobber low side with zeros
  1083. blr
  1084. .long 0
  1085. .byte 0,12,0x14,0,0,0,0,0
  1086. ##
  1087. ## .aes_schedule_round
  1088. ##
  1089. ## Runs one main round of the key schedule on %xmm0, %xmm7
  1090. ##
  1091. ## Specifically, runs subbytes on the high dword of %xmm0
  1092. ## then rotates it by one byte and xors into the low dword of
  1093. ## %xmm7.
  1094. ##
  1095. ## Adds rcon from low byte of %xmm8, then rotates %xmm8 for
  1096. ## next rcon.
  1097. ##
  1098. ## Smears the dwords of %xmm7 by xoring the low into the
  1099. ## second low, result into third, result into highest.
  1100. ##
  1101. ## Returns results in %xmm7 = %xmm0.
  1102. ## Clobbers %xmm1-%xmm4, %r11.
  1103. ##
  1104. .align 4
  1105. _vpaes_schedule_round:
  1106. # extract rcon from xmm8
  1107. #vxor v4, v4, v4 # vpxor %xmm4, %xmm4, %xmm4
  1108. ?vsldoi v1, $rcon, v9, 15 # vpalignr \$15, %xmm8, %xmm4, %xmm1
  1109. ?vsldoi $rcon, $rcon, $rcon, 15 # vpalignr \$15, %xmm8, %xmm8, %xmm8
  1110. vxor v7, v7, v1 # vpxor %xmm1, %xmm7, %xmm7
  1111. # rotate
  1112. ?vspltw v0, v0, 3 # vpshufd \$0xFF, %xmm0, %xmm0
  1113. ?vsldoi v0, v0, v0, 1 # vpalignr \$1, %xmm0, %xmm0, %xmm0
  1114. # fall through...
  1115. # low round: same as high round, but no rotation and no rcon.
  1116. _vpaes_schedule_low_round:
  1117. # smear xmm7
  1118. ?vsldoi v1, v9, v7, 12 # vpslldq \$4, %xmm7, %xmm1
  1119. vxor v7, v7, v1 # vpxor %xmm1, %xmm7, %xmm7
  1120. vspltisb v1, 0x0f # 0x0f..0f
  1121. ?vsldoi v4, v9, v7, 8 # vpslldq \$8, %xmm7, %xmm4
  1122. # subbytes
  1123. vand v1, v1, v0 # vpand %xmm9, %xmm0, %xmm1 # 0 = k
  1124. vsrb v0, v0, v8 # vpsrlb \$4, %xmm0, %xmm0 # 1 = i
  1125. vxor v7, v7, v4 # vpxor %xmm4, %xmm7, %xmm7
  1126. vperm v2, $invhi, v9, v1 # vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
  1127. vxor v1, v1, v0 # vpxor %xmm0, %xmm1, %xmm1 # 0 = j
  1128. vperm v3, $invlo, v9, v0 # vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
  1129. vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
  1130. vperm v4, $invlo, v9, v1 # vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
  1131. vxor v7, v7, v26 # vpxor .Lk_s63(%rip), %xmm7, %xmm7
  1132. vperm v3, $invlo, v9, v3 # vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak
  1133. vxor v4, v4, v2 # vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
  1134. vperm v2, $invlo, v9, v4 # vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak
  1135. vxor v3, v3, v1 # vpxor %xmm1, %xmm3, %xmm3 # 2 = io
  1136. vxor v2, v2, v0 # vpxor %xmm0, %xmm2, %xmm2 # 3 = jo
  1137. vperm v4, v15, v9, v3 # vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou
  1138. vperm v1, v14, v9, v2 # vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t
  1139. vxor v1, v1, v4 # vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output
  1140. # add in smeared stuff
  1141. vxor v0, v1, v7 # vpxor %xmm7, %xmm1, %xmm0
  1142. vxor v7, v1, v7 # vmovdqa %xmm0, %xmm7
  1143. blr
  1144. .long 0
  1145. .byte 0,12,0x14,0,0,0,0,0
  1146. ##
  1147. ## .aes_schedule_transform
  1148. ##
  1149. ## Linear-transform %xmm0 according to tables at (%r11)
  1150. ##
  1151. ## Requires that %xmm9 = 0x0F0F... as in preheat
  1152. ## Output in %xmm0
  1153. ## Clobbers %xmm2
  1154. ##
  1155. .align 4
  1156. _vpaes_schedule_transform:
  1157. #vand v1, v0, v9 # vpand %xmm9, %xmm0, %xmm1
  1158. vsrb v2, v0, v8 # vpsrlb \$4, %xmm0, %xmm0
  1159. # vmovdqa (%r11), %xmm2 # lo
  1160. vperm v0, $iptlo, $iptlo, v0 # vpshufb %xmm1, %xmm2, %xmm2
  1161. # vmovdqa 16(%r11), %xmm1 # hi
  1162. vperm v2, $ipthi, $ipthi, v2 # vpshufb %xmm0, %xmm1, %xmm0
  1163. vxor v0, v0, v2 # vpxor %xmm2, %xmm0, %xmm0
  1164. blr
  1165. .long 0
  1166. .byte 0,12,0x14,0,0,0,0,0
  1167. ##
  1168. ## .aes_schedule_mangle
  1169. ##
  1170. ## Mangle xmm0 from (basis-transformed) standard version
  1171. ## to our version.
  1172. ##
  1173. ## On encrypt,
  1174. ## xor with 0x63
  1175. ## multiply by circulant 0,1,1,1
  1176. ## apply shiftrows transform
  1177. ##
  1178. ## On decrypt,
  1179. ## xor with 0x63
  1180. ## multiply by "inverse mixcolumns" circulant E,B,D,9
  1181. ## deskew
  1182. ## apply shiftrows transform
  1183. ##
  1184. ##
  1185. ## Writes out to (%rdx), and increments or decrements it
  1186. ## Keeps track of round number mod 4 in %r8
  1187. ## Preserves xmm0
  1188. ## Clobbers xmm1-xmm5
  1189. ##
  1190. .align 4
  1191. _vpaes_schedule_mangle:
  1192. #vmr v4, v0 # vmovdqa %xmm0, %xmm4 # save xmm0 for later
  1193. # vmovdqa .Lk_mc_forward(%rip),%xmm5
  1194. bne $dir, Lschedule_mangle_dec
  1195. # encrypting
  1196. vxor v4, v0, v26 # vpxor .Lk_s63(%rip), %xmm0, %xmm4
  1197. addi $out, $out, 16 # add \$16, %rdx
  1198. vperm v4, v4, v4, v25 # vpshufb %xmm5, %xmm4, %xmm4
  1199. vperm v1, v4, v4, v25 # vpshufb %xmm5, %xmm4, %xmm1
  1200. vperm v3, v1, v1, v25 # vpshufb %xmm5, %xmm1, %xmm3
  1201. vxor v4, v4, v1 # vpxor %xmm1, %xmm4, %xmm4
  1202. lvx v1, r8, r10 # vmovdqa (%r8,%r10), %xmm1
  1203. vxor v3, v3, v4 # vpxor %xmm4, %xmm3, %xmm3
  1204. vperm v3, v3, v3, v1 # vpshufb %xmm1, %xmm3, %xmm3
  1205. addi r8, r8, -16 # add \$-16, %r8
  1206. andi. r8, r8, 0x30 # and \$0x30, %r8
  1207. #stvx v3, 0, $out # vmovdqu %xmm3, (%rdx)
  1208. vperm v1, v3, v3, $outperm # rotate right/left
  1209. vsel v2, $outhead, v1, $outmask
  1210. vmr $outhead, v1
  1211. stvx v2, 0, $out
  1212. blr
  1213. .align 4
  1214. Lschedule_mangle_dec:
  1215. # inverse mix columns
  1216. # lea .Lk_dksd(%rip),%r11
  1217. vsrb v1, v0, v8 # vpsrlb \$4, %xmm4, %xmm1 # 1 = hi
  1218. #and v4, v0, v9 # vpand %xmm9, %xmm4, %xmm4 # 4 = lo
  1219. # vmovdqa 0x00(%r11), %xmm2
  1220. vperm v2, v16, v16, v0 # vpshufb %xmm4, %xmm2, %xmm2
  1221. # vmovdqa 0x10(%r11), %xmm3
  1222. vperm v3, v17, v17, v1 # vpshufb %xmm1, %xmm3, %xmm3
  1223. vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3
  1224. vperm v3, v3, v9, v25 # vpshufb %xmm5, %xmm3, %xmm3
  1225. # vmovdqa 0x20(%r11), %xmm2
  1226. vperm v2, v18, v18, v0 # vpshufb %xmm4, %xmm2, %xmm2
  1227. vxor v2, v2, v3 # vpxor %xmm3, %xmm2, %xmm2
  1228. # vmovdqa 0x30(%r11), %xmm3
  1229. vperm v3, v19, v19, v1 # vpshufb %xmm1, %xmm3, %xmm3
  1230. vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3
  1231. vperm v3, v3, v9, v25 # vpshufb %xmm5, %xmm3, %xmm3
  1232. # vmovdqa 0x40(%r11), %xmm2
  1233. vperm v2, v20, v20, v0 # vpshufb %xmm4, %xmm2, %xmm2
  1234. vxor v2, v2, v3 # vpxor %xmm3, %xmm2, %xmm2
  1235. # vmovdqa 0x50(%r11), %xmm3
  1236. vperm v3, v21, v21, v1 # vpshufb %xmm1, %xmm3, %xmm3
  1237. vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3
  1238. # vmovdqa 0x60(%r11), %xmm2
  1239. vperm v2, v22, v22, v0 # vpshufb %xmm4, %xmm2, %xmm2
  1240. vperm v3, v3, v9, v25 # vpshufb %xmm5, %xmm3, %xmm3
  1241. # vmovdqa 0x70(%r11), %xmm4
  1242. vperm v4, v23, v23, v1 # vpshufb %xmm1, %xmm4, %xmm4
  1243. lvx v1, r8, r10 # vmovdqa (%r8,%r10), %xmm1
  1244. vxor v2, v2, v3 # vpxor %xmm3, %xmm2, %xmm2
  1245. vxor v3, v4, v2 # vpxor %xmm2, %xmm4, %xmm3
  1246. addi $out, $out, -16 # add \$-16, %rdx
  1247. vperm v3, v3, v3, v1 # vpshufb %xmm1, %xmm3, %xmm3
  1248. addi r8, r8, -16 # add \$-16, %r8
  1249. andi. r8, r8, 0x30 # and \$0x30, %r8
  1250. #stvx v3, 0, $out # vmovdqu %xmm3, (%rdx)
  1251. vperm v1, v3, v3, $outperm # rotate right/left
  1252. vsel v2, $outhead, v1, $outmask
  1253. vmr $outhead, v1
  1254. stvx v2, 0, $out
  1255. blr
  1256. .long 0
  1257. .byte 0,12,0x14,0,0,0,0,0
  1258. .globl .vpaes_set_encrypt_key
  1259. .align 5
  1260. .vpaes_set_encrypt_key:
  1261. $STU $sp,-$FRAME($sp)
  1262. li r10,`15+6*$SIZE_T`
  1263. li r11,`31+6*$SIZE_T`
  1264. mflr r0
  1265. mfspr r6, 256 # save vrsave
  1266. stvx v20,r10,$sp
  1267. addi r10,r10,32
  1268. stvx v21,r11,$sp
  1269. addi r11,r11,32
  1270. stvx v22,r10,$sp
  1271. addi r10,r10,32
  1272. stvx v23,r11,$sp
  1273. addi r11,r11,32
  1274. stvx v24,r10,$sp
  1275. addi r10,r10,32
  1276. stvx v25,r11,$sp
  1277. addi r11,r11,32
  1278. stvx v26,r10,$sp
  1279. addi r10,r10,32
  1280. stvx v27,r11,$sp
  1281. addi r11,r11,32
  1282. stvx v28,r10,$sp
  1283. addi r10,r10,32
  1284. stvx v29,r11,$sp
  1285. addi r11,r11,32
  1286. stvx v30,r10,$sp
  1287. stvx v31,r11,$sp
  1288. stw r6,`$FRAME-4`($sp) # save vrsave
  1289. li r7, -1
  1290. $PUSH r0, `$FRAME+$LRSAVE`($sp)
  1291. mtspr 256, r7 # preserve all AltiVec registers
  1292. srwi r9, $bits, 5 # shr \$5,%eax
  1293. addi r9, r9, 6 # add \$5,%eax
  1294. stw r9, 240($out) # mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
  1295. cmplw $dir, $bits, $bits # set encrypt direction
  1296. li r8, 0x30 # mov \$0x30,%r8d
  1297. bl _vpaes_schedule_core
  1298. $POP r0, `$FRAME+$LRSAVE`($sp)
  1299. li r10,`15+6*$SIZE_T`
  1300. li r11,`31+6*$SIZE_T`
  1301. mtspr 256, r6 # restore vrsave
  1302. mtlr r0
  1303. xor r3, r3, r3
  1304. lvx v20,r10,$sp
  1305. addi r10,r10,32
  1306. lvx v21,r11,$sp
  1307. addi r11,r11,32
  1308. lvx v22,r10,$sp
  1309. addi r10,r10,32
  1310. lvx v23,r11,$sp
  1311. addi r11,r11,32
  1312. lvx v24,r10,$sp
  1313. addi r10,r10,32
  1314. lvx v25,r11,$sp
  1315. addi r11,r11,32
  1316. lvx v26,r10,$sp
  1317. addi r10,r10,32
  1318. lvx v27,r11,$sp
  1319. addi r11,r11,32
  1320. lvx v28,r10,$sp
  1321. addi r10,r10,32
  1322. lvx v29,r11,$sp
  1323. addi r11,r11,32
  1324. lvx v30,r10,$sp
  1325. lvx v31,r11,$sp
  1326. addi $sp,$sp,$FRAME
  1327. blr
  1328. .long 0
  1329. .byte 0,12,0x04,1,0x80,0,3,0
  1330. .long 0
  1331. .size .vpaes_set_encrypt_key,.-.vpaes_set_encrypt_key
  1332. .globl .vpaes_set_decrypt_key
  1333. .align 4
  1334. .vpaes_set_decrypt_key:
  1335. $STU $sp,-$FRAME($sp)
  1336. li r10,`15+6*$SIZE_T`
  1337. li r11,`31+6*$SIZE_T`
  1338. mflr r0
  1339. mfspr r6, 256 # save vrsave
  1340. stvx v20,r10,$sp
  1341. addi r10,r10,32
  1342. stvx v21,r11,$sp
  1343. addi r11,r11,32
  1344. stvx v22,r10,$sp
  1345. addi r10,r10,32
  1346. stvx v23,r11,$sp
  1347. addi r11,r11,32
  1348. stvx v24,r10,$sp
  1349. addi r10,r10,32
  1350. stvx v25,r11,$sp
  1351. addi r11,r11,32
  1352. stvx v26,r10,$sp
  1353. addi r10,r10,32
  1354. stvx v27,r11,$sp
  1355. addi r11,r11,32
  1356. stvx v28,r10,$sp
  1357. addi r10,r10,32
  1358. stvx v29,r11,$sp
  1359. addi r11,r11,32
  1360. stvx v30,r10,$sp
  1361. stvx v31,r11,$sp
  1362. stw r6,`$FRAME-4`($sp) # save vrsave
  1363. li r7, -1
  1364. $PUSH r0, `$FRAME+$LRSAVE`($sp)
  1365. mtspr 256, r7 # preserve all AltiVec registers
  1366. srwi r9, $bits, 5 # shr \$5,%eax
  1367. addi r9, r9, 6 # add \$5,%eax
  1368. stw r9, 240($out) # mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
  1369. slwi r9, r9, 4 # shl \$4,%eax
  1370. add $out, $out, r9 # lea (%rdx,%rax),%rdx
  1371. cmplwi $dir, $bits, 0 # set decrypt direction
  1372. srwi r8, $bits, 1 # shr \$1,%r8d
  1373. andi. r8, r8, 32 # and \$32,%r8d
  1374. xori r8, r8, 32 # xor \$32,%r8d # nbits==192?0:32
  1375. bl _vpaes_schedule_core
  1376. $POP r0, `$FRAME+$LRSAVE`($sp)
  1377. li r10,`15+6*$SIZE_T`
  1378. li r11,`31+6*$SIZE_T`
  1379. mtspr 256, r6 # restore vrsave
  1380. mtlr r0
  1381. xor r3, r3, r3
  1382. lvx v20,r10,$sp
  1383. addi r10,r10,32
  1384. lvx v21,r11,$sp
  1385. addi r11,r11,32
  1386. lvx v22,r10,$sp
  1387. addi r10,r10,32
  1388. lvx v23,r11,$sp
  1389. addi r11,r11,32
  1390. lvx v24,r10,$sp
  1391. addi r10,r10,32
  1392. lvx v25,r11,$sp
  1393. addi r11,r11,32
  1394. lvx v26,r10,$sp
  1395. addi r10,r10,32
  1396. lvx v27,r11,$sp
  1397. addi r11,r11,32
  1398. lvx v28,r10,$sp
  1399. addi r10,r10,32
  1400. lvx v29,r11,$sp
  1401. addi r11,r11,32
  1402. lvx v30,r10,$sp
  1403. lvx v31,r11,$sp
  1404. addi $sp,$sp,$FRAME
  1405. blr
  1406. .long 0
  1407. .byte 0,12,0x04,1,0x80,0,3,0
  1408. .long 0
  1409. .size .vpaes_set_decrypt_key,.-.vpaes_set_decrypt_key
  1410. ___
  1411. }
  1412. my $consts=1;
  1413. foreach (split("\n",$code)) {
  1414. s/\`([^\`]*)\`/eval $1/geo;
  1415. # constants table endian-specific conversion
  1416. if ($consts && m/\.long\s+(.+)\s+(\?[a-z]*)$/o) {
  1417. my $conv=$2;
  1418. my @bytes=();
  1419. # convert to endian-agnostic format
  1420. foreach (split(/,\s+/,$1)) {
  1421. my $l = /^0/?oct:int;
  1422. push @bytes,($l>>24)&0xff,($l>>16)&0xff,($l>>8)&0xff,$l&0xff;
  1423. }
  1424. # little-endian conversion
  1425. if ($flavour =~ /le$/o) {
  1426. SWITCH: for($conv) {
  1427. /\?inv/ && do { @bytes=map($_^0xf,@bytes); last; };
  1428. /\?rev/ && do { @bytes=reverse(@bytes); last; };
  1429. }
  1430. }
  1431. #emit
  1432. print ".byte\t",join(',',map (sprintf("0x%02x",$_),@bytes)),"\n";
  1433. next;
  1434. }
  1435. $consts=0 if (m/Lconsts:/o); # end of table
  1436. # instructions prefixed with '?' are endian-specific and need
  1437. # to be adjusted accordingly...
  1438. if ($flavour =~ /le$/o) { # little-endian
  1439. s/\?lvsr/lvsl/o or
  1440. s/\?lvsl/lvsr/o or
  1441. s/\?(vperm\s+v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+)/$1$3$2$4/o or
  1442. s/\?(vsldoi\s+v[0-9]+,\s*)(v[0-9]+,)\s*(v[0-9]+,\s*)([0-9]+)/$1$3$2 16-$4/o or
  1443. s/\?(vspltw\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9])/$1$2 3-$3/o;
  1444. } else { # big-endian
  1445. s/\?([a-z]+)/$1/o;
  1446. }
  1447. print $_,"\n";
  1448. }
  1449. close STDOUT or die "error closing STDOUT: $!";