aesp8-ppc.pl 47 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942
  1. #!/usr/bin/env perl
  2. #
  3. # ====================================================================
  4. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  5. # project. The module is, however, dual licensed under OpenSSL and
  6. # CRYPTOGAMS licenses depending on where you obtain it. For further
  7. # details see http://www.openssl.org/~appro/cryptogams/.
  8. # ====================================================================
  9. #
  10. # This module implements support for AES instructions as per PowerISA
  11. # specification version 2.07, first implemented by POWER8 processor.
  12. # The module is endian-agnostic in sense that it supports both big-
  13. # and little-endian cases. Data alignment in parallelizable modes is
  14. # handled with VSX loads and stores, which implies MSR.VSX flag being
  15. # set. It should also be noted that ISA specification doesn't prohibit
  16. # alignment exceptions for these instructions on page boundaries.
  17. # Initially alignment was handled in pure AltiVec/VMX way [when data
  18. # is aligned programmatically, which in turn guarantees exception-
  19. # free execution], but it turned to hamper performance when vcipher
  20. # instructions are interleaved. It's reckoned that eventual
  21. # misalignment penalties at page boundaries are in average lower
  22. # than additional overhead in pure AltiVec approach.
  23. $flavour = shift;
  24. if ($flavour =~ /64/) {
  25. $SIZE_T =8;
  26. $LRSAVE =2*$SIZE_T;
  27. $STU ="stdu";
  28. $POP ="ld";
  29. $PUSH ="std";
  30. $UCMP ="cmpld";
  31. $SHL ="sldi";
  32. } elsif ($flavour =~ /32/) {
  33. $SIZE_T =4;
  34. $LRSAVE =$SIZE_T;
  35. $STU ="stwu";
  36. $POP ="lwz";
  37. $PUSH ="stw";
  38. $UCMP ="cmplw";
  39. $SHL ="slwi";
  40. } else { die "nonsense $flavour"; }
  41. $LITTLE_ENDIAN = ($flavour=~/le$/) ? $SIZE_T : 0;
  42. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  43. ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
  44. ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
  45. die "can't locate ppc-xlate.pl";
  46. open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
  47. $FRAME=8*$SIZE_T;
  48. $prefix="aes_p8";
  49. $sp="r1";
  50. $vrsave="r12";
  51. #########################################################################
  52. {{{ # Key setup procedures #
  53. my ($inp,$bits,$out,$ptr,$cnt,$rounds)=map("r$_",(3..8));
  54. my ($zero,$in0,$in1,$key,$rcon,$mask,$tmp)=map("v$_",(0..6));
  55. my ($stage,$outperm,$outmask,$outhead,$outtail)=map("v$_",(7..11));
  56. $code.=<<___;
  57. .machine "any"
  58. .text
  59. .align 7
  60. rcon:
  61. .long 0x01000000, 0x01000000, 0x01000000, 0x01000000 ?rev
  62. .long 0x1b000000, 0x1b000000, 0x1b000000, 0x1b000000 ?rev
  63. .long 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c ?rev
  64. .long 0,0,0,0 ?asis
  65. Lconsts:
  66. mflr r0
  67. bcl 20,31,\$+4
  68. mflr $ptr #vvvvv "distance between . and rcon
  69. addi $ptr,$ptr,-0x48
  70. mtlr r0
  71. blr
  72. .long 0
  73. .byte 0,12,0x14,0,0,0,0,0
  74. .asciz "AES for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
  75. .globl .${prefix}_set_encrypt_key
  76. .align 5
  77. .${prefix}_set_encrypt_key:
  78. Lset_encrypt_key:
  79. mflr r11
  80. $PUSH r11,$LRSAVE($sp)
  81. li $ptr,-1
  82. ${UCMP}i $inp,0
  83. beq- Lenc_key_abort # if ($inp==0) return -1;
  84. ${UCMP}i $out,0
  85. beq- Lenc_key_abort # if ($out==0) return -1;
  86. li $ptr,-2
  87. cmpwi $bits,128
  88. blt- Lenc_key_abort
  89. cmpwi $bits,256
  90. bgt- Lenc_key_abort
  91. andi. r0,$bits,0x3f
  92. bne- Lenc_key_abort
  93. lis r0,0xfff0
  94. mfspr $vrsave,256
  95. mtspr 256,r0
  96. bl Lconsts
  97. mtlr r11
  98. neg r9,$inp
  99. lvx $in0,0,$inp
  100. addi $inp,$inp,15 # 15 is not typo
  101. lvsr $key,0,r9 # borrow $key
  102. li r8,0x20
  103. cmpwi $bits,192
  104. lvx $in1,0,$inp
  105. le?vspltisb $mask,0x0f # borrow $mask
  106. lvx $rcon,0,$ptr
  107. le?vxor $key,$key,$mask # adjust for byte swap
  108. lvx $mask,r8,$ptr
  109. addi $ptr,$ptr,0x10
  110. vperm $in0,$in0,$in1,$key # align [and byte swap in LE]
  111. li $cnt,8
  112. vxor $zero,$zero,$zero
  113. mtctr $cnt
  114. ?lvsr $outperm,0,$out
  115. vspltisb $outmask,-1
  116. lvx $outhead,0,$out
  117. ?vperm $outmask,$zero,$outmask,$outperm
  118. blt Loop128
  119. addi $inp,$inp,8
  120. beq L192
  121. addi $inp,$inp,8
  122. b L256
  123. .align 4
  124. Loop128:
  125. vperm $key,$in0,$in0,$mask # rotate-n-splat
  126. vsldoi $tmp,$zero,$in0,12 # >>32
  127. vperm $outtail,$in0,$in0,$outperm # rotate
  128. vsel $stage,$outhead,$outtail,$outmask
  129. vmr $outhead,$outtail
  130. vcipherlast $key,$key,$rcon
  131. stvx $stage,0,$out
  132. addi $out,$out,16
  133. vxor $in0,$in0,$tmp
  134. vsldoi $tmp,$zero,$tmp,12 # >>32
  135. vxor $in0,$in0,$tmp
  136. vsldoi $tmp,$zero,$tmp,12 # >>32
  137. vxor $in0,$in0,$tmp
  138. vadduwm $rcon,$rcon,$rcon
  139. vxor $in0,$in0,$key
  140. bdnz Loop128
  141. lvx $rcon,0,$ptr # last two round keys
  142. vperm $key,$in0,$in0,$mask # rotate-n-splat
  143. vsldoi $tmp,$zero,$in0,12 # >>32
  144. vperm $outtail,$in0,$in0,$outperm # rotate
  145. vsel $stage,$outhead,$outtail,$outmask
  146. vmr $outhead,$outtail
  147. vcipherlast $key,$key,$rcon
  148. stvx $stage,0,$out
  149. addi $out,$out,16
  150. vxor $in0,$in0,$tmp
  151. vsldoi $tmp,$zero,$tmp,12 # >>32
  152. vxor $in0,$in0,$tmp
  153. vsldoi $tmp,$zero,$tmp,12 # >>32
  154. vxor $in0,$in0,$tmp
  155. vadduwm $rcon,$rcon,$rcon
  156. vxor $in0,$in0,$key
  157. vperm $key,$in0,$in0,$mask # rotate-n-splat
  158. vsldoi $tmp,$zero,$in0,12 # >>32
  159. vperm $outtail,$in0,$in0,$outperm # rotate
  160. vsel $stage,$outhead,$outtail,$outmask
  161. vmr $outhead,$outtail
  162. vcipherlast $key,$key,$rcon
  163. stvx $stage,0,$out
  164. addi $out,$out,16
  165. vxor $in0,$in0,$tmp
  166. vsldoi $tmp,$zero,$tmp,12 # >>32
  167. vxor $in0,$in0,$tmp
  168. vsldoi $tmp,$zero,$tmp,12 # >>32
  169. vxor $in0,$in0,$tmp
  170. vxor $in0,$in0,$key
  171. vperm $outtail,$in0,$in0,$outperm # rotate
  172. vsel $stage,$outhead,$outtail,$outmask
  173. vmr $outhead,$outtail
  174. stvx $stage,0,$out
  175. addi $inp,$out,15 # 15 is not typo
  176. addi $out,$out,0x50
  177. li $rounds,10
  178. b Ldone
  179. .align 4
  180. L192:
  181. lvx $tmp,0,$inp
  182. li $cnt,4
  183. vperm $outtail,$in0,$in0,$outperm # rotate
  184. vsel $stage,$outhead,$outtail,$outmask
  185. vmr $outhead,$outtail
  186. stvx $stage,0,$out
  187. addi $out,$out,16
  188. vperm $in1,$in1,$tmp,$key # align [and byte swap in LE]
  189. vspltisb $key,8 # borrow $key
  190. mtctr $cnt
  191. vsububm $mask,$mask,$key # adjust the mask
  192. Loop192:
  193. vperm $key,$in1,$in1,$mask # roate-n-splat
  194. vsldoi $tmp,$zero,$in0,12 # >>32
  195. vcipherlast $key,$key,$rcon
  196. vxor $in0,$in0,$tmp
  197. vsldoi $tmp,$zero,$tmp,12 # >>32
  198. vxor $in0,$in0,$tmp
  199. vsldoi $tmp,$zero,$tmp,12 # >>32
  200. vxor $in0,$in0,$tmp
  201. vsldoi $stage,$zero,$in1,8
  202. vspltw $tmp,$in0,3
  203. vxor $tmp,$tmp,$in1
  204. vsldoi $in1,$zero,$in1,12 # >>32
  205. vadduwm $rcon,$rcon,$rcon
  206. vxor $in1,$in1,$tmp
  207. vxor $in0,$in0,$key
  208. vxor $in1,$in1,$key
  209. vsldoi $stage,$stage,$in0,8
  210. vperm $key,$in1,$in1,$mask # rotate-n-splat
  211. vsldoi $tmp,$zero,$in0,12 # >>32
  212. vperm $outtail,$stage,$stage,$outperm # rotate
  213. vsel $stage,$outhead,$outtail,$outmask
  214. vmr $outhead,$outtail
  215. vcipherlast $key,$key,$rcon
  216. stvx $stage,0,$out
  217. addi $out,$out,16
  218. vsldoi $stage,$in0,$in1,8
  219. vxor $in0,$in0,$tmp
  220. vsldoi $tmp,$zero,$tmp,12 # >>32
  221. vperm $outtail,$stage,$stage,$outperm # rotate
  222. vsel $stage,$outhead,$outtail,$outmask
  223. vmr $outhead,$outtail
  224. vxor $in0,$in0,$tmp
  225. vsldoi $tmp,$zero,$tmp,12 # >>32
  226. vxor $in0,$in0,$tmp
  227. stvx $stage,0,$out
  228. addi $out,$out,16
  229. vspltw $tmp,$in0,3
  230. vxor $tmp,$tmp,$in1
  231. vsldoi $in1,$zero,$in1,12 # >>32
  232. vadduwm $rcon,$rcon,$rcon
  233. vxor $in1,$in1,$tmp
  234. vxor $in0,$in0,$key
  235. vxor $in1,$in1,$key
  236. vperm $outtail,$in0,$in0,$outperm # rotate
  237. vsel $stage,$outhead,$outtail,$outmask
  238. vmr $outhead,$outtail
  239. stvx $stage,0,$out
  240. addi $inp,$out,15 # 15 is not typo
  241. addi $out,$out,16
  242. bdnz Loop192
  243. li $rounds,12
  244. addi $out,$out,0x20
  245. b Ldone
  246. .align 4
  247. L256:
  248. lvx $tmp,0,$inp
  249. li $cnt,7
  250. li $rounds,14
  251. vperm $outtail,$in0,$in0,$outperm # rotate
  252. vsel $stage,$outhead,$outtail,$outmask
  253. vmr $outhead,$outtail
  254. stvx $stage,0,$out
  255. addi $out,$out,16
  256. vperm $in1,$in1,$tmp,$key # align [and byte swap in LE]
  257. mtctr $cnt
  258. Loop256:
  259. vperm $key,$in1,$in1,$mask # rotate-n-splat
  260. vsldoi $tmp,$zero,$in0,12 # >>32
  261. vperm $outtail,$in1,$in1,$outperm # rotate
  262. vsel $stage,$outhead,$outtail,$outmask
  263. vmr $outhead,$outtail
  264. vcipherlast $key,$key,$rcon
  265. stvx $stage,0,$out
  266. addi $out,$out,16
  267. vxor $in0,$in0,$tmp
  268. vsldoi $tmp,$zero,$tmp,12 # >>32
  269. vxor $in0,$in0,$tmp
  270. vsldoi $tmp,$zero,$tmp,12 # >>32
  271. vxor $in0,$in0,$tmp
  272. vadduwm $rcon,$rcon,$rcon
  273. vxor $in0,$in0,$key
  274. vperm $outtail,$in0,$in0,$outperm # rotate
  275. vsel $stage,$outhead,$outtail,$outmask
  276. vmr $outhead,$outtail
  277. stvx $stage,0,$out
  278. addi $inp,$out,15 # 15 is not typo
  279. addi $out,$out,16
  280. bdz Ldone
  281. vspltw $key,$in0,3 # just splat
  282. vsldoi $tmp,$zero,$in1,12 # >>32
  283. vsbox $key,$key
  284. vxor $in1,$in1,$tmp
  285. vsldoi $tmp,$zero,$tmp,12 # >>32
  286. vxor $in1,$in1,$tmp
  287. vsldoi $tmp,$zero,$tmp,12 # >>32
  288. vxor $in1,$in1,$tmp
  289. vxor $in1,$in1,$key
  290. b Loop256
  291. .align 4
  292. Ldone:
  293. lvx $in1,0,$inp # redundant in aligned case
  294. vsel $in1,$outhead,$in1,$outmask
  295. stvx $in1,0,$inp
  296. li $ptr,0
  297. mtspr 256,$vrsave
  298. stw $rounds,0($out)
  299. Lenc_key_abort:
  300. mr r3,$ptr
  301. blr
  302. .long 0
  303. .byte 0,12,0x14,1,0,0,3,0
  304. .long 0
  305. .size .${prefix}_set_encrypt_key,.-.${prefix}_set_encrypt_key
  306. .globl .${prefix}_set_decrypt_key
  307. .align 5
  308. .${prefix}_set_decrypt_key:
  309. $STU $sp,-$FRAME($sp)
  310. mflr r10
  311. $PUSH r10,$FRAME+$LRSAVE($sp)
  312. bl Lset_encrypt_key
  313. mtlr r10
  314. cmpwi r3,0
  315. bne- Ldec_key_abort
  316. slwi $cnt,$rounds,4
  317. subi $inp,$out,240 # first round key
  318. srwi $rounds,$rounds,1
  319. add $out,$inp,$cnt # last round key
  320. mtctr $rounds
  321. Ldeckey:
  322. lwz r0, 0($inp)
  323. lwz r6, 4($inp)
  324. lwz r7, 8($inp)
  325. lwz r8, 12($inp)
  326. addi $inp,$inp,16
  327. lwz r9, 0($out)
  328. lwz r10,4($out)
  329. lwz r11,8($out)
  330. lwz r12,12($out)
  331. stw r0, 0($out)
  332. stw r6, 4($out)
  333. stw r7, 8($out)
  334. stw r8, 12($out)
  335. subi $out,$out,16
  336. stw r9, -16($inp)
  337. stw r10,-12($inp)
  338. stw r11,-8($inp)
  339. stw r12,-4($inp)
  340. bdnz Ldeckey
  341. xor r3,r3,r3 # return value
  342. Ldec_key_abort:
  343. addi $sp,$sp,$FRAME
  344. blr
  345. .long 0
  346. .byte 0,12,4,1,0x80,0,3,0
  347. .long 0
  348. .size .${prefix}_set_decrypt_key,.-.${prefix}_set_decrypt_key
  349. ___
  350. }}}
  351. #########################################################################
  352. {{{ # Single block en- and decrypt procedures #
  353. sub gen_block () {
  354. my $dir = shift;
  355. my $n = $dir eq "de" ? "n" : "";
  356. my ($inp,$out,$key,$rounds,$idx)=map("r$_",(3..7));
  357. $code.=<<___;
  358. .globl .${prefix}_${dir}crypt
  359. .align 5
  360. .${prefix}_${dir}crypt:
  361. lwz $rounds,240($key)
  362. lis r0,0xfc00
  363. mfspr $vrsave,256
  364. li $idx,15 # 15 is not typo
  365. mtspr 256,r0
  366. lvx v0,0,$inp
  367. neg r11,$out
  368. lvx v1,$idx,$inp
  369. lvsl v2,0,$inp # inpperm
  370. le?vspltisb v4,0x0f
  371. ?lvsl v3,0,r11 # outperm
  372. le?vxor v2,v2,v4
  373. li $idx,16
  374. vperm v0,v0,v1,v2 # align [and byte swap in LE]
  375. lvx v1,0,$key
  376. ?lvsl v5,0,$key # keyperm
  377. srwi $rounds,$rounds,1
  378. lvx v2,$idx,$key
  379. addi $idx,$idx,16
  380. subi $rounds,$rounds,1
  381. ?vperm v1,v1,v2,v5 # align round key
  382. vxor v0,v0,v1
  383. lvx v1,$idx,$key
  384. addi $idx,$idx,16
  385. mtctr $rounds
  386. Loop_${dir}c:
  387. ?vperm v2,v2,v1,v5
  388. v${n}cipher v0,v0,v2
  389. lvx v2,$idx,$key
  390. addi $idx,$idx,16
  391. ?vperm v1,v1,v2,v5
  392. v${n}cipher v0,v0,v1
  393. lvx v1,$idx,$key
  394. addi $idx,$idx,16
  395. bdnz Loop_${dir}c
  396. ?vperm v2,v2,v1,v5
  397. v${n}cipher v0,v0,v2
  398. lvx v2,$idx,$key
  399. ?vperm v1,v1,v2,v5
  400. v${n}cipherlast v0,v0,v1
  401. vspltisb v2,-1
  402. vxor v1,v1,v1
  403. li $idx,15 # 15 is not typo
  404. ?vperm v2,v1,v2,v3 # outmask
  405. le?vxor v3,v3,v4
  406. lvx v1,0,$out # outhead
  407. vperm v0,v0,v0,v3 # rotate [and byte swap in LE]
  408. vsel v1,v1,v0,v2
  409. lvx v4,$idx,$out
  410. stvx v1,0,$out
  411. vsel v0,v0,v4,v2
  412. stvx v0,$idx,$out
  413. mtspr 256,$vrsave
  414. blr
  415. .long 0
  416. .byte 0,12,0x14,0,0,0,3,0
  417. .long 0
  418. .size .${prefix}_${dir}crypt,.-.${prefix}_${dir}crypt
  419. ___
  420. }
  421. &gen_block("en");
  422. &gen_block("de");
  423. }}}
  424. #########################################################################
  425. {{{ # CBC en- and decrypt procedures #
  426. my ($inp,$out,$len,$key,$ivp,$enc,$rounds,$idx)=map("r$_",(3..10));
  427. my ($rndkey0,$rndkey1,$inout,$tmp)= map("v$_",(0..3));
  428. my ($ivec,$inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm)=
  429. map("v$_",(4..10));
  430. $code.=<<___;
  431. .globl .${prefix}_cbc_encrypt
  432. .align 5
  433. .${prefix}_cbc_encrypt:
  434. ${UCMP}i $len,16
  435. bltlr-
  436. cmpwi $enc,0 # test direction
  437. lis r0,0xffe0
  438. mfspr $vrsave,256
  439. mtspr 256,r0
  440. li $idx,15
  441. vxor $rndkey0,$rndkey0,$rndkey0
  442. le?vspltisb $tmp,0x0f
  443. lvx $ivec,0,$ivp # load [unaligned] iv
  444. lvsl $inpperm,0,$ivp
  445. lvx $inptail,$idx,$ivp
  446. le?vxor $inpperm,$inpperm,$tmp
  447. vperm $ivec,$ivec,$inptail,$inpperm
  448. neg r11,$inp
  449. ?lvsl $keyperm,0,$key # prepare for unaligned key
  450. lwz $rounds,240($key)
  451. lvsr $inpperm,0,r11 # prepare for unaligned load
  452. lvx $inptail,0,$inp
  453. addi $inp,$inp,15 # 15 is not typo
  454. le?vxor $inpperm,$inpperm,$tmp
  455. ?lvsr $outperm,0,$out # prepare for unaligned store
  456. vspltisb $outmask,-1
  457. lvx $outhead,0,$out
  458. ?vperm $outmask,$rndkey0,$outmask,$outperm
  459. le?vxor $outperm,$outperm,$tmp
  460. srwi $rounds,$rounds,1
  461. li $idx,16
  462. subi $rounds,$rounds,1
  463. beq Lcbc_dec
  464. Lcbc_enc:
  465. vmr $inout,$inptail
  466. lvx $inptail,0,$inp
  467. addi $inp,$inp,16
  468. mtctr $rounds
  469. subi $len,$len,16 # len-=16
  470. lvx $rndkey0,0,$key
  471. vperm $inout,$inout,$inptail,$inpperm
  472. lvx $rndkey1,$idx,$key
  473. addi $idx,$idx,16
  474. ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
  475. vxor $inout,$inout,$rndkey0
  476. lvx $rndkey0,$idx,$key
  477. addi $idx,$idx,16
  478. vxor $inout,$inout,$ivec
  479. Loop_cbc_enc:
  480. ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
  481. vcipher $inout,$inout,$rndkey1
  482. lvx $rndkey1,$idx,$key
  483. addi $idx,$idx,16
  484. ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
  485. vcipher $inout,$inout,$rndkey0
  486. lvx $rndkey0,$idx,$key
  487. addi $idx,$idx,16
  488. bdnz Loop_cbc_enc
  489. ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
  490. vcipher $inout,$inout,$rndkey1
  491. lvx $rndkey1,$idx,$key
  492. li $idx,16
  493. ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
  494. vcipherlast $ivec,$inout,$rndkey0
  495. ${UCMP}i $len,16
  496. vperm $tmp,$ivec,$ivec,$outperm
  497. vsel $inout,$outhead,$tmp,$outmask
  498. vmr $outhead,$tmp
  499. stvx $inout,0,$out
  500. addi $out,$out,16
  501. bge Lcbc_enc
  502. b Lcbc_done
  503. .align 4
  504. Lcbc_dec:
  505. ${UCMP}i $len,128
  506. bge _aesp8_cbc_decrypt8x
  507. vmr $tmp,$inptail
  508. lvx $inptail,0,$inp
  509. addi $inp,$inp,16
  510. mtctr $rounds
  511. subi $len,$len,16 # len-=16
  512. lvx $rndkey0,0,$key
  513. vperm $tmp,$tmp,$inptail,$inpperm
  514. lvx $rndkey1,$idx,$key
  515. addi $idx,$idx,16
  516. ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
  517. vxor $inout,$tmp,$rndkey0
  518. lvx $rndkey0,$idx,$key
  519. addi $idx,$idx,16
  520. Loop_cbc_dec:
  521. ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
  522. vncipher $inout,$inout,$rndkey1
  523. lvx $rndkey1,$idx,$key
  524. addi $idx,$idx,16
  525. ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
  526. vncipher $inout,$inout,$rndkey0
  527. lvx $rndkey0,$idx,$key
  528. addi $idx,$idx,16
  529. bdnz Loop_cbc_dec
  530. ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
  531. vncipher $inout,$inout,$rndkey1
  532. lvx $rndkey1,$idx,$key
  533. li $idx,16
  534. ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
  535. vncipherlast $inout,$inout,$rndkey0
  536. ${UCMP}i $len,16
  537. vxor $inout,$inout,$ivec
  538. vmr $ivec,$tmp
  539. vperm $tmp,$inout,$inout,$outperm
  540. vsel $inout,$outhead,$tmp,$outmask
  541. vmr $outhead,$tmp
  542. stvx $inout,0,$out
  543. addi $out,$out,16
  544. bge Lcbc_dec
  545. Lcbc_done:
  546. addi $out,$out,-1
  547. lvx $inout,0,$out # redundant in aligned case
  548. vsel $inout,$outhead,$inout,$outmask
  549. stvx $inout,0,$out
  550. neg $enc,$ivp # write [unaligned] iv
  551. li $idx,15 # 15 is not typo
  552. vxor $rndkey0,$rndkey0,$rndkey0
  553. vspltisb $outmask,-1
  554. le?vspltisb $tmp,0x0f
  555. ?lvsl $outperm,0,$enc
  556. ?vperm $outmask,$rndkey0,$outmask,$outperm
  557. le?vxor $outperm,$outperm,$tmp
  558. lvx $outhead,0,$ivp
  559. vperm $ivec,$ivec,$ivec,$outperm
  560. vsel $inout,$outhead,$ivec,$outmask
  561. lvx $inptail,$idx,$ivp
  562. stvx $inout,0,$ivp
  563. vsel $inout,$ivec,$inptail,$outmask
  564. stvx $inout,$idx,$ivp
  565. mtspr 256,$vrsave
  566. blr
  567. .long 0
  568. .byte 0,12,0x14,0,0,0,6,0
  569. .long 0
  570. ___
  571. #########################################################################
  572. {{ # Optimized CBC decrypt procedure #
  573. my $key_="r11";
  574. my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,8,26..31));
  575. $x00=0 if ($flavour =~ /osx/);
  576. my ($in0, $in1, $in2, $in3, $in4, $in5, $in6, $in7 )=map("v$_",(0..3,10..13));
  577. my ($out0,$out1,$out2,$out3,$out4,$out5,$out6,$out7)=map("v$_",(14..21));
  578. my $rndkey0="v23"; # v24-v25 rotating buffer for first found keys
  579. # v26-v31 last 6 round keys
  580. my ($tmp,$keyperm)=($in3,$in4); # aliases with "caller", redundant assignment
  581. $code.=<<___;
  582. .align 5
  583. _aesp8_cbc_decrypt8x:
  584. $STU $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
  585. li r10,`$FRAME+8*16+15`
  586. li r11,`$FRAME+8*16+31`
  587. stvx v20,r10,$sp # ABI says so
  588. addi r10,r10,32
  589. stvx v21,r11,$sp
  590. addi r11,r11,32
  591. stvx v22,r10,$sp
  592. addi r10,r10,32
  593. stvx v23,r11,$sp
  594. addi r11,r11,32
  595. stvx v24,r10,$sp
  596. addi r10,r10,32
  597. stvx v25,r11,$sp
  598. addi r11,r11,32
  599. stvx v26,r10,$sp
  600. addi r10,r10,32
  601. stvx v27,r11,$sp
  602. addi r11,r11,32
  603. stvx v28,r10,$sp
  604. addi r10,r10,32
  605. stvx v29,r11,$sp
  606. addi r11,r11,32
  607. stvx v30,r10,$sp
  608. stvx v31,r11,$sp
  609. li r0,-1
  610. stw $vrsave,`$FRAME+21*16-4`($sp) # save vrsave
  611. li $x10,0x10
  612. $PUSH r26,`$FRAME+21*16+0*$SIZE_T`($sp)
  613. li $x20,0x20
  614. $PUSH r27,`$FRAME+21*16+1*$SIZE_T`($sp)
  615. li $x30,0x30
  616. $PUSH r28,`$FRAME+21*16+2*$SIZE_T`($sp)
  617. li $x40,0x40
  618. $PUSH r29,`$FRAME+21*16+3*$SIZE_T`($sp)
  619. li $x50,0x50
  620. $PUSH r30,`$FRAME+21*16+4*$SIZE_T`($sp)
  621. li $x60,0x60
  622. $PUSH r31,`$FRAME+21*16+5*$SIZE_T`($sp)
  623. li $x70,0x70
  624. mtspr 256,r0
  625. subi $rounds,$rounds,3 # -4 in total
  626. subi $len,$len,128 # bias
  627. lvx $rndkey0,$x00,$key # load key schedule
  628. lvx v30,$x10,$key
  629. addi $key,$key,0x20
  630. lvx v31,$x00,$key
  631. ?vperm $rndkey0,$rndkey0,v30,$keyperm
  632. addi $key_,$sp,$FRAME+15
  633. mtctr $rounds
  634. Load_cbc_dec_key:
  635. ?vperm v24,v30,v31,$keyperm
  636. lvx v30,$x10,$key
  637. addi $key,$key,0x20
  638. stvx v24,$x00,$key_ # off-load round[1]
  639. ?vperm v25,v31,v30,$keyperm
  640. lvx v31,$x00,$key
  641. stvx v25,$x10,$key_ # off-load round[2]
  642. addi $key_,$key_,0x20
  643. bdnz Load_cbc_dec_key
  644. lvx v26,$x10,$key
  645. ?vperm v24,v30,v31,$keyperm
  646. lvx v27,$x20,$key
  647. stvx v24,$x00,$key_ # off-load round[3]
  648. ?vperm v25,v31,v26,$keyperm
  649. lvx v28,$x30,$key
  650. stvx v25,$x10,$key_ # off-load round[4]
  651. addi $key_,$sp,$FRAME+15 # rewind $key_
  652. ?vperm v26,v26,v27,$keyperm
  653. lvx v29,$x40,$key
  654. ?vperm v27,v27,v28,$keyperm
  655. lvx v30,$x50,$key
  656. ?vperm v28,v28,v29,$keyperm
  657. lvx v31,$x60,$key
  658. ?vperm v29,v29,v30,$keyperm
  659. lvx $out0,$x70,$key # borrow $out0
  660. ?vperm v30,v30,v31,$keyperm
  661. lvx v24,$x00,$key_ # pre-load round[1]
  662. ?vperm v31,v31,$out0,$keyperm
  663. lvx v25,$x10,$key_ # pre-load round[2]
  664. #lvx $inptail,0,$inp # "caller" already did this
  665. #addi $inp,$inp,15 # 15 is not typo
  666. subi $inp,$inp,15 # undo "caller"
  667. le?li $idx,8
  668. lvx_u $in0,$x00,$inp # load first 8 "words"
  669. le?lvsl $inpperm,0,$idx
  670. le?vspltisb $tmp,0x0f
  671. lvx_u $in1,$x10,$inp
  672. le?vxor $inpperm,$inpperm,$tmp # transform for lvx_u/stvx_u
  673. lvx_u $in2,$x20,$inp
  674. le?vperm $in0,$in0,$in0,$inpperm
  675. lvx_u $in3,$x30,$inp
  676. le?vperm $in1,$in1,$in1,$inpperm
  677. lvx_u $in4,$x40,$inp
  678. le?vperm $in2,$in2,$in2,$inpperm
  679. vxor $out0,$in0,$rndkey0
  680. lvx_u $in5,$x50,$inp
  681. le?vperm $in3,$in3,$in3,$inpperm
  682. vxor $out1,$in1,$rndkey0
  683. lvx_u $in6,$x60,$inp
  684. le?vperm $in4,$in4,$in4,$inpperm
  685. vxor $out2,$in2,$rndkey0
  686. lvx_u $in7,$x70,$inp
  687. addi $inp,$inp,0x80
  688. le?vperm $in5,$in5,$in5,$inpperm
  689. vxor $out3,$in3,$rndkey0
  690. le?vperm $in6,$in6,$in6,$inpperm
  691. vxor $out4,$in4,$rndkey0
  692. le?vperm $in7,$in7,$in7,$inpperm
  693. vxor $out5,$in5,$rndkey0
  694. vxor $out6,$in6,$rndkey0
  695. vxor $out7,$in7,$rndkey0
  696. mtctr $rounds
  697. b Loop_cbc_dec8x
  698. .align 5
  699. Loop_cbc_dec8x:
  700. vncipher $out0,$out0,v24
  701. vncipher $out1,$out1,v24
  702. vncipher $out2,$out2,v24
  703. vncipher $out3,$out3,v24
  704. vncipher $out4,$out4,v24
  705. vncipher $out5,$out5,v24
  706. vncipher $out6,$out6,v24
  707. vncipher $out7,$out7,v24
  708. lvx v24,$x20,$key_ # round[3]
  709. addi $key_,$key_,0x20
  710. vncipher $out0,$out0,v25
  711. vncipher $out1,$out1,v25
  712. vncipher $out2,$out2,v25
  713. vncipher $out3,$out3,v25
  714. vncipher $out4,$out4,v25
  715. vncipher $out5,$out5,v25
  716. vncipher $out6,$out6,v25
  717. vncipher $out7,$out7,v25
  718. lvx v25,$x10,$key_ # round[4]
  719. bdnz Loop_cbc_dec8x
  720. subic $len,$len,128 # $len-=128
  721. vncipher $out0,$out0,v24
  722. vncipher $out1,$out1,v24
  723. vncipher $out2,$out2,v24
  724. vncipher $out3,$out3,v24
  725. vncipher $out4,$out4,v24
  726. vncipher $out5,$out5,v24
  727. vncipher $out6,$out6,v24
  728. vncipher $out7,$out7,v24
  729. subfe. r0,r0,r0 # borrow?-1:0
  730. vncipher $out0,$out0,v25
  731. vncipher $out1,$out1,v25
  732. vncipher $out2,$out2,v25
  733. vncipher $out3,$out3,v25
  734. vncipher $out4,$out4,v25
  735. vncipher $out5,$out5,v25
  736. vncipher $out6,$out6,v25
  737. vncipher $out7,$out7,v25
  738. and r0,r0,$len
  739. vncipher $out0,$out0,v26
  740. vncipher $out1,$out1,v26
  741. vncipher $out2,$out2,v26
  742. vncipher $out3,$out3,v26
  743. vncipher $out4,$out4,v26
  744. vncipher $out5,$out5,v26
  745. vncipher $out6,$out6,v26
  746. vncipher $out7,$out7,v26
  747. add $inp,$inp,r0 # $inp is adjusted in such
  748. # way that at exit from the
  749. # loop inX-in7 are loaded
  750. # with last "words"
  751. vncipher $out0,$out0,v27
  752. vncipher $out1,$out1,v27
  753. vncipher $out2,$out2,v27
  754. vncipher $out3,$out3,v27
  755. vncipher $out4,$out4,v27
  756. vncipher $out5,$out5,v27
  757. vncipher $out6,$out6,v27
  758. vncipher $out7,$out7,v27
  759. addi $key_,$sp,$FRAME+15 # rewind $key_
  760. vncipher $out0,$out0,v28
  761. vncipher $out1,$out1,v28
  762. vncipher $out2,$out2,v28
  763. vncipher $out3,$out3,v28
  764. vncipher $out4,$out4,v28
  765. vncipher $out5,$out5,v28
  766. vncipher $out6,$out6,v28
  767. vncipher $out7,$out7,v28
  768. lvx v24,$x00,$key_ # re-pre-load round[1]
  769. vncipher $out0,$out0,v29
  770. vncipher $out1,$out1,v29
  771. vncipher $out2,$out2,v29
  772. vncipher $out3,$out3,v29
  773. vncipher $out4,$out4,v29
  774. vncipher $out5,$out5,v29
  775. vncipher $out6,$out6,v29
  776. vncipher $out7,$out7,v29
  777. lvx v25,$x10,$key_ # re-pre-load round[2]
  778. vncipher $out0,$out0,v30
  779. vxor $ivec,$ivec,v31 # xor with last round key
  780. vncipher $out1,$out1,v30
  781. vxor $in0,$in0,v31
  782. vncipher $out2,$out2,v30
  783. vxor $in1,$in1,v31
  784. vncipher $out3,$out3,v30
  785. vxor $in2,$in2,v31
  786. vncipher $out4,$out4,v30
  787. vxor $in3,$in3,v31
  788. vncipher $out5,$out5,v30
  789. vxor $in4,$in4,v31
  790. vncipher $out6,$out6,v30
  791. vxor $in5,$in5,v31
  792. vncipher $out7,$out7,v30
  793. vxor $in6,$in6,v31
  794. vncipherlast $out0,$out0,$ivec
  795. vncipherlast $out1,$out1,$in0
  796. lvx_u $in0,$x00,$inp # load next input block
  797. vncipherlast $out2,$out2,$in1
  798. lvx_u $in1,$x10,$inp
  799. vncipherlast $out3,$out3,$in2
  800. le?vperm $in0,$in0,$in0,$inpperm
  801. lvx_u $in2,$x20,$inp
  802. vncipherlast $out4,$out4,$in3
  803. le?vperm $in1,$in1,$in1,$inpperm
  804. lvx_u $in3,$x30,$inp
  805. vncipherlast $out5,$out5,$in4
  806. le?vperm $in2,$in2,$in2,$inpperm
  807. lvx_u $in4,$x40,$inp
  808. vncipherlast $out6,$out6,$in5
  809. le?vperm $in3,$in3,$in3,$inpperm
  810. lvx_u $in5,$x50,$inp
  811. vncipherlast $out7,$out7,$in6
  812. le?vperm $in4,$in4,$in4,$inpperm
  813. lvx_u $in6,$x60,$inp
  814. vmr $ivec,$in7
  815. le?vperm $in5,$in5,$in5,$inpperm
  816. lvx_u $in7,$x70,$inp
  817. addi $inp,$inp,0x80
  818. le?vperm $out0,$out0,$out0,$inpperm
  819. le?vperm $out1,$out1,$out1,$inpperm
  820. stvx_u $out0,$x00,$out
  821. le?vperm $in6,$in6,$in6,$inpperm
  822. vxor $out0,$in0,$rndkey0
  823. le?vperm $out2,$out2,$out2,$inpperm
  824. stvx_u $out1,$x10,$out
  825. le?vperm $in7,$in7,$in7,$inpperm
  826. vxor $out1,$in1,$rndkey0
  827. le?vperm $out3,$out3,$out3,$inpperm
  828. stvx_u $out2,$x20,$out
  829. vxor $out2,$in2,$rndkey0
  830. le?vperm $out4,$out4,$out4,$inpperm
  831. stvx_u $out3,$x30,$out
  832. vxor $out3,$in3,$rndkey0
  833. le?vperm $out5,$out5,$out5,$inpperm
  834. stvx_u $out4,$x40,$out
  835. vxor $out4,$in4,$rndkey0
  836. le?vperm $out6,$out6,$out6,$inpperm
  837. stvx_u $out5,$x50,$out
  838. vxor $out5,$in5,$rndkey0
  839. le?vperm $out7,$out7,$out7,$inpperm
  840. stvx_u $out6,$x60,$out
  841. vxor $out6,$in6,$rndkey0
  842. stvx_u $out7,$x70,$out
  843. addi $out,$out,0x80
  844. vxor $out7,$in7,$rndkey0
  845. mtctr $rounds
  846. beq Loop_cbc_dec8x # did $len-=128 borrow?
  847. addic. $len,$len,128
  848. beq Lcbc_dec8x_done
  849. nop
  850. nop
  851. Loop_cbc_dec8x_tail: # up to 7 "words" tail...
  852. vncipher $out1,$out1,v24
  853. vncipher $out2,$out2,v24
  854. vncipher $out3,$out3,v24
  855. vncipher $out4,$out4,v24
  856. vncipher $out5,$out5,v24
  857. vncipher $out6,$out6,v24
  858. vncipher $out7,$out7,v24
  859. lvx v24,$x20,$key_ # round[3]
  860. addi $key_,$key_,0x20
  861. vncipher $out1,$out1,v25
  862. vncipher $out2,$out2,v25
  863. vncipher $out3,$out3,v25
  864. vncipher $out4,$out4,v25
  865. vncipher $out5,$out5,v25
  866. vncipher $out6,$out6,v25
  867. vncipher $out7,$out7,v25
  868. lvx v25,$x10,$key_ # round[4]
  869. bdnz Loop_cbc_dec8x_tail
  870. vncipher $out1,$out1,v24
  871. vncipher $out2,$out2,v24
  872. vncipher $out3,$out3,v24
  873. vncipher $out4,$out4,v24
  874. vncipher $out5,$out5,v24
  875. vncipher $out6,$out6,v24
  876. vncipher $out7,$out7,v24
  877. vncipher $out1,$out1,v25
  878. vncipher $out2,$out2,v25
  879. vncipher $out3,$out3,v25
  880. vncipher $out4,$out4,v25
  881. vncipher $out5,$out5,v25
  882. vncipher $out6,$out6,v25
  883. vncipher $out7,$out7,v25
  884. vncipher $out1,$out1,v26
  885. vncipher $out2,$out2,v26
  886. vncipher $out3,$out3,v26
  887. vncipher $out4,$out4,v26
  888. vncipher $out5,$out5,v26
  889. vncipher $out6,$out6,v26
  890. vncipher $out7,$out7,v26
  891. vncipher $out1,$out1,v27
  892. vncipher $out2,$out2,v27
  893. vncipher $out3,$out3,v27
  894. vncipher $out4,$out4,v27
  895. vncipher $out5,$out5,v27
  896. vncipher $out6,$out6,v27
  897. vncipher $out7,$out7,v27
  898. vncipher $out1,$out1,v28
  899. vncipher $out2,$out2,v28
  900. vncipher $out3,$out3,v28
  901. vncipher $out4,$out4,v28
  902. vncipher $out5,$out5,v28
  903. vncipher $out6,$out6,v28
  904. vncipher $out7,$out7,v28
  905. vncipher $out1,$out1,v29
  906. vncipher $out2,$out2,v29
  907. vncipher $out3,$out3,v29
  908. vncipher $out4,$out4,v29
  909. vncipher $out5,$out5,v29
  910. vncipher $out6,$out6,v29
  911. vncipher $out7,$out7,v29
  912. vncipher $out1,$out1,v30
  913. vxor $ivec,$ivec,v31 # last round key
  914. vncipher $out2,$out2,v30
  915. vxor $in1,$in1,v31
  916. vncipher $out3,$out3,v30
  917. vxor $in2,$in2,v31
  918. vncipher $out4,$out4,v30
  919. vxor $in3,$in3,v31
  920. vncipher $out5,$out5,v30
  921. vxor $in4,$in4,v31
  922. vncipher $out6,$out6,v30
  923. vxor $in5,$in5,v31
  924. vncipher $out7,$out7,v30
  925. vxor $in6,$in6,v31
  926. cmplwi $len,32 # switch($len)
  927. blt Lcbc_dec8x_one
  928. nop
  929. beq Lcbc_dec8x_two
  930. cmplwi $len,64
  931. blt Lcbc_dec8x_three
  932. nop
  933. beq Lcbc_dec8x_four
  934. cmplwi $len,96
  935. blt Lcbc_dec8x_five
  936. nop
  937. beq Lcbc_dec8x_six
  938. Lcbc_dec8x_seven:
  939. vncipherlast $out1,$out1,$ivec
  940. vncipherlast $out2,$out2,$in1
  941. vncipherlast $out3,$out3,$in2
  942. vncipherlast $out4,$out4,$in3
  943. vncipherlast $out5,$out5,$in4
  944. vncipherlast $out6,$out6,$in5
  945. vncipherlast $out7,$out7,$in6
  946. vmr $ivec,$in7
  947. le?vperm $out1,$out1,$out1,$inpperm
  948. le?vperm $out2,$out2,$out2,$inpperm
  949. stvx_u $out1,$x00,$out
  950. le?vperm $out3,$out3,$out3,$inpperm
  951. stvx_u $out2,$x10,$out
  952. le?vperm $out4,$out4,$out4,$inpperm
  953. stvx_u $out3,$x20,$out
  954. le?vperm $out5,$out5,$out5,$inpperm
  955. stvx_u $out4,$x30,$out
  956. le?vperm $out6,$out6,$out6,$inpperm
  957. stvx_u $out5,$x40,$out
  958. le?vperm $out7,$out7,$out7,$inpperm
  959. stvx_u $out6,$x50,$out
  960. stvx_u $out7,$x60,$out
  961. addi $out,$out,0x70
  962. b Lcbc_dec8x_done
  963. .align 5
  964. Lcbc_dec8x_six:
  965. vncipherlast $out2,$out2,$ivec
  966. vncipherlast $out3,$out3,$in2
  967. vncipherlast $out4,$out4,$in3
  968. vncipherlast $out5,$out5,$in4
  969. vncipherlast $out6,$out6,$in5
  970. vncipherlast $out7,$out7,$in6
  971. vmr $ivec,$in7
  972. le?vperm $out2,$out2,$out2,$inpperm
  973. le?vperm $out3,$out3,$out3,$inpperm
  974. stvx_u $out2,$x00,$out
  975. le?vperm $out4,$out4,$out4,$inpperm
  976. stvx_u $out3,$x10,$out
  977. le?vperm $out5,$out5,$out5,$inpperm
  978. stvx_u $out4,$x20,$out
  979. le?vperm $out6,$out6,$out6,$inpperm
  980. stvx_u $out5,$x30,$out
  981. le?vperm $out7,$out7,$out7,$inpperm
  982. stvx_u $out6,$x40,$out
  983. stvx_u $out7,$x50,$out
  984. addi $out,$out,0x60
  985. b Lcbc_dec8x_done
  986. .align 5
  987. Lcbc_dec8x_five:
  988. vncipherlast $out3,$out3,$ivec
  989. vncipherlast $out4,$out4,$in3
  990. vncipherlast $out5,$out5,$in4
  991. vncipherlast $out6,$out6,$in5
  992. vncipherlast $out7,$out7,$in6
  993. vmr $ivec,$in7
  994. le?vperm $out3,$out3,$out3,$inpperm
  995. le?vperm $out4,$out4,$out4,$inpperm
  996. stvx_u $out3,$x00,$out
  997. le?vperm $out5,$out5,$out5,$inpperm
  998. stvx_u $out4,$x10,$out
  999. le?vperm $out6,$out6,$out6,$inpperm
  1000. stvx_u $out5,$x20,$out
  1001. le?vperm $out7,$out7,$out7,$inpperm
  1002. stvx_u $out6,$x30,$out
  1003. stvx_u $out7,$x40,$out
  1004. addi $out,$out,0x50
  1005. b Lcbc_dec8x_done
  1006. .align 5
  1007. Lcbc_dec8x_four:
  1008. vncipherlast $out4,$out4,$ivec
  1009. vncipherlast $out5,$out5,$in4
  1010. vncipherlast $out6,$out6,$in5
  1011. vncipherlast $out7,$out7,$in6
  1012. vmr $ivec,$in7
  1013. le?vperm $out4,$out4,$out4,$inpperm
  1014. le?vperm $out5,$out5,$out5,$inpperm
  1015. stvx_u $out4,$x00,$out
  1016. le?vperm $out6,$out6,$out6,$inpperm
  1017. stvx_u $out5,$x10,$out
  1018. le?vperm $out7,$out7,$out7,$inpperm
  1019. stvx_u $out6,$x20,$out
  1020. stvx_u $out7,$x30,$out
  1021. addi $out,$out,0x40
  1022. b Lcbc_dec8x_done
  1023. .align 5
  1024. Lcbc_dec8x_three:
  1025. vncipherlast $out5,$out5,$ivec
  1026. vncipherlast $out6,$out6,$in5
  1027. vncipherlast $out7,$out7,$in6
  1028. vmr $ivec,$in7
  1029. le?vperm $out5,$out5,$out5,$inpperm
  1030. le?vperm $out6,$out6,$out6,$inpperm
  1031. stvx_u $out5,$x00,$out
  1032. le?vperm $out7,$out7,$out7,$inpperm
  1033. stvx_u $out6,$x10,$out
  1034. stvx_u $out7,$x20,$out
  1035. addi $out,$out,0x30
  1036. b Lcbc_dec8x_done
  1037. .align 5
  1038. Lcbc_dec8x_two:
  1039. vncipherlast $out6,$out6,$ivec
  1040. vncipherlast $out7,$out7,$in6
  1041. vmr $ivec,$in7
  1042. le?vperm $out6,$out6,$out6,$inpperm
  1043. le?vperm $out7,$out7,$out7,$inpperm
  1044. stvx_u $out6,$x00,$out
  1045. stvx_u $out7,$x10,$out
  1046. addi $out,$out,0x20
  1047. b Lcbc_dec8x_done
  1048. .align 5
  1049. Lcbc_dec8x_one:
  1050. vncipherlast $out7,$out7,$ivec
  1051. vmr $ivec,$in7
  1052. le?vperm $out7,$out7,$out7,$inpperm
  1053. stvx_u $out7,0,$out
  1054. addi $out,$out,0x10
  1055. Lcbc_dec8x_done:
  1056. le?vperm $ivec,$ivec,$ivec,$inpperm
  1057. stvx_u $ivec,0,$ivp # write [unaligned] iv
  1058. li r10,`$FRAME+15`
  1059. li r11,`$FRAME+31`
  1060. stvx $inpperm,r10,$sp # wipe copies of round keys
  1061. addi r10,r10,32
  1062. stvx $inpperm,r11,$sp
  1063. addi r11,r11,32
  1064. stvx $inpperm,r10,$sp
  1065. addi r10,r10,32
  1066. stvx $inpperm,r11,$sp
  1067. addi r11,r11,32
  1068. stvx $inpperm,r10,$sp
  1069. addi r10,r10,32
  1070. stvx $inpperm,r11,$sp
  1071. addi r11,r11,32
  1072. stvx $inpperm,r10,$sp
  1073. addi r10,r10,32
  1074. stvx $inpperm,r11,$sp
  1075. addi r11,r11,32
  1076. mtspr 256,$vrsave
  1077. lvx v20,r10,$sp # ABI says so
  1078. addi r10,r10,32
  1079. lvx v21,r11,$sp
  1080. addi r11,r11,32
  1081. lvx v22,r10,$sp
  1082. addi r10,r10,32
  1083. lvx v23,r11,$sp
  1084. addi r11,r11,32
  1085. lvx v24,r10,$sp
  1086. addi r10,r10,32
  1087. lvx v25,r11,$sp
  1088. addi r11,r11,32
  1089. lvx v26,r10,$sp
  1090. addi r10,r10,32
  1091. lvx v27,r11,$sp
  1092. addi r11,r11,32
  1093. lvx v28,r10,$sp
  1094. addi r10,r10,32
  1095. lvx v29,r11,$sp
  1096. addi r11,r11,32
  1097. lvx v30,r10,$sp
  1098. lvx v31,r11,$sp
  1099. $POP r26,`$FRAME+21*16+0*$SIZE_T`($sp)
  1100. $POP r27,`$FRAME+21*16+1*$SIZE_T`($sp)
  1101. $POP r28,`$FRAME+21*16+2*$SIZE_T`($sp)
  1102. $POP r29,`$FRAME+21*16+3*$SIZE_T`($sp)
  1103. $POP r30,`$FRAME+21*16+4*$SIZE_T`($sp)
  1104. $POP r31,`$FRAME+21*16+5*$SIZE_T`($sp)
  1105. addi $sp,$sp,`$FRAME+21*16+6*$SIZE_T`
  1106. blr
  1107. .long 0
  1108. .byte 0,12,0x04,0,0x80,6,6,0
  1109. .long 0
  1110. .size .${prefix}_cbc_encrypt,.-.${prefix}_cbc_encrypt
  1111. ___
  1112. }} }}}
  1113. #########################################################################
  1114. {{{ # CTR procedure[s] #
  1115. my ($inp,$out,$len,$key,$ivp,$x10,$rounds,$idx)=map("r$_",(3..10));
  1116. my ($rndkey0,$rndkey1,$inout,$tmp)= map("v$_",(0..3));
  1117. my ($ivec,$inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm,$one)=
  1118. map("v$_",(4..11));
  1119. my $dat=$tmp;
  1120. $code.=<<___;
  1121. .globl .${prefix}_ctr32_encrypt_blocks
  1122. .align 5
  1123. .${prefix}_ctr32_encrypt_blocks:
  1124. ${UCMP}i $len,1
  1125. bltlr-
  1126. lis r0,0xfff0
  1127. mfspr $vrsave,256
  1128. mtspr 256,r0
  1129. li $idx,15
  1130. vxor $rndkey0,$rndkey0,$rndkey0
  1131. le?vspltisb $tmp,0x0f
  1132. lvx $ivec,0,$ivp # load [unaligned] iv
  1133. lvsl $inpperm,0,$ivp
  1134. lvx $inptail,$idx,$ivp
  1135. vspltisb $one,1
  1136. le?vxor $inpperm,$inpperm,$tmp
  1137. vperm $ivec,$ivec,$inptail,$inpperm
  1138. vsldoi $one,$rndkey0,$one,1
  1139. neg r11,$inp
  1140. ?lvsl $keyperm,0,$key # prepare for unaligned key
  1141. lwz $rounds,240($key)
  1142. lvsr $inpperm,0,r11 # prepare for unaligned load
  1143. lvx $inptail,0,$inp
  1144. addi $inp,$inp,15 # 15 is not typo
  1145. le?vxor $inpperm,$inpperm,$tmp
  1146. srwi $rounds,$rounds,1
  1147. li $idx,16
  1148. subi $rounds,$rounds,1
  1149. ${UCMP}i $len,8
  1150. bge _aesp8_ctr32_encrypt8x
  1151. ?lvsr $outperm,0,$out # prepare for unaligned store
  1152. vspltisb $outmask,-1
  1153. lvx $outhead,0,$out
  1154. ?vperm $outmask,$rndkey0,$outmask,$outperm
  1155. le?vxor $outperm,$outperm,$tmp
  1156. lvx $rndkey0,0,$key
  1157. mtctr $rounds
  1158. lvx $rndkey1,$idx,$key
  1159. addi $idx,$idx,16
  1160. ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
  1161. vxor $inout,$ivec,$rndkey0
  1162. lvx $rndkey0,$idx,$key
  1163. addi $idx,$idx,16
  1164. b Loop_ctr32_enc
  1165. .align 5
  1166. Loop_ctr32_enc:
  1167. ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
  1168. vcipher $inout,$inout,$rndkey1
  1169. lvx $rndkey1,$idx,$key
  1170. addi $idx,$idx,16
  1171. ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
  1172. vcipher $inout,$inout,$rndkey0
  1173. lvx $rndkey0,$idx,$key
  1174. addi $idx,$idx,16
  1175. bdnz Loop_ctr32_enc
  1176. vadduwm $ivec,$ivec,$one
  1177. vmr $dat,$inptail
  1178. lvx $inptail,0,$inp
  1179. addi $inp,$inp,16
  1180. subic. $len,$len,1 # blocks--
  1181. ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
  1182. vcipher $inout,$inout,$rndkey1
  1183. lvx $rndkey1,$idx,$key
  1184. vperm $dat,$dat,$inptail,$inpperm
  1185. li $idx,16
  1186. ?vperm $rndkey1,$rndkey0,$rndkey1,$keyperm
  1187. lvx $rndkey0,0,$key
  1188. vxor $dat,$dat,$rndkey1 # last round key
  1189. vcipherlast $inout,$inout,$dat
  1190. lvx $rndkey1,$idx,$key
  1191. addi $idx,$idx,16
  1192. vperm $inout,$inout,$inout,$outperm
  1193. vsel $dat,$outhead,$inout,$outmask
  1194. mtctr $rounds
  1195. ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
  1196. vmr $outhead,$inout
  1197. vxor $inout,$ivec,$rndkey0
  1198. lvx $rndkey0,$idx,$key
  1199. addi $idx,$idx,16
  1200. stvx $dat,0,$out
  1201. addi $out,$out,16
  1202. bne Loop_ctr32_enc
  1203. addi $out,$out,-1
  1204. lvx $inout,0,$out # redundant in aligned case
  1205. vsel $inout,$outhead,$inout,$outmask
  1206. stvx $inout,0,$out
  1207. mtspr 256,$vrsave
  1208. blr
  1209. .long 0
  1210. .byte 0,12,0x14,0,0,0,6,0
  1211. .long 0
  1212. ___
  1213. #########################################################################
  1214. {{ # Optimized CTR procedure #
  1215. my $key_="r11";
  1216. my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,8,26..31));
  1217. $x00=0 if ($flavour =~ /osx/);
  1218. my ($in0, $in1, $in2, $in3, $in4, $in5, $in6, $in7 )=map("v$_",(0..3,10,12..14));
  1219. my ($out0,$out1,$out2,$out3,$out4,$out5,$out6,$out7)=map("v$_",(15..22));
  1220. my $rndkey0="v23"; # v24-v25 rotating buffer for first found keys
  1221. # v26-v31 last 6 round keys
  1222. my ($tmp,$keyperm)=($in3,$in4); # aliases with "caller", redundant assignment
  1223. my ($two,$three,$four)=($outhead,$outperm,$outmask);
  1224. $code.=<<___;
  1225. .align 5
  1226. _aesp8_ctr32_encrypt8x:
  1227. $STU $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
  1228. li r10,`$FRAME+8*16+15`
  1229. li r11,`$FRAME+8*16+31`
  1230. stvx v20,r10,$sp # ABI says so
  1231. addi r10,r10,32
  1232. stvx v21,r11,$sp
  1233. addi r11,r11,32
  1234. stvx v22,r10,$sp
  1235. addi r10,r10,32
  1236. stvx v23,r11,$sp
  1237. addi r11,r11,32
  1238. stvx v24,r10,$sp
  1239. addi r10,r10,32
  1240. stvx v25,r11,$sp
  1241. addi r11,r11,32
  1242. stvx v26,r10,$sp
  1243. addi r10,r10,32
  1244. stvx v27,r11,$sp
  1245. addi r11,r11,32
  1246. stvx v28,r10,$sp
  1247. addi r10,r10,32
  1248. stvx v29,r11,$sp
  1249. addi r11,r11,32
  1250. stvx v30,r10,$sp
  1251. stvx v31,r11,$sp
  1252. li r0,-1
  1253. stw $vrsave,`$FRAME+21*16-4`($sp) # save vrsave
  1254. li $x10,0x10
  1255. $PUSH r26,`$FRAME+21*16+0*$SIZE_T`($sp)
  1256. li $x20,0x20
  1257. $PUSH r27,`$FRAME+21*16+1*$SIZE_T`($sp)
  1258. li $x30,0x30
  1259. $PUSH r28,`$FRAME+21*16+2*$SIZE_T`($sp)
  1260. li $x40,0x40
  1261. $PUSH r29,`$FRAME+21*16+3*$SIZE_T`($sp)
  1262. li $x50,0x50
  1263. $PUSH r30,`$FRAME+21*16+4*$SIZE_T`($sp)
  1264. li $x60,0x60
  1265. $PUSH r31,`$FRAME+21*16+5*$SIZE_T`($sp)
  1266. li $x70,0x70
  1267. mtspr 256,r0
  1268. subi $rounds,$rounds,3 # -4 in total
  1269. lvx $rndkey0,$x00,$key # load key schedule
  1270. lvx v30,$x10,$key
  1271. addi $key,$key,0x20
  1272. lvx v31,$x00,$key
  1273. ?vperm $rndkey0,$rndkey0,v30,$keyperm
  1274. addi $key_,$sp,$FRAME+15
  1275. mtctr $rounds
  1276. Load_ctr32_enc_key:
  1277. ?vperm v24,v30,v31,$keyperm
  1278. lvx v30,$x10,$key
  1279. addi $key,$key,0x20
  1280. stvx v24,$x00,$key_ # off-load round[1]
  1281. ?vperm v25,v31,v30,$keyperm
  1282. lvx v31,$x00,$key
  1283. stvx v25,$x10,$key_ # off-load round[2]
  1284. addi $key_,$key_,0x20
  1285. bdnz Load_ctr32_enc_key
  1286. lvx v26,$x10,$key
  1287. ?vperm v24,v30,v31,$keyperm
  1288. lvx v27,$x20,$key
  1289. stvx v24,$x00,$key_ # off-load round[3]
  1290. ?vperm v25,v31,v26,$keyperm
  1291. lvx v28,$x30,$key
  1292. stvx v25,$x10,$key_ # off-load round[4]
  1293. addi $key_,$sp,$FRAME+15 # rewind $key_
  1294. ?vperm v26,v26,v27,$keyperm
  1295. lvx v29,$x40,$key
  1296. ?vperm v27,v27,v28,$keyperm
  1297. lvx v30,$x50,$key
  1298. ?vperm v28,v28,v29,$keyperm
  1299. lvx v31,$x60,$key
  1300. ?vperm v29,v29,v30,$keyperm
  1301. lvx $out0,$x70,$key # borrow $out0
  1302. ?vperm v30,v30,v31,$keyperm
  1303. lvx v24,$x00,$key_ # pre-load round[1]
  1304. ?vperm v31,v31,$out0,$keyperm
  1305. lvx v25,$x10,$key_ # pre-load round[2]
  1306. vadduwm $two,$one,$one
  1307. subi $inp,$inp,15 # undo "caller"
  1308. $SHL $len,$len,4
  1309. vadduwm $out1,$ivec,$one # counter values ...
  1310. vadduwm $out2,$ivec,$two
  1311. vxor $out0,$ivec,$rndkey0 # ... xored with rndkey[0]
  1312. le?li $idx,8
  1313. vadduwm $out3,$out1,$two
  1314. vxor $out1,$out1,$rndkey0
  1315. le?lvsl $inpperm,0,$idx
  1316. vadduwm $out4,$out2,$two
  1317. vxor $out2,$out2,$rndkey0
  1318. le?vspltisb $tmp,0x0f
  1319. vadduwm $out5,$out3,$two
  1320. vxor $out3,$out3,$rndkey0
  1321. le?vxor $inpperm,$inpperm,$tmp # transform for lvx_u/stvx_u
  1322. vadduwm $out6,$out4,$two
  1323. vxor $out4,$out4,$rndkey0
  1324. vadduwm $out7,$out5,$two
  1325. vxor $out5,$out5,$rndkey0
  1326. vadduwm $ivec,$out6,$two # next counter value
  1327. vxor $out6,$out6,$rndkey0
  1328. vxor $out7,$out7,$rndkey0
  1329. mtctr $rounds
  1330. b Loop_ctr32_enc8x
  1331. .align 5
  1332. Loop_ctr32_enc8x:
  1333. vcipher $out0,$out0,v24
  1334. vcipher $out1,$out1,v24
  1335. vcipher $out2,$out2,v24
  1336. vcipher $out3,$out3,v24
  1337. vcipher $out4,$out4,v24
  1338. vcipher $out5,$out5,v24
  1339. vcipher $out6,$out6,v24
  1340. vcipher $out7,$out7,v24
  1341. Loop_ctr32_enc8x_middle:
  1342. lvx v24,$x20,$key_ # round[3]
  1343. addi $key_,$key_,0x20
  1344. vcipher $out0,$out0,v25
  1345. vcipher $out1,$out1,v25
  1346. vcipher $out2,$out2,v25
  1347. vcipher $out3,$out3,v25
  1348. vcipher $out4,$out4,v25
  1349. vcipher $out5,$out5,v25
  1350. vcipher $out6,$out6,v25
  1351. vcipher $out7,$out7,v25
  1352. lvx v25,$x10,$key_ # round[4]
  1353. bdnz Loop_ctr32_enc8x
  1354. subic r11,$len,256 # $len-256, borrow $key_
  1355. vcipher $out0,$out0,v24
  1356. vcipher $out1,$out1,v24
  1357. vcipher $out2,$out2,v24
  1358. vcipher $out3,$out3,v24
  1359. vcipher $out4,$out4,v24
  1360. vcipher $out5,$out5,v24
  1361. vcipher $out6,$out6,v24
  1362. vcipher $out7,$out7,v24
  1363. subfe r0,r0,r0 # borrow?-1:0
  1364. vcipher $out0,$out0,v25
  1365. vcipher $out1,$out1,v25
  1366. vcipher $out2,$out2,v25
  1367. vcipher $out3,$out3,v25
  1368. vcipher $out4,$out4,v25
  1369. vcipher $out5,$out5,v25
  1370. vcipher $out6,$out6,v25
  1371. vcipher $out7,$out7,v25
  1372. and r0,r0,r11
  1373. addi $key_,$sp,$FRAME+15 # rewind $key_
  1374. vcipher $out0,$out0,v26
  1375. vcipher $out1,$out1,v26
  1376. vcipher $out2,$out2,v26
  1377. vcipher $out3,$out3,v26
  1378. vcipher $out4,$out4,v26
  1379. vcipher $out5,$out5,v26
  1380. vcipher $out6,$out6,v26
  1381. vcipher $out7,$out7,v26
  1382. lvx v24,$x00,$key_ # re-pre-load round[1]
  1383. subic $len,$len,129 # $len-=129
  1384. vcipher $out0,$out0,v27
  1385. addi $len,$len,1 # $len-=128 really
  1386. vcipher $out1,$out1,v27
  1387. vcipher $out2,$out2,v27
  1388. vcipher $out3,$out3,v27
  1389. vcipher $out4,$out4,v27
  1390. vcipher $out5,$out5,v27
  1391. vcipher $out6,$out6,v27
  1392. vcipher $out7,$out7,v27
  1393. lvx v25,$x10,$key_ # re-pre-load round[2]
  1394. vcipher $out0,$out0,v28
  1395. lvx_u $in0,$x00,$inp # load input
  1396. vcipher $out1,$out1,v28
  1397. lvx_u $in1,$x10,$inp
  1398. vcipher $out2,$out2,v28
  1399. lvx_u $in2,$x20,$inp
  1400. vcipher $out3,$out3,v28
  1401. lvx_u $in3,$x30,$inp
  1402. vcipher $out4,$out4,v28
  1403. lvx_u $in4,$x40,$inp
  1404. vcipher $out5,$out5,v28
  1405. lvx_u $in5,$x50,$inp
  1406. vcipher $out6,$out6,v28
  1407. lvx_u $in6,$x60,$inp
  1408. vcipher $out7,$out7,v28
  1409. lvx_u $in7,$x70,$inp
  1410. addi $inp,$inp,0x80
  1411. vcipher $out0,$out0,v29
  1412. le?vperm $in0,$in0,$in0,$inpperm
  1413. vcipher $out1,$out1,v29
  1414. le?vperm $in1,$in1,$in1,$inpperm
  1415. vcipher $out2,$out2,v29
  1416. le?vperm $in2,$in2,$in2,$inpperm
  1417. vcipher $out3,$out3,v29
  1418. le?vperm $in3,$in3,$in3,$inpperm
  1419. vcipher $out4,$out4,v29
  1420. le?vperm $in4,$in4,$in4,$inpperm
  1421. vcipher $out5,$out5,v29
  1422. le?vperm $in5,$in5,$in5,$inpperm
  1423. vcipher $out6,$out6,v29
  1424. le?vperm $in6,$in6,$in6,$inpperm
  1425. vcipher $out7,$out7,v29
  1426. le?vperm $in7,$in7,$in7,$inpperm
  1427. add $inp,$inp,r0 # $inp is adjusted in such
  1428. # way that at exit from the
  1429. # loop inX-in7 are loaded
  1430. # with last "words"
  1431. subfe. r0,r0,r0 # borrow?-1:0
  1432. vcipher $out0,$out0,v30
  1433. vxor $in0,$in0,v31 # xor with last round key
  1434. vcipher $out1,$out1,v30
  1435. vxor $in1,$in1,v31
  1436. vcipher $out2,$out2,v30
  1437. vxor $in2,$in2,v31
  1438. vcipher $out3,$out3,v30
  1439. vxor $in3,$in3,v31
  1440. vcipher $out4,$out4,v30
  1441. vxor $in4,$in4,v31
  1442. vcipher $out5,$out5,v30
  1443. vxor $in5,$in5,v31
  1444. vcipher $out6,$out6,v30
  1445. vxor $in6,$in6,v31
  1446. vcipher $out7,$out7,v30
  1447. vxor $in7,$in7,v31
  1448. bne Lctr32_enc8x_break # did $len-129 borrow?
  1449. vcipherlast $in0,$out0,$in0
  1450. vcipherlast $in1,$out1,$in1
  1451. vadduwm $out1,$ivec,$one # counter values ...
  1452. vcipherlast $in2,$out2,$in2
  1453. vadduwm $out2,$ivec,$two
  1454. vxor $out0,$ivec,$rndkey0 # ... xored with rndkey[0]
  1455. vcipherlast $in3,$out3,$in3
  1456. vadduwm $out3,$out1,$two
  1457. vxor $out1,$out1,$rndkey0
  1458. vcipherlast $in4,$out4,$in4
  1459. vadduwm $out4,$out2,$two
  1460. vxor $out2,$out2,$rndkey0
  1461. vcipherlast $in5,$out5,$in5
  1462. vadduwm $out5,$out3,$two
  1463. vxor $out3,$out3,$rndkey0
  1464. vcipherlast $in6,$out6,$in6
  1465. vadduwm $out6,$out4,$two
  1466. vxor $out4,$out4,$rndkey0
  1467. vcipherlast $in7,$out7,$in7
  1468. vadduwm $out7,$out5,$two
  1469. vxor $out5,$out5,$rndkey0
  1470. le?vperm $in0,$in0,$in0,$inpperm
  1471. vadduwm $ivec,$out6,$two # next counter value
  1472. vxor $out6,$out6,$rndkey0
  1473. le?vperm $in1,$in1,$in1,$inpperm
  1474. vxor $out7,$out7,$rndkey0
  1475. mtctr $rounds
  1476. vcipher $out0,$out0,v24
  1477. stvx_u $in0,$x00,$out
  1478. le?vperm $in2,$in2,$in2,$inpperm
  1479. vcipher $out1,$out1,v24
  1480. stvx_u $in1,$x10,$out
  1481. le?vperm $in3,$in3,$in3,$inpperm
  1482. vcipher $out2,$out2,v24
  1483. stvx_u $in2,$x20,$out
  1484. le?vperm $in4,$in4,$in4,$inpperm
  1485. vcipher $out3,$out3,v24
  1486. stvx_u $in3,$x30,$out
  1487. le?vperm $in5,$in5,$in5,$inpperm
  1488. vcipher $out4,$out4,v24
  1489. stvx_u $in4,$x40,$out
  1490. le?vperm $in6,$in6,$in6,$inpperm
  1491. vcipher $out5,$out5,v24
  1492. stvx_u $in5,$x50,$out
  1493. le?vperm $in7,$in7,$in7,$inpperm
  1494. vcipher $out6,$out6,v24
  1495. stvx_u $in6,$x60,$out
  1496. vcipher $out7,$out7,v24
  1497. stvx_u $in7,$x70,$out
  1498. addi $out,$out,0x80
  1499. b Loop_ctr32_enc8x_middle
  1500. .align 5
  1501. Lctr32_enc8x_break:
  1502. cmpwi $len,-0x60
  1503. blt Lctr32_enc8x_one
  1504. nop
  1505. beq Lctr32_enc8x_two
  1506. cmpwi $len,-0x40
  1507. blt Lctr32_enc8x_three
  1508. nop
  1509. beq Lctr32_enc8x_four
  1510. cmpwi $len,-0x20
  1511. blt Lctr32_enc8x_five
  1512. nop
  1513. beq Lctr32_enc8x_six
  1514. cmpwi $len,0x00
  1515. blt Lctr32_enc8x_seven
  1516. Lctr32_enc8x_eight:
  1517. vcipherlast $out0,$out0,$in0
  1518. vcipherlast $out1,$out1,$in1
  1519. vcipherlast $out2,$out2,$in2
  1520. vcipherlast $out3,$out3,$in3
  1521. vcipherlast $out4,$out4,$in4
  1522. vcipherlast $out5,$out5,$in5
  1523. vcipherlast $out6,$out6,$in6
  1524. vcipherlast $out7,$out7,$in7
  1525. le?vperm $out0,$out0,$out0,$inpperm
  1526. le?vperm $out1,$out1,$out1,$inpperm
  1527. stvx_u $out0,$x00,$out
  1528. le?vperm $out2,$out2,$out2,$inpperm
  1529. stvx_u $out1,$x10,$out
  1530. le?vperm $out3,$out3,$out3,$inpperm
  1531. stvx_u $out2,$x20,$out
  1532. le?vperm $out4,$out4,$out4,$inpperm
  1533. stvx_u $out3,$x30,$out
  1534. le?vperm $out5,$out5,$out5,$inpperm
  1535. stvx_u $out4,$x40,$out
  1536. le?vperm $out6,$out6,$out6,$inpperm
  1537. stvx_u $out5,$x50,$out
  1538. le?vperm $out7,$out7,$out7,$inpperm
  1539. stvx_u $out6,$x60,$out
  1540. stvx_u $out7,$x70,$out
  1541. addi $out,$out,0x80
  1542. b Lctr32_enc8x_done
  1543. .align 5
  1544. Lctr32_enc8x_seven:
  1545. vcipherlast $out0,$out0,$in1
  1546. vcipherlast $out1,$out1,$in2
  1547. vcipherlast $out2,$out2,$in3
  1548. vcipherlast $out3,$out3,$in4
  1549. vcipherlast $out4,$out4,$in5
  1550. vcipherlast $out5,$out5,$in6
  1551. vcipherlast $out6,$out6,$in7
  1552. le?vperm $out0,$out0,$out0,$inpperm
  1553. le?vperm $out1,$out1,$out1,$inpperm
  1554. stvx_u $out0,$x00,$out
  1555. le?vperm $out2,$out2,$out2,$inpperm
  1556. stvx_u $out1,$x10,$out
  1557. le?vperm $out3,$out3,$out3,$inpperm
  1558. stvx_u $out2,$x20,$out
  1559. le?vperm $out4,$out4,$out4,$inpperm
  1560. stvx_u $out3,$x30,$out
  1561. le?vperm $out5,$out5,$out5,$inpperm
  1562. stvx_u $out4,$x40,$out
  1563. le?vperm $out6,$out6,$out6,$inpperm
  1564. stvx_u $out5,$x50,$out
  1565. stvx_u $out6,$x60,$out
  1566. addi $out,$out,0x70
  1567. b Lctr32_enc8x_done
  1568. .align 5
  1569. Lctr32_enc8x_six:
  1570. vcipherlast $out0,$out0,$in2
  1571. vcipherlast $out1,$out1,$in3
  1572. vcipherlast $out2,$out2,$in4
  1573. vcipherlast $out3,$out3,$in5
  1574. vcipherlast $out4,$out4,$in6
  1575. vcipherlast $out5,$out5,$in7
  1576. le?vperm $out0,$out0,$out0,$inpperm
  1577. le?vperm $out1,$out1,$out1,$inpperm
  1578. stvx_u $out0,$x00,$out
  1579. le?vperm $out2,$out2,$out2,$inpperm
  1580. stvx_u $out1,$x10,$out
  1581. le?vperm $out3,$out3,$out3,$inpperm
  1582. stvx_u $out2,$x20,$out
  1583. le?vperm $out4,$out4,$out4,$inpperm
  1584. stvx_u $out3,$x30,$out
  1585. le?vperm $out5,$out5,$out5,$inpperm
  1586. stvx_u $out4,$x40,$out
  1587. stvx_u $out5,$x50,$out
  1588. addi $out,$out,0x60
  1589. b Lctr32_enc8x_done
  1590. .align 5
  1591. Lctr32_enc8x_five:
  1592. vcipherlast $out0,$out0,$in3
  1593. vcipherlast $out1,$out1,$in4
  1594. vcipherlast $out2,$out2,$in5
  1595. vcipherlast $out3,$out3,$in6
  1596. vcipherlast $out4,$out4,$in7
  1597. le?vperm $out0,$out0,$out0,$inpperm
  1598. le?vperm $out1,$out1,$out1,$inpperm
  1599. stvx_u $out0,$x00,$out
  1600. le?vperm $out2,$out2,$out2,$inpperm
  1601. stvx_u $out1,$x10,$out
  1602. le?vperm $out3,$out3,$out3,$inpperm
  1603. stvx_u $out2,$x20,$out
  1604. le?vperm $out4,$out4,$out4,$inpperm
  1605. stvx_u $out3,$x30,$out
  1606. stvx_u $out4,$x40,$out
  1607. addi $out,$out,0x50
  1608. b Lctr32_enc8x_done
  1609. .align 5
  1610. Lctr32_enc8x_four:
  1611. vcipherlast $out0,$out0,$in4
  1612. vcipherlast $out1,$out1,$in5
  1613. vcipherlast $out2,$out2,$in6
  1614. vcipherlast $out3,$out3,$in7
  1615. le?vperm $out0,$out0,$out0,$inpperm
  1616. le?vperm $out1,$out1,$out1,$inpperm
  1617. stvx_u $out0,$x00,$out
  1618. le?vperm $out2,$out2,$out2,$inpperm
  1619. stvx_u $out1,$x10,$out
  1620. le?vperm $out3,$out3,$out3,$inpperm
  1621. stvx_u $out2,$x20,$out
  1622. stvx_u $out3,$x30,$out
  1623. addi $out,$out,0x40
  1624. b Lctr32_enc8x_done
  1625. .align 5
  1626. Lctr32_enc8x_three:
  1627. vcipherlast $out0,$out0,$in5
  1628. vcipherlast $out1,$out1,$in6
  1629. vcipherlast $out2,$out2,$in7
  1630. le?vperm $out0,$out0,$out0,$inpperm
  1631. le?vperm $out1,$out1,$out1,$inpperm
  1632. stvx_u $out0,$x00,$out
  1633. le?vperm $out2,$out2,$out2,$inpperm
  1634. stvx_u $out1,$x10,$out
  1635. stvx_u $out2,$x20,$out
  1636. addi $out,$out,0x30
  1637. b Lcbc_dec8x_done
  1638. .align 5
  1639. Lctr32_enc8x_two:
  1640. vcipherlast $out0,$out0,$in6
  1641. vcipherlast $out1,$out1,$in7
  1642. le?vperm $out0,$out0,$out0,$inpperm
  1643. le?vperm $out1,$out1,$out1,$inpperm
  1644. stvx_u $out0,$x00,$out
  1645. stvx_u $out1,$x10,$out
  1646. addi $out,$out,0x20
  1647. b Lcbc_dec8x_done
  1648. .align 5
  1649. Lctr32_enc8x_one:
  1650. vcipherlast $out0,$out0,$in7
  1651. le?vperm $out0,$out0,$out0,$inpperm
  1652. stvx_u $out0,0,$out
  1653. addi $out,$out,0x10
  1654. Lctr32_enc8x_done:
  1655. li r10,`$FRAME+15`
  1656. li r11,`$FRAME+31`
  1657. stvx $inpperm,r10,$sp # wipe copies of round keys
  1658. addi r10,r10,32
  1659. stvx $inpperm,r11,$sp
  1660. addi r11,r11,32
  1661. stvx $inpperm,r10,$sp
  1662. addi r10,r10,32
  1663. stvx $inpperm,r11,$sp
  1664. addi r11,r11,32
  1665. stvx $inpperm,r10,$sp
  1666. addi r10,r10,32
  1667. stvx $inpperm,r11,$sp
  1668. addi r11,r11,32
  1669. stvx $inpperm,r10,$sp
  1670. addi r10,r10,32
  1671. stvx $inpperm,r11,$sp
  1672. addi r11,r11,32
  1673. mtspr 256,$vrsave
  1674. lvx v20,r10,$sp # ABI says so
  1675. addi r10,r10,32
  1676. lvx v21,r11,$sp
  1677. addi r11,r11,32
  1678. lvx v22,r10,$sp
  1679. addi r10,r10,32
  1680. lvx v23,r11,$sp
  1681. addi r11,r11,32
  1682. lvx v24,r10,$sp
  1683. addi r10,r10,32
  1684. lvx v25,r11,$sp
  1685. addi r11,r11,32
  1686. lvx v26,r10,$sp
  1687. addi r10,r10,32
  1688. lvx v27,r11,$sp
  1689. addi r11,r11,32
  1690. lvx v28,r10,$sp
  1691. addi r10,r10,32
  1692. lvx v29,r11,$sp
  1693. addi r11,r11,32
  1694. lvx v30,r10,$sp
  1695. lvx v31,r11,$sp
  1696. $POP r26,`$FRAME+21*16+0*$SIZE_T`($sp)
  1697. $POP r27,`$FRAME+21*16+1*$SIZE_T`($sp)
  1698. $POP r28,`$FRAME+21*16+2*$SIZE_T`($sp)
  1699. $POP r29,`$FRAME+21*16+3*$SIZE_T`($sp)
  1700. $POP r30,`$FRAME+21*16+4*$SIZE_T`($sp)
  1701. $POP r31,`$FRAME+21*16+5*$SIZE_T`($sp)
  1702. addi $sp,$sp,`$FRAME+21*16+6*$SIZE_T`
  1703. blr
  1704. .long 0
  1705. .byte 0,12,0x04,0,0x80,6,6,0
  1706. .long 0
  1707. .size .${prefix}_ctr32_encrypt_blocks,.-.${prefix}_ctr32_encrypt_blocks
  1708. ___
  1709. }} }}}
  1710. my $consts=1;
  1711. foreach(split("\n",$code)) {
  1712. s/\`([^\`]*)\`/eval($1)/geo;
  1713. # constants table endian-specific conversion
  1714. if ($consts && m/\.(long|byte)\s+(.+)\s+(\?[a-z]*)$/o) {
  1715. my $conv=$3;
  1716. my @bytes=();
  1717. # convert to endian-agnostic format
  1718. if ($1 eq "long") {
  1719. foreach (split(/,\s*/,$2)) {
  1720. my $l = /^0/?oct:int;
  1721. push @bytes,($l>>24)&0xff,($l>>16)&0xff,($l>>8)&0xff,$l&0xff;
  1722. }
  1723. } else {
  1724. @bytes = map(/^0/?oct:int,split(/,\s*/,$2));
  1725. }
  1726. # little-endian conversion
  1727. if ($flavour =~ /le$/o) {
  1728. SWITCH: for($conv) {
  1729. /\?inv/ && do { @bytes=map($_^0xf,@bytes); last; };
  1730. /\?rev/ && do { @bytes=reverse(@bytes); last; };
  1731. }
  1732. }
  1733. #emit
  1734. print ".byte\t",join(',',map (sprintf("0x%02x",$_),@bytes)),"\n";
  1735. next;
  1736. }
  1737. $consts=0 if (m/Lconsts:/o); # end of table
  1738. # instructions prefixed with '?' are endian-specific and need
  1739. # to be adjusted accordingly...
  1740. if ($flavour =~ /le$/o) { # little-endian
  1741. s/le\?//o or
  1742. s/be\?/#be#/o or
  1743. s/\?lvsr/lvsl/o or
  1744. s/\?lvsl/lvsr/o or
  1745. s/\?(vperm\s+v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+)/$1$3$2$4/o or
  1746. s/\?(vsldoi\s+v[0-9]+,\s*)(v[0-9]+,)\s*(v[0-9]+,\s*)([0-9]+)/$1$3$2 16-$4/o or
  1747. s/\?(vspltw\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9])/$1$2 3-$3/o;
  1748. } else { # big-endian
  1749. s/le\?/#le#/o or
  1750. s/be\?//o or
  1751. s/\?([a-z]+)/$1/o;
  1752. }
  1753. print $_,"\n";
  1754. }
  1755. close STDOUT;