ghashp8-ppc.pl 15 KB


  1. #! /usr/bin/env perl
  2. # Copyright 2014-2018 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. # ====================================================================
  10. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  11. # project. The module is, however, dual licensed under OpenSSL and
  12. # CRYPTOGAMS licenses depending on where you obtain it. For further
  13. # details see http://www.openssl.org/~appro/cryptogams/.
  14. # ====================================================================
  15. #
  16. # GHASH for for PowerISA v2.07.
  17. #
  18. # July 2014
  19. #
  20. # Accurate performance measurements are problematic, because it's
  21. # always virtualized setup with possibly throttled processor.
  22. # Relative comparison is therefore more informative. This initial
  23. # version is ~2.1x slower than hardware-assisted AES-128-CTR, ~12x
  24. # faster than "4-bit" integer-only compiler-generated 64-bit code.
  25. # "Initial version" means that there is room for further improvement.
  26. # May 2016
  27. #
  28. # 2x aggregated reduction improves performance by 50% (resulting
  29. # performance on POWER8 is 1 cycle per processed byte), and 4x
  30. # aggregated reduction - by 170% or 2.7x (resulting in 0.55 cpb).
  31. # POWER9 delivers 0.51 cpb.
  32. # $output is the last argument if it looks like a file (it has an extension)
  33. # $flavour is the first argument if it doesn't look like a file
  34. $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  35. $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  36. if ($flavour =~ /64/) {
  37. $SIZE_T=8;
  38. $LRSAVE=2*$SIZE_T;
  39. $STU="stdu";
  40. $POP="ld";
  41. $PUSH="std";
  42. $UCMP="cmpld";
  43. $SHRI="srdi";
  44. } elsif ($flavour =~ /32/) {
  45. $SIZE_T=4;
  46. $LRSAVE=$SIZE_T;
  47. $STU="stwu";
  48. $POP="lwz";
  49. $PUSH="stw";
  50. $UCMP="cmplw";
  51. $SHRI="srwi";
  52. } else { die "nonsense $flavour"; }
  53. $sp="r1";
  54. $FRAME=6*$SIZE_T+13*16; # 13*16 is for v20-v31 offload
  55. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  56. ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
  57. ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
  58. die "can't locate ppc-xlate.pl";
  59. open STDOUT,"| $^X $xlate $flavour \"$output\""
  60. or die "can't call $xlate: $!";
  61. my ($Xip,$Htbl,$inp,$len)=map("r$_",(3..6)); # argument block
  62. my ($Xl,$Xm,$Xh,$IN)=map("v$_",(0..3));
  63. my ($zero,$t0,$t1,$t2,$xC2,$H,$Hh,$Hl,$lemask)=map("v$_",(4..12));
  64. my ($Xl1,$Xm1,$Xh1,$IN1,$H2,$H2h,$H2l)=map("v$_",(13..19));
  65. my $vrsave="r12";
  66. $code=<<___;
  67. .machine "any"
  68. .text
  69. .globl .gcm_init_p8
  70. .align 5
  71. .gcm_init_p8:
  72. li r0,-4096
  73. li r8,0x10
  74. mfspr $vrsave,256
  75. li r9,0x20
  76. mtspr 256,r0
  77. li r10,0x30
  78. lvx_u $H,0,r4 # load H
  79. vspltisb $xC2,-16 # 0xf0
  80. vspltisb $t0,1 # one
  81. vaddubm $xC2,$xC2,$xC2 # 0xe0
  82. vxor $zero,$zero,$zero
  83. vor $xC2,$xC2,$t0 # 0xe1
  84. vsldoi $xC2,$xC2,$zero,15 # 0xe1...
  85. vsldoi $t1,$zero,$t0,1 # ...1
  86. vaddubm $xC2,$xC2,$xC2 # 0xc2...
  87. vspltisb $t2,7
  88. vor $xC2,$xC2,$t1 # 0xc2....01
  89. vspltb $t1,$H,0 # most significant byte
  90. vsl $H,$H,$t0 # H<<=1
  91. vsrab $t1,$t1,$t2 # broadcast carry bit
  92. vand $t1,$t1,$xC2
  93. vxor $IN,$H,$t1 # twisted H
  94. vsldoi $H,$IN,$IN,8 # twist even more ...
  95. vsldoi $xC2,$zero,$xC2,8 # 0xc2.0
  96. vsldoi $Hl,$zero,$H,8 # ... and split
  97. vsldoi $Hh,$H,$zero,8
  98. stvx_u $xC2,0,r3 # save pre-computed table
  99. stvx_u $Hl,r8,r3
  100. li r8,0x40
  101. stvx_u $H, r9,r3
  102. li r9,0x50
  103. stvx_u $Hh,r10,r3
  104. li r10,0x60
  105. vpmsumd $Xl,$IN,$Hl # H.lo·H.lo
  106. vpmsumd $Xm,$IN,$H # H.hi·H.lo+H.lo·H.hi
  107. vpmsumd $Xh,$IN,$Hh # H.hi·H.hi
  108. vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
  109. vsldoi $t0,$Xm,$zero,8
  110. vsldoi $t1,$zero,$Xm,8
  111. vxor $Xl,$Xl,$t0
  112. vxor $Xh,$Xh,$t1
  113. vsldoi $Xl,$Xl,$Xl,8
  114. vxor $Xl,$Xl,$t2
  115. vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
  116. vpmsumd $Xl,$Xl,$xC2
  117. vxor $t1,$t1,$Xh
  118. vxor $IN1,$Xl,$t1
  119. vsldoi $H2,$IN1,$IN1,8
  120. vsldoi $H2l,$zero,$H2,8
  121. vsldoi $H2h,$H2,$zero,8
  122. stvx_u $H2l,r8,r3 # save H^2
  123. li r8,0x70
  124. stvx_u $H2,r9,r3
  125. li r9,0x80
  126. stvx_u $H2h,r10,r3
  127. li r10,0x90
  128. ___
  129. {
  130. my ($t4,$t5,$t6) = ($Hl,$H,$Hh);
  131. $code.=<<___;
  132. vpmsumd $Xl,$IN,$H2l # H.lo·H^2.lo
  133. vpmsumd $Xl1,$IN1,$H2l # H^2.lo·H^2.lo
  134. vpmsumd $Xm,$IN,$H2 # H.hi·H^2.lo+H.lo·H^2.hi
  135. vpmsumd $Xm1,$IN1,$H2 # H^2.hi·H^2.lo+H^2.lo·H^2.hi
  136. vpmsumd $Xh,$IN,$H2h # H.hi·H^2.hi
  137. vpmsumd $Xh1,$IN1,$H2h # H^2.hi·H^2.hi
  138. vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
  139. vpmsumd $t6,$Xl1,$xC2 # 1st reduction phase
  140. vsldoi $t0,$Xm,$zero,8
  141. vsldoi $t1,$zero,$Xm,8
  142. vsldoi $t4,$Xm1,$zero,8
  143. vsldoi $t5,$zero,$Xm1,8
  144. vxor $Xl,$Xl,$t0
  145. vxor $Xh,$Xh,$t1
  146. vxor $Xl1,$Xl1,$t4
  147. vxor $Xh1,$Xh1,$t5
  148. vsldoi $Xl,$Xl,$Xl,8
  149. vsldoi $Xl1,$Xl1,$Xl1,8
  150. vxor $Xl,$Xl,$t2
  151. vxor $Xl1,$Xl1,$t6
  152. vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
  153. vsldoi $t5,$Xl1,$Xl1,8 # 2nd reduction phase
  154. vpmsumd $Xl,$Xl,$xC2
  155. vpmsumd $Xl1,$Xl1,$xC2
  156. vxor $t1,$t1,$Xh
  157. vxor $t5,$t5,$Xh1
  158. vxor $Xl,$Xl,$t1
  159. vxor $Xl1,$Xl1,$t5
  160. vsldoi $H,$Xl,$Xl,8
  161. vsldoi $H2,$Xl1,$Xl1,8
  162. vsldoi $Hl,$zero,$H,8
  163. vsldoi $Hh,$H,$zero,8
  164. vsldoi $H2l,$zero,$H2,8
  165. vsldoi $H2h,$H2,$zero,8
  166. stvx_u $Hl,r8,r3 # save H^3
  167. li r8,0xa0
  168. stvx_u $H,r9,r3
  169. li r9,0xb0
  170. stvx_u $Hh,r10,r3
  171. li r10,0xc0
  172. stvx_u $H2l,r8,r3 # save H^4
  173. stvx_u $H2,r9,r3
  174. stvx_u $H2h,r10,r3
  175. mtspr 256,$vrsave
  176. blr
  177. .long 0
  178. .byte 0,12,0x14,0,0,0,2,0
  179. .long 0
  180. .size .gcm_init_p8,.-.gcm_init_p8
  181. ___
  182. }
  183. $code.=<<___;
  184. .globl .gcm_gmult_p8
  185. .align 5
  186. .gcm_gmult_p8:
  187. lis r0,0xfff8
  188. li r8,0x10
  189. mfspr $vrsave,256
  190. li r9,0x20
  191. mtspr 256,r0
  192. li r10,0x30
  193. lvx_u $IN,0,$Xip # load Xi
  194. lvx_u $Hl,r8,$Htbl # load pre-computed table
  195. le?lvsl $lemask,r0,r0
  196. lvx_u $H, r9,$Htbl
  197. le?vspltisb $t0,0x07
  198. lvx_u $Hh,r10,$Htbl
  199. le?vxor $lemask,$lemask,$t0
  200. lvx_u $xC2,0,$Htbl
  201. le?vperm $IN,$IN,$IN,$lemask
  202. vxor $zero,$zero,$zero
  203. vpmsumd $Xl,$IN,$Hl # H.lo·Xi.lo
  204. vpmsumd $Xm,$IN,$H # H.hi·Xi.lo+H.lo·Xi.hi
  205. vpmsumd $Xh,$IN,$Hh # H.hi·Xi.hi
  206. vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
  207. vsldoi $t0,$Xm,$zero,8
  208. vsldoi $t1,$zero,$Xm,8
  209. vxor $Xl,$Xl,$t0
  210. vxor $Xh,$Xh,$t1
  211. vsldoi $Xl,$Xl,$Xl,8
  212. vxor $Xl,$Xl,$t2
  213. vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
  214. vpmsumd $Xl,$Xl,$xC2
  215. vxor $t1,$t1,$Xh
  216. vxor $Xl,$Xl,$t1
  217. le?vperm $Xl,$Xl,$Xl,$lemask
  218. stvx_u $Xl,0,$Xip # write out Xi
  219. mtspr 256,$vrsave
  220. blr
  221. .long 0
  222. .byte 0,12,0x14,0,0,0,2,0
  223. .long 0
  224. .size .gcm_gmult_p8,.-.gcm_gmult_p8
  225. .globl .gcm_ghash_p8
  226. .align 5
  227. .gcm_ghash_p8:
  228. li r0,-4096
  229. li r8,0x10
  230. mfspr $vrsave,256
  231. li r9,0x20
  232. mtspr 256,r0
  233. li r10,0x30
  234. lvx_u $Xl,0,$Xip # load Xi
  235. lvx_u $Hl,r8,$Htbl # load pre-computed table
  236. li r8,0x40
  237. le?lvsl $lemask,r0,r0
  238. lvx_u $H, r9,$Htbl
  239. li r9,0x50
  240. le?vspltisb $t0,0x07
  241. lvx_u $Hh,r10,$Htbl
  242. li r10,0x60
  243. le?vxor $lemask,$lemask,$t0
  244. lvx_u $xC2,0,$Htbl
  245. le?vperm $Xl,$Xl,$Xl,$lemask
  246. vxor $zero,$zero,$zero
  247. ${UCMP}i $len,64
  248. bge Lgcm_ghash_p8_4x
  249. lvx_u $IN,0,$inp
  250. addi $inp,$inp,16
  251. subic. $len,$len,16
  252. le?vperm $IN,$IN,$IN,$lemask
  253. vxor $IN,$IN,$Xl
  254. beq Lshort
  255. lvx_u $H2l,r8,$Htbl # load H^2
  256. li r8,16
  257. lvx_u $H2, r9,$Htbl
  258. add r9,$inp,$len # end of input
  259. lvx_u $H2h,r10,$Htbl
  260. be?b Loop_2x
  261. .align 5
  262. Loop_2x:
  263. lvx_u $IN1,0,$inp
  264. le?vperm $IN1,$IN1,$IN1,$lemask
  265. subic $len,$len,32
  266. vpmsumd $Xl,$IN,$H2l # H^2.lo·Xi.lo
  267. vpmsumd $Xl1,$IN1,$Hl # H.lo·Xi+1.lo
  268. subfe r0,r0,r0 # borrow?-1:0
  269. vpmsumd $Xm,$IN,$H2 # H^2.hi·Xi.lo+H^2.lo·Xi.hi
  270. vpmsumd $Xm1,$IN1,$H # H.hi·Xi+1.lo+H.lo·Xi+1.hi
  271. and r0,r0,$len
  272. vpmsumd $Xh,$IN,$H2h # H^2.hi·Xi.hi
  273. vpmsumd $Xh1,$IN1,$Hh # H.hi·Xi+1.hi
  274. add $inp,$inp,r0
  275. vxor $Xl,$Xl,$Xl1
  276. vxor $Xm,$Xm,$Xm1
  277. vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
  278. vsldoi $t0,$Xm,$zero,8
  279. vsldoi $t1,$zero,$Xm,8
  280. vxor $Xh,$Xh,$Xh1
  281. vxor $Xl,$Xl,$t0
  282. vxor $Xh,$Xh,$t1
  283. vsldoi $Xl,$Xl,$Xl,8
  284. vxor $Xl,$Xl,$t2
  285. lvx_u $IN,r8,$inp
  286. addi $inp,$inp,32
  287. vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
  288. vpmsumd $Xl,$Xl,$xC2
  289. le?vperm $IN,$IN,$IN,$lemask
  290. vxor $t1,$t1,$Xh
  291. vxor $IN,$IN,$t1
  292. vxor $IN,$IN,$Xl
  293. $UCMP r9,$inp
  294. bgt Loop_2x # done yet?
  295. cmplwi $len,0
  296. bne Leven
  297. Lshort:
  298. vpmsumd $Xl,$IN,$Hl # H.lo·Xi.lo
  299. vpmsumd $Xm,$IN,$H # H.hi·Xi.lo+H.lo·Xi.hi
  300. vpmsumd $Xh,$IN,$Hh # H.hi·Xi.hi
  301. vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
  302. vsldoi $t0,$Xm,$zero,8
  303. vsldoi $t1,$zero,$Xm,8
  304. vxor $Xl,$Xl,$t0
  305. vxor $Xh,$Xh,$t1
  306. vsldoi $Xl,$Xl,$Xl,8
  307. vxor $Xl,$Xl,$t2
  308. vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
  309. vpmsumd $Xl,$Xl,$xC2
  310. vxor $t1,$t1,$Xh
  311. Leven:
  312. vxor $Xl,$Xl,$t1
  313. le?vperm $Xl,$Xl,$Xl,$lemask
  314. stvx_u $Xl,0,$Xip # write out Xi
  315. mtspr 256,$vrsave
  316. blr
  317. .long 0
  318. .byte 0,12,0x14,0,0,0,4,0
  319. .long 0
  320. ___
  321. {
  322. my ($Xl3,$Xm2,$IN2,$H3l,$H3,$H3h,
  323. $Xh3,$Xm3,$IN3,$H4l,$H4,$H4h) = map("v$_",(20..31));
  324. my $IN0=$IN;
  325. my ($H21l,$H21h,$loperm,$hiperm) = ($Hl,$Hh,$H2l,$H2h);
  326. $code.=<<___;
  327. .align 5
  328. .gcm_ghash_p8_4x:
  329. Lgcm_ghash_p8_4x:
  330. $STU $sp,-$FRAME($sp)
  331. li r10,`15+6*$SIZE_T`
  332. li r11,`31+6*$SIZE_T`
  333. stvx v20,r10,$sp
  334. addi r10,r10,32
  335. stvx v21,r11,$sp
  336. addi r11,r11,32
  337. stvx v22,r10,$sp
  338. addi r10,r10,32
  339. stvx v23,r11,$sp
  340. addi r11,r11,32
  341. stvx v24,r10,$sp
  342. addi r10,r10,32
  343. stvx v25,r11,$sp
  344. addi r11,r11,32
  345. stvx v26,r10,$sp
  346. addi r10,r10,32
  347. stvx v27,r11,$sp
  348. addi r11,r11,32
  349. stvx v28,r10,$sp
  350. addi r10,r10,32
  351. stvx v29,r11,$sp
  352. addi r11,r11,32
  353. stvx v30,r10,$sp
  354. li r10,0x60
  355. stvx v31,r11,$sp
  356. li r0,-1
  357. stw $vrsave,`$FRAME-4`($sp) # save vrsave
  358. mtspr 256,r0 # preserve all AltiVec registers
  359. lvsl $t0,0,r8 # 0x0001..0e0f
  360. #lvx_u $H2l,r8,$Htbl # load H^2
  361. li r8,0x70
  362. lvx_u $H2, r9,$Htbl
  363. li r9,0x80
  364. vspltisb $t1,8 # 0x0808..0808
  365. #lvx_u $H2h,r10,$Htbl
  366. li r10,0x90
  367. lvx_u $H3l,r8,$Htbl # load H^3
  368. li r8,0xa0
  369. lvx_u $H3, r9,$Htbl
  370. li r9,0xb0
  371. lvx_u $H3h,r10,$Htbl
  372. li r10,0xc0
  373. lvx_u $H4l,r8,$Htbl # load H^4
  374. li r8,0x10
  375. lvx_u $H4, r9,$Htbl
  376. li r9,0x20
  377. lvx_u $H4h,r10,$Htbl
  378. li r10,0x30
  379. vsldoi $t2,$zero,$t1,8 # 0x0000..0808
  380. vaddubm $hiperm,$t0,$t2 # 0x0001..1617
  381. vaddubm $loperm,$t1,$hiperm # 0x0809..1e1f
  382. $SHRI $len,$len,4 # this allows to use sign bit
  383. # as carry
  384. lvx_u $IN0,0,$inp # load input
  385. lvx_u $IN1,r8,$inp
  386. subic. $len,$len,8
  387. lvx_u $IN2,r9,$inp
  388. lvx_u $IN3,r10,$inp
  389. addi $inp,$inp,0x40
  390. le?vperm $IN0,$IN0,$IN0,$lemask
  391. le?vperm $IN1,$IN1,$IN1,$lemask
  392. le?vperm $IN2,$IN2,$IN2,$lemask
  393. le?vperm $IN3,$IN3,$IN3,$lemask
  394. vxor $Xh,$IN0,$Xl
  395. vpmsumd $Xl1,$IN1,$H3l
  396. vpmsumd $Xm1,$IN1,$H3
  397. vpmsumd $Xh1,$IN1,$H3h
  398. vperm $H21l,$H2,$H,$hiperm
  399. vperm $t0,$IN2,$IN3,$loperm
  400. vperm $H21h,$H2,$H,$loperm
  401. vperm $t1,$IN2,$IN3,$hiperm
  402. vpmsumd $Xm2,$IN2,$H2 # H^2.lo·Xi+2.hi+H^2.hi·Xi+2.lo
  403. vpmsumd $Xl3,$t0,$H21l # H^2.lo·Xi+2.lo+H.lo·Xi+3.lo
  404. vpmsumd $Xm3,$IN3,$H # H.hi·Xi+3.lo +H.lo·Xi+3.hi
  405. vpmsumd $Xh3,$t1,$H21h # H^2.hi·Xi+2.hi+H.hi·Xi+3.hi
  406. vxor $Xm2,$Xm2,$Xm1
  407. vxor $Xl3,$Xl3,$Xl1
  408. vxor $Xm3,$Xm3,$Xm2
  409. vxor $Xh3,$Xh3,$Xh1
  410. blt Ltail_4x
  411. Loop_4x:
  412. lvx_u $IN0,0,$inp
  413. lvx_u $IN1,r8,$inp
  414. subic. $len,$len,4
  415. lvx_u $IN2,r9,$inp
  416. lvx_u $IN3,r10,$inp
  417. addi $inp,$inp,0x40
  418. le?vperm $IN1,$IN1,$IN1,$lemask
  419. le?vperm $IN2,$IN2,$IN2,$lemask
  420. le?vperm $IN3,$IN3,$IN3,$lemask
  421. le?vperm $IN0,$IN0,$IN0,$lemask
  422. vpmsumd $Xl,$Xh,$H4l # H^4.lo·Xi.lo
  423. vpmsumd $Xm,$Xh,$H4 # H^4.hi·Xi.lo+H^4.lo·Xi.hi
  424. vpmsumd $Xh,$Xh,$H4h # H^4.hi·Xi.hi
  425. vpmsumd $Xl1,$IN1,$H3l
  426. vpmsumd $Xm1,$IN1,$H3
  427. vpmsumd $Xh1,$IN1,$H3h
  428. vxor $Xl,$Xl,$Xl3
  429. vxor $Xm,$Xm,$Xm3
  430. vxor $Xh,$Xh,$Xh3
  431. vperm $t0,$IN2,$IN3,$loperm
  432. vperm $t1,$IN2,$IN3,$hiperm
  433. vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
  434. vpmsumd $Xl3,$t0,$H21l # H.lo·Xi+3.lo +H^2.lo·Xi+2.lo
  435. vpmsumd $Xh3,$t1,$H21h # H.hi·Xi+3.hi +H^2.hi·Xi+2.hi
  436. vsldoi $t0,$Xm,$zero,8
  437. vsldoi $t1,$zero,$Xm,8
  438. vxor $Xl,$Xl,$t0
  439. vxor $Xh,$Xh,$t1
  440. vsldoi $Xl,$Xl,$Xl,8
  441. vxor $Xl,$Xl,$t2
  442. vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
  443. vpmsumd $Xm2,$IN2,$H2 # H^2.hi·Xi+2.lo+H^2.lo·Xi+2.hi
  444. vpmsumd $Xm3,$IN3,$H # H.hi·Xi+3.lo +H.lo·Xi+3.hi
  445. vpmsumd $Xl,$Xl,$xC2
  446. vxor $Xl3,$Xl3,$Xl1
  447. vxor $Xh3,$Xh3,$Xh1
  448. vxor $Xh,$Xh,$IN0
  449. vxor $Xm2,$Xm2,$Xm1
  450. vxor $Xh,$Xh,$t1
  451. vxor $Xm3,$Xm3,$Xm2
  452. vxor $Xh,$Xh,$Xl
  453. bge Loop_4x
  454. Ltail_4x:
  455. vpmsumd $Xl,$Xh,$H4l # H^4.lo·Xi.lo
  456. vpmsumd $Xm,$Xh,$H4 # H^4.hi·Xi.lo+H^4.lo·Xi.hi
  457. vpmsumd $Xh,$Xh,$H4h # H^4.hi·Xi.hi
  458. vxor $Xl,$Xl,$Xl3
  459. vxor $Xm,$Xm,$Xm3
  460. vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
  461. vsldoi $t0,$Xm,$zero,8
  462. vsldoi $t1,$zero,$Xm,8
  463. vxor $Xh,$Xh,$Xh3
  464. vxor $Xl,$Xl,$t0
  465. vxor $Xh,$Xh,$t1
  466. vsldoi $Xl,$Xl,$Xl,8
  467. vxor $Xl,$Xl,$t2
  468. vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
  469. vpmsumd $Xl,$Xl,$xC2
  470. vxor $t1,$t1,$Xh
  471. vxor $Xl,$Xl,$t1
  472. addic. $len,$len,4
  473. beq Ldone_4x
  474. lvx_u $IN0,0,$inp
  475. ${UCMP}i $len,2
  476. li $len,-4
  477. blt Lone
  478. lvx_u $IN1,r8,$inp
  479. beq Ltwo
  480. Lthree:
  481. lvx_u $IN2,r9,$inp
  482. le?vperm $IN0,$IN0,$IN0,$lemask
  483. le?vperm $IN1,$IN1,$IN1,$lemask
  484. le?vperm $IN2,$IN2,$IN2,$lemask
  485. vxor $Xh,$IN0,$Xl
  486. vmr $H4l,$H3l
  487. vmr $H4, $H3
  488. vmr $H4h,$H3h
  489. vperm $t0,$IN1,$IN2,$loperm
  490. vperm $t1,$IN1,$IN2,$hiperm
  491. vpmsumd $Xm2,$IN1,$H2 # H^2.lo·Xi+1.hi+H^2.hi·Xi+1.lo
  492. vpmsumd $Xm3,$IN2,$H # H.hi·Xi+2.lo +H.lo·Xi+2.hi
  493. vpmsumd $Xl3,$t0,$H21l # H^2.lo·Xi+1.lo+H.lo·Xi+2.lo
  494. vpmsumd $Xh3,$t1,$H21h # H^2.hi·Xi+1.hi+H.hi·Xi+2.hi
  495. vxor $Xm3,$Xm3,$Xm2
  496. b Ltail_4x
  497. .align 4
  498. Ltwo:
  499. le?vperm $IN0,$IN0,$IN0,$lemask
  500. le?vperm $IN1,$IN1,$IN1,$lemask
  501. vxor $Xh,$IN0,$Xl
  502. vperm $t0,$zero,$IN1,$loperm
  503. vperm $t1,$zero,$IN1,$hiperm
  504. vsldoi $H4l,$zero,$H2,8
  505. vmr $H4, $H2
  506. vsldoi $H4h,$H2,$zero,8
  507. vpmsumd $Xl3,$t0, $H21l # H.lo·Xi+1.lo
  508. vpmsumd $Xm3,$IN1,$H # H.hi·Xi+1.lo+H.lo·Xi+2.hi
  509. vpmsumd $Xh3,$t1, $H21h # H.hi·Xi+1.hi
  510. b Ltail_4x
  511. .align 4
  512. Lone:
  513. le?vperm $IN0,$IN0,$IN0,$lemask
  514. vsldoi $H4l,$zero,$H,8
  515. vmr $H4, $H
  516. vsldoi $H4h,$H,$zero,8
  517. vxor $Xh,$IN0,$Xl
  518. vxor $Xl3,$Xl3,$Xl3
  519. vxor $Xm3,$Xm3,$Xm3
  520. vxor $Xh3,$Xh3,$Xh3
  521. b Ltail_4x
  522. Ldone_4x:
  523. le?vperm $Xl,$Xl,$Xl,$lemask
  524. stvx_u $Xl,0,$Xip # write out Xi
  525. li r10,`15+6*$SIZE_T`
  526. li r11,`31+6*$SIZE_T`
  527. mtspr 256,$vrsave
  528. lvx v20,r10,$sp
  529. addi r10,r10,32
  530. lvx v21,r11,$sp
  531. addi r11,r11,32
  532. lvx v22,r10,$sp
  533. addi r10,r10,32
  534. lvx v23,r11,$sp
  535. addi r11,r11,32
  536. lvx v24,r10,$sp
  537. addi r10,r10,32
  538. lvx v25,r11,$sp
  539. addi r11,r11,32
  540. lvx v26,r10,$sp
  541. addi r10,r10,32
  542. lvx v27,r11,$sp
  543. addi r11,r11,32
  544. lvx v28,r10,$sp
  545. addi r10,r10,32
  546. lvx v29,r11,$sp
  547. addi r11,r11,32
  548. lvx v30,r10,$sp
  549. lvx v31,r11,$sp
  550. addi $sp,$sp,$FRAME
  551. blr
  552. .long 0
  553. .byte 0,12,0x04,0,0x80,0,4,0
  554. .long 0
  555. ___
  556. }
  557. $code.=<<___;
  558. .size .gcm_ghash_p8,.-.gcm_ghash_p8
  559. .asciz "GHASH for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
  560. .align 2
  561. ___
  562. foreach (split("\n",$code)) {
  563. s/\`([^\`]*)\`/eval $1/geo;
  564. if ($flavour =~ /le$/o) { # little-endian
  565. s/le\?//o or
  566. s/be\?/#be#/o;
  567. } else {
  568. s/le\?/#le#/o or
  569. s/be\?//o;
  570. }
  571. print $_,"\n";
  572. }
  573. close STDOUT or die "error closing STDOUT"; # enforce flush