poly1305-ppc.pl 42 KB


  1. #! /usr/bin/env perl
  2. # Copyright 2016-2024 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. # ====================================================================
  10. # Written by Andy Polyakov, @dot-asm, initially for use in the OpenSSL
  11. # project. The module is dual licensed under OpenSSL and CRYPTOGAMS
  12. # licenses depending on where you obtain it. For further details see
  13. # https://github.com/dot-asm/cryptogams/.
  14. # ====================================================================
  15. #
  16. # This module implements Poly1305 hash for PowerPC.
  17. #
  18. # June 2015
  19. #
  20. # Numbers are cycles per processed byte with poly1305_blocks alone,
  21. # and improvement coefficients relative to gcc-generated code.
  22. #
  23. # -m32 -m64
  24. #
  25. # Freescale e300 14.8/+80% -
  26. # PPC74x0 7.60/+60% -
  27. # PPC970 7.00/+114% 3.51/+205%
  28. # POWER7 3.75/+260% 1.93/+100%
  29. # POWER8 - 2.03/+200%
  30. # POWER9 - 2.00/+150%
  31. #
  32. # Do we need floating-point implementation for PPC? Results presented
  33. # in poly1305_ieee754.c are tricky to compare to, because they are for
  34. # compiler-generated code. On the other hand it's known that floating-
  35. # point performance can be dominated by FPU latency, which means that
  36. # there is limit even for ideally optimized (and even vectorized) code.
  37. # And this limit is estimated to be higher than above -m64 results. Or
  38. # in other words floating-point implementation can be meaningful to
  39. # consider only in 32-bit application context. We probably have to
  40. # recognize that 32-bit builds are getting less popular on high-end
  41. # systems and therefore tend to target embedded ones, which might not
  42. # even have FPU...
  43. #
  44. # On side note, Power ISA 2.07 enables vector base 2^26 implementation,
  45. # and POWER8 might have capacity to break 1.0 cycle per byte barrier...
  46. #
  47. # January 2019
  48. #
  49. # ... Unfortunately not:-( Estimate was a projection of ARM result,
  50. # but ARM has vector multiply-n-add instruction, while PowerISA does
  51. # not, not one usable in the context. Improvement is ~40% over -m64
  52. # result above and is ~1.43 on little-endian systems.
  53. # $output is the last argument if it looks like a file (it has an extension)
  54. # $flavour is the first argument if it doesn't look like a file
  55. $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  56. $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  57. if ($flavour =~ /64/) {
  58. $SIZE_T =8;
  59. $LRSAVE =2*$SIZE_T;
  60. $UCMP ="cmpld";
  61. $STU ="stdu";
  62. $POP ="ld";
  63. $PUSH ="std";
  64. } elsif ($flavour =~ /32/) {
  65. $SIZE_T =4;
  66. $LRSAVE =$SIZE_T;
  67. $UCMP ="cmplw";
  68. $STU ="stwu";
  69. $POP ="lwz";
  70. $PUSH ="stw";
  71. } else { die "nonsense $flavour"; }
  72. # Define endianness based on flavour
  73. # i.e.: linux64le
  74. $LITTLE_ENDIAN = ($flavour=~/le$/) ? $SIZE_T : 0;
  75. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  76. ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
  77. ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
  78. die "can't locate ppc-xlate.pl";
  79. open STDOUT,"| $^X $xlate $flavour \"$output\""
  80. or die "can't call $xlate: $!";
  81. $FRAME=24*$SIZE_T;
  82. $sp="r1";
  83. my ($ctx,$inp,$len,$padbit) = map("r$_",(3..6));
  84. my ($mac,$nonce)=($inp,$len);
  85. my $mask = "r0";
  86. $code=<<___;
  87. .machine "any"
  88. .text
  89. ___
  90. if ($flavour =~ /64/) {
  91. ###############################################################################
  92. # base 2^64 implementation
  93. my ($h0,$h1,$h2,$d0,$d1,$d2, $r0,$r1,$s1, $t0,$t1) = map("r$_",(7..12,27..31));
  94. $code.=<<___;
  95. .globl .poly1305_init_int
  96. .align 4
  97. .poly1305_init_int:
  98. xor r0,r0,r0
  99. std r0,0($ctx) # zero hash value
  100. std r0,8($ctx)
  101. std r0,16($ctx)
  102. stw r0,24($ctx) # clear is_base2_26
  103. $UCMP $inp,r0
  104. beq- Lno_key
  105. ___
  106. $code.=<<___ if ($LITTLE_ENDIAN);
  107. ld $d0,0($inp) # load key material
  108. ld $d1,8($inp)
  109. ___
  110. $code.=<<___ if (!$LITTLE_ENDIAN);
  111. li $h0,4
  112. lwbrx $d0,0,$inp # load key material
  113. li $d1,8
  114. lwbrx $h0,$h0,$inp
  115. li $h1,12
  116. lwbrx $d1,$d1,$inp
  117. lwbrx $h1,$h1,$inp
  118. insrdi $d0,$h0,32,0
  119. insrdi $d1,$h1,32,0
  120. ___
  121. $code.=<<___;
  122. lis $h1,0xfff # 0x0fff0000
  123. ori $h1,$h1,0xfffc # 0x0ffffffc
  124. insrdi $h1,$h1,32,0 # 0x0ffffffc0ffffffc
  125. ori $h0,$h1,3 # 0x0ffffffc0fffffff
  126. and $d0,$d0,$h0
  127. and $d1,$d1,$h1
  128. std $d0,32($ctx) # store key
  129. std $d1,40($ctx)
  130. Lno_key:
  131. xor r3,r3,r3
  132. blr
  133. .long 0
  134. .byte 0,12,0x14,0,0,0,2,0
  135. .size .poly1305_init_int,.-.poly1305_init_int
  136. .globl .poly1305_blocks
  137. .align 4
  138. .poly1305_blocks:
  139. Lpoly1305_blocks:
  140. srdi. $len,$len,4
  141. beq- Labort
  142. $STU $sp,-$FRAME($sp)
  143. mflr r0
  144. $PUSH r27,`$FRAME-$SIZE_T*5`($sp)
  145. $PUSH r28,`$FRAME-$SIZE_T*4`($sp)
  146. $PUSH r29,`$FRAME-$SIZE_T*3`($sp)
  147. $PUSH r30,`$FRAME-$SIZE_T*2`($sp)
  148. $PUSH r31,`$FRAME-$SIZE_T*1`($sp)
  149. $PUSH r0,`$FRAME+$LRSAVE`($sp)
  150. ld $r0,32($ctx) # load key
  151. ld $r1,40($ctx)
  152. ld $h0,0($ctx) # load hash value
  153. ld $h1,8($ctx)
  154. ld $h2,16($ctx)
  155. srdi $s1,$r1,2
  156. mtctr $len
  157. add $s1,$s1,$r1 # s1 = r1 + r1>>2
  158. li $mask,3
  159. b Loop
  160. .align 4
  161. Loop:
  162. ___
  163. $code.=<<___ if ($LITTLE_ENDIAN);
  164. ld $t0,0($inp) # load input
  165. ld $t1,8($inp)
  166. ___
  167. $code.=<<___ if (!$LITTLE_ENDIAN);
  168. li $d0,4
  169. lwbrx $t0,0,$inp # load input
  170. li $t1,8
  171. lwbrx $d0,$d0,$inp
  172. li $d1,12
  173. lwbrx $t1,$t1,$inp
  174. lwbrx $d1,$d1,$inp
  175. insrdi $t0,$d0,32,0
  176. insrdi $t1,$d1,32,0
  177. ___
  178. $code.=<<___;
  179. addi $inp,$inp,16
  180. addc $h0,$h0,$t0 # accumulate input
  181. adde $h1,$h1,$t1
  182. mulld $d0,$h0,$r0 # h0*r0
  183. mulhdu $d1,$h0,$r0
  184. adde $h2,$h2,$padbit
  185. mulld $t0,$h1,$s1 # h1*5*r1
  186. mulhdu $t1,$h1,$s1
  187. addc $d0,$d0,$t0
  188. adde $d1,$d1,$t1
  189. mulld $t0,$h0,$r1 # h0*r1
  190. mulhdu $d2,$h0,$r1
  191. addc $d1,$d1,$t0
  192. addze $d2,$d2
  193. mulld $t0,$h1,$r0 # h1*r0
  194. mulhdu $t1,$h1,$r0
  195. addc $d1,$d1,$t0
  196. adde $d2,$d2,$t1
  197. mulld $t0,$h2,$s1 # h2*5*r1
  198. mulld $t1,$h2,$r0 # h2*r0
  199. addc $d1,$d1,$t0
  200. adde $d2,$d2,$t1
  201. andc $t0,$d2,$mask # final reduction step
  202. and $h2,$d2,$mask
  203. srdi $t1,$t0,2
  204. add $t0,$t0,$t1
  205. addc $h0,$d0,$t0
  206. addze $h1,$d1
  207. addze $h2,$h2
  208. bdnz Loop
  209. std $h0,0($ctx) # store hash value
  210. std $h1,8($ctx)
  211. std $h2,16($ctx)
  212. $POP r27,`$FRAME-$SIZE_T*5`($sp)
  213. $POP r28,`$FRAME-$SIZE_T*4`($sp)
  214. $POP r29,`$FRAME-$SIZE_T*3`($sp)
  215. $POP r30,`$FRAME-$SIZE_T*2`($sp)
  216. $POP r31,`$FRAME-$SIZE_T*1`($sp)
  217. addi $sp,$sp,$FRAME
  218. Labort:
  219. blr
  220. .long 0
  221. .byte 0,12,4,1,0x80,5,4,0
  222. .size .poly1305_blocks,.-.poly1305_blocks
  223. ___
  224. {
  225. my ($h0,$h1,$h2,$h3,$h4,$t0) = map("r$_",(7..12));
  226. $code.=<<___;
  227. .globl .poly1305_emit
  228. .align 5
  229. .poly1305_emit:
  230. lwz $h0,0($ctx) # load hash value base 2^26
  231. lwz $h1,4($ctx)
  232. lwz $h2,8($ctx)
  233. lwz $h3,12($ctx)
  234. lwz $h4,16($ctx)
  235. lwz r0,24($ctx) # is_base2_26
  236. sldi $h1,$h1,26 # base 2^26 -> base 2^64
  237. sldi $t0,$h2,52
  238. srdi $h2,$h2,12
  239. sldi $h3,$h3,14
  240. add $h0,$h0,$h1
  241. addc $h0,$h0,$t0
  242. sldi $t0,$h4,40
  243. srdi $h4,$h4,24
  244. adde $h1,$h2,$h3
  245. addc $h1,$h1,$t0
  246. addze $h2,$h4
  247. ld $h3,0($ctx) # load hash value base 2^64
  248. ld $h4,8($ctx)
  249. ld $t0,16($ctx)
  250. neg r0,r0
  251. xor $h0,$h0,$h3 # choose between radixes
  252. xor $h1,$h1,$h4
  253. xor $h2,$h2,$t0
  254. and $h0,$h0,r0
  255. and $h1,$h1,r0
  256. and $h2,$h2,r0
  257. xor $h0,$h0,$h3
  258. xor $h1,$h1,$h4
  259. xor $h2,$h2,$t0
  260. addic $h3,$h0,5 # compare to modulus
  261. addze $h4,$h1
  262. addze $t0,$h2
  263. srdi $t0,$t0,2 # see if it carried/borrowed
  264. neg $t0,$t0
  265. andc $h0,$h0,$t0
  266. and $h3,$h3,$t0
  267. andc $h1,$h1,$t0
  268. and $h4,$h4,$t0
  269. or $h0,$h0,$h3
  270. or $h1,$h1,$h4
  271. lwz $t0,4($nonce)
  272. lwz $h2,12($nonce)
  273. lwz $h3,0($nonce)
  274. lwz $h4,8($nonce)
  275. insrdi $h3,$t0,32,0
  276. insrdi $h4,$h2,32,0
  277. addc $h0,$h0,$h3 # accumulate nonce
  278. adde $h1,$h1,$h4
  279. addi $ctx,$mac,-1
  280. addi $mac,$mac,7
  281. stbu $h0,1($ctx) # write [little-endian] result
  282. srdi $h0,$h0,8
  283. stbu $h1,1($mac)
  284. srdi $h1,$h1,8
  285. stbu $h0,1($ctx)
  286. srdi $h0,$h0,8
  287. stbu $h1,1($mac)
  288. srdi $h1,$h1,8
  289. stbu $h0,1($ctx)
  290. srdi $h0,$h0,8
  291. stbu $h1,1($mac)
  292. srdi $h1,$h1,8
  293. stbu $h0,1($ctx)
  294. srdi $h0,$h0,8
  295. stbu $h1,1($mac)
  296. srdi $h1,$h1,8
  297. stbu $h0,1($ctx)
  298. srdi $h0,$h0,8
  299. stbu $h1,1($mac)
  300. srdi $h1,$h1,8
  301. stbu $h0,1($ctx)
  302. srdi $h0,$h0,8
  303. stbu $h1,1($mac)
  304. srdi $h1,$h1,8
  305. stbu $h0,1($ctx)
  306. srdi $h0,$h0,8
  307. stbu $h1,1($mac)
  308. srdi $h1,$h1,8
  309. stbu $h0,1($ctx)
  310. stbu $h1,1($mac)
  311. blr
  312. .long 0
  313. .byte 0,12,0x14,0,0,0,3,0
  314. .size .poly1305_emit,.-.poly1305_emit
  315. ___
  316. } } else {
  317. ###############################################################################
  318. # base 2^32 implementation
  319. my ($h0,$h1,$h2,$h3,$h4, $r0,$r1,$r2,$r3, $s1,$s2,$s3,
  320. $t0,$t1,$t2,$t3, $D0,$D1,$D2,$D3, $d0,$d1,$d2,$d3
  321. ) = map("r$_",(7..12,14..31));
  322. $code.=<<___;
  323. .globl .poly1305_init_int
  324. .align 4
  325. .poly1305_init_int:
  326. xor r0,r0,r0
  327. stw r0,0($ctx) # zero hash value
  328. stw r0,4($ctx)
  329. stw r0,8($ctx)
  330. stw r0,12($ctx)
  331. stw r0,16($ctx)
  332. stw r0,24($ctx) # clear is_base2_26
  333. $UCMP $inp,r0
  334. beq- Lno_key
  335. ___
  336. $code.=<<___ if ($LITTLE_ENDIAN);
  337. lw $h0,0($inp) # load key material
  338. lw $h1,4($inp)
  339. lw $h2,8($inp)
  340. lw $h3,12($inp)
  341. ___
  342. $code.=<<___ if (!$LITTLE_ENDIAN);
  343. li $h1,4
  344. lwbrx $h0,0,$inp # load key material
  345. li $h2,8
  346. lwbrx $h1,$h1,$inp
  347. li $h3,12
  348. lwbrx $h2,$h2,$inp
  349. lwbrx $h3,$h3,$inp
  350. ___
  351. $code.=<<___;
  352. lis $mask,0xf000 # 0xf0000000
  353. li $r0,-4
  354. andc $r0,$r0,$mask # 0x0ffffffc
  355. andc $h0,$h0,$mask
  356. and $h1,$h1,$r0
  357. and $h2,$h2,$r0
  358. and $h3,$h3,$r0
  359. stw $h0,32($ctx) # store key
  360. stw $h1,36($ctx)
  361. stw $h2,40($ctx)
  362. stw $h3,44($ctx)
  363. Lno_key:
  364. xor r3,r3,r3
  365. blr
  366. .long 0
  367. .byte 0,12,0x14,0,0,0,2,0
  368. .size .poly1305_init_int,.-.poly1305_init_int
  369. .globl .poly1305_blocks
  370. .align 4
  371. .poly1305_blocks:
  372. Lpoly1305_blocks:
  373. srwi. $len,$len,4
  374. beq- Labort
  375. $STU $sp,-$FRAME($sp)
  376. mflr r0
  377. $PUSH r14,`$FRAME-$SIZE_T*18`($sp)
  378. $PUSH r15,`$FRAME-$SIZE_T*17`($sp)
  379. $PUSH r16,`$FRAME-$SIZE_T*16`($sp)
  380. $PUSH r17,`$FRAME-$SIZE_T*15`($sp)
  381. $PUSH r18,`$FRAME-$SIZE_T*14`($sp)
  382. $PUSH r19,`$FRAME-$SIZE_T*13`($sp)
  383. $PUSH r20,`$FRAME-$SIZE_T*12`($sp)
  384. $PUSH r21,`$FRAME-$SIZE_T*11`($sp)
  385. $PUSH r22,`$FRAME-$SIZE_T*10`($sp)
  386. $PUSH r23,`$FRAME-$SIZE_T*9`($sp)
  387. $PUSH r24,`$FRAME-$SIZE_T*8`($sp)
  388. $PUSH r25,`$FRAME-$SIZE_T*7`($sp)
  389. $PUSH r26,`$FRAME-$SIZE_T*6`($sp)
  390. $PUSH r27,`$FRAME-$SIZE_T*5`($sp)
  391. $PUSH r28,`$FRAME-$SIZE_T*4`($sp)
  392. $PUSH r29,`$FRAME-$SIZE_T*3`($sp)
  393. $PUSH r30,`$FRAME-$SIZE_T*2`($sp)
  394. $PUSH r31,`$FRAME-$SIZE_T*1`($sp)
  395. $PUSH r0,`$FRAME+$LRSAVE`($sp)
  396. lwz $r0,32($ctx) # load key
  397. lwz $r1,36($ctx)
  398. lwz $r2,40($ctx)
  399. lwz $r3,44($ctx)
  400. lwz $h0,0($ctx) # load hash value
  401. lwz $h1,4($ctx)
  402. lwz $h2,8($ctx)
  403. lwz $h3,12($ctx)
  404. lwz $h4,16($ctx)
  405. srwi $s1,$r1,2
  406. srwi $s2,$r2,2
  407. srwi $s3,$r3,2
  408. add $s1,$s1,$r1 # si = ri + ri>>2
  409. add $s2,$s2,$r2
  410. add $s3,$s3,$r3
  411. mtctr $len
  412. li $mask,3
  413. b Loop
  414. .align 4
  415. Loop:
  416. ___
  417. $code.=<<___ if ($LITTLE_ENDIAN);
  418. lwz $d0,0($inp) # load input
  419. lwz $d1,4($inp)
  420. lwz $d2,8($inp)
  421. lwz $d3,12($inp)
  422. ___
  423. $code.=<<___ if (!$LITTLE_ENDIAN);
  424. li $d1,4
  425. lwbrx $d0,0,$inp # load input
  426. li $d2,8
  427. lwbrx $d1,$d1,$inp
  428. li $d3,12
  429. lwbrx $d2,$d2,$inp
  430. lwbrx $d3,$d3,$inp
  431. ___
  432. $code.=<<___;
  433. addi $inp,$inp,16
  434. addc $h0,$h0,$d0 # accumulate input
  435. adde $h1,$h1,$d1
  436. adde $h2,$h2,$d2
  437. mullw $d0,$h0,$r0 # h0*r0
  438. mulhwu $D0,$h0,$r0
  439. mullw $d1,$h0,$r1 # h0*r1
  440. mulhwu $D1,$h0,$r1
  441. mullw $d2,$h0,$r2 # h0*r2
  442. mulhwu $D2,$h0,$r2
  443. adde $h3,$h3,$d3
  444. adde $h4,$h4,$padbit
  445. mullw $d3,$h0,$r3 # h0*r3
  446. mulhwu $D3,$h0,$r3
  447. mullw $t0,$h1,$s3 # h1*s3
  448. mulhwu $t1,$h1,$s3
  449. mullw $t2,$h1,$r0 # h1*r0
  450. mulhwu $t3,$h1,$r0
  451. addc $d0,$d0,$t0
  452. adde $D0,$D0,$t1
  453. mullw $t0,$h1,$r1 # h1*r1
  454. mulhwu $t1,$h1,$r1
  455. addc $d1,$d1,$t2
  456. adde $D1,$D1,$t3
  457. mullw $t2,$h1,$r2 # h1*r2
  458. mulhwu $t3,$h1,$r2
  459. addc $d2,$d2,$t0
  460. adde $D2,$D2,$t1
  461. mullw $t0,$h2,$s2 # h2*s2
  462. mulhwu $t1,$h2,$s2
  463. addc $d3,$d3,$t2
  464. adde $D3,$D3,$t3
  465. mullw $t2,$h2,$s3 # h2*s3
  466. mulhwu $t3,$h2,$s3
  467. addc $d0,$d0,$t0
  468. adde $D0,$D0,$t1
  469. mullw $t0,$h2,$r0 # h2*r0
  470. mulhwu $t1,$h2,$r0
  471. addc $d1,$d1,$t2
  472. adde $D1,$D1,$t3
  473. mullw $t2,$h2,$r1 # h2*r1
  474. mulhwu $t3,$h2,$r1
  475. addc $d2,$d2,$t0
  476. adde $D2,$D2,$t1
  477. mullw $t0,$h3,$s1 # h3*s1
  478. mulhwu $t1,$h3,$s1
  479. addc $d3,$d3,$t2
  480. adde $D3,$D3,$t3
  481. mullw $t2,$h3,$s2 # h3*s2
  482. mulhwu $t3,$h3,$s2
  483. addc $d0,$d0,$t0
  484. adde $D0,$D0,$t1
  485. mullw $t0,$h3,$s3 # h3*s3
  486. mulhwu $t1,$h3,$s3
  487. addc $d1,$d1,$t2
  488. adde $D1,$D1,$t3
  489. mullw $t2,$h3,$r0 # h3*r0
  490. mulhwu $t3,$h3,$r0
  491. addc $d2,$d2,$t0
  492. adde $D2,$D2,$t1
  493. mullw $t0,$h4,$s1 # h4*s1
  494. addc $d3,$d3,$t2
  495. adde $D3,$D3,$t3
  496. addc $d1,$d1,$t0
  497. mullw $t1,$h4,$s2 # h4*s2
  498. addze $D1,$D1
  499. addc $d2,$d2,$t1
  500. addze $D2,$D2
  501. mullw $t2,$h4,$s3 # h4*s3
  502. addc $d3,$d3,$t2
  503. addze $D3,$D3
  504. mullw $h4,$h4,$r0 # h4*r0
  505. addc $h1,$d1,$D0
  506. adde $h2,$d2,$D1
  507. adde $h3,$d3,$D2
  508. adde $h4,$h4,$D3
  509. andc $D0,$h4,$mask # final reduction step
  510. and $h4,$h4,$mask
  511. srwi $D1,$D0,2
  512. add $D0,$D0,$D1
  513. addc $h0,$d0,$D0
  514. addze $h1,$h1
  515. addze $h2,$h2
  516. addze $h3,$h3
  517. addze $h4,$h4
  518. bdnz Loop
  519. stw $h0,0($ctx) # store hash value
  520. stw $h1,4($ctx)
  521. stw $h2,8($ctx)
  522. stw $h3,12($ctx)
  523. stw $h4,16($ctx)
  524. $POP r14,`$FRAME-$SIZE_T*18`($sp)
  525. $POP r15,`$FRAME-$SIZE_T*17`($sp)
  526. $POP r16,`$FRAME-$SIZE_T*16`($sp)
  527. $POP r17,`$FRAME-$SIZE_T*15`($sp)
  528. $POP r18,`$FRAME-$SIZE_T*14`($sp)
  529. $POP r19,`$FRAME-$SIZE_T*13`($sp)
  530. $POP r20,`$FRAME-$SIZE_T*12`($sp)
  531. $POP r21,`$FRAME-$SIZE_T*11`($sp)
  532. $POP r22,`$FRAME-$SIZE_T*10`($sp)
  533. $POP r23,`$FRAME-$SIZE_T*9`($sp)
  534. $POP r24,`$FRAME-$SIZE_T*8`($sp)
  535. $POP r25,`$FRAME-$SIZE_T*7`($sp)
  536. $POP r26,`$FRAME-$SIZE_T*6`($sp)
  537. $POP r27,`$FRAME-$SIZE_T*5`($sp)
  538. $POP r28,`$FRAME-$SIZE_T*4`($sp)
  539. $POP r29,`$FRAME-$SIZE_T*3`($sp)
  540. $POP r30,`$FRAME-$SIZE_T*2`($sp)
  541. $POP r31,`$FRAME-$SIZE_T*1`($sp)
  542. addi $sp,$sp,$FRAME
  543. Labort:
  544. blr
  545. .long 0
  546. .byte 0,12,4,1,0x80,18,4,0
  547. .size .poly1305_blocks,.-.poly1305_blocks
  548. ___
  549. {
  550. my ($h0,$h1,$h2,$h3,$h4,$t0,$t1) = map("r$_",(6..12));
  551. $code.=<<___;
  552. .globl .poly1305_emit
  553. .align 5
  554. .poly1305_emit:
  555. lwz r0,24($ctx) # is_base2_26
  556. lwz $h0,0($ctx) # load hash value
  557. lwz $h1,4($ctx)
  558. lwz $h2,8($ctx)
  559. lwz $h3,12($ctx)
  560. lwz $h4,16($ctx)
  561. cmplwi r0,0
  562. beq Lemit_base2_32
  563. slwi $t0,$h1,26 # base 2^26 -> base 2^32
  564. srwi $h1,$h1,6
  565. slwi $t1,$h2,20
  566. srwi $h2,$h2,12
  567. addc $h0,$h0,$t0
  568. slwi $t0,$h3,14
  569. srwi $h3,$h3,18
  570. adde $h1,$h1,$t1
  571. slwi $t1,$h4,8
  572. srwi $h4,$h4,24
  573. adde $h2,$h2,$t0
  574. adde $h3,$h3,$t1
  575. addze $h4,$h4
  576. Lemit_base2_32:
  577. addic r0,$h0,5 # compare to modulus
  578. addze r0,$h1
  579. addze r0,$h2
  580. addze r0,$h3
  581. addze r0,$h4
  582. srwi r0,r0,2 # see if it carried/borrowed
  583. neg r0,r0
  584. andi. r0,r0,5
  585. addc $h0,$h0,r0
  586. lwz r0,0($nonce)
  587. addze $h1,$h1
  588. lwz $t0,4($nonce)
  589. addze $h2,$h2
  590. lwz $t1,8($nonce)
  591. addze $h3,$h3
  592. lwz $h4,12($nonce)
  593. addc $h0,$h0,r0 # accumulate nonce
  594. adde $h1,$h1,$t0
  595. adde $h2,$h2,$t1
  596. adde $h3,$h3,$h4
  597. addi $ctx,$mac,-1
  598. addi $mac,$mac,7
  599. stbu $h0,1($ctx) # write [little-endian] result
  600. srwi $h0,$h0,8
  601. stbu $h2,1($mac)
  602. srwi $h2,$h2,8
  603. stbu $h0,1($ctx)
  604. srwi $h0,$h0,8
  605. stbu $h2,1($mac)
  606. srwi $h2,$h2,8
  607. stbu $h0,1($ctx)
  608. srwi $h0,$h0,8
  609. stbu $h2,1($mac)
  610. srwi $h2,$h2,8
  611. stbu $h0,1($ctx)
  612. stbu $h2,1($mac)
  613. stbu $h1,1($ctx)
  614. srwi $h1,$h1,8
  615. stbu $h3,1($mac)
  616. srwi $h3,$h3,8
  617. stbu $h1,1($ctx)
  618. srwi $h1,$h1,8
  619. stbu $h3,1($mac)
  620. srwi $h3,$h3,8
  621. stbu $h1,1($ctx)
  622. srwi $h1,$h1,8
  623. stbu $h3,1($mac)
  624. srwi $h3,$h3,8
  625. stbu $h1,1($ctx)
  626. stbu $h3,1($mac)
  627. blr
  628. .long 0
  629. .byte 0,12,0x14,0,0,0,3,0
  630. .size .poly1305_emit,.-.poly1305_emit
  631. ___
  632. } }
  633. {{{
  634. ########################################################################
  635. # PowerISA 2.07/VSX section #
  636. ########################################################################
  637. my $LOCALS= 6*$SIZE_T;
  638. my $VSXFRAME = $LOCALS + 6*$SIZE_T;
  639. $VSXFRAME += 128; # local variables
  640. $VSXFRAME += 12*16; # v20-v31 offload
  641. my $BIG_ENDIAN = ($flavour !~ /le/) ? 4 : 0;
  642. ########################################################################
  643. # Layout of opaque area is following:
  644. #
  645. # unsigned __int32 h[5]; # current hash value base 2^26
  646. # unsigned __int32 pad;
  647. # unsigned __int32 is_base2_26, pad;
  648. # unsigned __int64 r[2]; # key value base 2^64
  649. # struct { unsigned __int32 r^2, r^4, r^1, r^3; } r[9];
  650. #
  651. # where r^n are base 2^26 digits of powers of multiplier key. There are
  652. # 5 digits, but last four are interleaved with multiples of 5, totalling
  653. # in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4. Order of
  654. # powers is as they appear in register, not memory.
  655. my ($H0, $H1, $H2, $H3, $H4) = map("v$_",(0..4));
  656. my ($I0, $I1, $I2, $I3, $I4) = map("v$_",(5..9));
  657. my ($R0, $R1, $S1, $R2, $S2) = map("v$_",(10..14));
  658. my ($R3, $S3, $R4, $S4) = ($R1, $S1, $R2, $S2);
  659. my ($ACC0, $ACC1, $ACC2, $ACC3, $ACC4) = map("v$_",(15..19));
  660. my ($T0, $T1, $T2, $T3, $T4) = map("v$_",(20..24));
  661. my ($_26,$_4,$_40,$_14,$mask26,$padbits,$I2perm) = map("v$_",(25..31));
  662. my ($x00,$x60,$x70,$x10,$x20,$x30,$x40,$x50) = (0, map("r$_",(7,8,27..31)));
  663. my ($ctx_,$_ctx,$const) = map("r$_",(10..12));
  664. if ($flavour =~ /64/) {
  665. ###############################################################################
  666. # setup phase of poly1305_blocks_vsx is different on 32- and 64-bit platforms,
  667. # but the base 2^26 computational part is same...
  668. my ($h0,$h1,$h2,$d0,$d1,$d2, $r0,$r1,$s1, $t0,$t1) = map("r$_",(6..11,27..31));
  669. my $mask = "r0";
  670. $code.=<<___;
  671. .globl .poly1305_blocks_vsx
  672. .align 5
  673. .poly1305_blocks_vsx:
  674. lwz r7,24($ctx) # is_base2_26
  675. cmpldi $len,128
  676. bge __poly1305_blocks_vsx
  677. neg r0,r7 # is_base2_26 as mask
  678. lwz r7,0($ctx) # load hash base 2^26
  679. lwz r8,4($ctx)
  680. lwz r9,8($ctx)
  681. lwz r10,12($ctx)
  682. lwz r11,16($ctx)
  683. sldi r8,r8,26 # base 2^26 -> base 2^64
  684. sldi r12,r9,52
  685. add r7,r7,r8
  686. srdi r9,r9,12
  687. sldi r10,r10,14
  688. addc r7,r7,r12
  689. sldi r8,r11,40
  690. adde r9,r9,r10
  691. srdi r11,r11,24
  692. addc r9,r9,r8
  693. addze r11,r11
  694. ld r8,0($ctx) # load hash base 2^64
  695. ld r10,8($ctx)
  696. ld r12,16($ctx)
  697. xor r7,r7,r8 # select between radixes
  698. xor r9,r9,r10
  699. xor r11,r11,r12
  700. and r7,r7,r0
  701. and r9,r9,r0
  702. and r11,r11,r0
  703. xor r7,r7,r8
  704. xor r9,r9,r10
  705. xor r11,r11,r12
  706. li r0,0
  707. std r7,0($ctx) # store hash base 2^64
  708. std r9,8($ctx)
  709. std r11,16($ctx)
  710. stw r0,24($ctx) # clear is_base2_26
  711. b Lpoly1305_blocks
  712. .long 0
  713. .byte 0,12,0x14,0,0,0,4,0
  714. .size .poly1305_blocks_vsx,.-.poly1305_blocks_vsx
  715. .align 5
  716. __poly1305_mul:
  717. mulld $d0,$h0,$r0 # h0*r0
  718. mulhdu $d1,$h0,$r0
  719. mulld $t0,$h1,$s1 # h1*5*r1
  720. mulhdu $t1,$h1,$s1
  721. addc $d0,$d0,$t0
  722. adde $d1,$d1,$t1
  723. mulld $t0,$h0,$r1 # h0*r1
  724. mulhdu $d2,$h0,$r1
  725. addc $d1,$d1,$t0
  726. addze $d2,$d2
  727. mulld $t0,$h1,$r0 # h1*r0
  728. mulhdu $t1,$h1,$r0
  729. addc $d1,$d1,$t0
  730. adde $d2,$d2,$t1
  731. mulld $t0,$h2,$s1 # h2*5*r1
  732. mulld $t1,$h2,$r0 # h2*r0
  733. addc $d1,$d1,$t0
  734. adde $d2,$d2,$t1
  735. andc $t0,$d2,$mask # final reduction step
  736. and $h2,$d2,$mask
  737. srdi $t1,$t0,2
  738. add $t0,$t0,$t1
  739. addc $h0,$d0,$t0
  740. addze $h1,$d1
  741. addze $h2,$h2
  742. blr
  743. .long 0
  744. .byte 0,12,0x14,0,0,0,0,0
  745. .size __poly1305_mul,.-__poly1305_mul
  746. .align 5
  747. __poly1305_splat:
  748. extrdi $d0,$h0,26,38
  749. extrdi $d1,$h0,26,12
  750. stw $d0,0x00($t1)
  751. extrdi $d2,$h0,12,0
  752. slwi $d0,$d1,2
  753. stw $d1,0x10($t1)
  754. add $d0,$d0,$d1 # * 5
  755. stw $d0,0x20($t1)
  756. insrdi $d2,$h1,14,38
  757. slwi $d0,$d2,2
  758. stw $d2,0x30($t1)
  759. add $d0,$d0,$d2 # * 5
  760. stw $d0,0x40($t1)
  761. extrdi $d1,$h1,26,24
  762. extrdi $d2,$h1,24,0
  763. slwi $d0,$d1,2
  764. stw $d1,0x50($t1)
  765. add $d0,$d0,$d1 # * 5
  766. stw $d0,0x60($t1)
  767. insrdi $d2,$h2,3,37
  768. slwi $d0,$d2,2
  769. stw $d2,0x70($t1)
  770. add $d0,$d0,$d2 # * 5
  771. stw $d0,0x80($t1)
  772. blr
  773. .long 0
  774. .byte 0,12,0x14,0,0,0,0,0
  775. .size __poly1305_splat,.-__poly1305_splat
  776. .align 5
  777. __poly1305_blocks_vsx:
  778. $STU $sp,-$VSXFRAME($sp)
  779. mflr r0
  780. li r10,`15+$LOCALS+128`
  781. li r11,`31+$LOCALS+128`
  782. mfspr r12,256
  783. stvx v20,r10,$sp
  784. addi r10,r10,32
  785. stvx v21,r11,$sp
  786. addi r11,r11,32
  787. stvx v22,r10,$sp
  788. addi r10,r10,32
  789. stvx v23,r11,$sp
  790. addi r11,r11,32
  791. stvx v24,r10,$sp
  792. addi r10,r10,32
  793. stvx v25,r11,$sp
  794. addi r11,r11,32
  795. stvx v26,r10,$sp
  796. addi r10,r10,32
  797. stvx v27,r11,$sp
  798. addi r11,r11,32
  799. stvx v28,r10,$sp
  800. addi r10,r10,32
  801. stvx v29,r11,$sp
  802. addi r11,r11,32
  803. stvx v30,r10,$sp
  804. stvx v31,r11,$sp
  805. stw r12,`$VSXFRAME-$SIZE_T*5-4`($sp)# save vrsave
  806. li r12,-1
  807. mtspr 256,r12 # preserve all AltiVec registers
  808. $PUSH r27,`$VSXFRAME-$SIZE_T*5`($sp)
  809. $PUSH r28,`$VSXFRAME-$SIZE_T*4`($sp)
  810. $PUSH r29,`$VSXFRAME-$SIZE_T*3`($sp)
  811. $PUSH r30,`$VSXFRAME-$SIZE_T*2`($sp)
  812. $PUSH r31,`$VSXFRAME-$SIZE_T*1`($sp)
  813. $PUSH r0,`$VSXFRAME+$LRSAVE`($sp)
  814. bl LPICmeup
  815. li $x10,0x10
  816. li $x20,0x20
  817. li $x30,0x30
  818. li $x40,0x40
  819. li $x50,0x50
  820. lvx_u $mask26,$x00,$const
  821. lvx_u $_26,$x10,$const
  822. lvx_u $_40,$x20,$const
  823. lvx_u $I2perm,$x30,$const
  824. lvx_u $padbits,$x40,$const
  825. cmplwi r7,0 # is_base2_26?
  826. bne Lskip_init_vsx
  827. ld $r0,32($ctx) # load key base 2^64
  828. ld $r1,40($ctx)
  829. srdi $s1,$r1,2
  830. li $mask,3
  831. add $s1,$s1,$r1 # s1 = r1 + r1>>2
  832. mr $h0,$r0 # "calculate" r^1
  833. mr $h1,$r1
  834. li $h2,0
  835. addi $t1,$ctx,`48+(12^$BIG_ENDIAN)`
  836. bl __poly1305_splat
  837. bl __poly1305_mul # calculate r^2
  838. addi $t1,$ctx,`48+(4^$BIG_ENDIAN)`
  839. bl __poly1305_splat
  840. bl __poly1305_mul # calculate r^3
  841. addi $t1,$ctx,`48+(8^$BIG_ENDIAN)`
  842. bl __poly1305_splat
  843. bl __poly1305_mul # calculate r^4
  844. addi $t1,$ctx,`48+(0^$BIG_ENDIAN)`
  845. bl __poly1305_splat
  846. ld $h0,0($ctx) # load hash
  847. ld $h1,8($ctx)
  848. ld $h2,16($ctx)
  849. extrdi $d0,$h0,26,38 # base 2^64 -> base 2^26
  850. extrdi $d1,$h0,26,12
  851. extrdi $d2,$h0,12,0
  852. mtvrwz $H0,$d0
  853. insrdi $d2,$h1,14,38
  854. mtvrwz $H1,$d1
  855. extrdi $d1,$h1,26,24
  856. mtvrwz $H2,$d2
  857. extrdi $d2,$h1,24,0
  858. mtvrwz $H3,$d1
  859. insrdi $d2,$h2,3,37
  860. mtvrwz $H4,$d2
  861. ___
  862. } else {
  863. ###############################################################################
  864. # 32-bit initialization
  865. my ($h0,$h1,$h2,$h3,$h4,$t0,$t1) = map("r$_",(7..11,0,12));
  866. my ($R3,$S3,$R4,$S4)=($I1,$I2,$I3,$I4);
  867. $code.=<<___;
  868. .globl .poly1305_blocks_vsx
  869. .align 5
  870. .poly1305_blocks_vsx:
  871. lwz r7,24($ctx) # is_base2_26
  872. cmplwi $len,128
  873. bge __poly1305_blocks_vsx
  874. cmplwi r7,0
  875. beq Lpoly1305_blocks
  876. lwz $h0,0($ctx) # load hash
  877. lwz $h1,4($ctx)
  878. lwz $h2,8($ctx)
  879. lwz $h3,12($ctx)
  880. lwz $h4,16($ctx)
  881. slwi $t0,$h1,26 # base 2^26 -> base 2^32
  882. srwi $h1,$h1,6
  883. slwi $t1,$h2,20
  884. srwi $h2,$h2,12
  885. addc $h0,$h0,$t0
  886. slwi $t0,$h3,14
  887. srwi $h3,$h3,18
  888. adde $h1,$h1,$t1
  889. slwi $t1,$h4,8
  890. srwi $h4,$h4,24
  891. adde $h2,$h2,$t0
  892. li $t0,0
  893. adde $h3,$h3,$t1
  894. addze $h4,$h4
  895. stw $h0,0($ctx) # store hash base 2^32
  896. stw $h1,4($ctx)
  897. stw $h2,8($ctx)
  898. stw $h3,12($ctx)
  899. stw $h4,16($ctx)
  900. stw $t0,24($ctx) # clear is_base2_26
  901. b Lpoly1305_blocks
  902. .long 0
  903. .byte 0,12,0x14,0,0,0,4,0
  904. .size .poly1305_blocks_vsx,.-.poly1305_blocks_vsx
  905. .align 5
  906. __poly1305_mul:
  907. vmulouw $ACC0,$H0,$R0
  908. vmulouw $ACC1,$H1,$R0
  909. vmulouw $ACC2,$H2,$R0
  910. vmulouw $ACC3,$H3,$R0
  911. vmulouw $ACC4,$H4,$R0
  912. vmulouw $T0,$H4,$S1
  913. vaddudm $ACC0,$ACC0,$T0
  914. vmulouw $T0,$H0,$R1
  915. vaddudm $ACC1,$ACC1,$T0
  916. vmulouw $T0,$H1,$R1
  917. vaddudm $ACC2,$ACC2,$T0
  918. vmulouw $T0,$H2,$R1
  919. vaddudm $ACC3,$ACC3,$T0
  920. vmulouw $T0,$H3,$R1
  921. vaddudm $ACC4,$ACC4,$T0
  922. vmulouw $T0,$H3,$S2
  923. vaddudm $ACC0,$ACC0,$T0
  924. vmulouw $T0,$H4,$S2
  925. vaddudm $ACC1,$ACC1,$T0
  926. vmulouw $T0,$H0,$R2
  927. vaddudm $ACC2,$ACC2,$T0
  928. vmulouw $T0,$H1,$R2
  929. vaddudm $ACC3,$ACC3,$T0
  930. vmulouw $T0,$H2,$R2
  931. vaddudm $ACC4,$ACC4,$T0
  932. vmulouw $T0,$H2,$S3
  933. vaddudm $ACC0,$ACC0,$T0
  934. vmulouw $T0,$H3,$S3
  935. vaddudm $ACC1,$ACC1,$T0
  936. vmulouw $T0,$H4,$S3
  937. vaddudm $ACC2,$ACC2,$T0
  938. vmulouw $T0,$H0,$R3
  939. vaddudm $ACC3,$ACC3,$T0
  940. vmulouw $T0,$H1,$R3
  941. vaddudm $ACC4,$ACC4,$T0
  942. vmulouw $T0,$H1,$S4
  943. vaddudm $ACC0,$ACC0,$T0
  944. vmulouw $T0,$H2,$S4
  945. vaddudm $ACC1,$ACC1,$T0
  946. vmulouw $T0,$H3,$S4
  947. vaddudm $ACC2,$ACC2,$T0
  948. vmulouw $T0,$H4,$S4
  949. vaddudm $ACC3,$ACC3,$T0
  950. vmulouw $T0,$H0,$R4
  951. vaddudm $ACC4,$ACC4,$T0
  952. ################################################################
  953. # lazy reduction
  954. vspltisb $T0,2
  955. vsrd $H4,$ACC3,$_26
  956. vsrd $H1,$ACC0,$_26
  957. vand $H3,$ACC3,$mask26
  958. vand $H0,$ACC0,$mask26
  959. vaddudm $H4,$H4,$ACC4 # h3 -> h4
  960. vaddudm $H1,$H1,$ACC1 # h0 -> h1
  961. vsrd $ACC4,$H4,$_26
  962. vsrd $ACC1,$H1,$_26
  963. vand $H4,$H4,$mask26
  964. vand $H1,$H1,$mask26
  965. vaddudm $H0,$H0,$ACC4
  966. vaddudm $H2,$ACC2,$ACC1 # h1 -> h2
  967. vsld $ACC4,$ACC4,$T0 # <<2
  968. vsrd $ACC2,$H2,$_26
  969. vand $H2,$H2,$mask26
  970. vaddudm $H0,$H0,$ACC4 # h4 -> h0
  971. vaddudm $H3,$H3,$ACC2 # h2 -> h3
  972. vsrd $ACC0,$H0,$_26
  973. vsrd $ACC3,$H3,$_26
  974. vand $H0,$H0,$mask26
  975. vand $H3,$H3,$mask26
  976. vaddudm $H1,$H1,$ACC0 # h0 -> h1
  977. vaddudm $H4,$H4,$ACC3 # h3 -> h4
  978. blr
  979. .long 0
  980. .byte 0,12,0x14,0,0,0,0,0
  981. .size __poly1305_mul,.-__poly1305_mul
  982. .align 5
  983. __poly1305_blocks_vsx:
  984. $STU $sp,-$VSXFRAME($sp)
  985. mflr r0
  986. li r10,`15+$LOCALS+128`
  987. li r11,`31+$LOCALS+128`
  988. mfspr r12,256
  989. stvx v20,r10,$sp
  990. addi r10,r10,32
  991. stvx v21,r11,$sp
  992. addi r11,r11,32
  993. stvx v22,r10,$sp
  994. addi r10,r10,32
  995. stvx v23,r11,$sp
  996. addi r11,r11,32
  997. stvx v24,r10,$sp
  998. addi r10,r10,32
  999. stvx v25,r11,$sp
  1000. addi r11,r11,32
  1001. stvx v26,r10,$sp
  1002. addi r10,r10,32
  1003. stvx v27,r11,$sp
  1004. addi r11,r11,32
  1005. stvx v28,r10,$sp
  1006. addi r10,r10,32
  1007. stvx v29,r11,$sp
  1008. addi r11,r11,32
  1009. stvx v30,r10,$sp
  1010. stvx v31,r11,$sp
  1011. stw r12,`$VSXFRAME-$SIZE_T*5-4`($sp)# save vrsave
  1012. li r12,-1
  1013. mtspr 256,r12 # preserve all AltiVec registers
  1014. $PUSH r27,`$VSXFRAME-$SIZE_T*5`($sp)
  1015. $PUSH r28,`$VSXFRAME-$SIZE_T*4`($sp)
  1016. $PUSH r29,`$VSXFRAME-$SIZE_T*3`($sp)
  1017. $PUSH r30,`$VSXFRAME-$SIZE_T*2`($sp)
  1018. $PUSH r31,`$VSXFRAME-$SIZE_T*1`($sp)
  1019. $PUSH r0,`$VSXFRAME+$LRSAVE`($sp)
  1020. bl LPICmeup
  1021. li $x10,0x10
  1022. li $x20,0x20
  1023. li $x30,0x30
  1024. li $x40,0x40
  1025. li $x50,0x50
  1026. lvx_u $mask26,$x00,$const
  1027. lvx_u $_26,$x10,$const
  1028. lvx_u $_40,$x20,$const
  1029. lvx_u $I2perm,$x30,$const
  1030. lvx_u $padbits,$x40,$const
  1031. cmplwi r7,0 # is_base2_26?
  1032. bne Lskip_init_vsx
  1033. lwz $h1,32($ctx) # load key base 2^32
  1034. lwz $h2,36($ctx)
  1035. lwz $h3,40($ctx)
  1036. lwz $h4,44($ctx)
  1037. extrwi $h0,$h1,26,6 # base 2^32 -> base 2^26
  1038. extrwi $h1,$h1,6,0
  1039. insrwi $h1,$h2,20,6
  1040. extrwi $h2,$h2,12,0
  1041. insrwi $h2,$h3,14,6
  1042. extrwi $h3,$h3,18,0
  1043. insrwi $h3,$h4,8,6
  1044. extrwi $h4,$h4,24,0
  1045. mtvrwz $R0,$h0
  1046. slwi $h0,$h1,2
  1047. mtvrwz $R1,$h1
  1048. add $h1,$h1,$h0
  1049. mtvrwz $S1,$h1
  1050. slwi $h1,$h2,2
  1051. mtvrwz $R2,$h2
  1052. add $h2,$h2,$h1
  1053. mtvrwz $S2,$h2
  1054. slwi $h2,$h3,2
  1055. mtvrwz $R3,$h3
  1056. add $h3,$h3,$h2
  1057. mtvrwz $S3,$h3
  1058. slwi $h3,$h4,2
  1059. mtvrwz $R4,$h4
  1060. add $h4,$h4,$h3
  1061. mtvrwz $S4,$h4
  1062. vmr $H0,$R0
  1063. vmr $H1,$R1
  1064. vmr $H2,$R2
  1065. vmr $H3,$R3
  1066. vmr $H4,$R4
  1067. bl __poly1305_mul # r^1:- * r^1:-
  1068. vpermdi $R0,$H0,$R0,0b00
  1069. vpermdi $R1,$H1,$R1,0b00
  1070. vpermdi $R2,$H2,$R2,0b00
  1071. vpermdi $R3,$H3,$R3,0b00
  1072. vpermdi $R4,$H4,$R4,0b00
  1073. vpermdi $H0,$H0,$H0,0b00
  1074. vpermdi $H1,$H1,$H1,0b00
  1075. vpermdi $H2,$H2,$H2,0b00
  1076. vpermdi $H3,$H3,$H3,0b00
  1077. vpermdi $H4,$H4,$H4,0b00
  1078. vsld $S1,$R1,$T0 # <<2
  1079. vsld $S2,$R2,$T0
  1080. vsld $S3,$R3,$T0
  1081. vsld $S4,$R4,$T0
  1082. vaddudm $S1,$S1,$R1
  1083. vaddudm $S2,$S2,$R2
  1084. vaddudm $S3,$S3,$R3
  1085. vaddudm $S4,$S4,$R4
  1086. bl __poly1305_mul # r^2:r^2 * r^2:r^1
  1087. addi $h0,$ctx,0x60
  1088. lwz $h1,0($ctx) # load hash
  1089. lwz $h2,4($ctx)
  1090. lwz $h3,8($ctx)
  1091. lwz $h4,12($ctx)
  1092. lwz $t0,16($ctx)
  1093. vmrgow $R0,$R0,$H0 # r^2:r^4:r^1:r^3
  1094. vmrgow $R1,$R1,$H1
  1095. vmrgow $R2,$R2,$H2
  1096. vmrgow $R3,$R3,$H3
  1097. vmrgow $R4,$R4,$H4
  1098. vslw $S1,$R1,$T0 # <<2
  1099. vslw $S2,$R2,$T0
  1100. vslw $S3,$R3,$T0
  1101. vslw $S4,$R4,$T0
  1102. vadduwm $S1,$S1,$R1
  1103. vadduwm $S2,$S2,$R2
  1104. vadduwm $S3,$S3,$R3
  1105. vadduwm $S4,$S4,$R4
  1106. stvx_u $R0,$x30,$ctx
  1107. stvx_u $R1,$x40,$ctx
  1108. stvx_u $S1,$x50,$ctx
  1109. stvx_u $R2,$x00,$h0
  1110. stvx_u $S2,$x10,$h0
  1111. stvx_u $R3,$x20,$h0
  1112. stvx_u $S3,$x30,$h0
  1113. stvx_u $R4,$x40,$h0
  1114. stvx_u $S4,$x50,$h0
  1115. extrwi $h0,$h1,26,6 # base 2^32 -> base 2^26
  1116. extrwi $h1,$h1,6,0
  1117. mtvrwz $H0,$h0
  1118. insrwi $h1,$h2,20,6
  1119. extrwi $h2,$h2,12,0
  1120. mtvrwz $H1,$h1
  1121. insrwi $h2,$h3,14,6
  1122. extrwi $h3,$h3,18,0
  1123. mtvrwz $H2,$h2
  1124. insrwi $h3,$h4,8,6
  1125. extrwi $h4,$h4,24,0
  1126. mtvrwz $H3,$h3
  1127. insrwi $h4,$t0,3,5
  1128. mtvrwz $H4,$h4
  1129. ___
  1130. }
  1131. $code.=<<___;
  1132. li r0,1
  1133. stw r0,24($ctx) # set is_base2_26
  1134. b Loaded_vsx
  1135. .align 4
  1136. Lskip_init_vsx:
  1137. li $x10,4
  1138. li $x20,8
  1139. li $x30,12
  1140. li $x40,16
  1141. lvwzx_u $H0,$x00,$ctx
  1142. lvwzx_u $H1,$x10,$ctx
  1143. lvwzx_u $H2,$x20,$ctx
  1144. lvwzx_u $H3,$x30,$ctx
  1145. lvwzx_u $H4,$x40,$ctx
  1146. Loaded_vsx:
  1147. li $x10,0x10
  1148. li $x20,0x20
  1149. li $x30,0x30
  1150. li $x40,0x40
  1151. li $x50,0x50
  1152. li $x60,0x60
  1153. li $x70,0x70
  1154. addi $ctx_,$ctx,64 # &ctx->r[1]
  1155. addi $_ctx,$sp,`$LOCALS+15` # &ctx->r[1], r^2:r^4 shadow
  1156. vxor $T0,$T0,$T0 # ensure second half is zero
  1157. vpermdi $H0,$H0,$T0,0b00
  1158. vpermdi $H1,$H1,$T0,0b00
  1159. vpermdi $H2,$H2,$T0,0b00
  1160. vpermdi $H3,$H3,$T0,0b00
  1161. vpermdi $H4,$H4,$T0,0b00
  1162. be?lvx_u $_4,$x50,$const # byte swap mask
  1163. lvx_u $T1,$x00,$inp # load first input block
  1164. lvx_u $T2,$x10,$inp
  1165. lvx_u $T3,$x20,$inp
  1166. lvx_u $T4,$x30,$inp
  1167. be?vperm $T1,$T1,$T1,$_4
  1168. be?vperm $T2,$T2,$T2,$_4
  1169. be?vperm $T3,$T3,$T3,$_4
  1170. be?vperm $T4,$T4,$T4,$_4
  1171. vpermdi $I0,$T1,$T2,0b00 # smash input to base 2^26
  1172. vspltisb $_4,4
  1173. vperm $I2,$T1,$T2,$I2perm # 0x...0e0f0001...1e1f1011
  1174. vspltisb $_14,14
  1175. vpermdi $I3,$T1,$T2,0b11
  1176. vsrd $I1,$I0,$_26
  1177. vsrd $I2,$I2,$_4
  1178. vsrd $I4,$I3,$_40
  1179. vsrd $I3,$I3,$_14
  1180. vand $I0,$I0,$mask26
  1181. vand $I1,$I1,$mask26
  1182. vand $I2,$I2,$mask26
  1183. vand $I3,$I3,$mask26
  1184. vpermdi $T1,$T3,$T4,0b00
  1185. vperm $T2,$T3,$T4,$I2perm # 0x...0e0f0001...1e1f1011
  1186. vpermdi $T3,$T3,$T4,0b11
  1187. vsrd $T0,$T1,$_26
  1188. vsrd $T2,$T2,$_4
  1189. vsrd $T4,$T3,$_40
  1190. vsrd $T3,$T3,$_14
  1191. vand $T1,$T1,$mask26
  1192. vand $T0,$T0,$mask26
  1193. vand $T2,$T2,$mask26
  1194. vand $T3,$T3,$mask26
  1195. # inp[2]:inp[0]:inp[3]:inp[1]
  1196. vmrgow $I4,$T4,$I4
  1197. vmrgow $I0,$T1,$I0
  1198. vmrgow $I1,$T0,$I1
  1199. vmrgow $I2,$T2,$I2
  1200. vmrgow $I3,$T3,$I3
  1201. vor $I4,$I4,$padbits
  1202. lvx_splt $R0,$x30,$ctx # taking lvx_vsplt out of loop
  1203. lvx_splt $R1,$x00,$ctx_ # gives ~8% improvement
  1204. lvx_splt $S1,$x10,$ctx_
  1205. lvx_splt $R2,$x20,$ctx_
  1206. lvx_splt $S2,$x30,$ctx_
  1207. lvx_splt $T1,$x40,$ctx_
  1208. lvx_splt $T2,$x50,$ctx_
  1209. lvx_splt $T3,$x60,$ctx_
  1210. lvx_splt $T4,$x70,$ctx_
  1211. stvx $R1,$x00,$_ctx
  1212. stvx $S1,$x10,$_ctx
  1213. stvx $R2,$x20,$_ctx
  1214. stvx $S2,$x30,$_ctx
  1215. stvx $T1,$x40,$_ctx
  1216. stvx $T2,$x50,$_ctx
  1217. stvx $T3,$x60,$_ctx
  1218. stvx $T4,$x70,$_ctx
  1219. addi $inp,$inp,0x40
  1220. addi $const,$const,0x50
  1221. addi r0,$len,-64
  1222. srdi r0,r0,6
  1223. mtctr r0
  1224. b Loop_vsx
  1225. .align 4
  1226. Loop_vsx:
  1227. ################################################################
  1228. ## ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
  1229. ## ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
  1230. ## \___________________/
  1231. ##
  1232. ## Note that we start with inp[2:3]*r^2. This is because it
  1233. ## doesn't depend on reduction in previous iteration.
  1234. ################################################################
  1235. ## d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
  1236. ## d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
  1237. ## d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
  1238. ## d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
  1239. ## d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
  1240. vmuleuw $ACC0,$I0,$R0
  1241. vmuleuw $ACC1,$I0,$R1
  1242. vmuleuw $ACC2,$I0,$R2
  1243. vmuleuw $ACC3,$I1,$R2
  1244. vmuleuw $T0,$I1,$R0
  1245. vaddudm $ACC1,$ACC1,$T0
  1246. vmuleuw $T0,$I1,$R1
  1247. vaddudm $ACC2,$ACC2,$T0
  1248. vmuleuw $ACC4,$I2,$R2
  1249. vmuleuw $T0,$I4,$S1
  1250. vaddudm $ACC0,$ACC0,$T0
  1251. vmuleuw $T0,$I2,$R1
  1252. vaddudm $ACC3,$ACC3,$T0
  1253. lvx $S3,$x50,$_ctx
  1254. vmuleuw $T0,$I3,$R1
  1255. vaddudm $ACC4,$ACC4,$T0
  1256. lvx $R3,$x40,$_ctx
  1257. vaddudm $H2,$H2,$I2
  1258. vaddudm $H0,$H0,$I0
  1259. vaddudm $H3,$H3,$I3
  1260. vaddudm $H1,$H1,$I1
  1261. vaddudm $H4,$H4,$I4
  1262. vmuleuw $T0,$I3,$S2
  1263. vaddudm $ACC0,$ACC0,$T0
  1264. vmuleuw $T0,$I4,$S2
  1265. vaddudm $ACC1,$ACC1,$T0
  1266. vmuleuw $T0,$I2,$R0
  1267. vaddudm $ACC2,$ACC2,$T0
  1268. vmuleuw $T0,$I3,$R0
  1269. vaddudm $ACC3,$ACC3,$T0
  1270. lvx $S4,$x70,$_ctx
  1271. vmuleuw $T0,$I4,$R0
  1272. vaddudm $ACC4,$ACC4,$T0
  1273. lvx $R4,$x60,$_ctx
  1274. vmuleuw $T0,$I2,$S3
  1275. vaddudm $ACC0,$ACC0,$T0
  1276. vmuleuw $T0,$I3,$S3
  1277. vaddudm $ACC1,$ACC1,$T0
  1278. vmuleuw $T0,$I4,$S3
  1279. vaddudm $ACC2,$ACC2,$T0
  1280. vmuleuw $T0,$I0,$R3
  1281. vaddudm $ACC3,$ACC3,$T0
  1282. vmuleuw $T0,$I1,$R3
  1283. vaddudm $ACC4,$ACC4,$T0
  1284. be?lvx_u $_4,$x00,$const # byte swap mask
  1285. lvx_u $T1,$x00,$inp # load next input block
  1286. lvx_u $T2,$x10,$inp
  1287. lvx_u $T3,$x20,$inp
  1288. lvx_u $T4,$x30,$inp
  1289. be?vperm $T1,$T1,$T1,$_4
  1290. be?vperm $T2,$T2,$T2,$_4
  1291. be?vperm $T3,$T3,$T3,$_4
  1292. be?vperm $T4,$T4,$T4,$_4
  1293. vmuleuw $T0,$I1,$S4
  1294. vaddudm $ACC0,$ACC0,$T0
  1295. vmuleuw $T0,$I2,$S4
  1296. vaddudm $ACC1,$ACC1,$T0
  1297. vmuleuw $T0,$I3,$S4
  1298. vaddudm $ACC2,$ACC2,$T0
  1299. vmuleuw $T0,$I4,$S4
  1300. vaddudm $ACC3,$ACC3,$T0
  1301. vmuleuw $T0,$I0,$R4
  1302. vaddudm $ACC4,$ACC4,$T0
  1303. vpermdi $I0,$T1,$T2,0b00 # smash input to base 2^26
  1304. vspltisb $_4,4
  1305. vperm $I2,$T1,$T2,$I2perm # 0x...0e0f0001...1e1f1011
  1306. vpermdi $I3,$T1,$T2,0b11
  1307. # (hash + inp[0:1]) * r^4
  1308. vmulouw $T0,$H0,$R0
  1309. vaddudm $ACC0,$ACC0,$T0
  1310. vmulouw $T0,$H1,$R0
  1311. vaddudm $ACC1,$ACC1,$T0
  1312. vmulouw $T0,$H2,$R0
  1313. vaddudm $ACC2,$ACC2,$T0
  1314. vmulouw $T0,$H3,$R0
  1315. vaddudm $ACC3,$ACC3,$T0
  1316. vmulouw $T0,$H4,$R0
  1317. vaddudm $ACC4,$ACC4,$T0
  1318. vpermdi $T1,$T3,$T4,0b00
  1319. vperm $T2,$T3,$T4,$I2perm # 0x...0e0f0001...1e1f1011
  1320. vpermdi $T3,$T3,$T4,0b11
  1321. vmulouw $T0,$H2,$S3
  1322. vaddudm $ACC0,$ACC0,$T0
  1323. vmulouw $T0,$H3,$S3
  1324. vaddudm $ACC1,$ACC1,$T0
  1325. vmulouw $T0,$H4,$S3
  1326. vaddudm $ACC2,$ACC2,$T0
  1327. vmulouw $T0,$H0,$R3
  1328. vaddudm $ACC3,$ACC3,$T0
  1329. lvx $S1,$x10,$_ctx
  1330. vmulouw $T0,$H1,$R3
  1331. vaddudm $ACC4,$ACC4,$T0
  1332. lvx $R1,$x00,$_ctx
  1333. vsrd $I1,$I0,$_26
  1334. vsrd $I2,$I2,$_4
  1335. vsrd $I4,$I3,$_40
  1336. vsrd $I3,$I3,$_14
  1337. vmulouw $T0,$H1,$S4
  1338. vaddudm $ACC0,$ACC0,$T0
  1339. vmulouw $T0,$H2,$S4
  1340. vaddudm $ACC1,$ACC1,$T0
  1341. vmulouw $T0,$H3,$S4
  1342. vaddudm $ACC2,$ACC2,$T0
  1343. vmulouw $T0,$H4,$S4
  1344. vaddudm $ACC3,$ACC3,$T0
  1345. lvx $S2,$x30,$_ctx
  1346. vmulouw $T0,$H0,$R4
  1347. vaddudm $ACC4,$ACC4,$T0
  1348. lvx $R2,$x20,$_ctx
  1349. vand $I0,$I0,$mask26
  1350. vand $I1,$I1,$mask26
  1351. vand $I2,$I2,$mask26
  1352. vand $I3,$I3,$mask26
  1353. vmulouw $T0,$H4,$S1
  1354. vaddudm $ACC0,$ACC0,$T0
  1355. vmulouw $T0,$H0,$R1
  1356. vaddudm $ACC1,$ACC1,$T0
  1357. vmulouw $T0,$H1,$R1
  1358. vaddudm $ACC2,$ACC2,$T0
  1359. vmulouw $T0,$H2,$R1
  1360. vaddudm $ACC3,$ACC3,$T0
  1361. vmulouw $T0,$H3,$R1
  1362. vaddudm $ACC4,$ACC4,$T0
  1363. vsrd $T2,$T2,$_4
  1364. vsrd $_4,$T1,$_26
  1365. vsrd $T4,$T3,$_40
  1366. vsrd $T3,$T3,$_14
  1367. vmulouw $T0,$H3,$S2
  1368. vaddudm $ACC0,$ACC0,$T0
  1369. vmulouw $T0,$H4,$S2
  1370. vaddudm $ACC1,$ACC1,$T0
  1371. vmulouw $T0,$H0,$R2
  1372. vaddudm $ACC2,$ACC2,$T0
  1373. vmulouw $T0,$H1,$R2
  1374. vaddudm $ACC3,$ACC3,$T0
  1375. vmulouw $T0,$H2,$R2
  1376. vaddudm $ACC4,$ACC4,$T0
  1377. vand $T1,$T1,$mask26
  1378. vand $_4,$_4,$mask26
  1379. vand $T2,$T2,$mask26
  1380. vand $T3,$T3,$mask26
  1381. ################################################################
  1382. # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
  1383. # and P. Schwabe
  1384. vspltisb $T0,2
  1385. vsrd $H4,$ACC3,$_26
  1386. vsrd $H1,$ACC0,$_26
  1387. vand $H3,$ACC3,$mask26
  1388. vand $H0,$ACC0,$mask26
  1389. vaddudm $H4,$H4,$ACC4 # h3 -> h4
  1390. vaddudm $H1,$H1,$ACC1 # h0 -> h1
  1391. vmrgow $I4,$T4,$I4
  1392. vmrgow $I0,$T1,$I0
  1393. vmrgow $I1,$_4,$I1
  1394. vmrgow $I2,$T2,$I2
  1395. vmrgow $I3,$T3,$I3
  1396. vor $I4,$I4,$padbits
  1397. vsrd $ACC4,$H4,$_26
  1398. vsrd $ACC1,$H1,$_26
  1399. vand $H4,$H4,$mask26
  1400. vand $H1,$H1,$mask26
  1401. vaddudm $H0,$H0,$ACC4
  1402. vaddudm $H2,$ACC2,$ACC1 # h1 -> h2
  1403. vsld $ACC4,$ACC4,$T0 # <<2
  1404. vsrd $ACC2,$H2,$_26
  1405. vand $H2,$H2,$mask26
  1406. vaddudm $H0,$H0,$ACC4 # h4 -> h0
  1407. vaddudm $H3,$H3,$ACC2 # h2 -> h3
  1408. vsrd $ACC0,$H0,$_26
  1409. vsrd $ACC3,$H3,$_26
  1410. vand $H0,$H0,$mask26
  1411. vand $H3,$H3,$mask26
  1412. vaddudm $H1,$H1,$ACC0 # h0 -> h1
  1413. vaddudm $H4,$H4,$ACC3 # h3 -> h4
  1414. addi $inp,$inp,0x40
  1415. bdnz Loop_vsx
  1416. neg $len,$len
  1417. andi. $len,$len,0x30
  1418. sub $inp,$inp,$len
  1419. lvx_u $R0,$x30,$ctx # load all powers
  1420. lvx_u $R1,$x00,$ctx_
  1421. lvx_u $S1,$x10,$ctx_
  1422. lvx_u $R2,$x20,$ctx_
  1423. lvx_u $S2,$x30,$ctx_
  1424. Last_vsx:
  1425. vmuleuw $ACC0,$I0,$R0
  1426. vmuleuw $ACC1,$I1,$R0
  1427. vmuleuw $ACC2,$I2,$R0
  1428. vmuleuw $ACC3,$I3,$R0
  1429. vmuleuw $ACC4,$I4,$R0
  1430. vmuleuw $T0,$I4,$S1
  1431. vaddudm $ACC0,$ACC0,$T0
  1432. vmuleuw $T0,$I0,$R1
  1433. vaddudm $ACC1,$ACC1,$T0
  1434. vmuleuw $T0,$I1,$R1
  1435. vaddudm $ACC2,$ACC2,$T0
  1436. vmuleuw $T0,$I2,$R1
  1437. vaddudm $ACC3,$ACC3,$T0
  1438. lvx_u $S3,$x50,$ctx_
  1439. vmuleuw $T0,$I3,$R1
  1440. vaddudm $ACC4,$ACC4,$T0
  1441. lvx_u $R3,$x40,$ctx_
  1442. vaddudm $H2,$H2,$I2
  1443. vaddudm $H0,$H0,$I0
  1444. vaddudm $H3,$H3,$I3
  1445. vaddudm $H1,$H1,$I1
  1446. vaddudm $H4,$H4,$I4
  1447. vmuleuw $T0,$I3,$S2
  1448. vaddudm $ACC0,$ACC0,$T0
  1449. vmuleuw $T0,$I4,$S2
  1450. vaddudm $ACC1,$ACC1,$T0
  1451. vmuleuw $T0,$I0,$R2
  1452. vaddudm $ACC2,$ACC2,$T0
  1453. vmuleuw $T0,$I1,$R2
  1454. vaddudm $ACC3,$ACC3,$T0
  1455. lvx_u $S4,$x70,$ctx_
  1456. vmuleuw $T0,$I2,$R2
  1457. vaddudm $ACC4,$ACC4,$T0
  1458. lvx_u $R4,$x60,$ctx_
  1459. vmuleuw $T0,$I2,$S3
  1460. vaddudm $ACC0,$ACC0,$T0
  1461. vmuleuw $T0,$I3,$S3
  1462. vaddudm $ACC1,$ACC1,$T0
  1463. vmuleuw $T0,$I4,$S3
  1464. vaddudm $ACC2,$ACC2,$T0
  1465. vmuleuw $T0,$I0,$R3
  1466. vaddudm $ACC3,$ACC3,$T0
  1467. vmuleuw $T0,$I1,$R3
  1468. vaddudm $ACC4,$ACC4,$T0
  1469. vmuleuw $T0,$I1,$S4
  1470. vaddudm $ACC0,$ACC0,$T0
  1471. vmuleuw $T0,$I2,$S4
  1472. vaddudm $ACC1,$ACC1,$T0
  1473. vmuleuw $T0,$I3,$S4
  1474. vaddudm $ACC2,$ACC2,$T0
  1475. vmuleuw $T0,$I4,$S4
  1476. vaddudm $ACC3,$ACC3,$T0
  1477. vmuleuw $T0,$I0,$R4
  1478. vaddudm $ACC4,$ACC4,$T0
  1479. # (hash + inp[0:1]) * r^4
  1480. vmulouw $T0,$H0,$R0
  1481. vaddudm $ACC0,$ACC0,$T0
  1482. vmulouw $T0,$H1,$R0
  1483. vaddudm $ACC1,$ACC1,$T0
  1484. vmulouw $T0,$H2,$R0
  1485. vaddudm $ACC2,$ACC2,$T0
  1486. vmulouw $T0,$H3,$R0
  1487. vaddudm $ACC3,$ACC3,$T0
  1488. vmulouw $T0,$H4,$R0
  1489. vaddudm $ACC4,$ACC4,$T0
  1490. vmulouw $T0,$H2,$S3
  1491. vaddudm $ACC0,$ACC0,$T0
  1492. vmulouw $T0,$H3,$S3
  1493. vaddudm $ACC1,$ACC1,$T0
  1494. vmulouw $T0,$H4,$S3
  1495. vaddudm $ACC2,$ACC2,$T0
  1496. vmulouw $T0,$H0,$R3
  1497. vaddudm $ACC3,$ACC3,$T0
  1498. lvx_u $S1,$x10,$ctx_
  1499. vmulouw $T0,$H1,$R3
  1500. vaddudm $ACC4,$ACC4,$T0
  1501. lvx_u $R1,$x00,$ctx_
  1502. vmulouw $T0,$H1,$S4
  1503. vaddudm $ACC0,$ACC0,$T0
  1504. vmulouw $T0,$H2,$S4
  1505. vaddudm $ACC1,$ACC1,$T0
  1506. vmulouw $T0,$H3,$S4
  1507. vaddudm $ACC2,$ACC2,$T0
  1508. vmulouw $T0,$H4,$S4
  1509. vaddudm $ACC3,$ACC3,$T0
  1510. lvx_u $S2,$x30,$ctx_
  1511. vmulouw $T0,$H0,$R4
  1512. vaddudm $ACC4,$ACC4,$T0
  1513. lvx_u $R2,$x20,$ctx_
  1514. vmulouw $T0,$H4,$S1
  1515. vaddudm $ACC0,$ACC0,$T0
  1516. vmulouw $T0,$H0,$R1
  1517. vaddudm $ACC1,$ACC1,$T0
  1518. vmulouw $T0,$H1,$R1
  1519. vaddudm $ACC2,$ACC2,$T0
  1520. vmulouw $T0,$H2,$R1
  1521. vaddudm $ACC3,$ACC3,$T0
  1522. vmulouw $T0,$H3,$R1
  1523. vaddudm $ACC4,$ACC4,$T0
  1524. vmulouw $T0,$H3,$S2
  1525. vaddudm $ACC0,$ACC0,$T0
  1526. vmulouw $T0,$H4,$S2
  1527. vaddudm $ACC1,$ACC1,$T0
  1528. vmulouw $T0,$H0,$R2
  1529. vaddudm $ACC2,$ACC2,$T0
  1530. vmulouw $T0,$H1,$R2
  1531. vaddudm $ACC3,$ACC3,$T0
  1532. vmulouw $T0,$H2,$R2
  1533. vaddudm $ACC4,$ACC4,$T0
  1534. ################################################################
  1535. # horizontal addition
  1536. vpermdi $H0,$ACC0,$ACC0,0b10
  1537. vpermdi $H1,$ACC1,$ACC1,0b10
  1538. vpermdi $H2,$ACC2,$ACC2,0b10
  1539. vpermdi $H3,$ACC3,$ACC3,0b10
  1540. vpermdi $H4,$ACC4,$ACC4,0b10
  1541. vaddudm $ACC0,$ACC0,$H0
  1542. vaddudm $ACC1,$ACC1,$H1
  1543. vaddudm $ACC2,$ACC2,$H2
  1544. vaddudm $ACC3,$ACC3,$H3
  1545. vaddudm $ACC4,$ACC4,$H4
  1546. ################################################################
  1547. # lazy reduction
  1548. vspltisb $T0,2
  1549. vsrd $H4,$ACC3,$_26
  1550. vsrd $H1,$ACC0,$_26
  1551. vand $H3,$ACC3,$mask26
  1552. vand $H0,$ACC0,$mask26
  1553. vaddudm $H4,$H4,$ACC4 # h3 -> h4
  1554. vaddudm $H1,$H1,$ACC1 # h0 -> h1
  1555. vsrd $ACC4,$H4,$_26
  1556. vsrd $ACC1,$H1,$_26
  1557. vand $H4,$H4,$mask26
  1558. vand $H1,$H1,$mask26
  1559. vaddudm $H0,$H0,$ACC4
  1560. vaddudm $H2,$ACC2,$ACC1 # h1 -> h2
  1561. vsld $ACC4,$ACC4,$T0 # <<2
  1562. vsrd $ACC2,$H2,$_26
  1563. vand $H2,$H2,$mask26
  1564. vaddudm $H0,$H0,$ACC4 # h4 -> h0
  1565. vaddudm $H3,$H3,$ACC2 # h2 -> h3
  1566. vsrd $ACC0,$H0,$_26
  1567. vsrd $ACC3,$H3,$_26
  1568. vand $H0,$H0,$mask26
  1569. vand $H3,$H3,$mask26
  1570. vaddudm $H1,$H1,$ACC0 # h0 -> h1
  1571. vaddudm $H4,$H4,$ACC3 # h3 -> h4
  1572. beq Ldone_vsx
  1573. add r6,$const,$len
  1574. be?lvx_u $_4,$x00,$const # byte swap mask
  1575. lvx_u $T1,$x00,$inp # load last partial input block
  1576. lvx_u $T2,$x10,$inp
  1577. lvx_u $T3,$x20,$inp
  1578. lvx_u $T4,$x30,$inp
  1579. be?vperm $T1,$T1,$T1,$_4
  1580. be?vperm $T2,$T2,$T2,$_4
  1581. be?vperm $T3,$T3,$T3,$_4
  1582. be?vperm $T4,$T4,$T4,$_4
  1583. vpermdi $I0,$T1,$T2,0b00 # smash input to base 2^26
  1584. vspltisb $_4,4
  1585. vperm $I2,$T1,$T2,$I2perm # 0x...0e0f0001...1e1f1011
  1586. vpermdi $I3,$T1,$T2,0b11
  1587. vsrd $I1,$I0,$_26
  1588. vsrd $I2,$I2,$_4
  1589. vsrd $I4,$I3,$_40
  1590. vsrd $I3,$I3,$_14
  1591. vand $I0,$I0,$mask26
  1592. vand $I1,$I1,$mask26
  1593. vand $I2,$I2,$mask26
  1594. vand $I3,$I3,$mask26
  1595. vpermdi $T0,$T3,$T4,0b00
  1596. vperm $T1,$T3,$T4,$I2perm # 0x...0e0f0001...1e1f1011
  1597. vpermdi $T2,$T3,$T4,0b11
  1598. lvx_u $ACC0,$x00,r6
  1599. lvx_u $ACC1,$x30,r6
  1600. vsrd $T3,$T0,$_26
  1601. vsrd $T1,$T1,$_4
  1602. vsrd $T4,$T2,$_40
  1603. vsrd $T2,$T2,$_14
  1604. vand $T0,$T0,$mask26
  1605. vand $T3,$T3,$mask26
  1606. vand $T1,$T1,$mask26
  1607. vand $T2,$T2,$mask26
  1608. # inp[2]:inp[0]:inp[3]:inp[1]
  1609. vmrgow $I4,$T4,$I4
  1610. vmrgow $I0,$T0,$I0
  1611. vmrgow $I1,$T3,$I1
  1612. vmrgow $I2,$T1,$I2
  1613. vmrgow $I3,$T2,$I3
  1614. vor $I4,$I4,$padbits
  1615. vperm $H0,$H0,$H0,$ACC0 # move hash to right lane
  1616. vand $I0,$I0, $ACC1 # mask redundant input lane[s]
  1617. vperm $H1,$H1,$H1,$ACC0
  1618. vand $I1,$I1, $ACC1
  1619. vperm $H2,$H2,$H2,$ACC0
  1620. vand $I2,$I2, $ACC1
  1621. vperm $H3,$H3,$H3,$ACC0
  1622. vand $I3,$I3, $ACC1
  1623. vperm $H4,$H4,$H4,$ACC0
  1624. vand $I4,$I4, $ACC1
  1625. vaddudm $I0,$I0,$H0 # accumulate hash
  1626. vxor $H0,$H0,$H0 # wipe hash value
  1627. vaddudm $I1,$I1,$H1
  1628. vxor $H1,$H1,$H1
  1629. vaddudm $I2,$I2,$H2
  1630. vxor $H2,$H2,$H2
  1631. vaddudm $I3,$I3,$H3
  1632. vxor $H3,$H3,$H3
  1633. vaddudm $I4,$I4,$H4
  1634. vxor $H4,$H4,$H4
  1635. xor. $len,$len,$len
  1636. b Last_vsx
  1637. .align 4
  1638. Ldone_vsx:
  1639. $POP r0,`$VSXFRAME+$LRSAVE`($sp)
  1640. li $x10,4
  1641. li $x20,8
  1642. li $x30,12
  1643. li $x40,16
  1644. stvwx_u $H0,$x00,$ctx # store hash
  1645. stvwx_u $H1,$x10,$ctx
  1646. stvwx_u $H2,$x20,$ctx
  1647. stvwx_u $H3,$x30,$ctx
  1648. stvwx_u $H4,$x40,$ctx
  1649. lwz r12,`$VSXFRAME-$SIZE_T*5-4`($sp)# pull vrsave
  1650. mtlr r0
  1651. li r10,`15+$LOCALS+128`
  1652. li r11,`31+$LOCALS+128`
  1653. mtspr 256,r12 # restore vrsave
  1654. lvx v20,r10,$sp
  1655. addi r10,r10,32
  1656. lvx v21,r11,$sp
  1657. addi r11,r11,32
  1658. lvx v22,r10,$sp
  1659. addi r10,r10,32
  1660. lvx v23,r11,$sp
  1661. addi r11,r11,32
  1662. lvx v24,r10,$sp
  1663. addi r10,r10,32
  1664. lvx v25,r11,$sp
  1665. addi r11,r11,32
  1666. lvx v26,r10,$sp
  1667. addi r10,r10,32
  1668. lvx v27,r11,$sp
  1669. addi r11,r11,32
  1670. lvx v28,r10,$sp
  1671. addi r10,r10,32
  1672. lvx v29,r11,$sp
  1673. addi r11,r11,32
  1674. lvx v30,r10,$sp
  1675. lvx v31,r11,$sp
  1676. $POP r27,`$VSXFRAME-$SIZE_T*5`($sp)
  1677. $POP r28,`$VSXFRAME-$SIZE_T*4`($sp)
  1678. $POP r29,`$VSXFRAME-$SIZE_T*3`($sp)
  1679. $POP r30,`$VSXFRAME-$SIZE_T*2`($sp)
  1680. $POP r31,`$VSXFRAME-$SIZE_T*1`($sp)
  1681. addi $sp,$sp,$VSXFRAME
  1682. blr
  1683. .long 0
  1684. .byte 0,12,0x04,1,0x80,5,4,0
  1685. .long 0
  1686. .size __poly1305_blocks_vsx,.-__poly1305_blocks_vsx
  1687. .align 6
  1688. LPICmeup:
  1689. mflr r0
  1690. bcl 20,31,\$+4
  1691. mflr $const # vvvvvv "distance" between . and 1st data entry
  1692. addi $const,$const,`64-8`
  1693. mtlr r0
  1694. blr
  1695. .long 0
  1696. .byte 0,12,0x14,0,0,0,0,0
  1697. .space `64-9*4`
  1698. .quad 0x0000000003ffffff,0x0000000003ffffff # mask26
  1699. .quad 0x000000000000001a,0x000000000000001a # _26
  1700. .quad 0x0000000000000028,0x0000000000000028 # _40
  1701. .quad 0x000000000e0f0001,0x000000001e1f1011 # I2perm
  1702. .quad 0x0100000001000000,0x0100000001000000 # padbits
  1703. .quad 0x0706050403020100,0x0f0e0d0c0b0a0908 # byte swap for big-endian
  1704. .quad 0x0000000000000000,0x0000000004050607 # magic tail masks
  1705. .quad 0x0405060700000000,0x0000000000000000
  1706. .quad 0x0000000000000000,0x0405060700000000
  1707. .quad 0xffffffff00000000,0xffffffffffffffff
  1708. .quad 0xffffffff00000000,0xffffffff00000000
  1709. .quad 0x0000000000000000,0xffffffff00000000
  1710. ___
  1711. }}}
  1712. $code.=<<___;
  1713. .asciz "Poly1305 for PPC, CRYPTOGAMS by \@dot-asm"
  1714. ___
  1715. foreach (split("\n",$code)) {
  1716. s/\`([^\`]*)\`/eval($1)/ge;
  1717. # instructions prefixed with '?' are endian-specific and need
  1718. # to be adjusted accordingly...
  1719. if ($flavour !~ /le$/) { # big-endian
  1720. s/be\?// or
  1721. s/le\?/#le#/
  1722. } else { # little-endian
  1723. s/le\?// or
  1724. s/be\?/#be#/
  1725. }
  1726. print $_,"\n";
  1727. }
  1728. close STDOUT or die "error closing STDOUT: $!";