poly1305-ppc.pl 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645
  1. #! /usr/bin/env perl
  2. # Copyright 2016-2018 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the OpenSSL license (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. # ====================================================================
  10. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  11. # project. The module is, however, dual licensed under OpenSSL and
  12. # CRYPTOGAMS licenses depending on where you obtain it. For further
  13. # details see http://www.openssl.org/~appro/cryptogams/.
  14. # ====================================================================
  15. #
  16. # This module implements Poly1305 hash for PowerPC.
  17. #
  18. # June 2015
  19. #
  20. # Numbers are cycles per processed byte with poly1305_blocks alone,
  21. # and improvement coefficients relative to gcc-generated code.
  22. #
  23. # -m32 -m64
  24. #
  25. # Freescale e300 14.8/+80% -
  26. # PPC74x0 7.60/+60% -
  27. # PPC970 7.00/+114% 3.51/+205%
  28. # POWER7 3.75/+260% 1.93/+100%
  29. # POWER8 - 2.03/+200%
  30. # POWER9 - 2.00/+150%
  31. #
  32. # Do we need floating-point implementation for PPC? Results presented
  33. # in poly1305_ieee754.c are tricky to compare to, because they are for
  34. # compiler-generated code. On the other hand it's known that floating-
  35. # point performance can be dominated by FPU latency, which means that
  36. # there is limit even for ideally optimized (and even vectorized) code.
  37. # And this limit is estimated to be higher than above -m64 results. Or
  38. # in other words floating-point implementation can be meaningful to
  39. # consider only in 32-bit application context. We probably have to
  40. # recognize that 32-bit builds are getting less popular on high-end
  41. # systems and therefore tend to target embedded ones, which might not
  42. # even have FPU...
  43. #
  44. # On side note, Power ISA 2.07 enables vector base 2^26 implementation,
  45. # and POWER8 might have capacity to break 1.0 cycle per byte barrier...
  46. $flavour = shift;
  47. if ($flavour =~ /64/) {
  48. $SIZE_T =8;
  49. $LRSAVE =2*$SIZE_T;
  50. $UCMP ="cmpld";
  51. $STU ="stdu";
  52. $POP ="ld";
  53. $PUSH ="std";
  54. } elsif ($flavour =~ /32/) {
  55. $SIZE_T =4;
  56. $LRSAVE =$SIZE_T;
  57. $UCMP ="cmplw";
  58. $STU ="stwu";
  59. $POP ="lwz";
  60. $PUSH ="stw";
  61. } else { die "nonsense $flavour"; }
  62. # Define endianness based on flavour
  63. # i.e.: linux64le
  64. $LITTLE_ENDIAN = ($flavour=~/le$/) ? $SIZE_T : 0;
  65. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  66. ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
  67. ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
  68. die "can't locate ppc-xlate.pl";
  69. open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
  70. $FRAME=24*$SIZE_T;
  71. $sp="r1";
  72. my ($ctx,$inp,$len,$padbit) = map("r$_",(3..6));
  73. my ($mac,$nonce)=($inp,$len);
  74. my $mask = "r0";
  75. $code=<<___;
  76. .machine "any"
  77. .text
  78. ___
  79. if ($flavour =~ /64/) {
  80. ###############################################################################
  81. # base 2^64 implementation
  82. my ($h0,$h1,$h2,$d0,$d1,$d2, $r0,$r1,$s1, $t0,$t1) = map("r$_",(7..12,27..31));
  83. $code.=<<___;
  84. .globl .poly1305_init_int
  85. .align 4
  86. .poly1305_init_int:
  87. xor r0,r0,r0
  88. std r0,0($ctx) # zero hash value
  89. std r0,8($ctx)
  90. std r0,16($ctx)
  91. $UCMP $inp,r0
  92. beq- Lno_key
  93. ___
  94. $code.=<<___ if ($LITTLE_ENDIAN);
  95. ld $d0,0($inp) # load key material
  96. ld $d1,8($inp)
  97. ___
  98. $code.=<<___ if (!$LITTLE_ENDIAN);
  99. li $h0,4
  100. lwbrx $d0,0,$inp # load key material
  101. li $d1,8
  102. lwbrx $h0,$h0,$inp
  103. li $h1,12
  104. lwbrx $d1,$d1,$inp
  105. lwbrx $h1,$h1,$inp
  106. insrdi $d0,$h0,32,0
  107. insrdi $d1,$h1,32,0
  108. ___
  109. $code.=<<___;
  110. lis $h1,0xfff # 0x0fff0000
  111. ori $h1,$h1,0xfffc # 0x0ffffffc
  112. insrdi $h1,$h1,32,0 # 0x0ffffffc0ffffffc
  113. ori $h0,$h1,3 # 0x0ffffffc0fffffff
  114. and $d0,$d0,$h0
  115. and $d1,$d1,$h1
  116. std $d0,32($ctx) # store key
  117. std $d1,40($ctx)
  118. Lno_key:
  119. xor r3,r3,r3
  120. blr
  121. .long 0
  122. .byte 0,12,0x14,0,0,0,2,0
  123. .size .poly1305_init_int,.-.poly1305_init_int
  124. .globl .poly1305_blocks
  125. .align 4
  126. .poly1305_blocks:
  127. srdi. $len,$len,4
  128. beq- Labort
  129. $STU $sp,-$FRAME($sp)
  130. mflr r0
  131. $PUSH r27,`$FRAME-$SIZE_T*5`($sp)
  132. $PUSH r28,`$FRAME-$SIZE_T*4`($sp)
  133. $PUSH r29,`$FRAME-$SIZE_T*3`($sp)
  134. $PUSH r30,`$FRAME-$SIZE_T*2`($sp)
  135. $PUSH r31,`$FRAME-$SIZE_T*1`($sp)
  136. $PUSH r0,`$FRAME+$LRSAVE`($sp)
  137. ld $r0,32($ctx) # load key
  138. ld $r1,40($ctx)
  139. ld $h0,0($ctx) # load hash value
  140. ld $h1,8($ctx)
  141. ld $h2,16($ctx)
  142. srdi $s1,$r1,2
  143. mtctr $len
  144. add $s1,$s1,$r1 # s1 = r1 + r1>>2
  145. li $mask,3
  146. b Loop
  147. .align 4
  148. Loop:
  149. ___
  150. $code.=<<___ if ($LITTLE_ENDIAN);
  151. ld $t0,0($inp) # load input
  152. ld $t1,8($inp)
  153. ___
  154. $code.=<<___ if (!$LITTLE_ENDIAN);
  155. li $d0,4
  156. lwbrx $t0,0,$inp # load input
  157. li $t1,8
  158. lwbrx $d0,$d0,$inp
  159. li $d1,12
  160. lwbrx $t1,$t1,$inp
  161. lwbrx $d1,$d1,$inp
  162. insrdi $t0,$d0,32,0
  163. insrdi $t1,$d1,32,0
  164. ___
  165. $code.=<<___;
  166. addi $inp,$inp,16
  167. addc $h0,$h0,$t0 # accumulate input
  168. adde $h1,$h1,$t1
  169. mulld $d0,$h0,$r0 # h0*r0
  170. mulhdu $d1,$h0,$r0
  171. adde $h2,$h2,$padbit
  172. mulld $t0,$h1,$s1 # h1*5*r1
  173. mulhdu $t1,$h1,$s1
  174. addc $d0,$d0,$t0
  175. adde $d1,$d1,$t1
  176. mulld $t0,$h0,$r1 # h0*r1
  177. mulhdu $d2,$h0,$r1
  178. addc $d1,$d1,$t0
  179. addze $d2,$d2
  180. mulld $t0,$h1,$r0 # h1*r0
  181. mulhdu $t1,$h1,$r0
  182. addc $d1,$d1,$t0
  183. adde $d2,$d2,$t1
  184. mulld $t0,$h2,$s1 # h2*5*r1
  185. mulld $t1,$h2,$r0 # h2*r0
  186. addc $d1,$d1,$t0
  187. adde $d2,$d2,$t1
  188. andc $t0,$d2,$mask # final reduction step
  189. and $h2,$d2,$mask
  190. srdi $t1,$t0,2
  191. add $t0,$t0,$t1
  192. addc $h0,$d0,$t0
  193. addze $h1,$d1
  194. addze $h2,$h2
  195. bdnz Loop
  196. std $h0,0($ctx) # store hash value
  197. std $h1,8($ctx)
  198. std $h2,16($ctx)
  199. $POP r27,`$FRAME-$SIZE_T*5`($sp)
  200. $POP r28,`$FRAME-$SIZE_T*4`($sp)
  201. $POP r29,`$FRAME-$SIZE_T*3`($sp)
  202. $POP r30,`$FRAME-$SIZE_T*2`($sp)
  203. $POP r31,`$FRAME-$SIZE_T*1`($sp)
  204. addi $sp,$sp,$FRAME
  205. Labort:
  206. blr
  207. .long 0
  208. .byte 0,12,4,1,0x80,5,4,0
  209. .size .poly1305_blocks,.-.poly1305_blocks
  210. .globl .poly1305_emit
  211. .align 4
  212. .poly1305_emit:
  213. ld $h0,0($ctx) # load hash
  214. ld $h1,8($ctx)
  215. ld $h2,16($ctx)
  216. ld $padbit,0($nonce) # load nonce
  217. ld $nonce,8($nonce)
  218. addic $d0,$h0,5 # compare to modulus
  219. addze $d1,$h1
  220. addze $d2,$h2
  221. srdi $mask,$d2,2 # did it carry/borrow?
  222. neg $mask,$mask
  223. andc $h0,$h0,$mask
  224. and $d0,$d0,$mask
  225. andc $h1,$h1,$mask
  226. and $d1,$d1,$mask
  227. or $h0,$h0,$d0
  228. or $h1,$h1,$d1
  229. ___
  230. $code.=<<___ if (!$LITTLE_ENDIAN);
  231. rotldi $padbit,$padbit,32 # flip nonce words
  232. rotldi $nonce,$nonce,32
  233. ___
  234. $code.=<<___;
  235. addc $h0,$h0,$padbit # accumulate nonce
  236. adde $h1,$h1,$nonce
  237. ___
  238. $code.=<<___ if ($LITTLE_ENDIAN);
  239. std $h0,0($mac) # write result
  240. std $h1,8($mac)
  241. ___
  242. $code.=<<___ if (!$LITTLE_ENDIAN);
  243. extrdi r0,$h0,32,0
  244. li $d0,4
  245. stwbrx $h0,0,$mac # write result
  246. extrdi $h0,$h1,32,0
  247. li $d1,8
  248. stwbrx r0,$d0,$mac
  249. li $d2,12
  250. stwbrx $h1,$d1,$mac
  251. stwbrx $h0,$d2,$mac
  252. ___
  253. $code.=<<___;
  254. blr
  255. .long 0
  256. .byte 0,12,0x14,0,0,0,3,0
  257. .size .poly1305_emit,.-.poly1305_emit
  258. ___
  259. } else {
  260. ###############################################################################
  261. # base 2^32 implementation
  262. my ($h0,$h1,$h2,$h3,$h4, $r0,$r1,$r2,$r3, $s1,$s2,$s3,
  263. $t0,$t1,$t2,$t3, $D0,$D1,$D2,$D3, $d0,$d1,$d2,$d3
  264. ) = map("r$_",(7..12,14..31));
  265. $code.=<<___;
  266. .globl .poly1305_init_int
  267. .align 4
  268. .poly1305_init_int:
  269. xor r0,r0,r0
  270. stw r0,0($ctx) # zero hash value
  271. stw r0,4($ctx)
  272. stw r0,8($ctx)
  273. stw r0,12($ctx)
  274. stw r0,16($ctx)
  275. $UCMP $inp,r0
  276. beq- Lno_key
  277. ___
  278. $code.=<<___ if ($LITTLE_ENDIAN);
  279. lw $h0,0($inp) # load key material
  280. lw $h1,4($inp)
  281. lw $h2,8($inp)
  282. lw $h3,12($inp)
  283. ___
  284. $code.=<<___ if (!$LITTLE_ENDIAN);
  285. li $h1,4
  286. lwbrx $h0,0,$inp # load key material
  287. li $h2,8
  288. lwbrx $h1,$h1,$inp
  289. li $h3,12
  290. lwbrx $h2,$h2,$inp
  291. lwbrx $h3,$h3,$inp
  292. ___
  293. $code.=<<___;
  294. lis $mask,0xf000 # 0xf0000000
  295. li $r0,-4
  296. andc $r0,$r0,$mask # 0x0ffffffc
  297. andc $h0,$h0,$mask
  298. and $h1,$h1,$r0
  299. and $h2,$h2,$r0
  300. and $h3,$h3,$r0
  301. stw $h0,32($ctx) # store key
  302. stw $h1,36($ctx)
  303. stw $h2,40($ctx)
  304. stw $h3,44($ctx)
  305. Lno_key:
  306. xor r3,r3,r3
  307. blr
  308. .long 0
  309. .byte 0,12,0x14,0,0,0,2,0
  310. .size .poly1305_init_int,.-.poly1305_init_int
  311. .globl .poly1305_blocks
  312. .align 4
  313. .poly1305_blocks:
  314. srwi. $len,$len,4
  315. beq- Labort
  316. $STU $sp,-$FRAME($sp)
  317. mflr r0
  318. $PUSH r14,`$FRAME-$SIZE_T*18`($sp)
  319. $PUSH r15,`$FRAME-$SIZE_T*17`($sp)
  320. $PUSH r16,`$FRAME-$SIZE_T*16`($sp)
  321. $PUSH r17,`$FRAME-$SIZE_T*15`($sp)
  322. $PUSH r18,`$FRAME-$SIZE_T*14`($sp)
  323. $PUSH r19,`$FRAME-$SIZE_T*13`($sp)
  324. $PUSH r20,`$FRAME-$SIZE_T*12`($sp)
  325. $PUSH r21,`$FRAME-$SIZE_T*11`($sp)
  326. $PUSH r22,`$FRAME-$SIZE_T*10`($sp)
  327. $PUSH r23,`$FRAME-$SIZE_T*9`($sp)
  328. $PUSH r24,`$FRAME-$SIZE_T*8`($sp)
  329. $PUSH r25,`$FRAME-$SIZE_T*7`($sp)
  330. $PUSH r26,`$FRAME-$SIZE_T*6`($sp)
  331. $PUSH r27,`$FRAME-$SIZE_T*5`($sp)
  332. $PUSH r28,`$FRAME-$SIZE_T*4`($sp)
  333. $PUSH r29,`$FRAME-$SIZE_T*3`($sp)
  334. $PUSH r30,`$FRAME-$SIZE_T*2`($sp)
  335. $PUSH r31,`$FRAME-$SIZE_T*1`($sp)
  336. $PUSH r0,`$FRAME+$LRSAVE`($sp)
  337. lwz $r0,32($ctx) # load key
  338. lwz $r1,36($ctx)
  339. lwz $r2,40($ctx)
  340. lwz $r3,44($ctx)
  341. lwz $h0,0($ctx) # load hash value
  342. lwz $h1,4($ctx)
  343. lwz $h2,8($ctx)
  344. lwz $h3,12($ctx)
  345. lwz $h4,16($ctx)
  346. srwi $s1,$r1,2
  347. srwi $s2,$r2,2
  348. srwi $s3,$r3,2
  349. add $s1,$s1,$r1 # si = ri + ri>>2
  350. add $s2,$s2,$r2
  351. add $s3,$s3,$r3
  352. mtctr $len
  353. li $mask,3
  354. b Loop
  355. .align 4
  356. Loop:
  357. ___
  358. $code.=<<___ if ($LITTLE_ENDIAN);
  359. lwz $d0,0($inp) # load input
  360. lwz $d1,4($inp)
  361. lwz $d2,8($inp)
  362. lwz $d3,12($inp)
  363. ___
  364. $code.=<<___ if (!$LITTLE_ENDIAN);
  365. li $d1,4
  366. lwbrx $d0,0,$inp # load input
  367. li $d2,8
  368. lwbrx $d1,$d1,$inp
  369. li $d3,12
  370. lwbrx $d2,$d2,$inp
  371. lwbrx $d3,$d3,$inp
  372. ___
  373. $code.=<<___;
  374. addi $inp,$inp,16
  375. addc $h0,$h0,$d0 # accumulate input
  376. adde $h1,$h1,$d1
  377. adde $h2,$h2,$d2
  378. mullw $d0,$h0,$r0 # h0*r0
  379. mulhwu $D0,$h0,$r0
  380. mullw $d1,$h0,$r1 # h0*r1
  381. mulhwu $D1,$h0,$r1
  382. mullw $d2,$h0,$r2 # h0*r2
  383. mulhwu $D2,$h0,$r2
  384. adde $h3,$h3,$d3
  385. adde $h4,$h4,$padbit
  386. mullw $d3,$h0,$r3 # h0*r3
  387. mulhwu $D3,$h0,$r3
  388. mullw $t0,$h1,$s3 # h1*s3
  389. mulhwu $t1,$h1,$s3
  390. mullw $t2,$h1,$r0 # h1*r0
  391. mulhwu $t3,$h1,$r0
  392. addc $d0,$d0,$t0
  393. adde $D0,$D0,$t1
  394. mullw $t0,$h1,$r1 # h1*r1
  395. mulhwu $t1,$h1,$r1
  396. addc $d1,$d1,$t2
  397. adde $D1,$D1,$t3
  398. mullw $t2,$h1,$r2 # h1*r2
  399. mulhwu $t3,$h1,$r2
  400. addc $d2,$d2,$t0
  401. adde $D2,$D2,$t1
  402. mullw $t0,$h2,$s2 # h2*s2
  403. mulhwu $t1,$h2,$s2
  404. addc $d3,$d3,$t2
  405. adde $D3,$D3,$t3
  406. mullw $t2,$h2,$s3 # h2*s3
  407. mulhwu $t3,$h2,$s3
  408. addc $d0,$d0,$t0
  409. adde $D0,$D0,$t1
  410. mullw $t0,$h2,$r0 # h2*r0
  411. mulhwu $t1,$h2,$r0
  412. addc $d1,$d1,$t2
  413. adde $D1,$D1,$t3
  414. mullw $t2,$h2,$r1 # h2*r1
  415. mulhwu $t3,$h2,$r1
  416. addc $d2,$d2,$t0
  417. adde $D2,$D2,$t1
  418. mullw $t0,$h3,$s1 # h3*s1
  419. mulhwu $t1,$h3,$s1
  420. addc $d3,$d3,$t2
  421. adde $D3,$D3,$t3
  422. mullw $t2,$h3,$s2 # h3*s2
  423. mulhwu $t3,$h3,$s2
  424. addc $d0,$d0,$t0
  425. adde $D0,$D0,$t1
  426. mullw $t0,$h3,$s3 # h3*s3
  427. mulhwu $t1,$h3,$s3
  428. addc $d1,$d1,$t2
  429. adde $D1,$D1,$t3
  430. mullw $t2,$h3,$r0 # h3*r0
  431. mulhwu $t3,$h3,$r0
  432. addc $d2,$d2,$t0
  433. adde $D2,$D2,$t1
  434. mullw $t0,$h4,$s1 # h4*s1
  435. addc $d3,$d3,$t2
  436. adde $D3,$D3,$t3
  437. addc $d1,$d1,$t0
  438. mullw $t1,$h4,$s2 # h4*s2
  439. addze $D1,$D1
  440. addc $d2,$d2,$t1
  441. addze $D2,$D2
  442. mullw $t2,$h4,$s3 # h4*s3
  443. addc $d3,$d3,$t2
  444. addze $D3,$D3
  445. mullw $h4,$h4,$r0 # h4*r0
  446. addc $h1,$d1,$D0
  447. adde $h2,$d2,$D1
  448. adde $h3,$d3,$D2
  449. adde $h4,$h4,$D3
  450. andc $D0,$h4,$mask # final reduction step
  451. and $h4,$h4,$mask
  452. srwi $D1,$D0,2
  453. add $D0,$D0,$D1
  454. addc $h0,$d0,$D0
  455. addze $h1,$h1
  456. addze $h2,$h2
  457. addze $h3,$h3
  458. addze $h4,$h4
  459. bdnz Loop
  460. stw $h0,0($ctx) # store hash value
  461. stw $h1,4($ctx)
  462. stw $h2,8($ctx)
  463. stw $h3,12($ctx)
  464. stw $h4,16($ctx)
  465. $POP r14,`$FRAME-$SIZE_T*18`($sp)
  466. $POP r15,`$FRAME-$SIZE_T*17`($sp)
  467. $POP r16,`$FRAME-$SIZE_T*16`($sp)
  468. $POP r17,`$FRAME-$SIZE_T*15`($sp)
  469. $POP r18,`$FRAME-$SIZE_T*14`($sp)
  470. $POP r19,`$FRAME-$SIZE_T*13`($sp)
  471. $POP r20,`$FRAME-$SIZE_T*12`($sp)
  472. $POP r21,`$FRAME-$SIZE_T*11`($sp)
  473. $POP r22,`$FRAME-$SIZE_T*10`($sp)
  474. $POP r23,`$FRAME-$SIZE_T*9`($sp)
  475. $POP r24,`$FRAME-$SIZE_T*8`($sp)
  476. $POP r25,`$FRAME-$SIZE_T*7`($sp)
  477. $POP r26,`$FRAME-$SIZE_T*6`($sp)
  478. $POP r27,`$FRAME-$SIZE_T*5`($sp)
  479. $POP r28,`$FRAME-$SIZE_T*4`($sp)
  480. $POP r29,`$FRAME-$SIZE_T*3`($sp)
  481. $POP r30,`$FRAME-$SIZE_T*2`($sp)
  482. $POP r31,`$FRAME-$SIZE_T*1`($sp)
  483. addi $sp,$sp,$FRAME
  484. Labort:
  485. blr
  486. .long 0
  487. .byte 0,12,4,1,0x80,18,4,0
  488. .size .poly1305_blocks,.-.poly1305_blocks
  489. .globl .poly1305_emit
  490. .align 4
  491. .poly1305_emit:
  492. $STU $sp,-$FRAME($sp)
  493. mflr r0
  494. $PUSH r28,`$FRAME-$SIZE_T*4`($sp)
  495. $PUSH r29,`$FRAME-$SIZE_T*3`($sp)
  496. $PUSH r30,`$FRAME-$SIZE_T*2`($sp)
  497. $PUSH r31,`$FRAME-$SIZE_T*1`($sp)
  498. $PUSH r0,`$FRAME+$LRSAVE`($sp)
  499. lwz $h0,0($ctx) # load hash
  500. lwz $h1,4($ctx)
  501. lwz $h2,8($ctx)
  502. lwz $h3,12($ctx)
  503. lwz $h4,16($ctx)
  504. addic $d0,$h0,5 # compare to modulus
  505. addze $d1,$h1
  506. addze $d2,$h2
  507. addze $d3,$h3
  508. addze $mask,$h4
  509. srwi $mask,$mask,2 # did it carry/borrow?
  510. neg $mask,$mask
  511. andc $h0,$h0,$mask
  512. and $d0,$d0,$mask
  513. andc $h1,$h1,$mask
  514. and $d1,$d1,$mask
  515. or $h0,$h0,$d0
  516. lwz $d0,0($nonce) # load nonce
  517. andc $h2,$h2,$mask
  518. and $d2,$d2,$mask
  519. or $h1,$h1,$d1
  520. lwz $d1,4($nonce)
  521. andc $h3,$h3,$mask
  522. and $d3,$d3,$mask
  523. or $h2,$h2,$d2
  524. lwz $d2,8($nonce)
  525. or $h3,$h3,$d3
  526. lwz $d3,12($nonce)
  527. addc $h0,$h0,$d0 # accumulate nonce
  528. adde $h1,$h1,$d1
  529. adde $h2,$h2,$d2
  530. adde $h3,$h3,$d3
  531. ___
  532. $code.=<<___ if ($LITTLE_ENDIAN);
  533. stw $h0,0($mac) # write result
  534. stw $h1,4($mac)
  535. stw $h2,8($mac)
  536. stw $h3,12($mac)
  537. ___
  538. $code.=<<___ if (!$LITTLE_ENDIAN);
  539. li $d1,4
  540. stwbrx $h0,0,$mac # write result
  541. li $d2,8
  542. stwbrx $h1,$d1,$mac
  543. li $d3,12
  544. stwbrx $h2,$d2,$mac
  545. stwbrx $h3,$d3,$mac
  546. ___
  547. $code.=<<___;
  548. $POP r28,`$FRAME-$SIZE_T*4`($sp)
  549. $POP r29,`$FRAME-$SIZE_T*3`($sp)
  550. $POP r30,`$FRAME-$SIZE_T*2`($sp)
  551. $POP r31,`$FRAME-$SIZE_T*1`($sp)
  552. addi $sp,$sp,$FRAME
  553. blr
  554. .long 0
  555. .byte 0,12,4,1,0x80,4,3,0
  556. .size .poly1305_emit,.-.poly1305_emit
  557. ___
  558. }
  559. $code.=<<___;
  560. .asciz "Poly1305 for PPC, CRYPTOGAMS by <appro\@openssl.org>"
  561. ___
  562. $code =~ s/\`([^\`]*)\`/eval $1/gem;
  563. print $code;
  564. close STDOUT;