poly1305-ppcfp.pl 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743
  1. #! /usr/bin/env perl
  2. # Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. # ====================================================================
  10. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  11. # project. The module is, however, dual licensed under OpenSSL and
  12. # CRYPTOGAMS licenses depending on where you obtain it. For further
  13. # details see http://www.openssl.org/~appro/cryptogams/.
  14. # ====================================================================
  15. #
  16. # This module implements Poly1305 hash for PowerPC FPU.
  17. #
  18. # June 2015
  19. #
  20. # Numbers are cycles per processed byte with poly1305_blocks alone,
  21. # and improvement coefficients relative to gcc-generated code.
  22. #
  23. # Freescale e300 9.78/+30%
  24. # PPC74x0 6.92/+50%
  25. # PPC970 6.03/+80%
  26. # POWER7 3.50/+30%
  27. # POWER8 3.75/+10%
  28. # $output is the last argument if it looks like a file (it has an extension)
  29. # $flavour is the first argument if it doesn't look like a file
  30. $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  31. $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  32. if ($flavour =~ /64/) {
  33. $SIZE_T =8;
  34. $LRSAVE =2*$SIZE_T;
  35. $UCMP ="cmpld";
  36. $STU ="stdu";
  37. $POP ="ld";
  38. $PUSH ="std";
  39. } elsif ($flavour =~ /32/) {
  40. $SIZE_T =4;
  41. $LRSAVE =$SIZE_T;
  42. $UCMP ="cmplw";
  43. $STU ="stwu";
  44. $POP ="lwz";
  45. $PUSH ="stw";
  46. } else { die "nonsense $flavour"; }
  47. $LITTLE_ENDIAN = ($flavour=~/le$/) ? 4 : 0;
  48. $LWXLE = $LITTLE_ENDIAN ? "lwzx" : "lwbrx";
  49. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  50. ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
  51. ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
  52. die "can't locate ppc-xlate.pl";
  53. open STDOUT,"| $^X $xlate $flavour \"$output\""
  54. or die "can't call $xlate: $!";
  55. $LOCALS=6*$SIZE_T;
  56. $FRAME=$LOCALS+6*8+18*8;
  57. my $sp="r1";
  58. my ($ctx,$inp,$len,$padbit) = map("r$_",(3..6));
  59. my ($in0,$in1,$in2,$in3,$i1,$i2,$i3) = map("r$_",(7..12,6));
  60. my ($h0lo,$h0hi,$h1lo,$h1hi,$h2lo,$h2hi,$h3lo,$h3hi,
  61. $two0,$two32,$two64,$two96,$two130,$five_two130,
  62. $r0lo,$r0hi,$r1lo,$r1hi,$r2lo,$r2hi,
  63. $s2lo,$s2hi,$s3lo,$s3hi,
  64. $c0lo,$c0hi,$c1lo,$c1hi,$c2lo,$c2hi,$c3lo,$c3hi) = map("f$_",(0..31));
  65. # borrowings
  66. my ($r3lo,$r3hi,$s1lo,$s1hi) = ($c0lo,$c0hi,$c1lo,$c1hi);
  67. my ($x0,$x1,$x2,$x3) = ($c2lo,$c2hi,$c3lo,$c3hi);
  68. my ($y0,$y1,$y2,$y3) = ($c3lo,$c3hi,$c1lo,$c1hi);
  69. $code.=<<___;
  70. .machine "any"
  71. .text
  72. .globl .poly1305_init_fpu
  73. .align 6
  74. .poly1305_init_fpu:
  75. $STU $sp,-$LOCALS($sp) # minimal frame
  76. mflr $padbit
  77. $PUSH $padbit,`$LOCALS+$LRSAVE`($sp)
  78. bl LPICmeup
  79. xor r0,r0,r0
  80. mtlr $padbit # restore lr
  81. lfd $two0,8*0($len) # load constants
  82. lfd $two32,8*1($len)
  83. lfd $two64,8*2($len)
  84. lfd $two96,8*3($len)
  85. lfd $two130,8*4($len)
  86. lfd $five_two130,8*5($len)
  87. stfd $two0,8*0($ctx) # initial hash value, biased 0
  88. stfd $two32,8*1($ctx)
  89. stfd $two64,8*2($ctx)
  90. stfd $two96,8*3($ctx)
  91. $UCMP $inp,r0
  92. beq- Lno_key
  93. lfd $h3lo,8*13($len) # new fpscr
  94. mffs $h3hi # old fpscr
  95. stfd $two0,8*4($ctx) # key "template"
  96. stfd $two32,8*5($ctx)
  97. stfd $two64,8*6($ctx)
  98. stfd $two96,8*7($ctx)
  99. li $in1,4
  100. li $in2,8
  101. li $in3,12
  102. $LWXLE $in0,0,$inp # load key
  103. $LWXLE $in1,$in1,$inp
  104. $LWXLE $in2,$in2,$inp
  105. $LWXLE $in3,$in3,$inp
  106. lis $i1,0xf000 # 0xf0000000
  107. ori $i2,$i1,3 # 0xf0000003
  108. andc $in0,$in0,$i1 # &=0x0fffffff
  109. andc $in1,$in1,$i2 # &=0x0ffffffc
  110. andc $in2,$in2,$i2
  111. andc $in3,$in3,$i2
  112. stw $in0,`8*4+(4^$LITTLE_ENDIAN)`($ctx) # fill "template"
  113. stw $in1,`8*5+(4^$LITTLE_ENDIAN)`($ctx)
  114. stw $in2,`8*6+(4^$LITTLE_ENDIAN)`($ctx)
  115. stw $in3,`8*7+(4^$LITTLE_ENDIAN)`($ctx)
  116. mtfsf 255,$h3lo # fpscr
  117. stfd $two0,8*18($ctx) # copy constants to context
  118. stfd $two32,8*19($ctx)
  119. stfd $two64,8*20($ctx)
  120. stfd $two96,8*21($ctx)
  121. stfd $two130,8*22($ctx)
  122. stfd $five_two130,8*23($ctx)
  123. lfd $h0lo,8*4($ctx) # load [biased] key
  124. lfd $h1lo,8*5($ctx)
  125. lfd $h2lo,8*6($ctx)
  126. lfd $h3lo,8*7($ctx)
  127. fsub $h0lo,$h0lo,$two0 # r0
  128. fsub $h1lo,$h1lo,$two32 # r1
  129. fsub $h2lo,$h2lo,$two64 # r2
  130. fsub $h3lo,$h3lo,$two96 # r3
  131. lfd $two0,8*6($len) # more constants
  132. lfd $two32,8*7($len)
  133. lfd $two64,8*8($len)
  134. lfd $two96,8*9($len)
  135. fmul $h1hi,$h1lo,$five_two130 # s1
  136. fmul $h2hi,$h2lo,$five_two130 # s2
  137. stfd $h3hi,8*15($ctx) # borrow slot for original fpscr
  138. fmul $h3hi,$h3lo,$five_two130 # s3
  139. fadd $h0hi,$h0lo,$two0
  140. stfd $h1hi,8*12($ctx) # put aside for now
  141. fadd $h1hi,$h1lo,$two32
  142. stfd $h2hi,8*13($ctx)
  143. fadd $h2hi,$h2lo,$two64
  144. stfd $h3hi,8*14($ctx)
  145. fadd $h3hi,$h3lo,$two96
  146. fsub $h0hi,$h0hi,$two0
  147. fsub $h1hi,$h1hi,$two32
  148. fsub $h2hi,$h2hi,$two64
  149. fsub $h3hi,$h3hi,$two96
  150. lfd $two0,8*10($len) # more constants
  151. lfd $two32,8*11($len)
  152. lfd $two64,8*12($len)
  153. fsub $h0lo,$h0lo,$h0hi
  154. fsub $h1lo,$h1lo,$h1hi
  155. fsub $h2lo,$h2lo,$h2hi
  156. fsub $h3lo,$h3lo,$h3hi
  157. stfd $h0hi,8*5($ctx) # r0hi
  158. stfd $h1hi,8*7($ctx) # r1hi
  159. stfd $h2hi,8*9($ctx) # r2hi
  160. stfd $h3hi,8*11($ctx) # r3hi
  161. stfd $h0lo,8*4($ctx) # r0lo
  162. stfd $h1lo,8*6($ctx) # r1lo
  163. stfd $h2lo,8*8($ctx) # r2lo
  164. stfd $h3lo,8*10($ctx) # r3lo
  165. lfd $h1lo,8*12($ctx) # s1
  166. lfd $h2lo,8*13($ctx) # s2
  167. lfd $h3lo,8*14($ctx) # s3
  168. lfd $h0lo,8*15($ctx) # pull original fpscr
  169. fadd $h1hi,$h1lo,$two0
  170. fadd $h2hi,$h2lo,$two32
  171. fadd $h3hi,$h3lo,$two64
  172. fsub $h1hi,$h1hi,$two0
  173. fsub $h2hi,$h2hi,$two32
  174. fsub $h3hi,$h3hi,$two64
  175. fsub $h1lo,$h1lo,$h1hi
  176. fsub $h2lo,$h2lo,$h2hi
  177. fsub $h3lo,$h3lo,$h3hi
  178. stfd $h1hi,8*13($ctx) # s1hi
  179. stfd $h2hi,8*15($ctx) # s2hi
  180. stfd $h3hi,8*17($ctx) # s3hi
  181. stfd $h1lo,8*12($ctx) # s1lo
  182. stfd $h2lo,8*14($ctx) # s2lo
  183. stfd $h3lo,8*16($ctx) # s3lo
  184. mtfsf 255,$h0lo # restore fpscr
  185. Lno_key:
  186. xor r3,r3,r3
  187. addi $sp,$sp,$LOCALS
  188. blr
  189. .long 0
  190. .byte 0,12,4,1,0x80,0,2,0
  191. .size .poly1305_init_fpu,.-.poly1305_init_fpu
  192. .globl .poly1305_blocks_fpu
  193. .align 4
  194. .poly1305_blocks_fpu:
  195. srwi. $len,$len,4
  196. beq- Labort
  197. $STU $sp,-$FRAME($sp)
  198. mflr r0
  199. stfd f14,`$FRAME-8*18`($sp)
  200. stfd f15,`$FRAME-8*17`($sp)
  201. stfd f16,`$FRAME-8*16`($sp)
  202. stfd f17,`$FRAME-8*15`($sp)
  203. stfd f18,`$FRAME-8*14`($sp)
  204. stfd f19,`$FRAME-8*13`($sp)
  205. stfd f20,`$FRAME-8*12`($sp)
  206. stfd f21,`$FRAME-8*11`($sp)
  207. stfd f22,`$FRAME-8*10`($sp)
  208. stfd f23,`$FRAME-8*9`($sp)
  209. stfd f24,`$FRAME-8*8`($sp)
  210. stfd f25,`$FRAME-8*7`($sp)
  211. stfd f26,`$FRAME-8*6`($sp)
  212. stfd f27,`$FRAME-8*5`($sp)
  213. stfd f28,`$FRAME-8*4`($sp)
  214. stfd f29,`$FRAME-8*3`($sp)
  215. stfd f30,`$FRAME-8*2`($sp)
  216. stfd f31,`$FRAME-8*1`($sp)
  217. $PUSH r0,`$FRAME+$LRSAVE`($sp)
  218. xor r0,r0,r0
  219. li $in3,1
  220. mtctr $len
  221. neg $len,$len
  222. stw r0,`$LOCALS+8*4+(0^$LITTLE_ENDIAN)`($sp)
  223. stw $in3,`$LOCALS+8*4+(4^$LITTLE_ENDIAN)`($sp)
  224. lfd $two0,8*18($ctx) # load constants
  225. lfd $two32,8*19($ctx)
  226. lfd $two64,8*20($ctx)
  227. lfd $two96,8*21($ctx)
  228. lfd $two130,8*22($ctx)
  229. lfd $five_two130,8*23($ctx)
  230. lfd $h0lo,8*0($ctx) # load [biased] hash value
  231. lfd $h1lo,8*1($ctx)
  232. lfd $h2lo,8*2($ctx)
  233. lfd $h3lo,8*3($ctx)
  234. stfd $two0,`$LOCALS+8*0`($sp) # input "template"
  235. oris $in3,$padbit,`(1023+52+96)<<4`
  236. stfd $two32,`$LOCALS+8*1`($sp)
  237. stfd $two64,`$LOCALS+8*2`($sp)
  238. stw $in3,`$LOCALS+8*3+(0^$LITTLE_ENDIAN)`($sp)
  239. li $i1,4
  240. li $i2,8
  241. li $i3,12
  242. $LWXLE $in0,0,$inp # load input
  243. $LWXLE $in1,$i1,$inp
  244. $LWXLE $in2,$i2,$inp
  245. $LWXLE $in3,$i3,$inp
  246. addi $inp,$inp,16
  247. stw $in0,`$LOCALS+8*0+(4^$LITTLE_ENDIAN)`($sp) # fill "template"
  248. stw $in1,`$LOCALS+8*1+(4^$LITTLE_ENDIAN)`($sp)
  249. stw $in2,`$LOCALS+8*2+(4^$LITTLE_ENDIAN)`($sp)
  250. stw $in3,`$LOCALS+8*3+(4^$LITTLE_ENDIAN)`($sp)
  251. mffs $x0 # original fpscr
  252. lfd $x1,`$LOCALS+8*4`($sp) # new fpscr
  253. lfd $r0lo,8*4($ctx) # load key
  254. lfd $r0hi,8*5($ctx)
  255. lfd $r1lo,8*6($ctx)
  256. lfd $r1hi,8*7($ctx)
  257. lfd $r2lo,8*8($ctx)
  258. lfd $r2hi,8*9($ctx)
  259. lfd $r3lo,8*10($ctx)
  260. lfd $r3hi,8*11($ctx)
  261. lfd $s1lo,8*12($ctx)
  262. lfd $s1hi,8*13($ctx)
  263. lfd $s2lo,8*14($ctx)
  264. lfd $s2hi,8*15($ctx)
  265. lfd $s3lo,8*16($ctx)
  266. lfd $s3hi,8*17($ctx)
  267. stfd $x0,`$LOCALS+8*4`($sp) # save original fpscr
  268. mtfsf 255,$x1
  269. addic $len,$len,1
  270. addze r0,r0
  271. slwi. r0,r0,4
  272. sub $inp,$inp,r0 # conditional rewind
  273. lfd $x0,`$LOCALS+8*0`($sp)
  274. lfd $x1,`$LOCALS+8*1`($sp)
  275. lfd $x2,`$LOCALS+8*2`($sp)
  276. lfd $x3,`$LOCALS+8*3`($sp)
  277. fsub $h0lo,$h0lo,$two0 # de-bias hash value
  278. $LWXLE $in0,0,$inp # modulo-scheduled input load
  279. fsub $h1lo,$h1lo,$two32
  280. $LWXLE $in1,$i1,$inp
  281. fsub $h2lo,$h2lo,$two64
  282. $LWXLE $in2,$i2,$inp
  283. fsub $h3lo,$h3lo,$two96
  284. $LWXLE $in3,$i3,$inp
  285. fsub $x0,$x0,$two0 # de-bias input
  286. addi $inp,$inp,16
  287. fsub $x1,$x1,$two32
  288. fsub $x2,$x2,$two64
  289. fsub $x3,$x3,$two96
  290. fadd $x0,$x0,$h0lo # accumulate input
  291. stw $in0,`$LOCALS+8*0+(4^$LITTLE_ENDIAN)`($sp)
  292. fadd $x1,$x1,$h1lo
  293. stw $in1,`$LOCALS+8*1+(4^$LITTLE_ENDIAN)`($sp)
  294. fadd $x2,$x2,$h2lo
  295. stw $in2,`$LOCALS+8*2+(4^$LITTLE_ENDIAN)`($sp)
  296. fadd $x3,$x3,$h3lo
  297. stw $in3,`$LOCALS+8*3+(4^$LITTLE_ENDIAN)`($sp)
  298. b Lentry
  299. .align 4
  300. Loop:
  301. fsub $y0,$y0,$two0 # de-bias input
  302. addic $len,$len,1
  303. fsub $y1,$y1,$two32
  304. addze r0,r0
  305. fsub $y2,$y2,$two64
  306. slwi. r0,r0,4
  307. fsub $y3,$y3,$two96
  308. sub $inp,$inp,r0 # conditional rewind
  309. fadd $h0lo,$h0lo,$y0 # accumulate input
  310. fadd $h0hi,$h0hi,$y1
  311. fadd $h2lo,$h2lo,$y2
  312. fadd $h2hi,$h2hi,$y3
  313. ######################################### base 2^48 -> base 2^32
  314. fadd $c1lo,$h1lo,$two64
  315. $LWXLE $in0,0,$inp # modulo-scheduled input load
  316. fadd $c1hi,$h1hi,$two64
  317. $LWXLE $in1,$i1,$inp
  318. fadd $c3lo,$h3lo,$two130
  319. $LWXLE $in2,$i2,$inp
  320. fadd $c3hi,$h3hi,$two130
  321. $LWXLE $in3,$i3,$inp
  322. fadd $c0lo,$h0lo,$two32
  323. addi $inp,$inp,16
  324. fadd $c0hi,$h0hi,$two32
  325. fadd $c2lo,$h2lo,$two96
  326. fadd $c2hi,$h2hi,$two96
  327. fsub $c1lo,$c1lo,$two64
  328. stw $in0,`$LOCALS+8*0+(4^$LITTLE_ENDIAN)`($sp) # fill "template"
  329. fsub $c1hi,$c1hi,$two64
  330. stw $in1,`$LOCALS+8*1+(4^$LITTLE_ENDIAN)`($sp)
  331. fsub $c3lo,$c3lo,$two130
  332. stw $in2,`$LOCALS+8*2+(4^$LITTLE_ENDIAN)`($sp)
  333. fsub $c3hi,$c3hi,$two130
  334. stw $in3,`$LOCALS+8*3+(4^$LITTLE_ENDIAN)`($sp)
  335. fsub $c0lo,$c0lo,$two32
  336. fsub $c0hi,$c0hi,$two32
  337. fsub $c2lo,$c2lo,$two96
  338. fsub $c2hi,$c2hi,$two96
  339. fsub $h1lo,$h1lo,$c1lo
  340. fsub $h1hi,$h1hi,$c1hi
  341. fsub $h3lo,$h3lo,$c3lo
  342. fsub $h3hi,$h3hi,$c3hi
  343. fsub $h2lo,$h2lo,$c2lo
  344. fsub $h2hi,$h2hi,$c2hi
  345. fsub $h0lo,$h0lo,$c0lo
  346. fsub $h0hi,$h0hi,$c0hi
  347. fadd $h1lo,$h1lo,$c0lo
  348. fadd $h1hi,$h1hi,$c0hi
  349. fadd $h3lo,$h3lo,$c2lo
  350. fadd $h3hi,$h3hi,$c2hi
  351. fadd $h2lo,$h2lo,$c1lo
  352. fadd $h2hi,$h2hi,$c1hi
  353. fmadd $h0lo,$c3lo,$five_two130,$h0lo
  354. fmadd $h0hi,$c3hi,$five_two130,$h0hi
  355. fadd $x1,$h1lo,$h1hi
  356. lfd $s1lo,8*12($ctx) # reload constants
  357. fadd $x3,$h3lo,$h3hi
  358. lfd $s1hi,8*13($ctx)
  359. fadd $x2,$h2lo,$h2hi
  360. lfd $r3lo,8*10($ctx)
  361. fadd $x0,$h0lo,$h0hi
  362. lfd $r3hi,8*11($ctx)
  363. Lentry:
  364. fmul $h0lo,$s3lo,$x1
  365. fmul $h0hi,$s3hi,$x1
  366. fmul $h2lo,$r1lo,$x1
  367. fmul $h2hi,$r1hi,$x1
  368. fmul $h1lo,$r0lo,$x1
  369. fmul $h1hi,$r0hi,$x1
  370. fmul $h3lo,$r2lo,$x1
  371. fmul $h3hi,$r2hi,$x1
  372. fmadd $h0lo,$s1lo,$x3,$h0lo
  373. fmadd $h0hi,$s1hi,$x3,$h0hi
  374. fmadd $h2lo,$s3lo,$x3,$h2lo
  375. fmadd $h2hi,$s3hi,$x3,$h2hi
  376. fmadd $h1lo,$s2lo,$x3,$h1lo
  377. fmadd $h1hi,$s2hi,$x3,$h1hi
  378. fmadd $h3lo,$r0lo,$x3,$h3lo
  379. fmadd $h3hi,$r0hi,$x3,$h3hi
  380. fmadd $h0lo,$s2lo,$x2,$h0lo
  381. fmadd $h0hi,$s2hi,$x2,$h0hi
  382. fmadd $h2lo,$r0lo,$x2,$h2lo
  383. fmadd $h2hi,$r0hi,$x2,$h2hi
  384. fmadd $h1lo,$s3lo,$x2,$h1lo
  385. fmadd $h1hi,$s3hi,$x2,$h1hi
  386. fmadd $h3lo,$r1lo,$x2,$h3lo
  387. fmadd $h3hi,$r1hi,$x2,$h3hi
  388. fmadd $h0lo,$r0lo,$x0,$h0lo
  389. lfd $y0,`$LOCALS+8*0`($sp) # load [biased] input
  390. fmadd $h0hi,$r0hi,$x0,$h0hi
  391. lfd $y1,`$LOCALS+8*1`($sp)
  392. fmadd $h2lo,$r2lo,$x0,$h2lo
  393. lfd $y2,`$LOCALS+8*2`($sp)
  394. fmadd $h2hi,$r2hi,$x0,$h2hi
  395. lfd $y3,`$LOCALS+8*3`($sp)
  396. fmadd $h1lo,$r1lo,$x0,$h1lo
  397. fmadd $h1hi,$r1hi,$x0,$h1hi
  398. fmadd $h3lo,$r3lo,$x0,$h3lo
  399. fmadd $h3hi,$r3hi,$x0,$h3hi
  400. bdnz Loop
  401. ######################################### base 2^48 -> base 2^32
  402. fadd $c0lo,$h0lo,$two32
  403. fadd $c0hi,$h0hi,$two32
  404. fadd $c2lo,$h2lo,$two96
  405. fadd $c2hi,$h2hi,$two96
  406. fadd $c1lo,$h1lo,$two64
  407. fadd $c1hi,$h1hi,$two64
  408. fadd $c3lo,$h3lo,$two130
  409. fadd $c3hi,$h3hi,$two130
  410. fsub $c0lo,$c0lo,$two32
  411. fsub $c0hi,$c0hi,$two32
  412. fsub $c2lo,$c2lo,$two96
  413. fsub $c2hi,$c2hi,$two96
  414. fsub $c1lo,$c1lo,$two64
  415. fsub $c1hi,$c1hi,$two64
  416. fsub $c3lo,$c3lo,$two130
  417. fsub $c3hi,$c3hi,$two130
  418. fsub $h1lo,$h1lo,$c1lo
  419. fsub $h1hi,$h1hi,$c1hi
  420. fsub $h3lo,$h3lo,$c3lo
  421. fsub $h3hi,$h3hi,$c3hi
  422. fsub $h2lo,$h2lo,$c2lo
  423. fsub $h2hi,$h2hi,$c2hi
  424. fsub $h0lo,$h0lo,$c0lo
  425. fsub $h0hi,$h0hi,$c0hi
  426. fadd $h1lo,$h1lo,$c0lo
  427. fadd $h1hi,$h1hi,$c0hi
  428. fadd $h3lo,$h3lo,$c2lo
  429. fadd $h3hi,$h3hi,$c2hi
  430. fadd $h2lo,$h2lo,$c1lo
  431. fadd $h2hi,$h2hi,$c1hi
  432. fmadd $h0lo,$c3lo,$five_two130,$h0lo
  433. fmadd $h0hi,$c3hi,$five_two130,$h0hi
  434. fadd $x1,$h1lo,$h1hi
  435. fadd $x3,$h3lo,$h3hi
  436. fadd $x2,$h2lo,$h2hi
  437. fadd $x0,$h0lo,$h0hi
  438. lfd $h0lo,`$LOCALS+8*4`($sp) # pull saved fpscr
  439. fadd $x1,$x1,$two32 # bias
  440. fadd $x3,$x3,$two96
  441. fadd $x2,$x2,$two64
  442. fadd $x0,$x0,$two0
  443. stfd $x1,8*1($ctx) # store [biased] hash value
  444. stfd $x3,8*3($ctx)
  445. stfd $x2,8*2($ctx)
  446. stfd $x0,8*0($ctx)
  447. mtfsf 255,$h0lo # restore original fpscr
  448. lfd f14,`$FRAME-8*18`($sp)
  449. lfd f15,`$FRAME-8*17`($sp)
  450. lfd f16,`$FRAME-8*16`($sp)
  451. lfd f17,`$FRAME-8*15`($sp)
  452. lfd f18,`$FRAME-8*14`($sp)
  453. lfd f19,`$FRAME-8*13`($sp)
  454. lfd f20,`$FRAME-8*12`($sp)
  455. lfd f21,`$FRAME-8*11`($sp)
  456. lfd f22,`$FRAME-8*10`($sp)
  457. lfd f23,`$FRAME-8*9`($sp)
  458. lfd f24,`$FRAME-8*8`($sp)
  459. lfd f25,`$FRAME-8*7`($sp)
  460. lfd f26,`$FRAME-8*6`($sp)
  461. lfd f27,`$FRAME-8*5`($sp)
  462. lfd f28,`$FRAME-8*4`($sp)
  463. lfd f29,`$FRAME-8*3`($sp)
  464. lfd f30,`$FRAME-8*2`($sp)
  465. lfd f31,`$FRAME-8*1`($sp)
  466. addi $sp,$sp,$FRAME
  467. Labort:
  468. blr
  469. .long 0
  470. .byte 0,12,4,1,0x80,0,4,0
  471. .size .poly1305_blocks_fpu,.-.poly1305_blocks_fpu
  472. ___
  473. {
  474. my ($mac,$nonce)=($inp,$len);
  475. my ($h0,$h1,$h2,$h3,$h4, $d0,$d1,$d2,$d3
  476. ) = map("r$_",(7..11,28..31));
  477. my $mask = "r0";
  478. my $FRAME = (6+4)*$SIZE_T;
  479. $code.=<<___;
  480. .globl .poly1305_emit_fpu
  481. .align 4
  482. .poly1305_emit_fpu:
  483. $STU $sp,-$FRAME($sp)
  484. mflr r0
  485. $PUSH r28,`$FRAME-$SIZE_T*4`($sp)
  486. $PUSH r29,`$FRAME-$SIZE_T*3`($sp)
  487. $PUSH r30,`$FRAME-$SIZE_T*2`($sp)
  488. $PUSH r31,`$FRAME-$SIZE_T*1`($sp)
  489. $PUSH r0,`$FRAME+$LRSAVE`($sp)
  490. lwz $d0,`8*0+(0^$LITTLE_ENDIAN)`($ctx) # load hash
  491. lwz $h0,`8*0+(4^$LITTLE_ENDIAN)`($ctx)
  492. lwz $d1,`8*1+(0^$LITTLE_ENDIAN)`($ctx)
  493. lwz $h1,`8*1+(4^$LITTLE_ENDIAN)`($ctx)
  494. lwz $d2,`8*2+(0^$LITTLE_ENDIAN)`($ctx)
  495. lwz $h2,`8*2+(4^$LITTLE_ENDIAN)`($ctx)
  496. lwz $d3,`8*3+(0^$LITTLE_ENDIAN)`($ctx)
  497. lwz $h3,`8*3+(4^$LITTLE_ENDIAN)`($ctx)
  498. lis $mask,0xfff0
  499. andc $d0,$d0,$mask # mask exponent
  500. andc $d1,$d1,$mask
  501. andc $d2,$d2,$mask
  502. andc $d3,$d3,$mask # can be partially reduced...
  503. li $mask,3
  504. srwi $padbit,$d3,2 # ... so reduce
  505. and $h4,$d3,$mask
  506. andc $d3,$d3,$mask
  507. add $d3,$d3,$padbit
  508. ___
  509. if ($SIZE_T==4) {
  510. $code.=<<___;
  511. addc $h0,$h0,$d3
  512. adde $h1,$h1,$d0
  513. adde $h2,$h2,$d1
  514. adde $h3,$h3,$d2
  515. addze $h4,$h4
  516. addic $d0,$h0,5 # compare to modulus
  517. addze $d1,$h1
  518. addze $d2,$h2
  519. addze $d3,$h3
  520. addze $mask,$h4
  521. srwi $mask,$mask,2 # did it carry/borrow?
  522. neg $mask,$mask
  523. srawi $mask,$mask,31 # mask
  524. andc $h0,$h0,$mask
  525. and $d0,$d0,$mask
  526. andc $h1,$h1,$mask
  527. and $d1,$d1,$mask
  528. or $h0,$h0,$d0
  529. lwz $d0,0($nonce) # load nonce
  530. andc $h2,$h2,$mask
  531. and $d2,$d2,$mask
  532. or $h1,$h1,$d1
  533. lwz $d1,4($nonce)
  534. andc $h3,$h3,$mask
  535. and $d3,$d3,$mask
  536. or $h2,$h2,$d2
  537. lwz $d2,8($nonce)
  538. or $h3,$h3,$d3
  539. lwz $d3,12($nonce)
  540. addc $h0,$h0,$d0 # accumulate nonce
  541. adde $h1,$h1,$d1
  542. adde $h2,$h2,$d2
  543. adde $h3,$h3,$d3
  544. ___
  545. } else {
  546. $code.=<<___;
  547. add $h0,$h0,$d3
  548. add $h1,$h1,$d0
  549. add $h2,$h2,$d1
  550. add $h3,$h3,$d2
  551. srdi $d0,$h0,32
  552. add $h1,$h1,$d0
  553. srdi $d1,$h1,32
  554. add $h2,$h2,$d1
  555. srdi $d2,$h2,32
  556. add $h3,$h3,$d2
  557. srdi $d3,$h3,32
  558. add $h4,$h4,$d3
  559. insrdi $h0,$h1,32,0
  560. insrdi $h2,$h3,32,0
  561. addic $d0,$h0,5 # compare to modulus
  562. addze $d1,$h2
  563. addze $d2,$h4
  564. srdi $mask,$d2,2 # did it carry/borrow?
  565. neg $mask,$mask
  566. sradi $mask,$mask,63 # mask
  567. ld $d2,0($nonce) # load nonce
  568. ld $d3,8($nonce)
  569. andc $h0,$h0,$mask
  570. and $d0,$d0,$mask
  571. andc $h2,$h2,$mask
  572. and $d1,$d1,$mask
  573. or $h0,$h0,$d0
  574. or $h2,$h2,$d1
  575. ___
  576. $code.=<<___ if (!$LITTLE_ENDIAN);
  577. rotldi $d2,$d2,32 # flip nonce words
  578. rotldi $d3,$d3,32
  579. ___
  580. $code.=<<___;
  581. addc $h0,$h0,$d2 # accumulate nonce
  582. adde $h2,$h2,$d3
  583. srdi $h1,$h0,32
  584. srdi $h3,$h2,32
  585. ___
  586. }
  587. $code.=<<___ if ($LITTLE_ENDIAN);
  588. stw $h0,0($mac) # write result
  589. stw $h1,4($mac)
  590. stw $h2,8($mac)
  591. stw $h3,12($mac)
  592. ___
  593. $code.=<<___ if (!$LITTLE_ENDIAN);
  594. li $d1,4
  595. stwbrx $h0,0,$mac # write result
  596. li $d2,8
  597. stwbrx $h1,$d1,$mac
  598. li $d3,12
  599. stwbrx $h2,$d2,$mac
  600. stwbrx $h3,$d3,$mac
  601. ___
  602. $code.=<<___;
  603. $POP r28,`$FRAME-$SIZE_T*4`($sp)
  604. $POP r29,`$FRAME-$SIZE_T*3`($sp)
  605. $POP r30,`$FRAME-$SIZE_T*2`($sp)
  606. $POP r31,`$FRAME-$SIZE_T*1`($sp)
  607. addi $sp,$sp,$FRAME
  608. blr
  609. .long 0
  610. .byte 0,12,4,1,0x80,4,3,0
  611. .size .poly1305_emit_fpu,.-.poly1305_emit_fpu
  612. ___
  613. }
  614. # Ugly hack here, because PPC assembler syntax seem to vary too
  615. # much from platforms to platform...
  616. $code.=<<___;
  617. .align 6
  618. LPICmeup:
  619. mflr r0
  620. bcl 20,31,\$+4
  621. mflr $len # vvvvvv "distance" between . and 1st data entry
  622. addi $len,$len,`64-8` # borrow $len
  623. mtlr r0
  624. blr
  625. .long 0
  626. .byte 0,12,0x14,0,0,0,0,0
  627. .space `64-9*4`
  628. .quad 0x4330000000000000 # 2^(52+0)
  629. .quad 0x4530000000000000 # 2^(52+32)
  630. .quad 0x4730000000000000 # 2^(52+64)
  631. .quad 0x4930000000000000 # 2^(52+96)
  632. .quad 0x4b50000000000000 # 2^(52+130)
  633. .quad 0x37f4000000000000 # 5/2^130
  634. .quad 0x4430000000000000 # 2^(52+16+0)
  635. .quad 0x4630000000000000 # 2^(52+16+32)
  636. .quad 0x4830000000000000 # 2^(52+16+64)
  637. .quad 0x4a30000000000000 # 2^(52+16+96)
  638. .quad 0x3e30000000000000 # 2^(52+16+0-96)
  639. .quad 0x4030000000000000 # 2^(52+16+32-96)
  640. .quad 0x4230000000000000 # 2^(52+16+64-96)
  641. .quad 0x0000000000000001 # fpscr: truncate, no exceptions
  642. .asciz "Poly1305 for PPC FPU, CRYPTOGAMS by <appro\@openssl.org>"
  643. .align 4
  644. ___
  645. $code =~ s/\`([^\`]*)\`/eval $1/gem;
  646. print $code;
  647. close STDOUT or die "error closing STDOUT: $!";