poly1305-sparcv9.pl 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122
  1. #! /usr/bin/env perl
  2. # Copyright 2016-2021 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. # ====================================================================
  10. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  11. # project. The module is, however, dual licensed under OpenSSL and
  12. # CRYPTOGAMS licenses depending on where you obtain it. For further
  13. # details see http://www.openssl.org/~appro/cryptogams/.
  14. # ====================================================================
  15. #
  16. # This module implements Poly1305 hash for SPARCv9, vanilla, as well
  17. # as VIS3 and FMA extensions.
  18. #
  19. # May, August 2015
  20. #
  21. # Numbers are cycles per processed byte with poly1305_blocks alone.
  22. #
  23. # IALU(*) FMA
  24. #
  25. # UltraSPARC III 12.3(**)
  26. # SPARC T3 7.92
  27. # SPARC T4 1.70(***) 6.55
  28. # SPARC64 X 5.60 3.64
  29. #
  30. # (*) Comparison to compiler-generated code is really problematic,
  31. # because latter's performance varies too much depending on too
  32. # many variables. For example, one can measure from 5x to 15x
  33. # improvement on T4 for gcc-4.6. Well, in T4 case it's a bit
  34. # unfair comparison, because compiler doesn't use VIS3, but
  35. # given same initial conditions coefficient varies from 3x to 9x.
  36. # (**) Pre-III performance should be even worse; floating-point
  37. # performance for UltraSPARC I-IV on the other hand is reported
  38. # to be 4.25 for hand-coded assembly, but they are just too old
  39. # to care about.
  40. # (***) Multi-process benchmark saturates at ~12.5x single-process
  41. # result on 8-core processor, or ~21GBps per 2.85GHz socket.
  42. # $output is the last argument if it looks like a file (it has an extension)
  43. my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  44. open STDOUT,">$output" if $output;
  45. my ($ctx,$inp,$len,$padbit,$shl,$shr) = map("%i$_",(0..5));
  46. my ($r0,$r1,$r2,$r3,$s1,$s2,$s3,$h4) = map("%l$_",(0..7));
  47. my ($h0,$h1,$h2,$h3, $t0,$t1,$t2) = map("%o$_",(0..5,7));
  48. my ($d0,$d1,$d2,$d3) = map("%g$_",(1..4));
  49. $code.=<<___;
  50. #ifndef __ASSEMBLER__
  51. # define __ASSEMBLER__ 1
  52. #endif
  53. #include "crypto/sparc_arch.h"
  54. #ifdef __arch64__
  55. .register %g2,#scratch
  56. .register %g3,#scratch
  57. # define STPTR stx
  58. # define SIZE_T 8
  59. #else
  60. # define STPTR st
  61. # define SIZE_T 4
  62. #endif
  63. #define LOCALS (STACK_BIAS+STACK_FRAME)
  64. .section ".text",#alloc,#execinstr
  65. #ifdef __PIC__
  66. SPARC_PIC_THUNK(%g1)
  67. #endif
  68. .globl poly1305_init
  69. .align 32
  70. poly1305_init:
  71. save %sp,-STACK_FRAME-16,%sp
  72. nop
  73. SPARC_LOAD_ADDRESS(OPENSSL_sparcv9cap_P,%g1)
  74. ld [%g1],%g1
  75. and %g1,SPARCV9_FMADD|SPARCV9_VIS3,%g1
  76. cmp %g1,SPARCV9_FMADD
  77. be .Lpoly1305_init_fma
  78. nop
  79. stx %g0,[$ctx+0]
  80. stx %g0,[$ctx+8] ! zero hash value
  81. brz,pn $inp,.Lno_key
  82. stx %g0,[$ctx+16]
  83. and $inp,7,$shr ! alignment factor
  84. andn $inp,7,$inp
  85. sll $shr,3,$shr ! *8
  86. neg $shr,$shl
  87. sethi %hi(0x0ffffffc),$t0
  88. set 8,$h1
  89. or $t0,%lo(0x0ffffffc),$t0
  90. set 16,$h2
  91. sllx $t0,32,$t1
  92. or $t0,$t1,$t1 ! 0x0ffffffc0ffffffc
  93. or $t1,3,$t0 ! 0x0ffffffc0fffffff
  94. ldxa [$inp+%g0]0x88,$h0 ! load little-endian key
  95. brz,pt $shr,.Lkey_aligned
  96. ldxa [$inp+$h1]0x88,$h1
  97. ldxa [$inp+$h2]0x88,$h2
  98. srlx $h0,$shr,$h0
  99. sllx $h1,$shl,$t2
  100. srlx $h1,$shr,$h1
  101. or $t2,$h0,$h0
  102. sllx $h2,$shl,$h2
  103. or $h2,$h1,$h1
  104. .Lkey_aligned:
  105. and $t0,$h0,$h0
  106. and $t1,$h1,$h1
  107. stx $h0,[$ctx+32+0] ! store key
  108. stx $h1,[$ctx+32+8]
  109. andcc %g1,SPARCV9_VIS3,%g0
  110. be .Lno_key
  111. nop
  112. 1: call .+8
  113. add %o7,poly1305_blocks_vis3-1b,%o7
  114. add %o7,poly1305_emit-poly1305_blocks_vis3,%o5
  115. STPTR %o7,[%i2]
  116. STPTR %o5,[%i2+SIZE_T]
  117. ret
  118. restore %g0,1,%o0 ! return 1
  119. .Lno_key:
  120. ret
  121. restore %g0,%g0,%o0 ! return 0
  122. .type poly1305_init,#function
  123. .size poly1305_init,.-poly1305_init
  124. .globl poly1305_blocks
  125. .align 32
  126. poly1305_blocks:
  127. save %sp,-STACK_FRAME,%sp
  128. srln $len,4,$len
  129. brz,pn $len,.Lno_data
  130. nop
  131. ld [$ctx+32+0],$r1 ! load key
  132. ld [$ctx+32+4],$r0
  133. ld [$ctx+32+8],$r3
  134. ld [$ctx+32+12],$r2
  135. ld [$ctx+0],$h1 ! load hash value
  136. ld [$ctx+4],$h0
  137. ld [$ctx+8],$h3
  138. ld [$ctx+12],$h2
  139. ld [$ctx+16],$h4
  140. and $inp,7,$shr ! alignment factor
  141. andn $inp,7,$inp
  142. set 8,$d1
  143. sll $shr,3,$shr ! *8
  144. set 16,$d2
  145. neg $shr,$shl
  146. srl $r1,2,$s1
  147. srl $r2,2,$s2
  148. add $r1,$s1,$s1
  149. srl $r3,2,$s3
  150. add $r2,$s2,$s2
  151. add $r3,$s3,$s3
  152. .Loop:
  153. ldxa [$inp+%g0]0x88,$d0 ! load little-endian input
  154. brz,pt $shr,.Linp_aligned
  155. ldxa [$inp+$d1]0x88,$d1
  156. ldxa [$inp+$d2]0x88,$d2
  157. srlx $d0,$shr,$d0
  158. sllx $d1,$shl,$t1
  159. srlx $d1,$shr,$d1
  160. or $t1,$d0,$d0
  161. sllx $d2,$shl,$d2
  162. or $d2,$d1,$d1
  163. .Linp_aligned:
  164. srlx $d0,32,$t0
  165. addcc $d0,$h0,$h0 ! accumulate input
  166. srlx $d1,32,$t1
  167. addccc $t0,$h1,$h1
  168. addccc $d1,$h2,$h2
  169. addccc $t1,$h3,$h3
  170. addc $padbit,$h4,$h4
  171. umul $r0,$h0,$d0
  172. umul $r1,$h0,$d1
  173. umul $r2,$h0,$d2
  174. umul $r3,$h0,$d3
  175. sub $len,1,$len
  176. add $inp,16,$inp
  177. umul $s3,$h1,$t0
  178. umul $r0,$h1,$t1
  179. umul $r1,$h1,$t2
  180. add $t0,$d0,$d0
  181. add $t1,$d1,$d1
  182. umul $r2,$h1,$t0
  183. add $t2,$d2,$d2
  184. add $t0,$d3,$d3
  185. umul $s2,$h2,$t1
  186. umul $s3,$h2,$t2
  187. umul $r0,$h2,$t0
  188. add $t1,$d0,$d0
  189. add $t2,$d1,$d1
  190. umul $r1,$h2,$t1
  191. add $t0,$d2,$d2
  192. add $t1,$d3,$d3
  193. umul $s1,$h3,$t2
  194. umul $s2,$h3,$t0
  195. umul $s3,$h3,$t1
  196. add $t2,$d0,$d0
  197. add $t0,$d1,$d1
  198. umul $r0,$h3,$t2
  199. add $t1,$d2,$d2
  200. add $t2,$d3,$d3
  201. umul $s1,$h4,$t0
  202. umul $s2,$h4,$t1
  203. umul $s3,$h4,$t2
  204. umul $r0,$h4,$h4
  205. add $t0,$d1,$d1
  206. add $t1,$d2,$d2
  207. srlx $d0,32,$h1
  208. add $t2,$d3,$d3
  209. srlx $d1,32,$h2
  210. addcc $d1,$h1,$h1
  211. srlx $d2,32,$h3
  212. set 8,$d1
  213. addccc $d2,$h2,$h2
  214. srlx $d3,32,$t0
  215. set 16,$d2
  216. addccc $d3,$h3,$h3
  217. addc $t0,$h4,$h4
  218. srl $h4,2,$t0 ! final reduction step
  219. andn $h4,3,$t1
  220. and $h4,3,$h4
  221. add $t1,$t0,$t0
  222. addcc $t0,$d0,$h0
  223. addccc %g0,$h1,$h1
  224. addccc %g0,$h2,$h2
  225. addccc %g0,$h3,$h3
  226. brnz,pt $len,.Loop
  227. addc %g0,$h4,$h4
  228. st $h1,[$ctx+0] ! store hash value
  229. st $h0,[$ctx+4]
  230. st $h3,[$ctx+8]
  231. st $h2,[$ctx+12]
  232. st $h4,[$ctx+16]
  233. .Lno_data:
  234. ret
  235. restore
  236. .type poly1305_blocks,#function
  237. .size poly1305_blocks,.-poly1305_blocks
  238. ___
  239. ########################################################################
  240. # VIS3 has umulxhi and addxc...
  241. {
  242. my ($H0,$H1,$H2,$R0,$R1,$S1,$T1) = map("%o$_",(0..5,7));
  243. my ($D0,$D1,$D2,$T0) = map("%g$_",(1..4));
  244. $code.=<<___;
  245. .align 32
  246. poly1305_blocks_vis3:
  247. save %sp,-STACK_FRAME,%sp
  248. srln $len,4,$len
  249. brz,pn $len,.Lno_data
  250. nop
  251. ldx [$ctx+32+0],$R0 ! load key
  252. ldx [$ctx+32+8],$R1
  253. ldx [$ctx+0],$H0 ! load hash value
  254. ldx [$ctx+8],$H1
  255. ld [$ctx+16],$H2
  256. and $inp,7,$shr ! alignment factor
  257. andn $inp,7,$inp
  258. set 8,$r1
  259. sll $shr,3,$shr ! *8
  260. set 16,$r2
  261. neg $shr,$shl
  262. srlx $R1,2,$S1
  263. b .Loop_vis3
  264. add $R1,$S1,$S1
  265. .Loop_vis3:
  266. ldxa [$inp+%g0]0x88,$D0 ! load little-endian input
  267. brz,pt $shr,.Linp_aligned_vis3
  268. ldxa [$inp+$r1]0x88,$D1
  269. ldxa [$inp+$r2]0x88,$D2
  270. srlx $D0,$shr,$D0
  271. sllx $D1,$shl,$T1
  272. srlx $D1,$shr,$D1
  273. or $T1,$D0,$D0
  274. sllx $D2,$shl,$D2
  275. or $D2,$D1,$D1
  276. .Linp_aligned_vis3:
  277. addcc $D0,$H0,$H0 ! accumulate input
  278. sub $len,1,$len
  279. addxccc $D1,$H1,$H1
  280. add $inp,16,$inp
  281. mulx $R0,$H0,$D0 ! r0*h0
  282. addxc $padbit,$H2,$H2
  283. umulxhi $R0,$H0,$D1
  284. mulx $S1,$H1,$T0 ! s1*h1
  285. umulxhi $S1,$H1,$T1
  286. addcc $T0,$D0,$D0
  287. mulx $R1,$H0,$T0 ! r1*h0
  288. addxc $T1,$D1,$D1
  289. umulxhi $R1,$H0,$D2
  290. addcc $T0,$D1,$D1
  291. mulx $R0,$H1,$T0 ! r0*h1
  292. addxc %g0,$D2,$D2
  293. umulxhi $R0,$H1,$T1
  294. addcc $T0,$D1,$D1
  295. mulx $S1,$H2,$T0 ! s1*h2
  296. addxc $T1,$D2,$D2
  297. mulx $R0,$H2,$T1 ! r0*h2
  298. addcc $T0,$D1,$D1
  299. addxc $T1,$D2,$D2
  300. srlx $D2,2,$T0 ! final reduction step
  301. andn $D2,3,$T1
  302. and $D2,3,$H2
  303. add $T1,$T0,$T0
  304. addcc $T0,$D0,$H0
  305. addxccc %g0,$D1,$H1
  306. brnz,pt $len,.Loop_vis3
  307. addxc %g0,$H2,$H2
  308. stx $H0,[$ctx+0] ! store hash value
  309. stx $H1,[$ctx+8]
  310. st $H2,[$ctx+16]
  311. ret
  312. restore
  313. .type poly1305_blocks_vis3,#function
  314. .size poly1305_blocks_vis3,.-poly1305_blocks_vis3
  315. ___
  316. }
  317. my ($mac,$nonce) = ($inp,$len);
  318. $code.=<<___;
  319. .globl poly1305_emit
  320. .align 32
  321. poly1305_emit:
  322. save %sp,-STACK_FRAME,%sp
  323. ld [$ctx+0],$h1 ! load hash value
  324. ld [$ctx+4],$h0
  325. ld [$ctx+8],$h3
  326. ld [$ctx+12],$h2
  327. ld [$ctx+16],$h4
  328. addcc $h0,5,$r0 ! compare to modulus
  329. addccc $h1,0,$r1
  330. addccc $h2,0,$r2
  331. addccc $h3,0,$r3
  332. addc $h4,0,$h4
  333. andcc $h4,4,%g0 ! did it carry/borrow?
  334. movnz %icc,$r0,$h0
  335. ld [$nonce+0],$r0 ! load nonce
  336. movnz %icc,$r1,$h1
  337. ld [$nonce+4],$r1
  338. movnz %icc,$r2,$h2
  339. ld [$nonce+8],$r2
  340. movnz %icc,$r3,$h3
  341. ld [$nonce+12],$r3
  342. addcc $r0,$h0,$h0 ! accumulate nonce
  343. addccc $r1,$h1,$h1
  344. addccc $r2,$h2,$h2
  345. addc $r3,$h3,$h3
  346. srl $h0,8,$r0
  347. stb $h0,[$mac+0] ! store little-endian result
  348. srl $h0,16,$r1
  349. stb $r0,[$mac+1]
  350. srl $h0,24,$r2
  351. stb $r1,[$mac+2]
  352. stb $r2,[$mac+3]
  353. srl $h1,8,$r0
  354. stb $h1,[$mac+4]
  355. srl $h1,16,$r1
  356. stb $r0,[$mac+5]
  357. srl $h1,24,$r2
  358. stb $r1,[$mac+6]
  359. stb $r2,[$mac+7]
  360. srl $h2,8,$r0
  361. stb $h2,[$mac+8]
  362. srl $h2,16,$r1
  363. stb $r0,[$mac+9]
  364. srl $h2,24,$r2
  365. stb $r1,[$mac+10]
  366. stb $r2,[$mac+11]
  367. srl $h3,8,$r0
  368. stb $h3,[$mac+12]
  369. srl $h3,16,$r1
  370. stb $r0,[$mac+13]
  371. srl $h3,24,$r2
  372. stb $r1,[$mac+14]
  373. stb $r2,[$mac+15]
  374. ret
  375. restore
  376. .type poly1305_emit,#function
  377. .size poly1305_emit,.-poly1305_emit
  378. ___
  379. {
  380. my ($ctx,$inp,$len,$padbit) = map("%i$_",(0..3));
  381. my ($in0,$in1,$in2,$in3,$in4) = map("%o$_",(0..4));
  382. my ($i1,$step,$shr,$shl) = map("%l$_",(0..7));
  383. my $i2=$step;
  384. my ($h0lo,$h0hi,$h1lo,$h1hi,$h2lo,$h2hi,$h3lo,$h3hi,
  385. $two0,$two32,$two64,$two96,$two130,$five_two130,
  386. $r0lo,$r0hi,$r1lo,$r1hi,$r2lo,$r2hi,
  387. $s2lo,$s2hi,$s3lo,$s3hi,
  388. $c0lo,$c0hi,$c1lo,$c1hi,$c2lo,$c2hi,$c3lo,$c3hi) = map("%f".2*$_,(0..31));
  389. # borrowings
  390. my ($r3lo,$r3hi,$s1lo,$s1hi) = ($c0lo,$c0hi,$c1lo,$c1hi);
  391. my ($x0,$x1,$x2,$x3) = ($c2lo,$c2hi,$c3lo,$c3hi);
  392. my ($y0,$y1,$y2,$y3) = ($c1lo,$c1hi,$c3hi,$c3lo);
  393. $code.=<<___;
  394. .align 32
  395. poly1305_init_fma:
  396. save %sp,-STACK_FRAME-16,%sp
  397. nop
  398. .Lpoly1305_init_fma:
  399. 1: call .+8
  400. add %o7,.Lconsts_fma-1b,%o7
  401. ldd [%o7+8*0],$two0 ! load constants
  402. ldd [%o7+8*1],$two32
  403. ldd [%o7+8*2],$two64
  404. ldd [%o7+8*3],$two96
  405. ldd [%o7+8*5],$five_two130
  406. std $two0,[$ctx+8*0] ! initial hash value, biased 0
  407. std $two32,[$ctx+8*1]
  408. std $two64,[$ctx+8*2]
  409. std $two96,[$ctx+8*3]
  410. brz,pn $inp,.Lno_key_fma
  411. nop
  412. stx %fsr,[%sp+LOCALS] ! save original %fsr
  413. ldx [%o7+8*6],%fsr ! load new %fsr
  414. std $two0,[$ctx+8*4] ! key "template"
  415. std $two32,[$ctx+8*5]
  416. std $two64,[$ctx+8*6]
  417. std $two96,[$ctx+8*7]
  418. and $inp,7,$shr
  419. andn $inp,7,$inp ! align pointer
  420. mov 8,$i1
  421. sll $shr,3,$shr
  422. mov 16,$i2
  423. neg $shr,$shl
  424. ldxa [$inp+%g0]0x88,$in0 ! load little-endian key
  425. ldxa [$inp+$i1]0x88,$in2
  426. brz $shr,.Lkey_aligned_fma
  427. sethi %hi(0xf0000000),$i1 ! 0xf0000000
  428. ldxa [$inp+$i2]0x88,$in4
  429. srlx $in0,$shr,$in0 ! align data
  430. sllx $in2,$shl,$in1
  431. srlx $in2,$shr,$in2
  432. or $in1,$in0,$in0
  433. sllx $in4,$shl,$in3
  434. or $in3,$in2,$in2
  435. .Lkey_aligned_fma:
  436. or $i1,3,$i2 ! 0xf0000003
  437. srlx $in0,32,$in1
  438. andn $in0,$i1,$in0 ! &=0x0fffffff
  439. andn $in1,$i2,$in1 ! &=0x0ffffffc
  440. srlx $in2,32,$in3
  441. andn $in2,$i2,$in2
  442. andn $in3,$i2,$in3
  443. st $in0,[$ctx+`8*4+4`] ! fill "template"
  444. st $in1,[$ctx+`8*5+4`]
  445. st $in2,[$ctx+`8*6+4`]
  446. st $in3,[$ctx+`8*7+4`]
  447. ldd [$ctx+8*4],$h0lo ! load [biased] key
  448. ldd [$ctx+8*5],$h1lo
  449. ldd [$ctx+8*6],$h2lo
  450. ldd [$ctx+8*7],$h3lo
  451. fsubd $h0lo,$two0, $h0lo ! r0
  452. ldd [%o7+8*7],$two0 ! more constants
  453. fsubd $h1lo,$two32,$h1lo ! r1
  454. ldd [%o7+8*8],$two32
  455. fsubd $h2lo,$two64,$h2lo ! r2
  456. ldd [%o7+8*9],$two64
  457. fsubd $h3lo,$two96,$h3lo ! r3
  458. ldd [%o7+8*10],$two96
  459. fmuld $five_two130,$h1lo,$s1lo ! s1
  460. fmuld $five_two130,$h2lo,$s2lo ! s2
  461. fmuld $five_two130,$h3lo,$s3lo ! s3
  462. faddd $h0lo,$two0, $h0hi
  463. faddd $h1lo,$two32,$h1hi
  464. faddd $h2lo,$two64,$h2hi
  465. faddd $h3lo,$two96,$h3hi
  466. fsubd $h0hi,$two0, $h0hi
  467. ldd [%o7+8*11],$two0 ! more constants
  468. fsubd $h1hi,$two32,$h1hi
  469. ldd [%o7+8*12],$two32
  470. fsubd $h2hi,$two64,$h2hi
  471. ldd [%o7+8*13],$two64
  472. fsubd $h3hi,$two96,$h3hi
  473. fsubd $h0lo,$h0hi,$h0lo
  474. std $h0hi,[$ctx+8*5] ! r0hi
  475. fsubd $h1lo,$h1hi,$h1lo
  476. std $h1hi,[$ctx+8*7] ! r1hi
  477. fsubd $h2lo,$h2hi,$h2lo
  478. std $h2hi,[$ctx+8*9] ! r2hi
  479. fsubd $h3lo,$h3hi,$h3lo
  480. std $h3hi,[$ctx+8*11] ! r3hi
  481. faddd $s1lo,$two0, $s1hi
  482. faddd $s2lo,$two32,$s2hi
  483. faddd $s3lo,$two64,$s3hi
  484. fsubd $s1hi,$two0, $s1hi
  485. fsubd $s2hi,$two32,$s2hi
  486. fsubd $s3hi,$two64,$s3hi
  487. fsubd $s1lo,$s1hi,$s1lo
  488. fsubd $s2lo,$s2hi,$s2lo
  489. fsubd $s3lo,$s3hi,$s3lo
  490. ldx [%sp+LOCALS],%fsr ! restore %fsr
  491. std $h0lo,[$ctx+8*4] ! r0lo
  492. std $h1lo,[$ctx+8*6] ! r1lo
  493. std $h2lo,[$ctx+8*8] ! r2lo
  494. std $h3lo,[$ctx+8*10] ! r3lo
  495. std $s1hi,[$ctx+8*13]
  496. std $s2hi,[$ctx+8*15]
  497. std $s3hi,[$ctx+8*17]
  498. std $s1lo,[$ctx+8*12]
  499. std $s2lo,[$ctx+8*14]
  500. std $s3lo,[$ctx+8*16]
  501. add %o7,poly1305_blocks_fma-.Lconsts_fma,%o0
  502. add %o7,poly1305_emit_fma-.Lconsts_fma,%o1
  503. STPTR %o0,[%i2]
  504. STPTR %o1,[%i2+SIZE_T]
  505. ret
  506. restore %g0,1,%o0 ! return 1
  507. .Lno_key_fma:
  508. ret
  509. restore %g0,%g0,%o0 ! return 0
  510. .type poly1305_init_fma,#function
  511. .size poly1305_init_fma,.-poly1305_init_fma
  512. .align 32
  513. poly1305_blocks_fma:
  514. save %sp,-STACK_FRAME-48,%sp
  515. srln $len,4,$len
  516. brz,pn $len,.Labort
  517. sub $len,1,$len
  518. 1: call .+8
  519. add %o7,.Lconsts_fma-1b,%o7
  520. ldd [%o7+8*0],$two0 ! load constants
  521. ldd [%o7+8*1],$two32
  522. ldd [%o7+8*2],$two64
  523. ldd [%o7+8*3],$two96
  524. ldd [%o7+8*4],$two130
  525. ldd [%o7+8*5],$five_two130
  526. ldd [$ctx+8*0],$h0lo ! load [biased] hash value
  527. ldd [$ctx+8*1],$h1lo
  528. ldd [$ctx+8*2],$h2lo
  529. ldd [$ctx+8*3],$h3lo
  530. std $two0,[%sp+LOCALS+8*0] ! input "template"
  531. sethi %hi((1023+52+96)<<20),$in3
  532. std $two32,[%sp+LOCALS+8*1]
  533. or $padbit,$in3,$in3
  534. std $two64,[%sp+LOCALS+8*2]
  535. st $in3,[%sp+LOCALS+8*3]
  536. and $inp,7,$shr
  537. andn $inp,7,$inp ! align pointer
  538. mov 8,$i1
  539. sll $shr,3,$shr
  540. mov 16,$step
  541. neg $shr,$shl
  542. ldxa [$inp+%g0]0x88,$in0 ! load little-endian input
  543. brz $shr,.Linp_aligned_fma
  544. ldxa [$inp+$i1]0x88,$in2
  545. ldxa [$inp+$step]0x88,$in4
  546. add $inp,8,$inp
  547. srlx $in0,$shr,$in0 ! align data
  548. sllx $in2,$shl,$in1
  549. srlx $in2,$shr,$in2
  550. or $in1,$in0,$in0
  551. sllx $in4,$shl,$in3
  552. srlx $in4,$shr,$in4 ! pre-shift
  553. or $in3,$in2,$in2
  554. .Linp_aligned_fma:
  555. srlx $in0,32,$in1
  556. movrz $len,0,$step
  557. srlx $in2,32,$in3
  558. add $step,$inp,$inp ! conditional advance
  559. st $in0,[%sp+LOCALS+8*0+4] ! fill "template"
  560. st $in1,[%sp+LOCALS+8*1+4]
  561. st $in2,[%sp+LOCALS+8*2+4]
  562. st $in3,[%sp+LOCALS+8*3+4]
  563. ldd [$ctx+8*4],$r0lo ! load key
  564. ldd [$ctx+8*5],$r0hi
  565. ldd [$ctx+8*6],$r1lo
  566. ldd [$ctx+8*7],$r1hi
  567. ldd [$ctx+8*8],$r2lo
  568. ldd [$ctx+8*9],$r2hi
  569. ldd [$ctx+8*10],$r3lo
  570. ldd [$ctx+8*11],$r3hi
  571. ldd [$ctx+8*12],$s1lo
  572. ldd [$ctx+8*13],$s1hi
  573. ldd [$ctx+8*14],$s2lo
  574. ldd [$ctx+8*15],$s2hi
  575. ldd [$ctx+8*16],$s3lo
  576. ldd [$ctx+8*17],$s3hi
  577. stx %fsr,[%sp+LOCALS+8*4] ! save original %fsr
  578. ldx [%o7+8*6],%fsr ! load new %fsr
  579. subcc $len,1,$len
  580. movrz $len,0,$step
  581. ldd [%sp+LOCALS+8*0],$x0 ! load biased input
  582. ldd [%sp+LOCALS+8*1],$x1
  583. ldd [%sp+LOCALS+8*2],$x2
  584. ldd [%sp+LOCALS+8*3],$x3
  585. fsubd $h0lo,$two0, $h0lo ! de-bias hash value
  586. fsubd $h1lo,$two32,$h1lo
  587. ldxa [$inp+%g0]0x88,$in0 ! modulo-scheduled input load
  588. fsubd $h2lo,$two64,$h2lo
  589. fsubd $h3lo,$two96,$h3lo
  590. ldxa [$inp+$i1]0x88,$in2
  591. fsubd $x0,$two0, $x0 ! de-bias input
  592. fsubd $x1,$two32,$x1
  593. fsubd $x2,$two64,$x2
  594. fsubd $x3,$two96,$x3
  595. brz $shr,.Linp_aligned_fma2
  596. add $step,$inp,$inp ! conditional advance
  597. sllx $in0,$shl,$in1 ! align data
  598. srlx $in0,$shr,$in3
  599. or $in1,$in4,$in0
  600. sllx $in2,$shl,$in1
  601. srlx $in2,$shr,$in4 ! pre-shift
  602. or $in3,$in1,$in2
  603. .Linp_aligned_fma2:
  604. srlx $in0,32,$in1
  605. srlx $in2,32,$in3
  606. faddd $h0lo,$x0,$x0 ! accumulate input
  607. stw $in0,[%sp+LOCALS+8*0+4]
  608. faddd $h1lo,$x1,$x1
  609. stw $in1,[%sp+LOCALS+8*1+4]
  610. faddd $h2lo,$x2,$x2
  611. stw $in2,[%sp+LOCALS+8*2+4]
  612. faddd $h3lo,$x3,$x3
  613. stw $in3,[%sp+LOCALS+8*3+4]
  614. b .Lentry_fma
  615. nop
  616. .align 16
  617. .Loop_fma:
  618. ldxa [$inp+%g0]0x88,$in0 ! modulo-scheduled input load
  619. ldxa [$inp+$i1]0x88,$in2
  620. movrz $len,0,$step
  621. faddd $y0,$h0lo,$h0lo ! accumulate input
  622. faddd $y1,$h0hi,$h0hi
  623. faddd $y2,$h2lo,$h2lo
  624. faddd $y3,$h2hi,$h2hi
  625. brz,pn $shr,.Linp_aligned_fma3
  626. add $step,$inp,$inp ! conditional advance
  627. sllx $in0,$shl,$in1 ! align data
  628. srlx $in0,$shr,$in3
  629. or $in1,$in4,$in0
  630. sllx $in2,$shl,$in1
  631. srlx $in2,$shr,$in4 ! pre-shift
  632. or $in3,$in1,$in2
  633. .Linp_aligned_fma3:
  634. !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! base 2^48 -> base 2^32
  635. faddd $two64,$h1lo,$c1lo
  636. srlx $in0,32,$in1
  637. faddd $two64,$h1hi,$c1hi
  638. srlx $in2,32,$in3
  639. faddd $two130,$h3lo,$c3lo
  640. st $in0,[%sp+LOCALS+8*0+4] ! fill "template"
  641. faddd $two130,$h3hi,$c3hi
  642. st $in1,[%sp+LOCALS+8*1+4]
  643. faddd $two32,$h0lo,$c0lo
  644. st $in2,[%sp+LOCALS+8*2+4]
  645. faddd $two32,$h0hi,$c0hi
  646. st $in3,[%sp+LOCALS+8*3+4]
  647. faddd $two96,$h2lo,$c2lo
  648. faddd $two96,$h2hi,$c2hi
  649. fsubd $c1lo,$two64,$c1lo
  650. fsubd $c1hi,$two64,$c1hi
  651. fsubd $c3lo,$two130,$c3lo
  652. fsubd $c3hi,$two130,$c3hi
  653. fsubd $c0lo,$two32,$c0lo
  654. fsubd $c0hi,$two32,$c0hi
  655. fsubd $c2lo,$two96,$c2lo
  656. fsubd $c2hi,$two96,$c2hi
  657. fsubd $h1lo,$c1lo,$h1lo
  658. fsubd $h1hi,$c1hi,$h1hi
  659. fsubd $h3lo,$c3lo,$h3lo
  660. fsubd $h3hi,$c3hi,$h3hi
  661. fsubd $h2lo,$c2lo,$h2lo
  662. fsubd $h2hi,$c2hi,$h2hi
  663. fsubd $h0lo,$c0lo,$h0lo
  664. fsubd $h0hi,$c0hi,$h0hi
  665. faddd $h1lo,$c0lo,$h1lo
  666. faddd $h1hi,$c0hi,$h1hi
  667. faddd $h3lo,$c2lo,$h3lo
  668. faddd $h3hi,$c2hi,$h3hi
  669. faddd $h2lo,$c1lo,$h2lo
  670. faddd $h2hi,$c1hi,$h2hi
  671. fmaddd $five_two130,$c3lo,$h0lo,$h0lo
  672. fmaddd $five_two130,$c3hi,$h0hi,$h0hi
  673. faddd $h1lo,$h1hi,$x1
  674. ldd [$ctx+8*12],$s1lo ! reload constants
  675. faddd $h3lo,$h3hi,$x3
  676. ldd [$ctx+8*13],$s1hi
  677. faddd $h2lo,$h2hi,$x2
  678. ldd [$ctx+8*10],$r3lo
  679. faddd $h0lo,$h0hi,$x0
  680. ldd [$ctx+8*11],$r3hi
  681. .Lentry_fma:
  682. fmuld $x1,$s3lo,$h0lo
  683. fmuld $x1,$s3hi,$h0hi
  684. fmuld $x1,$r1lo,$h2lo
  685. fmuld $x1,$r1hi,$h2hi
  686. fmuld $x1,$r0lo,$h1lo
  687. fmuld $x1,$r0hi,$h1hi
  688. fmuld $x1,$r2lo,$h3lo
  689. fmuld $x1,$r2hi,$h3hi
  690. fmaddd $x3,$s1lo,$h0lo,$h0lo
  691. fmaddd $x3,$s1hi,$h0hi,$h0hi
  692. fmaddd $x3,$s3lo,$h2lo,$h2lo
  693. fmaddd $x3,$s3hi,$h2hi,$h2hi
  694. fmaddd $x3,$s2lo,$h1lo,$h1lo
  695. fmaddd $x3,$s2hi,$h1hi,$h1hi
  696. fmaddd $x3,$r0lo,$h3lo,$h3lo
  697. fmaddd $x3,$r0hi,$h3hi,$h3hi
  698. fmaddd $x2,$s2lo,$h0lo,$h0lo
  699. fmaddd $x2,$s2hi,$h0hi,$h0hi
  700. fmaddd $x2,$r0lo,$h2lo,$h2lo
  701. fmaddd $x2,$r0hi,$h2hi,$h2hi
  702. fmaddd $x2,$s3lo,$h1lo,$h1lo
  703. ldd [%sp+LOCALS+8*0],$y0 ! load [biased] input
  704. fmaddd $x2,$s3hi,$h1hi,$h1hi
  705. ldd [%sp+LOCALS+8*1],$y1
  706. fmaddd $x2,$r1lo,$h3lo,$h3lo
  707. ldd [%sp+LOCALS+8*2],$y2
  708. fmaddd $x2,$r1hi,$h3hi,$h3hi
  709. ldd [%sp+LOCALS+8*3],$y3
  710. fmaddd $x0,$r0lo,$h0lo,$h0lo
  711. fsubd $y0,$two0, $y0 ! de-bias input
  712. fmaddd $x0,$r0hi,$h0hi,$h0hi
  713. fsubd $y1,$two32,$y1
  714. fmaddd $x0,$r2lo,$h2lo,$h2lo
  715. fsubd $y2,$two64,$y2
  716. fmaddd $x0,$r2hi,$h2hi,$h2hi
  717. fsubd $y3,$two96,$y3
  718. fmaddd $x0,$r1lo,$h1lo,$h1lo
  719. fmaddd $x0,$r1hi,$h1hi,$h1hi
  720. fmaddd $x0,$r3lo,$h3lo,$h3lo
  721. fmaddd $x0,$r3hi,$h3hi,$h3hi
  722. bcc SIZE_T_CC,.Loop_fma
  723. subcc $len,1,$len
  724. !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! base 2^48 -> base 2^32
  725. faddd $h0lo,$two32,$c0lo
  726. faddd $h0hi,$two32,$c0hi
  727. faddd $h2lo,$two96,$c2lo
  728. faddd $h2hi,$two96,$c2hi
  729. faddd $h1lo,$two64,$c1lo
  730. faddd $h1hi,$two64,$c1hi
  731. faddd $h3lo,$two130,$c3lo
  732. faddd $h3hi,$two130,$c3hi
  733. fsubd $c0lo,$two32,$c0lo
  734. fsubd $c0hi,$two32,$c0hi
  735. fsubd $c2lo,$two96,$c2lo
  736. fsubd $c2hi,$two96,$c2hi
  737. fsubd $c1lo,$two64,$c1lo
  738. fsubd $c1hi,$two64,$c1hi
  739. fsubd $c3lo,$two130,$c3lo
  740. fsubd $c3hi,$two130,$c3hi
  741. fsubd $h1lo,$c1lo,$h1lo
  742. fsubd $h1hi,$c1hi,$h1hi
  743. fsubd $h3lo,$c3lo,$h3lo
  744. fsubd $h3hi,$c3hi,$h3hi
  745. fsubd $h2lo,$c2lo,$h2lo
  746. fsubd $h2hi,$c2hi,$h2hi
  747. fsubd $h0lo,$c0lo,$h0lo
  748. fsubd $h0hi,$c0hi,$h0hi
  749. faddd $h1lo,$c0lo,$h1lo
  750. faddd $h1hi,$c0hi,$h1hi
  751. faddd $h3lo,$c2lo,$h3lo
  752. faddd $h3hi,$c2hi,$h3hi
  753. faddd $h2lo,$c1lo,$h2lo
  754. faddd $h2hi,$c1hi,$h2hi
  755. fmaddd $five_two130,$c3lo,$h0lo,$h0lo
  756. fmaddd $five_two130,$c3hi,$h0hi,$h0hi
  757. faddd $h1lo,$h1hi,$x1
  758. faddd $h3lo,$h3hi,$x3
  759. faddd $h2lo,$h2hi,$x2
  760. faddd $h0lo,$h0hi,$x0
  761. faddd $x1,$two32,$x1 ! bias
  762. faddd $x3,$two96,$x3
  763. faddd $x2,$two64,$x2
  764. faddd $x0,$two0, $x0
  765. ldx [%sp+LOCALS+8*4],%fsr ! restore saved %fsr
  766. std $x1,[$ctx+8*1] ! store [biased] hash value
  767. std $x3,[$ctx+8*3]
  768. std $x2,[$ctx+8*2]
  769. std $x0,[$ctx+8*0]
  770. .Labort:
  771. ret
  772. restore
  773. .type poly1305_blocks_fma,#function
  774. .size poly1305_blocks_fma,.-poly1305_blocks_fma
  775. ___
  776. {
  777. my ($mac,$nonce)=($inp,$len);
  778. my ($h0,$h1,$h2,$h3,$h4, $d0,$d1,$d2,$d3, $mask
  779. ) = (map("%l$_",(0..5)),map("%o$_",(0..4)));
  780. $code.=<<___;
  781. .align 32
  782. poly1305_emit_fma:
  783. save %sp,-STACK_FRAME,%sp
  784. ld [$ctx+8*0+0],$d0 ! load hash
  785. ld [$ctx+8*0+4],$h0
  786. ld [$ctx+8*1+0],$d1
  787. ld [$ctx+8*1+4],$h1
  788. ld [$ctx+8*2+0],$d2
  789. ld [$ctx+8*2+4],$h2
  790. ld [$ctx+8*3+0],$d3
  791. ld [$ctx+8*3+4],$h3
  792. sethi %hi(0xfff00000),$mask
  793. andn $d0,$mask,$d0 ! mask exponent
  794. andn $d1,$mask,$d1
  795. andn $d2,$mask,$d2
  796. andn $d3,$mask,$d3 ! can be partially reduced...
  797. mov 3,$mask
  798. srl $d3,2,$padbit ! ... so reduce
  799. and $d3,$mask,$h4
  800. andn $d3,$mask,$d3
  801. add $padbit,$d3,$d3
  802. addcc $d3,$h0,$h0
  803. addccc $d0,$h1,$h1
  804. addccc $d1,$h2,$h2
  805. addccc $d2,$h3,$h3
  806. addc %g0,$h4,$h4
  807. addcc $h0,5,$d0 ! compare to modulus
  808. addccc $h1,0,$d1
  809. addccc $h2,0,$d2
  810. addccc $h3,0,$d3
  811. addc $h4,0,$mask
  812. srl $mask,2,$mask ! did it carry/borrow?
  813. neg $mask,$mask
  814. sra $mask,31,$mask ! mask
  815. andn $h0,$mask,$h0
  816. and $d0,$mask,$d0
  817. andn $h1,$mask,$h1
  818. and $d1,$mask,$d1
  819. or $d0,$h0,$h0
  820. ld [$nonce+0],$d0 ! load nonce
  821. andn $h2,$mask,$h2
  822. and $d2,$mask,$d2
  823. or $d1,$h1,$h1
  824. ld [$nonce+4],$d1
  825. andn $h3,$mask,$h3
  826. and $d3,$mask,$d3
  827. or $d2,$h2,$h2
  828. ld [$nonce+8],$d2
  829. or $d3,$h3,$h3
  830. ld [$nonce+12],$d3
  831. addcc $d0,$h0,$h0 ! accumulate nonce
  832. addccc $d1,$h1,$h1
  833. addccc $d2,$h2,$h2
  834. addc $d3,$h3,$h3
  835. stb $h0,[$mac+0] ! write little-endian result
  836. srl $h0,8,$h0
  837. stb $h1,[$mac+4]
  838. srl $h1,8,$h1
  839. stb $h2,[$mac+8]
  840. srl $h2,8,$h2
  841. stb $h3,[$mac+12]
  842. srl $h3,8,$h3
  843. stb $h0,[$mac+1]
  844. srl $h0,8,$h0
  845. stb $h1,[$mac+5]
  846. srl $h1,8,$h1
  847. stb $h2,[$mac+9]
  848. srl $h2,8,$h2
  849. stb $h3,[$mac+13]
  850. srl $h3,8,$h3
  851. stb $h0,[$mac+2]
  852. srl $h0,8,$h0
  853. stb $h1,[$mac+6]
  854. srl $h1,8,$h1
  855. stb $h2,[$mac+10]
  856. srl $h2,8,$h2
  857. stb $h3,[$mac+14]
  858. srl $h3,8,$h3
  859. stb $h0,[$mac+3]
  860. stb $h1,[$mac+7]
  861. stb $h2,[$mac+11]
  862. stb $h3,[$mac+15]
  863. ret
  864. restore
  865. .type poly1305_emit_fma,#function
  866. .size poly1305_emit_fma,.-poly1305_emit_fma
  867. ___
  868. }
  869. $code.=<<___;
  870. .align 64
  871. .Lconsts_fma:
  872. .word 0x43300000,0x00000000 ! 2^(52+0)
  873. .word 0x45300000,0x00000000 ! 2^(52+32)
  874. .word 0x47300000,0x00000000 ! 2^(52+64)
  875. .word 0x49300000,0x00000000 ! 2^(52+96)
  876. .word 0x4b500000,0x00000000 ! 2^(52+130)
  877. .word 0x37f40000,0x00000000 ! 5/2^130
  878. .word 0,1<<30 ! fsr: truncate, no exceptions
  879. .word 0x44300000,0x00000000 ! 2^(52+16+0)
  880. .word 0x46300000,0x00000000 ! 2^(52+16+32)
  881. .word 0x48300000,0x00000000 ! 2^(52+16+64)
  882. .word 0x4a300000,0x00000000 ! 2^(52+16+96)
  883. .word 0x3e300000,0x00000000 ! 2^(52+16+0-96)
  884. .word 0x40300000,0x00000000 ! 2^(52+16+32-96)
  885. .word 0x42300000,0x00000000 ! 2^(52+16+64-96)
  886. .asciz "Poly1305 for SPARCv9/VIS3/FMA, CRYPTOGAMS by <appro\@openssl.org>"
  887. .align 4
  888. ___
  889. }
  890. # Purpose of these subroutines is to explicitly encode VIS instructions,
  891. # so that one can compile the module without having to specify VIS
  892. # extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
  893. # Idea is to reserve for option to produce "universal" binary and let
  894. # programmer detect if current CPU is VIS capable at run-time.
  895. sub unvis3 {
  896. my ($mnemonic,$rs1,$rs2,$rd)=@_;
  897. my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
  898. my ($ref,$opf);
  899. my %visopf = ( "addxc" => 0x011,
  900. "addxccc" => 0x013,
  901. "umulxhi" => 0x016 );
  902. $ref = "$mnemonic\t$rs1,$rs2,$rd";
  903. if ($opf=$visopf{$mnemonic}) {
  904. foreach ($rs1,$rs2,$rd) {
  905. return $ref if (!/%([goli])([0-9])/);
  906. $_=$bias{$1}+$2;
  907. }
  908. return sprintf ".word\t0x%08x !%s",
  909. 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
  910. $ref;
  911. } else {
  912. return $ref;
  913. }
  914. }
  915. sub unfma {
  916. my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_;
  917. my ($ref,$opf);
  918. my %fmaopf = ( "fmadds" => 0x1,
  919. "fmaddd" => 0x2,
  920. "fmsubs" => 0x5,
  921. "fmsubd" => 0x6 );
  922. $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd";
  923. if ($opf=$fmaopf{$mnemonic}) {
  924. foreach ($rs1,$rs2,$rs3,$rd) {
  925. return $ref if (!/%f([0-9]{1,2})/);
  926. $_=$1;
  927. if ($1>=32) {
  928. return $ref if ($1&1);
  929. # re-encode for upper double register addressing
  930. $_=($1|$1>>5)&31;
  931. }
  932. }
  933. return sprintf ".word\t0x%08x !%s",
  934. 0x81b80000|$rd<<25|$rs1<<14|$rs3<<9|$opf<<5|$rs2,
  935. $ref;
  936. } else {
  937. return $ref;
  938. }
  939. }
  940. foreach (split("\n",$code)) {
  941. s/\`([^\`]*)\`/eval $1/ge;
  942. s/\b(umulxhi|addxc[c]{0,2})\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
  943. &unvis3($1,$2,$3,$4)
  944. /ge or
  945. s/\b(fmadd[sd])\s+(%f[0-9]+),\s*(%f[0-9]+),\s*(%f[0-9]+),\s*(%f[0-9]+)/
  946. &unfma($1,$2,$3,$4,$5)
  947. /ge;
  948. print $_,"\n";
  949. }
  950. close STDOUT or die "error closing STDOUT: $!";