ghash-parisc.pl 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751
  1. #! /usr/bin/env perl
  2. # Copyright 2010-2020 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. # ====================================================================
  10. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  11. # project. The module is, however, dual licensed under OpenSSL and
  12. # CRYPTOGAMS licenses depending on where you obtain it. For further
  13. # details see http://www.openssl.org/~appro/cryptogams/.
  14. # ====================================================================
  15. #
  16. # April 2010
  17. #
  18. # The module implements "4-bit" GCM GHASH function and underlying
  19. # single multiplication operation in GF(2^128). "4-bit" means that it
  20. # uses 256 bytes per-key table [+128 bytes shared table]. On PA-7100LC
  21. # it processes one byte in 19.6 cycles, which is more than twice as
  22. # fast as code generated by gcc 3.2. PA-RISC 2.0 loop is scheduled for
  23. # 8 cycles, but measured performance on PA-8600 system is ~9 cycles per
  24. # processed byte. This is ~2.2x faster than 64-bit code generated by
  25. # vendor compiler (which used to be very hard to beat:-).
  26. #
  27. # Special thanks to polarhome.com for providing HP-UX account.
  28. # $output is the last argument if it looks like a file (it has an extension)
  29. # $flavour is the first argument if it doesn't look like a file
  30. $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  31. $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  32. $output and open STDOUT,">$output";
  33. if ($flavour =~ /64/) {
  34. $LEVEL ="2.0W";
  35. $SIZE_T =8;
  36. $FRAME_MARKER =80;
  37. $SAVED_RP =16;
  38. $PUSH ="std";
  39. $PUSHMA ="std,ma";
  40. $POP ="ldd";
  41. $POPMB ="ldd,mb";
  42. $NREGS =6;
  43. } else {
  44. $LEVEL ="1.0"; #"\n\t.ALLOW\t2.0";
  45. $SIZE_T =4;
  46. $FRAME_MARKER =48;
  47. $SAVED_RP =20;
  48. $PUSH ="stw";
  49. $PUSHMA ="stwm";
  50. $POP ="ldw";
  51. $POPMB ="ldwm";
  52. $NREGS =11;
  53. }
  54. $FRAME=10*$SIZE_T+$FRAME_MARKER;# NREGS saved regs + frame marker
  55. # [+ argument transfer]
  56. ################# volatile registers
  57. $Xi="%r26"; # argument block
  58. $Htbl="%r25";
  59. $inp="%r24";
  60. $len="%r23";
  61. $Hhh=$Htbl; # variables
  62. $Hll="%r22";
  63. $Zhh="%r21";
  64. $Zll="%r20";
  65. $cnt="%r19";
  66. $rem_4bit="%r28";
  67. $rem="%r29";
  68. $mask0xf0="%r31";
  69. ################# preserved registers
  70. $Thh="%r1";
  71. $Tll="%r2";
  72. $nlo="%r3";
  73. $nhi="%r4";
  74. $byte="%r5";
  75. if ($SIZE_T==4) {
  76. $Zhl="%r6";
  77. $Zlh="%r7";
  78. $Hhl="%r8";
  79. $Hlh="%r9";
  80. $Thl="%r10";
  81. $Tlh="%r11";
  82. }
  83. $rem2="%r6"; # used in PA-RISC 2.0 code
  84. $code.=<<___;
  85. .LEVEL $LEVEL
  86. .SPACE \$TEXT\$
  87. .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
  88. .EXPORT gcm_gmult_4bit,ENTRY,ARGW0=GR,ARGW1=GR
  89. .ALIGN 64
  90. gcm_gmult_4bit
  91. .PROC
  92. .CALLINFO FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=$NREGS
  93. .ENTRY
  94. $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
  95. $PUSHMA %r3,$FRAME(%sp)
  96. $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
  97. $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
  98. $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
  99. ___
  100. $code.=<<___ if ($SIZE_T==4);
  101. $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp)
  102. $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp)
  103. $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp)
  104. $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp)
  105. $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp)
  106. ___
  107. $code.=<<___;
  108. blr %r0,$rem_4bit
  109. ldi 3,$rem
  110. L\$pic_gmult
  111. andcm $rem_4bit,$rem,$rem_4bit
  112. addl $inp,$len,$len
  113. ldo L\$rem_4bit-L\$pic_gmult($rem_4bit),$rem_4bit
  114. ldi 0xf0,$mask0xf0
  115. ___
  116. $code.=<<___ if ($SIZE_T==4);
  117. ldi 31,$rem
  118. mtctl $rem,%cr11
  119. extrd,u,*= $rem,%sar,1,$rem ; executes on PA-RISC 1.0
  120. b L\$parisc1_gmult
  121. nop
  122. ___
  123. $code.=<<___;
  124. ldb 15($Xi),$nlo
  125. ldo 8($Htbl),$Hll
  126. and $mask0xf0,$nlo,$nhi
  127. depd,z $nlo,59,4,$nlo
  128. ldd $nlo($Hll),$Zll
  129. ldd $nlo($Hhh),$Zhh
  130. depd,z $Zll,60,4,$rem
  131. shrpd $Zhh,$Zll,4,$Zll
  132. extrd,u $Zhh,59,60,$Zhh
  133. ldb 14($Xi),$nlo
  134. ldd $nhi($Hll),$Tll
  135. ldd $nhi($Hhh),$Thh
  136. and $mask0xf0,$nlo,$nhi
  137. depd,z $nlo,59,4,$nlo
  138. xor $Tll,$Zll,$Zll
  139. xor $Thh,$Zhh,$Zhh
  140. ldd $rem($rem_4bit),$rem
  141. b L\$oop_gmult_pa2
  142. ldi 13,$cnt
  143. .ALIGN 8
  144. L\$oop_gmult_pa2
  145. xor $rem,$Zhh,$Zhh ; moved here to work around gas bug
  146. depd,z $Zll,60,4,$rem
  147. shrpd $Zhh,$Zll,4,$Zll
  148. extrd,u $Zhh,59,60,$Zhh
  149. ldd $nlo($Hll),$Tll
  150. ldd $nlo($Hhh),$Thh
  151. xor $Tll,$Zll,$Zll
  152. xor $Thh,$Zhh,$Zhh
  153. ldd $rem($rem_4bit),$rem
  154. xor $rem,$Zhh,$Zhh
  155. depd,z $Zll,60,4,$rem
  156. ldbx $cnt($Xi),$nlo
  157. shrpd $Zhh,$Zll,4,$Zll
  158. extrd,u $Zhh,59,60,$Zhh
  159. ldd $nhi($Hll),$Tll
  160. ldd $nhi($Hhh),$Thh
  161. and $mask0xf0,$nlo,$nhi
  162. depd,z $nlo,59,4,$nlo
  163. ldd $rem($rem_4bit),$rem
  164. xor $Tll,$Zll,$Zll
  165. addib,uv -1,$cnt,L\$oop_gmult_pa2
  166. xor $Thh,$Zhh,$Zhh
  167. xor $rem,$Zhh,$Zhh
  168. depd,z $Zll,60,4,$rem
  169. shrpd $Zhh,$Zll,4,$Zll
  170. extrd,u $Zhh,59,60,$Zhh
  171. ldd $nlo($Hll),$Tll
  172. ldd $nlo($Hhh),$Thh
  173. xor $Tll,$Zll,$Zll
  174. xor $Thh,$Zhh,$Zhh
  175. ldd $rem($rem_4bit),$rem
  176. xor $rem,$Zhh,$Zhh
  177. depd,z $Zll,60,4,$rem
  178. shrpd $Zhh,$Zll,4,$Zll
  179. extrd,u $Zhh,59,60,$Zhh
  180. ldd $nhi($Hll),$Tll
  181. ldd $nhi($Hhh),$Thh
  182. xor $Tll,$Zll,$Zll
  183. xor $Thh,$Zhh,$Zhh
  184. ldd $rem($rem_4bit),$rem
  185. xor $rem,$Zhh,$Zhh
  186. std $Zll,8($Xi)
  187. std $Zhh,0($Xi)
  188. ___
  189. $code.=<<___ if ($SIZE_T==4);
  190. b L\$done_gmult
  191. nop
  192. L\$parisc1_gmult
  193. ldb 15($Xi),$nlo
  194. ldo 12($Htbl),$Hll
  195. ldo 8($Htbl),$Hlh
  196. ldo 4($Htbl),$Hhl
  197. and $mask0xf0,$nlo,$nhi
  198. zdep $nlo,27,4,$nlo
  199. ldwx $nlo($Hll),$Zll
  200. ldwx $nlo($Hlh),$Zlh
  201. ldwx $nlo($Hhl),$Zhl
  202. ldwx $nlo($Hhh),$Zhh
  203. zdep $Zll,28,4,$rem
  204. ldb 14($Xi),$nlo
  205. ldwx $rem($rem_4bit),$rem
  206. shrpw $Zlh,$Zll,4,$Zll
  207. ldwx $nhi($Hll),$Tll
  208. shrpw $Zhl,$Zlh,4,$Zlh
  209. ldwx $nhi($Hlh),$Tlh
  210. shrpw $Zhh,$Zhl,4,$Zhl
  211. ldwx $nhi($Hhl),$Thl
  212. extru $Zhh,27,28,$Zhh
  213. ldwx $nhi($Hhh),$Thh
  214. xor $rem,$Zhh,$Zhh
  215. and $mask0xf0,$nlo,$nhi
  216. zdep $nlo,27,4,$nlo
  217. xor $Tll,$Zll,$Zll
  218. ldwx $nlo($Hll),$Tll
  219. xor $Tlh,$Zlh,$Zlh
  220. ldwx $nlo($Hlh),$Tlh
  221. xor $Thl,$Zhl,$Zhl
  222. b L\$oop_gmult_pa1
  223. ldi 13,$cnt
  224. .ALIGN 8
  225. L\$oop_gmult_pa1
  226. zdep $Zll,28,4,$rem
  227. ldwx $nlo($Hhl),$Thl
  228. xor $Thh,$Zhh,$Zhh
  229. ldwx $rem($rem_4bit),$rem
  230. shrpw $Zlh,$Zll,4,$Zll
  231. ldwx $nlo($Hhh),$Thh
  232. shrpw $Zhl,$Zlh,4,$Zlh
  233. ldbx $cnt($Xi),$nlo
  234. xor $Tll,$Zll,$Zll
  235. ldwx $nhi($Hll),$Tll
  236. shrpw $Zhh,$Zhl,4,$Zhl
  237. xor $Tlh,$Zlh,$Zlh
  238. ldwx $nhi($Hlh),$Tlh
  239. extru $Zhh,27,28,$Zhh
  240. xor $Thl,$Zhl,$Zhl
  241. ldwx $nhi($Hhl),$Thl
  242. xor $rem,$Zhh,$Zhh
  243. zdep $Zll,28,4,$rem
  244. xor $Thh,$Zhh,$Zhh
  245. ldwx $nhi($Hhh),$Thh
  246. shrpw $Zlh,$Zll,4,$Zll
  247. ldwx $rem($rem_4bit),$rem
  248. shrpw $Zhl,$Zlh,4,$Zlh
  249. shrpw $Zhh,$Zhl,4,$Zhl
  250. and $mask0xf0,$nlo,$nhi
  251. extru $Zhh,27,28,$Zhh
  252. zdep $nlo,27,4,$nlo
  253. xor $Tll,$Zll,$Zll
  254. ldwx $nlo($Hll),$Tll
  255. xor $Tlh,$Zlh,$Zlh
  256. ldwx $nlo($Hlh),$Tlh
  257. xor $rem,$Zhh,$Zhh
  258. addib,uv -1,$cnt,L\$oop_gmult_pa1
  259. xor $Thl,$Zhl,$Zhl
  260. zdep $Zll,28,4,$rem
  261. ldwx $nlo($Hhl),$Thl
  262. xor $Thh,$Zhh,$Zhh
  263. ldwx $rem($rem_4bit),$rem
  264. shrpw $Zlh,$Zll,4,$Zll
  265. ldwx $nlo($Hhh),$Thh
  266. shrpw $Zhl,$Zlh,4,$Zlh
  267. xor $Tll,$Zll,$Zll
  268. ldwx $nhi($Hll),$Tll
  269. shrpw $Zhh,$Zhl,4,$Zhl
  270. xor $Tlh,$Zlh,$Zlh
  271. ldwx $nhi($Hlh),$Tlh
  272. extru $Zhh,27,28,$Zhh
  273. xor $rem,$Zhh,$Zhh
  274. xor $Thl,$Zhl,$Zhl
  275. ldwx $nhi($Hhl),$Thl
  276. xor $Thh,$Zhh,$Zhh
  277. ldwx $nhi($Hhh),$Thh
  278. zdep $Zll,28,4,$rem
  279. ldwx $rem($rem_4bit),$rem
  280. shrpw $Zlh,$Zll,4,$Zll
  281. shrpw $Zhl,$Zlh,4,$Zlh
  282. shrpw $Zhh,$Zhl,4,$Zhl
  283. extru $Zhh,27,28,$Zhh
  284. xor $Tll,$Zll,$Zll
  285. xor $Tlh,$Zlh,$Zlh
  286. xor $rem,$Zhh,$Zhh
  287. stw $Zll,12($Xi)
  288. xor $Thl,$Zhl,$Zhl
  289. stw $Zlh,8($Xi)
  290. xor $Thh,$Zhh,$Zhh
  291. stw $Zhl,4($Xi)
  292. stw $Zhh,0($Xi)
  293. ___
  294. $code.=<<___;
  295. L\$done_gmult
  296. $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue
  297. $POP `-$FRAME+1*$SIZE_T`(%sp),%r4
  298. $POP `-$FRAME+2*$SIZE_T`(%sp),%r5
  299. $POP `-$FRAME+3*$SIZE_T`(%sp),%r6
  300. ___
  301. $code.=<<___ if ($SIZE_T==4);
  302. $POP `-$FRAME+4*$SIZE_T`(%sp),%r7
  303. $POP `-$FRAME+5*$SIZE_T`(%sp),%r8
  304. $POP `-$FRAME+6*$SIZE_T`(%sp),%r9
  305. $POP `-$FRAME+7*$SIZE_T`(%sp),%r10
  306. $POP `-$FRAME+8*$SIZE_T`(%sp),%r11
  307. ___
  308. $code.=<<___;
  309. bv (%r2)
  310. .EXIT
  311. $POPMB -$FRAME(%sp),%r3
  312. .PROCEND
  313. .EXPORT gcm_ghash_4bit,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
  314. .ALIGN 64
  315. gcm_ghash_4bit
  316. .PROC
  317. .CALLINFO FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=11
  318. .ENTRY
  319. $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
  320. $PUSHMA %r3,$FRAME(%sp)
  321. $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
  322. $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
  323. $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
  324. ___
  325. $code.=<<___ if ($SIZE_T==4);
  326. $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp)
  327. $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp)
  328. $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp)
  329. $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp)
  330. $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp)
  331. ___
  332. $code.=<<___;
  333. blr %r0,$rem_4bit
  334. ldi 3,$rem
  335. L\$pic_ghash
  336. andcm $rem_4bit,$rem,$rem_4bit
  337. addl $inp,$len,$len
  338. ldo L\$rem_4bit-L\$pic_ghash($rem_4bit),$rem_4bit
  339. ldi 0xf0,$mask0xf0
  340. ___
  341. $code.=<<___ if ($SIZE_T==4);
  342. ldi 31,$rem
  343. mtctl $rem,%cr11
  344. extrd,u,*= $rem,%sar,1,$rem ; executes on PA-RISC 1.0
  345. b L\$parisc1_ghash
  346. nop
  347. ___
  348. $code.=<<___;
  349. ldb 15($Xi),$nlo
  350. ldo 8($Htbl),$Hll
  351. L\$outer_ghash_pa2
  352. ldb 15($inp),$nhi
  353. xor $nhi,$nlo,$nlo
  354. and $mask0xf0,$nlo,$nhi
  355. depd,z $nlo,59,4,$nlo
  356. ldd $nlo($Hll),$Zll
  357. ldd $nlo($Hhh),$Zhh
  358. depd,z $Zll,60,4,$rem
  359. shrpd $Zhh,$Zll,4,$Zll
  360. extrd,u $Zhh,59,60,$Zhh
  361. ldb 14($Xi),$nlo
  362. ldb 14($inp),$byte
  363. ldd $nhi($Hll),$Tll
  364. ldd $nhi($Hhh),$Thh
  365. xor $byte,$nlo,$nlo
  366. and $mask0xf0,$nlo,$nhi
  367. depd,z $nlo,59,4,$nlo
  368. xor $Tll,$Zll,$Zll
  369. xor $Thh,$Zhh,$Zhh
  370. ldd $rem($rem_4bit),$rem
  371. b L\$oop_ghash_pa2
  372. ldi 13,$cnt
  373. .ALIGN 8
  374. L\$oop_ghash_pa2
  375. xor $rem,$Zhh,$Zhh ; moved here to work around gas bug
  376. depd,z $Zll,60,4,$rem2
  377. shrpd $Zhh,$Zll,4,$Zll
  378. extrd,u $Zhh,59,60,$Zhh
  379. ldd $nlo($Hll),$Tll
  380. ldd $nlo($Hhh),$Thh
  381. xor $Tll,$Zll,$Zll
  382. xor $Thh,$Zhh,$Zhh
  383. ldbx $cnt($Xi),$nlo
  384. ldbx $cnt($inp),$byte
  385. depd,z $Zll,60,4,$rem
  386. shrpd $Zhh,$Zll,4,$Zll
  387. ldd $rem2($rem_4bit),$rem2
  388. xor $rem2,$Zhh,$Zhh
  389. xor $byte,$nlo,$nlo
  390. ldd $nhi($Hll),$Tll
  391. ldd $nhi($Hhh),$Thh
  392. and $mask0xf0,$nlo,$nhi
  393. depd,z $nlo,59,4,$nlo
  394. extrd,u $Zhh,59,60,$Zhh
  395. xor $Tll,$Zll,$Zll
  396. ldd $rem($rem_4bit),$rem
  397. addib,uv -1,$cnt,L\$oop_ghash_pa2
  398. xor $Thh,$Zhh,$Zhh
  399. xor $rem,$Zhh,$Zhh
  400. depd,z $Zll,60,4,$rem2
  401. shrpd $Zhh,$Zll,4,$Zll
  402. extrd,u $Zhh,59,60,$Zhh
  403. ldd $nlo($Hll),$Tll
  404. ldd $nlo($Hhh),$Thh
  405. xor $Tll,$Zll,$Zll
  406. xor $Thh,$Zhh,$Zhh
  407. depd,z $Zll,60,4,$rem
  408. shrpd $Zhh,$Zll,4,$Zll
  409. ldd $rem2($rem_4bit),$rem2
  410. xor $rem2,$Zhh,$Zhh
  411. ldd $nhi($Hll),$Tll
  412. ldd $nhi($Hhh),$Thh
  413. extrd,u $Zhh,59,60,$Zhh
  414. xor $Tll,$Zll,$Zll
  415. xor $Thh,$Zhh,$Zhh
  416. ldd $rem($rem_4bit),$rem
  417. xor $rem,$Zhh,$Zhh
  418. std $Zll,8($Xi)
  419. ldo 16($inp),$inp
  420. std $Zhh,0($Xi)
  421. cmpb,*<> $inp,$len,L\$outer_ghash_pa2
  422. copy $Zll,$nlo
  423. ___
  424. $code.=<<___ if ($SIZE_T==4);
  425. b L\$done_ghash
  426. nop
  427. L\$parisc1_ghash
  428. ldb 15($Xi),$nlo
  429. ldo 12($Htbl),$Hll
  430. ldo 8($Htbl),$Hlh
  431. ldo 4($Htbl),$Hhl
  432. L\$outer_ghash_pa1
  433. ldb 15($inp),$byte
  434. xor $byte,$nlo,$nlo
  435. and $mask0xf0,$nlo,$nhi
  436. zdep $nlo,27,4,$nlo
  437. ldwx $nlo($Hll),$Zll
  438. ldwx $nlo($Hlh),$Zlh
  439. ldwx $nlo($Hhl),$Zhl
  440. ldwx $nlo($Hhh),$Zhh
  441. zdep $Zll,28,4,$rem
  442. ldb 14($Xi),$nlo
  443. ldb 14($inp),$byte
  444. ldwx $rem($rem_4bit),$rem
  445. shrpw $Zlh,$Zll,4,$Zll
  446. ldwx $nhi($Hll),$Tll
  447. shrpw $Zhl,$Zlh,4,$Zlh
  448. ldwx $nhi($Hlh),$Tlh
  449. shrpw $Zhh,$Zhl,4,$Zhl
  450. ldwx $nhi($Hhl),$Thl
  451. extru $Zhh,27,28,$Zhh
  452. ldwx $nhi($Hhh),$Thh
  453. xor $byte,$nlo,$nlo
  454. xor $rem,$Zhh,$Zhh
  455. and $mask0xf0,$nlo,$nhi
  456. zdep $nlo,27,4,$nlo
  457. xor $Tll,$Zll,$Zll
  458. ldwx $nlo($Hll),$Tll
  459. xor $Tlh,$Zlh,$Zlh
  460. ldwx $nlo($Hlh),$Tlh
  461. xor $Thl,$Zhl,$Zhl
  462. b L\$oop_ghash_pa1
  463. ldi 13,$cnt
  464. .ALIGN 8
  465. L\$oop_ghash_pa1
  466. zdep $Zll,28,4,$rem
  467. ldwx $nlo($Hhl),$Thl
  468. xor $Thh,$Zhh,$Zhh
  469. ldwx $rem($rem_4bit),$rem
  470. shrpw $Zlh,$Zll,4,$Zll
  471. ldwx $nlo($Hhh),$Thh
  472. shrpw $Zhl,$Zlh,4,$Zlh
  473. ldbx $cnt($Xi),$nlo
  474. xor $Tll,$Zll,$Zll
  475. ldwx $nhi($Hll),$Tll
  476. shrpw $Zhh,$Zhl,4,$Zhl
  477. ldbx $cnt($inp),$byte
  478. xor $Tlh,$Zlh,$Zlh
  479. ldwx $nhi($Hlh),$Tlh
  480. extru $Zhh,27,28,$Zhh
  481. xor $Thl,$Zhl,$Zhl
  482. ldwx $nhi($Hhl),$Thl
  483. xor $rem,$Zhh,$Zhh
  484. zdep $Zll,28,4,$rem
  485. xor $Thh,$Zhh,$Zhh
  486. ldwx $nhi($Hhh),$Thh
  487. shrpw $Zlh,$Zll,4,$Zll
  488. ldwx $rem($rem_4bit),$rem
  489. shrpw $Zhl,$Zlh,4,$Zlh
  490. xor $byte,$nlo,$nlo
  491. shrpw $Zhh,$Zhl,4,$Zhl
  492. and $mask0xf0,$nlo,$nhi
  493. extru $Zhh,27,28,$Zhh
  494. zdep $nlo,27,4,$nlo
  495. xor $Tll,$Zll,$Zll
  496. ldwx $nlo($Hll),$Tll
  497. xor $Tlh,$Zlh,$Zlh
  498. ldwx $nlo($Hlh),$Tlh
  499. xor $rem,$Zhh,$Zhh
  500. addib,uv -1,$cnt,L\$oop_ghash_pa1
  501. xor $Thl,$Zhl,$Zhl
  502. zdep $Zll,28,4,$rem
  503. ldwx $nlo($Hhl),$Thl
  504. xor $Thh,$Zhh,$Zhh
  505. ldwx $rem($rem_4bit),$rem
  506. shrpw $Zlh,$Zll,4,$Zll
  507. ldwx $nlo($Hhh),$Thh
  508. shrpw $Zhl,$Zlh,4,$Zlh
  509. xor $Tll,$Zll,$Zll
  510. ldwx $nhi($Hll),$Tll
  511. shrpw $Zhh,$Zhl,4,$Zhl
  512. xor $Tlh,$Zlh,$Zlh
  513. ldwx $nhi($Hlh),$Tlh
  514. extru $Zhh,27,28,$Zhh
  515. xor $rem,$Zhh,$Zhh
  516. xor $Thl,$Zhl,$Zhl
  517. ldwx $nhi($Hhl),$Thl
  518. xor $Thh,$Zhh,$Zhh
  519. ldwx $nhi($Hhh),$Thh
  520. zdep $Zll,28,4,$rem
  521. ldwx $rem($rem_4bit),$rem
  522. shrpw $Zlh,$Zll,4,$Zll
  523. shrpw $Zhl,$Zlh,4,$Zlh
  524. shrpw $Zhh,$Zhl,4,$Zhl
  525. extru $Zhh,27,28,$Zhh
  526. xor $Tll,$Zll,$Zll
  527. xor $Tlh,$Zlh,$Zlh
  528. xor $rem,$Zhh,$Zhh
  529. stw $Zll,12($Xi)
  530. xor $Thl,$Zhl,$Zhl
  531. stw $Zlh,8($Xi)
  532. xor $Thh,$Zhh,$Zhh
  533. stw $Zhl,4($Xi)
  534. ldo 16($inp),$inp
  535. stw $Zhh,0($Xi)
  536. comb,<> $inp,$len,L\$outer_ghash_pa1
  537. copy $Zll,$nlo
  538. ___
  539. $code.=<<___;
  540. L\$done_ghash
  541. $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue
  542. $POP `-$FRAME+1*$SIZE_T`(%sp),%r4
  543. $POP `-$FRAME+2*$SIZE_T`(%sp),%r5
  544. $POP `-$FRAME+3*$SIZE_T`(%sp),%r6
  545. ___
  546. $code.=<<___ if ($SIZE_T==4);
  547. $POP `-$FRAME+4*$SIZE_T`(%sp),%r7
  548. $POP `-$FRAME+5*$SIZE_T`(%sp),%r8
  549. $POP `-$FRAME+6*$SIZE_T`(%sp),%r9
  550. $POP `-$FRAME+7*$SIZE_T`(%sp),%r10
  551. $POP `-$FRAME+8*$SIZE_T`(%sp),%r11
  552. ___
  553. $code.=<<___;
  554. bv (%r2)
  555. .EXIT
  556. $POPMB -$FRAME(%sp),%r3
  557. .PROCEND
  558. .ALIGN 64
  559. L\$rem_4bit
  560. .WORD `0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0
  561. .WORD `0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0
  562. .WORD `0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0
  563. .WORD `0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0
  564. .STRINGZ "GHASH for PA-RISC, GRYPTOGAMS by <appro\@openssl.org>"
  565. .ALIGN 64
  566. ___
  567. # Explicitly encode PA-RISC 2.0 instructions used in this module, so
  568. # that it can be compiled with .LEVEL 1.0. It should be noted that I
  569. # wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
  570. # directive...
  571. my $ldd = sub {
  572. my ($mod,$args) = @_;
  573. my $orig = "ldd$mod\t$args";
  574. if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 4
  575. { my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3;
  576. sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
  577. }
  578. elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 5
  579. { my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3;
  580. $opcode|=(($1&0xF)<<17)|(($1&0x10)<<12); # encode offset
  581. $opcode|=(1<<5) if ($mod =~ /^,m/);
  582. $opcode|=(1<<13) if ($mod =~ /^,mb/);
  583. sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
  584. }
  585. else { "\t".$orig; }
  586. };
  587. my $std = sub {
  588. my ($mod,$args) = @_;
  589. my $orig = "std$mod\t$args";
  590. if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 3 suffices
  591. { my $opcode=(0x1c<<26)|($3<<21)|($1<<16)|(($2&0x1FF8)<<1)|(($2>>13)&1);
  592. sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
  593. }
  594. else { "\t".$orig; }
  595. };
  596. my $extrd = sub {
  597. my ($mod,$args) = @_;
  598. my $orig = "extrd$mod\t$args";
  599. # I only have ",u" completer, it's implicitly encoded...
  600. if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 15
  601. { my $opcode=(0x36<<26)|($1<<21)|($4<<16);
  602. my $len=32-$3;
  603. $opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5); # encode pos
  604. $opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len
  605. sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
  606. }
  607. elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/) # format 12
  608. { my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
  609. my $len=32-$2;
  610. $opcode |= (($len&0x20)<<3)|($len&0x1f); # encode len
  611. $opcode |= (1<<13) if ($mod =~ /,\**=/);
  612. sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
  613. }
  614. else { "\t".$orig; }
  615. };
  616. my $shrpd = sub {
  617. my ($mod,$args) = @_;
  618. my $orig = "shrpd$mod\t$args";
  619. if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/) # format 14
  620. { my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
  621. my $cpos=63-$3;
  622. $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode sa
  623. sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
  624. }
  625. elsif ($args =~ /%r([0-9]+),%r([0-9]+),%sar,%r([0-9]+)/) # format 11
  626. { sprintf "\t.WORD\t0x%08x\t; %s",
  627. (0x34<<26)|($2<<21)|($1<<16)|(1<<9)|$3,$orig;
  628. }
  629. else { "\t".$orig; }
  630. };
  631. my $depd = sub {
  632. my ($mod,$args) = @_;
  633. my $orig = "depd$mod\t$args";
  634. # I only have ",z" completer, it's implicitly encoded...
  635. if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 16
  636. { my $opcode=(0x3c<<26)|($4<<21)|($1<<16);
  637. my $cpos=63-$2;
  638. my $len=32-$3;
  639. $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode pos
  640. $opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len
  641. sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
  642. }
  643. else { "\t".$orig; }
  644. };
  645. sub assemble {
  646. my ($mnemonic,$mod,$args)=@_;
  647. my $opcode = eval("\$$mnemonic");
  648. ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
  649. }
  650. if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
  651. =~ /GNU assembler/) {
  652. $gnuas = 1;
  653. }
  654. foreach (split("\n",$code)) {
  655. s/\`([^\`]*)\`/eval $1/ge;
  656. if ($SIZE_T==4) {
  657. s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e;
  658. s/cmpb,\*/comb,/;
  659. s/,\*/,/;
  660. }
  661. s/(\.LEVEL\s+2\.0)W/$1w/ if ($gnuas && $SIZE_T==8);
  662. s/\.SPACE\s+\$TEXT\$/.text/ if ($gnuas && $SIZE_T==8);
  663. s/\.SUBSPA.*// if ($gnuas && $SIZE_T==8);
  664. s/\bbv\b/bve/ if ($SIZE_T==8);
  665. print $_,"\n";
  666. }
  667. close STDOUT or die "error closing STDOUT: $!";