ghash-parisc.pl 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730
  1. #!/usr/bin/env perl
  2. #
  3. # ====================================================================
  4. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  5. # project. The module is, however, dual licensed under OpenSSL and
  6. # CRYPTOGAMS licenses depending on where you obtain it. For further
  7. # details see http://www.openssl.org/~appro/cryptogams/.
  8. # ====================================================================
  9. #
  10. # April 2010
  11. #
  12. # The module implements "4-bit" GCM GHASH function and underlying
  13. # single multiplication operation in GF(2^128). "4-bit" means that it
  14. # uses 256 bytes per-key table [+128 bytes shared table]. On PA-7100LC
  15. # it processes one byte in 19.6 cycles, which is more than twice as
  16. # fast as code generated by gcc 3.2. PA-RISC 2.0 loop is scheduled for
  17. # 8 cycles, but measured performance on PA-8600 system is ~9 cycles per
  18. # processed byte. This is ~2.2x faster than 64-bit code generated by
  19. # vendor compiler (which used to be very hard to beat:-).
  20. #
  21. # Special thanks to polarhome.com for providing HP-UX account.
  22. $flavour = shift;
  23. $output = shift;
  24. open STDOUT,">$output";
  25. if ($flavour =~ /64/) {
  26. $LEVEL ="2.0W";
  27. $SIZE_T =8;
  28. $FRAME_MARKER =80;
  29. $SAVED_RP =16;
  30. $PUSH ="std";
  31. $PUSHMA ="std,ma";
  32. $POP ="ldd";
  33. $POPMB ="ldd,mb";
  34. $NREGS =6;
  35. } else {
  36. $LEVEL ="1.0"; #"\n\t.ALLOW\t2.0";
  37. $SIZE_T =4;
  38. $FRAME_MARKER =48;
  39. $SAVED_RP =20;
  40. $PUSH ="stw";
  41. $PUSHMA ="stwm";
  42. $POP ="ldw";
  43. $POPMB ="ldwm";
  44. $NREGS =11;
  45. }
  46. $FRAME=10*$SIZE_T+$FRAME_MARKER;# NREGS saved regs + frame marker
  47. # [+ argument transfer]
  48. ################# volatile registers
  49. $Xi="%r26"; # argument block
  50. $Htbl="%r25";
  51. $inp="%r24";
  52. $len="%r23";
  53. $Hhh=$Htbl; # variables
  54. $Hll="%r22";
  55. $Zhh="%r21";
  56. $Zll="%r20";
  57. $cnt="%r19";
  58. $rem_4bit="%r28";
  59. $rem="%r29";
  60. $mask0xf0="%r31";
  61. ################# preserved registers
  62. $Thh="%r1";
  63. $Tll="%r2";
  64. $nlo="%r3";
  65. $nhi="%r4";
  66. $byte="%r5";
  67. if ($SIZE_T==4) {
  68. $Zhl="%r6";
  69. $Zlh="%r7";
  70. $Hhl="%r8";
  71. $Hlh="%r9";
  72. $Thl="%r10";
  73. $Tlh="%r11";
  74. }
  75. $rem2="%r6"; # used in PA-RISC 2.0 code
  76. $code.=<<___;
  77. .LEVEL $LEVEL
  78. .SPACE \$TEXT\$
  79. .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
  80. .EXPORT gcm_gmult_4bit,ENTRY,ARGW0=GR,ARGW1=GR
  81. .ALIGN 64
  82. gcm_gmult_4bit
  83. .PROC
  84. .CALLINFO FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=$NREGS
  85. .ENTRY
  86. $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
  87. $PUSHMA %r3,$FRAME(%sp)
  88. $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
  89. $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
  90. $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
  91. ___
  92. $code.=<<___ if ($SIZE_T==4);
  93. $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp)
  94. $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp)
  95. $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp)
  96. $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp)
  97. $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp)
  98. ___
  99. $code.=<<___;
  100. blr %r0,$rem_4bit
  101. ldi 3,$rem
  102. L\$pic_gmult
  103. andcm $rem_4bit,$rem,$rem_4bit
  104. addl $inp,$len,$len
  105. ldo L\$rem_4bit-L\$pic_gmult($rem_4bit),$rem_4bit
  106. ldi 0xf0,$mask0xf0
  107. ___
  108. $code.=<<___ if ($SIZE_T==4);
  109. ldi 31,$rem
  110. mtctl $rem,%cr11
  111. extrd,u,*= $rem,%sar,1,$rem ; executes on PA-RISC 1.0
  112. b L\$parisc1_gmult
  113. nop
  114. ___
  115. $code.=<<___;
  116. ldb 15($Xi),$nlo
  117. ldo 8($Htbl),$Hll
  118. and $mask0xf0,$nlo,$nhi
  119. depd,z $nlo,59,4,$nlo
  120. ldd $nlo($Hll),$Zll
  121. ldd $nlo($Hhh),$Zhh
  122. depd,z $Zll,60,4,$rem
  123. shrpd $Zhh,$Zll,4,$Zll
  124. extrd,u $Zhh,59,60,$Zhh
  125. ldb 14($Xi),$nlo
  126. ldd $nhi($Hll),$Tll
  127. ldd $nhi($Hhh),$Thh
  128. and $mask0xf0,$nlo,$nhi
  129. depd,z $nlo,59,4,$nlo
  130. xor $Tll,$Zll,$Zll
  131. xor $Thh,$Zhh,$Zhh
  132. ldd $rem($rem_4bit),$rem
  133. b L\$oop_gmult_pa2
  134. ldi 13,$cnt
  135. .ALIGN 8
  136. L\$oop_gmult_pa2
  137. xor $rem,$Zhh,$Zhh ; moved here to work around gas bug
  138. depd,z $Zll,60,4,$rem
  139. shrpd $Zhh,$Zll,4,$Zll
  140. extrd,u $Zhh,59,60,$Zhh
  141. ldd $nlo($Hll),$Tll
  142. ldd $nlo($Hhh),$Thh
  143. xor $Tll,$Zll,$Zll
  144. xor $Thh,$Zhh,$Zhh
  145. ldd $rem($rem_4bit),$rem
  146. xor $rem,$Zhh,$Zhh
  147. depd,z $Zll,60,4,$rem
  148. ldbx $cnt($Xi),$nlo
  149. shrpd $Zhh,$Zll,4,$Zll
  150. extrd,u $Zhh,59,60,$Zhh
  151. ldd $nhi($Hll),$Tll
  152. ldd $nhi($Hhh),$Thh
  153. and $mask0xf0,$nlo,$nhi
  154. depd,z $nlo,59,4,$nlo
  155. ldd $rem($rem_4bit),$rem
  156. xor $Tll,$Zll,$Zll
  157. addib,uv -1,$cnt,L\$oop_gmult_pa2
  158. xor $Thh,$Zhh,$Zhh
  159. xor $rem,$Zhh,$Zhh
  160. depd,z $Zll,60,4,$rem
  161. shrpd $Zhh,$Zll,4,$Zll
  162. extrd,u $Zhh,59,60,$Zhh
  163. ldd $nlo($Hll),$Tll
  164. ldd $nlo($Hhh),$Thh
  165. xor $Tll,$Zll,$Zll
  166. xor $Thh,$Zhh,$Zhh
  167. ldd $rem($rem_4bit),$rem
  168. xor $rem,$Zhh,$Zhh
  169. depd,z $Zll,60,4,$rem
  170. shrpd $Zhh,$Zll,4,$Zll
  171. extrd,u $Zhh,59,60,$Zhh
  172. ldd $nhi($Hll),$Tll
  173. ldd $nhi($Hhh),$Thh
  174. xor $Tll,$Zll,$Zll
  175. xor $Thh,$Zhh,$Zhh
  176. ldd $rem($rem_4bit),$rem
  177. xor $rem,$Zhh,$Zhh
  178. std $Zll,8($Xi)
  179. std $Zhh,0($Xi)
  180. ___
  181. $code.=<<___ if ($SIZE_T==4);
  182. b L\$done_gmult
  183. nop
  184. L\$parisc1_gmult
  185. ldb 15($Xi),$nlo
  186. ldo 12($Htbl),$Hll
  187. ldo 8($Htbl),$Hlh
  188. ldo 4($Htbl),$Hhl
  189. and $mask0xf0,$nlo,$nhi
  190. zdep $nlo,27,4,$nlo
  191. ldwx $nlo($Hll),$Zll
  192. ldwx $nlo($Hlh),$Zlh
  193. ldwx $nlo($Hhl),$Zhl
  194. ldwx $nlo($Hhh),$Zhh
  195. zdep $Zll,28,4,$rem
  196. ldb 14($Xi),$nlo
  197. ldwx $rem($rem_4bit),$rem
  198. shrpw $Zlh,$Zll,4,$Zll
  199. ldwx $nhi($Hll),$Tll
  200. shrpw $Zhl,$Zlh,4,$Zlh
  201. ldwx $nhi($Hlh),$Tlh
  202. shrpw $Zhh,$Zhl,4,$Zhl
  203. ldwx $nhi($Hhl),$Thl
  204. extru $Zhh,27,28,$Zhh
  205. ldwx $nhi($Hhh),$Thh
  206. xor $rem,$Zhh,$Zhh
  207. and $mask0xf0,$nlo,$nhi
  208. zdep $nlo,27,4,$nlo
  209. xor $Tll,$Zll,$Zll
  210. ldwx $nlo($Hll),$Tll
  211. xor $Tlh,$Zlh,$Zlh
  212. ldwx $nlo($Hlh),$Tlh
  213. xor $Thl,$Zhl,$Zhl
  214. b L\$oop_gmult_pa1
  215. ldi 13,$cnt
  216. .ALIGN 8
  217. L\$oop_gmult_pa1
  218. zdep $Zll,28,4,$rem
  219. ldwx $nlo($Hhl),$Thl
  220. xor $Thh,$Zhh,$Zhh
  221. ldwx $rem($rem_4bit),$rem
  222. shrpw $Zlh,$Zll,4,$Zll
  223. ldwx $nlo($Hhh),$Thh
  224. shrpw $Zhl,$Zlh,4,$Zlh
  225. ldbx $cnt($Xi),$nlo
  226. xor $Tll,$Zll,$Zll
  227. ldwx $nhi($Hll),$Tll
  228. shrpw $Zhh,$Zhl,4,$Zhl
  229. xor $Tlh,$Zlh,$Zlh
  230. ldwx $nhi($Hlh),$Tlh
  231. extru $Zhh,27,28,$Zhh
  232. xor $Thl,$Zhl,$Zhl
  233. ldwx $nhi($Hhl),$Thl
  234. xor $rem,$Zhh,$Zhh
  235. zdep $Zll,28,4,$rem
  236. xor $Thh,$Zhh,$Zhh
  237. ldwx $nhi($Hhh),$Thh
  238. shrpw $Zlh,$Zll,4,$Zll
  239. ldwx $rem($rem_4bit),$rem
  240. shrpw $Zhl,$Zlh,4,$Zlh
  241. shrpw $Zhh,$Zhl,4,$Zhl
  242. and $mask0xf0,$nlo,$nhi
  243. extru $Zhh,27,28,$Zhh
  244. zdep $nlo,27,4,$nlo
  245. xor $Tll,$Zll,$Zll
  246. ldwx $nlo($Hll),$Tll
  247. xor $Tlh,$Zlh,$Zlh
  248. ldwx $nlo($Hlh),$Tlh
  249. xor $rem,$Zhh,$Zhh
  250. addib,uv -1,$cnt,L\$oop_gmult_pa1
  251. xor $Thl,$Zhl,$Zhl
  252. zdep $Zll,28,4,$rem
  253. ldwx $nlo($Hhl),$Thl
  254. xor $Thh,$Zhh,$Zhh
  255. ldwx $rem($rem_4bit),$rem
  256. shrpw $Zlh,$Zll,4,$Zll
  257. ldwx $nlo($Hhh),$Thh
  258. shrpw $Zhl,$Zlh,4,$Zlh
  259. xor $Tll,$Zll,$Zll
  260. ldwx $nhi($Hll),$Tll
  261. shrpw $Zhh,$Zhl,4,$Zhl
  262. xor $Tlh,$Zlh,$Zlh
  263. ldwx $nhi($Hlh),$Tlh
  264. extru $Zhh,27,28,$Zhh
  265. xor $rem,$Zhh,$Zhh
  266. xor $Thl,$Zhl,$Zhl
  267. ldwx $nhi($Hhl),$Thl
  268. xor $Thh,$Zhh,$Zhh
  269. ldwx $nhi($Hhh),$Thh
  270. zdep $Zll,28,4,$rem
  271. ldwx $rem($rem_4bit),$rem
  272. shrpw $Zlh,$Zll,4,$Zll
  273. shrpw $Zhl,$Zlh,4,$Zlh
  274. shrpw $Zhh,$Zhl,4,$Zhl
  275. extru $Zhh,27,28,$Zhh
  276. xor $Tll,$Zll,$Zll
  277. xor $Tlh,$Zlh,$Zlh
  278. xor $rem,$Zhh,$Zhh
  279. stw $Zll,12($Xi)
  280. xor $Thl,$Zhl,$Zhl
  281. stw $Zlh,8($Xi)
  282. xor $Thh,$Zhh,$Zhh
  283. stw $Zhl,4($Xi)
  284. stw $Zhh,0($Xi)
  285. ___
  286. $code.=<<___;
  287. L\$done_gmult
  288. $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue
  289. $POP `-$FRAME+1*$SIZE_T`(%sp),%r4
  290. $POP `-$FRAME+2*$SIZE_T`(%sp),%r5
  291. $POP `-$FRAME+3*$SIZE_T`(%sp),%r6
  292. ___
  293. $code.=<<___ if ($SIZE_T==4);
  294. $POP `-$FRAME+4*$SIZE_T`(%sp),%r7
  295. $POP `-$FRAME+5*$SIZE_T`(%sp),%r8
  296. $POP `-$FRAME+6*$SIZE_T`(%sp),%r9
  297. $POP `-$FRAME+7*$SIZE_T`(%sp),%r10
  298. $POP `-$FRAME+8*$SIZE_T`(%sp),%r11
  299. ___
  300. $code.=<<___;
  301. bv (%r2)
  302. .EXIT
  303. $POPMB -$FRAME(%sp),%r3
  304. .PROCEND
  305. .EXPORT gcm_ghash_4bit,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
  306. .ALIGN 64
  307. gcm_ghash_4bit
  308. .PROC
  309. .CALLINFO FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=11
  310. .ENTRY
  311. $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
  312. $PUSHMA %r3,$FRAME(%sp)
  313. $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
  314. $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
  315. $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
  316. ___
  317. $code.=<<___ if ($SIZE_T==4);
  318. $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp)
  319. $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp)
  320. $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp)
  321. $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp)
  322. $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp)
  323. ___
  324. $code.=<<___;
  325. blr %r0,$rem_4bit
  326. ldi 3,$rem
  327. L\$pic_ghash
  328. andcm $rem_4bit,$rem,$rem_4bit
  329. addl $inp,$len,$len
  330. ldo L\$rem_4bit-L\$pic_ghash($rem_4bit),$rem_4bit
  331. ldi 0xf0,$mask0xf0
  332. ___
  333. $code.=<<___ if ($SIZE_T==4);
  334. ldi 31,$rem
  335. mtctl $rem,%cr11
  336. extrd,u,*= $rem,%sar,1,$rem ; executes on PA-RISC 1.0
  337. b L\$parisc1_ghash
  338. nop
  339. ___
  340. $code.=<<___;
  341. ldb 15($Xi),$nlo
  342. ldo 8($Htbl),$Hll
  343. L\$outer_ghash_pa2
  344. ldb 15($inp),$nhi
  345. xor $nhi,$nlo,$nlo
  346. and $mask0xf0,$nlo,$nhi
  347. depd,z $nlo,59,4,$nlo
  348. ldd $nlo($Hll),$Zll
  349. ldd $nlo($Hhh),$Zhh
  350. depd,z $Zll,60,4,$rem
  351. shrpd $Zhh,$Zll,4,$Zll
  352. extrd,u $Zhh,59,60,$Zhh
  353. ldb 14($Xi),$nlo
  354. ldb 14($inp),$byte
  355. ldd $nhi($Hll),$Tll
  356. ldd $nhi($Hhh),$Thh
  357. xor $byte,$nlo,$nlo
  358. and $mask0xf0,$nlo,$nhi
  359. depd,z $nlo,59,4,$nlo
  360. xor $Tll,$Zll,$Zll
  361. xor $Thh,$Zhh,$Zhh
  362. ldd $rem($rem_4bit),$rem
  363. b L\$oop_ghash_pa2
  364. ldi 13,$cnt
  365. .ALIGN 8
  366. L\$oop_ghash_pa2
  367. xor $rem,$Zhh,$Zhh ; moved here to work around gas bug
  368. depd,z $Zll,60,4,$rem2
  369. shrpd $Zhh,$Zll,4,$Zll
  370. extrd,u $Zhh,59,60,$Zhh
  371. ldd $nlo($Hll),$Tll
  372. ldd $nlo($Hhh),$Thh
  373. xor $Tll,$Zll,$Zll
  374. xor $Thh,$Zhh,$Zhh
  375. ldbx $cnt($Xi),$nlo
  376. ldbx $cnt($inp),$byte
  377. depd,z $Zll,60,4,$rem
  378. shrpd $Zhh,$Zll,4,$Zll
  379. ldd $rem2($rem_4bit),$rem2
  380. xor $rem2,$Zhh,$Zhh
  381. xor $byte,$nlo,$nlo
  382. ldd $nhi($Hll),$Tll
  383. ldd $nhi($Hhh),$Thh
  384. and $mask0xf0,$nlo,$nhi
  385. depd,z $nlo,59,4,$nlo
  386. extrd,u $Zhh,59,60,$Zhh
  387. xor $Tll,$Zll,$Zll
  388. ldd $rem($rem_4bit),$rem
  389. addib,uv -1,$cnt,L\$oop_ghash_pa2
  390. xor $Thh,$Zhh,$Zhh
  391. xor $rem,$Zhh,$Zhh
  392. depd,z $Zll,60,4,$rem2
  393. shrpd $Zhh,$Zll,4,$Zll
  394. extrd,u $Zhh,59,60,$Zhh
  395. ldd $nlo($Hll),$Tll
  396. ldd $nlo($Hhh),$Thh
  397. xor $Tll,$Zll,$Zll
  398. xor $Thh,$Zhh,$Zhh
  399. depd,z $Zll,60,4,$rem
  400. shrpd $Zhh,$Zll,4,$Zll
  401. ldd $rem2($rem_4bit),$rem2
  402. xor $rem2,$Zhh,$Zhh
  403. ldd $nhi($Hll),$Tll
  404. ldd $nhi($Hhh),$Thh
  405. extrd,u $Zhh,59,60,$Zhh
  406. xor $Tll,$Zll,$Zll
  407. xor $Thh,$Zhh,$Zhh
  408. ldd $rem($rem_4bit),$rem
  409. xor $rem,$Zhh,$Zhh
  410. std $Zll,8($Xi)
  411. ldo 16($inp),$inp
  412. std $Zhh,0($Xi)
  413. cmpb,*<> $inp,$len,L\$outer_ghash_pa2
  414. copy $Zll,$nlo
  415. ___
  416. $code.=<<___ if ($SIZE_T==4);
  417. b L\$done_ghash
  418. nop
  419. L\$parisc1_ghash
  420. ldb 15($Xi),$nlo
  421. ldo 12($Htbl),$Hll
  422. ldo 8($Htbl),$Hlh
  423. ldo 4($Htbl),$Hhl
  424. L\$outer_ghash_pa1
  425. ldb 15($inp),$byte
  426. xor $byte,$nlo,$nlo
  427. and $mask0xf0,$nlo,$nhi
  428. zdep $nlo,27,4,$nlo
  429. ldwx $nlo($Hll),$Zll
  430. ldwx $nlo($Hlh),$Zlh
  431. ldwx $nlo($Hhl),$Zhl
  432. ldwx $nlo($Hhh),$Zhh
  433. zdep $Zll,28,4,$rem
  434. ldb 14($Xi),$nlo
  435. ldb 14($inp),$byte
  436. ldwx $rem($rem_4bit),$rem
  437. shrpw $Zlh,$Zll,4,$Zll
  438. ldwx $nhi($Hll),$Tll
  439. shrpw $Zhl,$Zlh,4,$Zlh
  440. ldwx $nhi($Hlh),$Tlh
  441. shrpw $Zhh,$Zhl,4,$Zhl
  442. ldwx $nhi($Hhl),$Thl
  443. extru $Zhh,27,28,$Zhh
  444. ldwx $nhi($Hhh),$Thh
  445. xor $byte,$nlo,$nlo
  446. xor $rem,$Zhh,$Zhh
  447. and $mask0xf0,$nlo,$nhi
  448. zdep $nlo,27,4,$nlo
  449. xor $Tll,$Zll,$Zll
  450. ldwx $nlo($Hll),$Tll
  451. xor $Tlh,$Zlh,$Zlh
  452. ldwx $nlo($Hlh),$Tlh
  453. xor $Thl,$Zhl,$Zhl
  454. b L\$oop_ghash_pa1
  455. ldi 13,$cnt
  456. .ALIGN 8
  457. L\$oop_ghash_pa1
  458. zdep $Zll,28,4,$rem
  459. ldwx $nlo($Hhl),$Thl
  460. xor $Thh,$Zhh,$Zhh
  461. ldwx $rem($rem_4bit),$rem
  462. shrpw $Zlh,$Zll,4,$Zll
  463. ldwx $nlo($Hhh),$Thh
  464. shrpw $Zhl,$Zlh,4,$Zlh
  465. ldbx $cnt($Xi),$nlo
  466. xor $Tll,$Zll,$Zll
  467. ldwx $nhi($Hll),$Tll
  468. shrpw $Zhh,$Zhl,4,$Zhl
  469. ldbx $cnt($inp),$byte
  470. xor $Tlh,$Zlh,$Zlh
  471. ldwx $nhi($Hlh),$Tlh
  472. extru $Zhh,27,28,$Zhh
  473. xor $Thl,$Zhl,$Zhl
  474. ldwx $nhi($Hhl),$Thl
  475. xor $rem,$Zhh,$Zhh
  476. zdep $Zll,28,4,$rem
  477. xor $Thh,$Zhh,$Zhh
  478. ldwx $nhi($Hhh),$Thh
  479. shrpw $Zlh,$Zll,4,$Zll
  480. ldwx $rem($rem_4bit),$rem
  481. shrpw $Zhl,$Zlh,4,$Zlh
  482. xor $byte,$nlo,$nlo
  483. shrpw $Zhh,$Zhl,4,$Zhl
  484. and $mask0xf0,$nlo,$nhi
  485. extru $Zhh,27,28,$Zhh
  486. zdep $nlo,27,4,$nlo
  487. xor $Tll,$Zll,$Zll
  488. ldwx $nlo($Hll),$Tll
  489. xor $Tlh,$Zlh,$Zlh
  490. ldwx $nlo($Hlh),$Tlh
  491. xor $rem,$Zhh,$Zhh
  492. addib,uv -1,$cnt,L\$oop_ghash_pa1
  493. xor $Thl,$Zhl,$Zhl
  494. zdep $Zll,28,4,$rem
  495. ldwx $nlo($Hhl),$Thl
  496. xor $Thh,$Zhh,$Zhh
  497. ldwx $rem($rem_4bit),$rem
  498. shrpw $Zlh,$Zll,4,$Zll
  499. ldwx $nlo($Hhh),$Thh
  500. shrpw $Zhl,$Zlh,4,$Zlh
  501. xor $Tll,$Zll,$Zll
  502. ldwx $nhi($Hll),$Tll
  503. shrpw $Zhh,$Zhl,4,$Zhl
  504. xor $Tlh,$Zlh,$Zlh
  505. ldwx $nhi($Hlh),$Tlh
  506. extru $Zhh,27,28,$Zhh
  507. xor $rem,$Zhh,$Zhh
  508. xor $Thl,$Zhl,$Zhl
  509. ldwx $nhi($Hhl),$Thl
  510. xor $Thh,$Zhh,$Zhh
  511. ldwx $nhi($Hhh),$Thh
  512. zdep $Zll,28,4,$rem
  513. ldwx $rem($rem_4bit),$rem
  514. shrpw $Zlh,$Zll,4,$Zll
  515. shrpw $Zhl,$Zlh,4,$Zlh
  516. shrpw $Zhh,$Zhl,4,$Zhl
  517. extru $Zhh,27,28,$Zhh
  518. xor $Tll,$Zll,$Zll
  519. xor $Tlh,$Zlh,$Zlh
  520. xor $rem,$Zhh,$Zhh
  521. stw $Zll,12($Xi)
  522. xor $Thl,$Zhl,$Zhl
  523. stw $Zlh,8($Xi)
  524. xor $Thh,$Zhh,$Zhh
  525. stw $Zhl,4($Xi)
  526. ldo 16($inp),$inp
  527. stw $Zhh,0($Xi)
  528. comb,<> $inp,$len,L\$outer_ghash_pa1
  529. copy $Zll,$nlo
  530. ___
  531. $code.=<<___;
  532. L\$done_ghash
  533. $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue
  534. $POP `-$FRAME+1*$SIZE_T`(%sp),%r4
  535. $POP `-$FRAME+2*$SIZE_T`(%sp),%r5
  536. $POP `-$FRAME+3*$SIZE_T`(%sp),%r6
  537. ___
  538. $code.=<<___ if ($SIZE_T==4);
  539. $POP `-$FRAME+4*$SIZE_T`(%sp),%r7
  540. $POP `-$FRAME+5*$SIZE_T`(%sp),%r8
  541. $POP `-$FRAME+6*$SIZE_T`(%sp),%r9
  542. $POP `-$FRAME+7*$SIZE_T`(%sp),%r10
  543. $POP `-$FRAME+8*$SIZE_T`(%sp),%r11
  544. ___
  545. $code.=<<___;
  546. bv (%r2)
  547. .EXIT
  548. $POPMB -$FRAME(%sp),%r3
  549. .PROCEND
  550. .ALIGN 64
  551. L\$rem_4bit
  552. .WORD `0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0
  553. .WORD `0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0
  554. .WORD `0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0
  555. .WORD `0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0
  556. .STRINGZ "GHASH for PA-RISC, GRYPTOGAMS by <appro\@openssl.org>"
  557. .ALIGN 64
  558. ___
  559. # Explicitly encode PA-RISC 2.0 instructions used in this module, so
  560. # that it can be compiled with .LEVEL 1.0. It should be noted that I
  561. # wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
  562. # directive...
  563. my $ldd = sub {
  564. my ($mod,$args) = @_;
  565. my $orig = "ldd$mod\t$args";
  566. if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 4
  567. { my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3;
  568. sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
  569. }
  570. elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 5
  571. { my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3;
  572. $opcode|=(($1&0xF)<<17)|(($1&0x10)<<12); # encode offset
  573. $opcode|=(1<<5) if ($mod =~ /^,m/);
  574. $opcode|=(1<<13) if ($mod =~ /^,mb/);
  575. sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
  576. }
  577. else { "\t".$orig; }
  578. };
  579. my $std = sub {
  580. my ($mod,$args) = @_;
  581. my $orig = "std$mod\t$args";
  582. if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 3 suffices
  583. { my $opcode=(0x1c<<26)|($3<<21)|($1<<16)|(($2&0x1FF8)<<1)|(($2>>13)&1);
  584. sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
  585. }
  586. else { "\t".$orig; }
  587. };
  588. my $extrd = sub {
  589. my ($mod,$args) = @_;
  590. my $orig = "extrd$mod\t$args";
  591. # I only have ",u" completer, it's implicitly encoded...
  592. if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 15
  593. { my $opcode=(0x36<<26)|($1<<21)|($4<<16);
  594. my $len=32-$3;
  595. $opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5); # encode pos
  596. $opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len
  597. sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
  598. }
  599. elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/) # format 12
  600. { my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
  601. my $len=32-$2;
  602. $opcode |= (($len&0x20)<<3)|($len&0x1f); # encode len
  603. $opcode |= (1<<13) if ($mod =~ /,\**=/);
  604. sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
  605. }
  606. else { "\t".$orig; }
  607. };
  608. my $shrpd = sub {
  609. my ($mod,$args) = @_;
  610. my $orig = "shrpd$mod\t$args";
  611. if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/) # format 14
  612. { my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
  613. my $cpos=63-$3;
  614. $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode sa
  615. sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
  616. }
  617. elsif ($args =~ /%r([0-9]+),%r([0-9]+),%sar,%r([0-9]+)/) # format 11
  618. { sprintf "\t.WORD\t0x%08x\t; %s",
  619. (0x34<<26)|($2<<21)|($1<<16)|(1<<9)|$3,$orig;
  620. }
  621. else { "\t".$orig; }
  622. };
  623. my $depd = sub {
  624. my ($mod,$args) = @_;
  625. my $orig = "depd$mod\t$args";
  626. # I only have ",z" completer, it's impicitly encoded...
  627. if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 16
  628. { my $opcode=(0x3c<<26)|($4<<21)|($1<<16);
  629. my $cpos=63-$2;
  630. my $len=32-$3;
  631. $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode pos
  632. $opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len
  633. sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
  634. }
  635. else { "\t".$orig; }
  636. };
  637. sub assemble {
  638. my ($mnemonic,$mod,$args)=@_;
  639. my $opcode = eval("\$$mnemonic");
  640. ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
  641. }
  642. foreach (split("\n",$code)) {
  643. s/\`([^\`]*)\`/eval $1/ge;
  644. if ($SIZE_T==4) {
  645. s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e;
  646. s/cmpb,\*/comb,/;
  647. s/,\*/,/;
  648. }
  649. print $_,"\n";
  650. }
  651. close STDOUT;