ghash-ia64.pl 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463
  1. #!/usr/bin/env perl
  2. # ====================================================================
  3. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  4. # project. The module is, however, dual licensed under OpenSSL and
  5. # CRYPTOGAMS licenses depending on where you obtain it. For further
  6. # details see http://www.openssl.org/~appro/cryptogams/.
  7. # ====================================================================
  8. #
  9. # March 2010
  10. #
  11. # The module implements "4-bit" GCM GHASH function and underlying
  12. # single multiplication operation in GF(2^128). "4-bit" means that it
  13. # uses 256 bytes per-key table [+128 bytes shared table]. Streamed
  14. # GHASH performance was measured to be 6.67 cycles per processed byte
  15. # on Itanium 2, which is >90% better than Microsoft compiler generated
  16. # code. To anchor to something else sha1-ia64.pl module processes one
  17. # byte in 5.7 cycles. On Itanium GHASH should run at ~8.5 cycles per
  18. # byte.
  19. # September 2010
  20. #
  21. # It was originally thought that it makes lesser sense to implement
  22. # "528B" variant on Itanium 2 for following reason. Because number of
  23. # functional units is naturally limited, it appeared impossible to
  24. # implement "528B" loop in 4 cycles, only in 5. This would mean that
  25. # theoretically performance improvement couldn't be more than 20%.
  26. # But occasionally you prove yourself wrong:-) I figured out a way to
  27. # fold couple of instructions and having freed yet another instruction
  28. # slot by unrolling the loop... Resulting performance is 4.45 cycles
  29. # per processed byte and 50% better than "256B" version. On original
  30. # Itanium performance should remain the same as the "256B" version,
  31. # i.e. ~8.5 cycles.
  32. $output=shift and (open STDOUT,">$output" or die "can't open $output: $!");
  33. if ($^O eq "hpux") {
  34. $ADDP="addp4";
  35. for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
  36. } else { $ADDP="add"; }
  37. for (@ARGV) { $big_endian=1 if (/\-DB_ENDIAN/);
  38. $big_endian=0 if (/\-DL_ENDIAN/); }
  39. if (!defined($big_endian))
  40. { $big_endian=(unpack('L',pack('N',1))==1); }
  41. sub loop() {
  42. my $label=shift;
  43. my ($p16,$p17)=(shift)?("p63","p63"):("p16","p17"); # mask references to inp
  44. # Loop is scheduled for 6 ticks on Itanium 2 and 8 on Itanium, i.e.
  45. # in scalable manner;-) Naturally assuming data in L1 cache...
  46. # Special note about 'dep' instruction, which is used to construct
  47. # &rem_4bit[Zlo&0xf]. It works, because rem_4bit is aligned at 128
  48. # bytes boundary and lower 7 bits of its address are guaranteed to
  49. # be zero.
  50. $code.=<<___;
  51. $label:
  52. { .mfi; (p18) ld8 Hlo=[Hi[1]],-8
  53. (p19) dep rem=Zlo,rem_4bitp,3,4 }
  54. { .mfi; (p19) xor Zhi=Zhi,Hhi
  55. ($p17) xor xi[1]=xi[1],in[1] };;
  56. { .mfi; (p18) ld8 Hhi=[Hi[1]]
  57. (p19) shrp Zlo=Zhi,Zlo,4 }
  58. { .mfi; (p19) ld8 rem=[rem]
  59. (p18) and Hi[1]=mask0xf0,xi[2] };;
  60. { .mmi; ($p16) ld1 in[0]=[inp],-1
  61. (p18) xor Zlo=Zlo,Hlo
  62. (p19) shr.u Zhi=Zhi,4 }
  63. { .mib; (p19) xor Hhi=Hhi,rem
  64. (p18) add Hi[1]=Htbl,Hi[1] };;
  65. { .mfi; (p18) ld8 Hlo=[Hi[1]],-8
  66. (p18) dep rem=Zlo,rem_4bitp,3,4 }
  67. { .mfi; (p17) shladd Hi[0]=xi[1],4,r0
  68. (p18) xor Zhi=Zhi,Hhi };;
  69. { .mfi; (p18) ld8 Hhi=[Hi[1]]
  70. (p18) shrp Zlo=Zhi,Zlo,4 }
  71. { .mfi; (p18) ld8 rem=[rem]
  72. (p17) and Hi[0]=mask0xf0,Hi[0] };;
  73. { .mmi; (p16) ld1 xi[0]=[Xi],-1
  74. (p18) xor Zlo=Zlo,Hlo
  75. (p18) shr.u Zhi=Zhi,4 }
  76. { .mib; (p18) xor Hhi=Hhi,rem
  77. (p17) add Hi[0]=Htbl,Hi[0]
  78. br.ctop.sptk $label };;
  79. ___
  80. }
  81. $code=<<___;
  82. .explicit
  83. .text
  84. prevfs=r2; prevlc=r3; prevpr=r8;
  85. mask0xf0=r21;
  86. rem=r22; rem_4bitp=r23;
  87. Xi=r24; Htbl=r25;
  88. inp=r26; end=r27;
  89. Hhi=r28; Hlo=r29;
  90. Zhi=r30; Zlo=r31;
  91. .align 128
  92. .skip 16 // aligns loop body
  93. .global gcm_gmult_4bit#
  94. .proc gcm_gmult_4bit#
  95. gcm_gmult_4bit:
  96. .prologue
  97. { .mmi; .save ar.pfs,prevfs
  98. alloc prevfs=ar.pfs,2,6,0,8
  99. $ADDP Xi=15,in0 // &Xi[15]
  100. mov rem_4bitp=ip }
  101. { .mii; $ADDP Htbl=8,in1 // &Htbl[0].lo
  102. .save ar.lc,prevlc
  103. mov prevlc=ar.lc
  104. .save pr,prevpr
  105. mov prevpr=pr };;
  106. .body
  107. .rotr in[3],xi[3],Hi[2]
  108. { .mib; ld1 xi[2]=[Xi],-1 // Xi[15]
  109. mov mask0xf0=0xf0
  110. brp.loop.imp .Loop1,.Lend1-16};;
  111. { .mmi; ld1 xi[1]=[Xi],-1 // Xi[14]
  112. };;
  113. { .mii; shladd Hi[1]=xi[2],4,r0
  114. mov pr.rot=0x7<<16
  115. mov ar.lc=13 };;
  116. { .mii; and Hi[1]=mask0xf0,Hi[1]
  117. mov ar.ec=3
  118. xor Zlo=Zlo,Zlo };;
  119. { .mii; add Hi[1]=Htbl,Hi[1] // &Htbl[nlo].lo
  120. add rem_4bitp=rem_4bit#-gcm_gmult_4bit#,rem_4bitp
  121. xor Zhi=Zhi,Zhi };;
  122. ___
  123. &loop (".Loop1",1);
  124. $code.=<<___;
  125. .Lend1:
  126. { .mib; xor Zhi=Zhi,Hhi };; // modulo-scheduling artefact
  127. { .mib; mux1 Zlo=Zlo,\@rev };;
  128. { .mib; mux1 Zhi=Zhi,\@rev };;
  129. { .mmi; add Hlo=9,Xi;; // ;; is here to prevent
  130. add Hhi=1,Xi };; // pipeline flush on Itanium
  131. { .mib; st8 [Hlo]=Zlo
  132. mov pr=prevpr,0x1ffff };;
  133. { .mib; st8 [Hhi]=Zhi
  134. mov ar.lc=prevlc
  135. br.ret.sptk.many b0 };;
  136. .endp gcm_gmult_4bit#
  137. ___
  138. ######################################################################
  139. # "528B" (well, "512B" actualy) streamed GHASH
  140. #
  141. $Xip="in0";
  142. $Htbl="in1";
  143. $inp="in2";
  144. $len="in3";
  145. $rem_8bit="loc0";
  146. $mask0xff="loc1";
  147. ($sum,$rum) = $big_endian ? ("nop.m","nop.m") : ("sum","rum");
  148. sub load_htable() {
  149. for (my $i=0;$i<8;$i++) {
  150. $code.=<<___;
  151. { .mmi; ld8 r`16+2*$i+1`=[r8],16 // Htable[$i].hi
  152. ld8 r`16+2*$i`=[r9],16 } // Htable[$i].lo
  153. { .mmi; ldf8 f`32+2*$i+1`=[r10],16 // Htable[`8+$i`].hi
  154. ldf8 f`32+2*$i`=[r11],16 // Htable[`8+$i`].lo
  155. ___
  156. $code.=shift if (($i+$#_)==7);
  157. $code.="\t};;\n"
  158. }
  159. }
  160. $code.=<<___;
  161. prevsp=r3;
  162. .align 32
  163. .skip 16 // aligns loop body
  164. .global gcm_ghash_4bit#
  165. .proc gcm_ghash_4bit#
  166. gcm_ghash_4bit:
  167. .prologue
  168. { .mmi; .save ar.pfs,prevfs
  169. alloc prevfs=ar.pfs,4,2,0,0
  170. .vframe prevsp
  171. mov prevsp=sp
  172. mov $rem_8bit=ip };;
  173. .body
  174. { .mfi; $ADDP r8=0+0,$Htbl
  175. $ADDP r9=0+8,$Htbl }
  176. { .mfi; $ADDP r10=128+0,$Htbl
  177. $ADDP r11=128+8,$Htbl };;
  178. ___
  179. &load_htable(
  180. " $ADDP $Xip=15,$Xip", # &Xi[15]
  181. " $ADDP $len=$len,$inp", # &inp[len]
  182. " $ADDP $inp=15,$inp", # &inp[15]
  183. " mov $mask0xff=0xff",
  184. " add sp=-512,sp",
  185. " andcm sp=sp,$mask0xff", # align stack frame
  186. " add r14=0,sp",
  187. " add r15=8,sp");
  188. $code.=<<___;
  189. { .mmi; $sum 1<<1 // go big-endian
  190. add r8=256+0,sp
  191. add r9=256+8,sp }
  192. { .mmi; add r10=256+128+0,sp
  193. add r11=256+128+8,sp
  194. add $len=-17,$len };;
  195. ___
  196. for($i=0;$i<8;$i++) { # generate first half of Hshr4[]
  197. my ($rlo,$rhi)=("r".eval(16+2*$i),"r".eval(16+2*$i+1));
  198. $code.=<<___;
  199. { .mmi; st8 [r8]=$rlo,16 // Htable[$i].lo
  200. st8 [r9]=$rhi,16 // Htable[$i].hi
  201. shrp $rlo=$rhi,$rlo,4 }//;;
  202. { .mmi; stf8 [r10]=f`32+2*$i`,16 // Htable[`8+$i`].lo
  203. stf8 [r11]=f`32+2*$i+1`,16 // Htable[`8+$i`].hi
  204. shr.u $rhi=$rhi,4 };;
  205. { .mmi; st8 [r14]=$rlo,16 // Htable[$i].lo>>4
  206. st8 [r15]=$rhi,16 }//;; // Htable[$i].hi>>4
  207. ___
  208. }
  209. $code.=<<___;
  210. { .mmi; ld8 r16=[r8],16 // Htable[8].lo
  211. ld8 r17=[r9],16 };; // Htable[8].hi
  212. { .mmi; ld8 r18=[r8],16 // Htable[9].lo
  213. ld8 r19=[r9],16 } // Htable[9].hi
  214. { .mmi; rum 1<<5 // clear um.mfh
  215. shrp r16=r17,r16,4 };;
  216. ___
  217. for($i=0;$i<6;$i++) { # generate second half of Hshr4[]
  218. $code.=<<___;
  219. { .mmi; ld8 r`20+2*$i`=[r8],16 // Htable[`10+$i`].lo
  220. ld8 r`20+2*$i+1`=[r9],16 // Htable[`10+$i`].hi
  221. shr.u r`16+2*$i+1`=r`16+2*$i+1`,4 };;
  222. { .mmi; st8 [r14]=r`16+2*$i`,16 // Htable[`8+$i`].lo>>4
  223. st8 [r15]=r`16+2*$i+1`,16 // Htable[`8+$i`].hi>>4
  224. shrp r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4 }
  225. ___
  226. }
  227. $code.=<<___;
  228. { .mmi; shr.u r`16+2*$i+1`=r`16+2*$i+1`,4 };;
  229. { .mmi; st8 [r14]=r`16+2*$i`,16 // Htable[`8+$i`].lo>>4
  230. st8 [r15]=r`16+2*$i+1`,16 // Htable[`8+$i`].hi>>4
  231. shrp r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4 }
  232. { .mmi; add $Htbl=256,sp // &Htable[0]
  233. add $rem_8bit=rem_8bit#-gcm_ghash_4bit#,$rem_8bit
  234. shr.u r`18+2*$i+1`=r`18+2*$i+1`,4 };;
  235. { .mmi; st8 [r14]=r`18+2*$i` // Htable[`8+$i`].lo>>4
  236. st8 [r15]=r`18+2*$i+1` } // Htable[`8+$i`].hi>>4
  237. ___
  238. $in="r15";
  239. @xi=("r16","r17");
  240. @rem=("r18","r19");
  241. ($Alo,$Ahi,$Blo,$Bhi,$Zlo,$Zhi)=("r20","r21","r22","r23","r24","r25");
  242. ($Atbl,$Btbl)=("r26","r27");
  243. $code.=<<___; # (p16)
  244. { .mmi; ld1 $in=[$inp],-1 //(p16) *inp--
  245. ld1 $xi[0]=[$Xip],-1 //(p16) *Xi--
  246. cmp.eq p0,p6=r0,r0 };; // clear p6
  247. ___
  248. push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
  249. $code.=<<___; # (p16),(p17)
  250. { .mmi; ld1 $xi[0]=[$Xip],-1 //(p16) *Xi--
  251. xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i]
  252. { .mii; ld1 $in=[$inp],-1 //(p16) *inp--
  253. dep $Atbl=$xi[1],$Htbl,4,4 //(p17) &Htable[nlo].lo
  254. and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0
  255. .align 32
  256. .LOOP:
  257. { .mmi;
  258. (p6) st8 [$Xip]=$Zhi,13
  259. xor $Zlo=$Zlo,$Zlo
  260. add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi].lo
  261. ___
  262. push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
  263. $code.=<<___; # (p16),(p17),(p18)
  264. { .mmi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi
  265. ld8 $rem[0]=[$Btbl],-256 //(p18) Htable[nhi].lo,&Hshr4[nhi].lo
  266. xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i]
  267. { .mfi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi
  268. dep $Atbl=$xi[1],$Htbl,4,4 } //(p17) &Htable[nlo].lo
  269. { .mfi; shladd $rem[0]=$rem[0],4,r0 //(p18) Htable[nhi].lo<<4
  270. xor $Zlo=$Zlo,$Alo };; //(p18) Z.lo^=Htable[nlo].lo
  271. { .mmi; ld8 $Blo=[$Btbl],8 //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
  272. ld1 $in=[$inp],-1 } //(p16) *inp--
  273. { .mmi; xor $rem[0]=$rem[0],$Zlo //(p18) Z.lo^(Htable[nhi].lo<<4)
  274. mov $Zhi=$Ahi //(p18) Z.hi^=Htable[nlo].hi
  275. and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0
  276. { .mmi; ld8 $Bhi=[$Btbl] //(p18) Hshr4[nhi].hi
  277. ld1 $xi[0]=[$Xip],-1 //(p16) *Xi--
  278. shrp $Zlo=$Zhi,$Zlo,8 } //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
  279. { .mmi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
  280. add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi]
  281. ___
  282. push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
  283. for ($i=1;$i<14;$i++) {
  284. # Above and below fragments are derived from this one by removing
  285. # unsuitable (p??) instructions.
  286. $code.=<<___; # (p16),(p17),(p18),(p19)
  287. { .mmi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi
  288. ld8 $rem[0]=[$Btbl],-256 //(p18) Htable[nhi].lo,&Hshr4[nhi].lo
  289. shr.u $Zhi=$Zhi,8 } //(p19) Z.hi>>=8
  290. { .mmi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem]
  291. xor $Zlo=$Zlo,$Blo //(p19) Z.lo^=Hshr4[nhi].lo
  292. xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i]
  293. { .mmi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi
  294. ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem]
  295. dep $Atbl=$xi[1],$Htbl,4,4 } //(p17) &Htable[nlo].lo
  296. { .mmi; shladd $rem[0]=$rem[0],4,r0 //(p18) Htable[nhi].lo<<4
  297. xor $Zlo=$Zlo,$Alo //(p18) Z.lo^=Htable[nlo].lo
  298. xor $Zhi=$Zhi,$Bhi };; //(p19) Z.hi^=Hshr4[nhi].hi
  299. { .mmi; ld8 $Blo=[$Btbl],8 //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
  300. ld1 $in=[$inp],-1 //(p16) *inp--
  301. shl $rem[1]=$rem[1],48 } //(p19) rem_8bit[rem]<<48
  302. { .mmi; xor $rem[0]=$rem[0],$Zlo //(p18) Z.lo^(Htable[nhi].lo<<4)
  303. xor $Zhi=$Zhi,$Ahi //(p18) Z.hi^=Htable[nlo].hi
  304. and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0
  305. { .mmi; ld8 $Bhi=[$Btbl] //(p18) Hshr4[nhi].hi
  306. ld1 $xi[0]=[$Xip],-1 //(p16) *Xi--
  307. shrp $Zlo=$Zhi,$Zlo,8 } //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
  308. { .mmi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
  309. xor $Zhi=$Zhi,$rem[1] //(p19) Z.hi^=rem_8bit[rem]<<48
  310. add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi]
  311. ___
  312. push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
  313. }
  314. $code.=<<___; # (p17),(p18),(p19)
  315. { .mmi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi
  316. ld8 $rem[0]=[$Btbl],-256 //(p18) Htable[nhi].lo,&Hshr4[nhi].lo
  317. shr.u $Zhi=$Zhi,8 } //(p19) Z.hi>>=8
  318. { .mmi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem]
  319. xor $Zlo=$Zlo,$Blo //(p19) Z.lo^=Hshr4[nhi].lo
  320. xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i]
  321. { .mmi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi
  322. ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem]
  323. dep $Atbl=$xi[1],$Htbl,4,4 };; //(p17) &Htable[nlo].lo
  324. { .mmi; shladd $rem[0]=$rem[0],4,r0 //(p18) Htable[nhi].lo<<4
  325. xor $Zlo=$Zlo,$Alo //(p18) Z.lo^=Htable[nlo].lo
  326. xor $Zhi=$Zhi,$Bhi };; //(p19) Z.hi^=Hshr4[nhi].hi
  327. { .mmi; ld8 $Blo=[$Btbl],8 //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
  328. shl $rem[1]=$rem[1],48 } //(p19) rem_8bit[rem]<<48
  329. { .mmi; xor $rem[0]=$rem[0],$Zlo //(p18) Z.lo^(Htable[nhi].lo<<4)
  330. xor $Zhi=$Zhi,$Ahi //(p18) Z.hi^=Htable[nlo].hi
  331. and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0
  332. { .mmi; ld8 $Bhi=[$Btbl] //(p18) Hshr4[nhi].hi
  333. shrp $Zlo=$Zhi,$Zlo,8 } //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
  334. { .mmi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
  335. xor $Zhi=$Zhi,$rem[1] //(p19) Z.hi^=rem_8bit[rem]<<48
  336. add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi]
  337. ___
  338. push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
  339. $code.=<<___; # (p18),(p19)
  340. { .mfi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi
  341. shr.u $Zhi=$Zhi,8 } //(p19) Z.hi>>=8
  342. { .mfi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem]
  343. xor $Zlo=$Zlo,$Blo };; //(p19) Z.lo^=Hshr4[nhi].lo
  344. { .mfi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi
  345. xor $Zlo=$Zlo,$Alo } //(p18) Z.lo^=Htable[nlo].lo
  346. { .mfi; ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem]
  347. xor $Zhi=$Zhi,$Bhi };; //(p19) Z.hi^=Hshr4[nhi].hi
  348. { .mfi; ld8 $Blo=[$Btbl],8 //(p18) Htable[nhi].lo,&Htable[nhi].hi
  349. shl $rem[1]=$rem[1],48 } //(p19) rem_8bit[rem]<<48
  350. { .mfi; shladd $rem[0]=$Zlo,4,r0 //(p18) Z.lo<<4
  351. xor $Zhi=$Zhi,$Ahi };; //(p18) Z.hi^=Htable[nlo].hi
  352. { .mfi; ld8 $Bhi=[$Btbl] //(p18) Htable[nhi].hi
  353. shrp $Zlo=$Zhi,$Zlo,4 } //(p18) Z.lo=(Z.hi<<60)|(Z.lo>>4)
  354. { .mfi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
  355. xor $Zhi=$Zhi,$rem[1] };; //(p19) Z.hi^=rem_8bit[rem]<<48
  356. ___
  357. push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
  358. $code.=<<___; # (p19)
  359. { .mmi; cmp.ltu p6,p0=$inp,$len
  360. add $inp=32,$inp
  361. shr.u $Zhi=$Zhi,4 } //(p19) Z.hi>>=4
  362. { .mmi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem]
  363. xor $Zlo=$Zlo,$Blo //(p19) Z.lo^=Hshr4[nhi].lo
  364. add $Xip=9,$Xip };; // &Xi.lo
  365. { .mmi; ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem]
  366. (p6) ld1 $in=[$inp],-1 //[p16] *inp--
  367. (p6) extr.u $xi[1]=$Zlo,8,8 } //[p17] Xi[14]
  368. { .mmi; xor $Zhi=$Zhi,$Bhi //(p19) Z.hi^=Hshr4[nhi].hi
  369. (p6) and $xi[0]=$Zlo,$mask0xff };; //[p16] Xi[15]
  370. { .mmi; st8 [$Xip]=$Zlo,-8
  371. (p6) xor $xi[0]=$xi[0],$in //[p17] xi=$xi[i]^inp[i]
  372. shl $rem[1]=$rem[1],48 };; //(p19) rem_8bit[rem]<<48
  373. { .mmi;
  374. (p6) ld1 $in=[$inp],-1 //[p16] *inp--
  375. xor $Zhi=$Zhi,$rem[1] //(p19) Z.hi^=rem_8bit[rem]<<48
  376. (p6) dep $Atbl=$xi[0],$Htbl,4,4 } //[p17] &Htable[nlo].lo
  377. { .mib;
  378. (p6) and $xi[0]=-16,$xi[0] //[p17] nhi=xi&0xf0
  379. (p6) br.cond.dptk.many .LOOP };;
  380. { .mib; st8 [$Xip]=$Zhi };;
  381. { .mib; $rum 1<<1 // return to little-endian
  382. .restore sp
  383. mov sp=prevsp
  384. br.ret.sptk.many b0 };;
  385. .endp gcm_ghash_4bit#
  386. ___
  387. $code.=<<___;
  388. .align 128
  389. .type rem_4bit#,\@object
  390. rem_4bit:
  391. data8 0x0000<<48, 0x1C20<<48, 0x3840<<48, 0x2460<<48
  392. data8 0x7080<<48, 0x6CA0<<48, 0x48C0<<48, 0x54E0<<48
  393. data8 0xE100<<48, 0xFD20<<48, 0xD940<<48, 0xC560<<48
  394. data8 0x9180<<48, 0x8DA0<<48, 0xA9C0<<48, 0xB5E0<<48
  395. .size rem_4bit#,128
  396. .type rem_8bit#,\@object
  397. rem_8bit:
  398. data1 0x00,0x00, 0x01,0xC2, 0x03,0x84, 0x02,0x46, 0x07,0x08, 0x06,0xCA, 0x04,0x8C, 0x05,0x4E
  399. data1 0x0E,0x10, 0x0F,0xD2, 0x0D,0x94, 0x0C,0x56, 0x09,0x18, 0x08,0xDA, 0x0A,0x9C, 0x0B,0x5E
  400. data1 0x1C,0x20, 0x1D,0xE2, 0x1F,0xA4, 0x1E,0x66, 0x1B,0x28, 0x1A,0xEA, 0x18,0xAC, 0x19,0x6E
  401. data1 0x12,0x30, 0x13,0xF2, 0x11,0xB4, 0x10,0x76, 0x15,0x38, 0x14,0xFA, 0x16,0xBC, 0x17,0x7E
  402. data1 0x38,0x40, 0x39,0x82, 0x3B,0xC4, 0x3A,0x06, 0x3F,0x48, 0x3E,0x8A, 0x3C,0xCC, 0x3D,0x0E
  403. data1 0x36,0x50, 0x37,0x92, 0x35,0xD4, 0x34,0x16, 0x31,0x58, 0x30,0x9A, 0x32,0xDC, 0x33,0x1E
  404. data1 0x24,0x60, 0x25,0xA2, 0x27,0xE4, 0x26,0x26, 0x23,0x68, 0x22,0xAA, 0x20,0xEC, 0x21,0x2E
  405. data1 0x2A,0x70, 0x2B,0xB2, 0x29,0xF4, 0x28,0x36, 0x2D,0x78, 0x2C,0xBA, 0x2E,0xFC, 0x2F,0x3E
  406. data1 0x70,0x80, 0x71,0x42, 0x73,0x04, 0x72,0xC6, 0x77,0x88, 0x76,0x4A, 0x74,0x0C, 0x75,0xCE
  407. data1 0x7E,0x90, 0x7F,0x52, 0x7D,0x14, 0x7C,0xD6, 0x79,0x98, 0x78,0x5A, 0x7A,0x1C, 0x7B,0xDE
  408. data1 0x6C,0xA0, 0x6D,0x62, 0x6F,0x24, 0x6E,0xE6, 0x6B,0xA8, 0x6A,0x6A, 0x68,0x2C, 0x69,0xEE
  409. data1 0x62,0xB0, 0x63,0x72, 0x61,0x34, 0x60,0xF6, 0x65,0xB8, 0x64,0x7A, 0x66,0x3C, 0x67,0xFE
  410. data1 0x48,0xC0, 0x49,0x02, 0x4B,0x44, 0x4A,0x86, 0x4F,0xC8, 0x4E,0x0A, 0x4C,0x4C, 0x4D,0x8E
  411. data1 0x46,0xD0, 0x47,0x12, 0x45,0x54, 0x44,0x96, 0x41,0xD8, 0x40,0x1A, 0x42,0x5C, 0x43,0x9E
  412. data1 0x54,0xE0, 0x55,0x22, 0x57,0x64, 0x56,0xA6, 0x53,0xE8, 0x52,0x2A, 0x50,0x6C, 0x51,0xAE
  413. data1 0x5A,0xF0, 0x5B,0x32, 0x59,0x74, 0x58,0xB6, 0x5D,0xF8, 0x5C,0x3A, 0x5E,0x7C, 0x5F,0xBE
  414. data1 0xE1,0x00, 0xE0,0xC2, 0xE2,0x84, 0xE3,0x46, 0xE6,0x08, 0xE7,0xCA, 0xE5,0x8C, 0xE4,0x4E
  415. data1 0xEF,0x10, 0xEE,0xD2, 0xEC,0x94, 0xED,0x56, 0xE8,0x18, 0xE9,0xDA, 0xEB,0x9C, 0xEA,0x5E
  416. data1 0xFD,0x20, 0xFC,0xE2, 0xFE,0xA4, 0xFF,0x66, 0xFA,0x28, 0xFB,0xEA, 0xF9,0xAC, 0xF8,0x6E
  417. data1 0xF3,0x30, 0xF2,0xF2, 0xF0,0xB4, 0xF1,0x76, 0xF4,0x38, 0xF5,0xFA, 0xF7,0xBC, 0xF6,0x7E
  418. data1 0xD9,0x40, 0xD8,0x82, 0xDA,0xC4, 0xDB,0x06, 0xDE,0x48, 0xDF,0x8A, 0xDD,0xCC, 0xDC,0x0E
  419. data1 0xD7,0x50, 0xD6,0x92, 0xD4,0xD4, 0xD5,0x16, 0xD0,0x58, 0xD1,0x9A, 0xD3,0xDC, 0xD2,0x1E
  420. data1 0xC5,0x60, 0xC4,0xA2, 0xC6,0xE4, 0xC7,0x26, 0xC2,0x68, 0xC3,0xAA, 0xC1,0xEC, 0xC0,0x2E
  421. data1 0xCB,0x70, 0xCA,0xB2, 0xC8,0xF4, 0xC9,0x36, 0xCC,0x78, 0xCD,0xBA, 0xCF,0xFC, 0xCE,0x3E
  422. data1 0x91,0x80, 0x90,0x42, 0x92,0x04, 0x93,0xC6, 0x96,0x88, 0x97,0x4A, 0x95,0x0C, 0x94,0xCE
  423. data1 0x9F,0x90, 0x9E,0x52, 0x9C,0x14, 0x9D,0xD6, 0x98,0x98, 0x99,0x5A, 0x9B,0x1C, 0x9A,0xDE
  424. data1 0x8D,0xA0, 0x8C,0x62, 0x8E,0x24, 0x8F,0xE6, 0x8A,0xA8, 0x8B,0x6A, 0x89,0x2C, 0x88,0xEE
  425. data1 0x83,0xB0, 0x82,0x72, 0x80,0x34, 0x81,0xF6, 0x84,0xB8, 0x85,0x7A, 0x87,0x3C, 0x86,0xFE
  426. data1 0xA9,0xC0, 0xA8,0x02, 0xAA,0x44, 0xAB,0x86, 0xAE,0xC8, 0xAF,0x0A, 0xAD,0x4C, 0xAC,0x8E
  427. data1 0xA7,0xD0, 0xA6,0x12, 0xA4,0x54, 0xA5,0x96, 0xA0,0xD8, 0xA1,0x1A, 0xA3,0x5C, 0xA2,0x9E
  428. data1 0xB5,0xE0, 0xB4,0x22, 0xB6,0x64, 0xB7,0xA6, 0xB2,0xE8, 0xB3,0x2A, 0xB1,0x6C, 0xB0,0xAE
  429. data1 0xBB,0xF0, 0xBA,0x32, 0xB8,0x74, 0xB9,0xB6, 0xBC,0xF8, 0xBD,0x3A, 0xBF,0x7C, 0xBE,0xBE
  430. .size rem_8bit#,512
  431. stringz "GHASH for IA64, CRYPTOGAMS by <appro\@openssl.org>"
  432. ___
  433. $code =~ s/mux1(\s+)\S+\@rev/nop.i$1 0x0/gm if ($big_endian);
  434. $code =~ s/\`([^\`]*)\`/eval $1/gem;
  435. print $code;
  436. close STDOUT;