2
0

ghash-sparcv9.pl 7.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330
  1. #!/usr/bin/env perl
  2. # ====================================================================
  3. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  4. # project. The module is, however, dual licensed under OpenSSL and
  5. # CRYPTOGAMS licenses depending on where you obtain it. For further
  6. # details see http://www.openssl.org/~appro/cryptogams/.
  7. # ====================================================================
  8. # March 2010
  9. #
  10. # The module implements "4-bit" GCM GHASH function and underlying
  11. # single multiplication operation in GF(2^128). "4-bit" means that it
  12. # uses 256 bytes per-key table [+128 bytes shared table]. Performance
  13. # results are for streamed GHASH subroutine on UltraSPARC pre-Tx CPU
  14. # and are expressed in cycles per processed byte, less is better:
  15. #
  16. # gcc 3.3.x cc 5.2 this assembler
  17. #
  18. # 32-bit build 81.4 43.3 12.6 (+546%/+244%)
  19. # 64-bit build 20.2 21.2 12.6 (+60%/+68%)
  20. #
  21. # Here is data collected on UltraSPARC T1 system running Linux:
  22. #
  23. # gcc 4.4.1 this assembler
  24. #
  25. # 32-bit build 566 50 (+1000%)
  26. # 64-bit build 56 50 (+12%)
  27. #
  28. # I don't quite understand why difference between 32-bit and 64-bit
  29. # compiler-generated code is so big. Compilers *were* instructed to
  30. # generate code for UltraSPARC and should have used 64-bit registers
  31. # for Z vector (see C code) even in 32-bit build... Oh well, it only
  32. # means more impressive improvement coefficients for this assembler
  33. # module;-) Loops are aggressively modulo-scheduled in respect to
  34. # references to input data and Z.hi updates to achieve 12 cycles
  35. # timing. To anchor to something else, sha1-sparcv9.pl spends 11.6
  36. # cycles to process one byte on UltraSPARC pre-Tx CPU and ~24 on T1.
  37. $bits=32;
  38. for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
  39. if ($bits==64) { $bias=2047; $frame=192; }
  40. else { $bias=0; $frame=112; }
  41. $output=shift;
  42. open STDOUT,">$output";
  43. $Zhi="%o0"; # 64-bit values
  44. $Zlo="%o1";
  45. $Thi="%o2";
  46. $Tlo="%o3";
  47. $rem="%o4";
  48. $tmp="%o5";
  49. $nhi="%l0"; # small values and pointers
  50. $nlo="%l1";
  51. $xi0="%l2";
  52. $xi1="%l3";
  53. $rem_4bit="%l4";
  54. $remi="%l5";
  55. $Htblo="%l6";
  56. $cnt="%l7";
  57. $Xi="%i0"; # input argument block
  58. $Htbl="%i1";
  59. $inp="%i2";
  60. $len="%i3";
  61. $code.=<<___;
  62. .section ".text",#alloc,#execinstr
  63. .align 64
  64. rem_4bit:
  65. .long `0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0
  66. .long `0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0
  67. .long `0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0
  68. .long `0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0
  69. .type rem_4bit,#object
  70. .size rem_4bit,(.-rem_4bit)
  71. .globl gcm_ghash_4bit
  72. .align 32
  73. gcm_ghash_4bit:
  74. save %sp,-$frame,%sp
  75. ldub [$inp+15],$nlo
  76. ldub [$Xi+15],$xi0
  77. ldub [$Xi+14],$xi1
  78. add $len,$inp,$len
  79. add $Htbl,8,$Htblo
  80. 1: call .+8
  81. add %o7,rem_4bit-1b,$rem_4bit
  82. .Louter:
  83. xor $xi0,$nlo,$nlo
  84. and $nlo,0xf0,$nhi
  85. and $nlo,0x0f,$nlo
  86. sll $nlo,4,$nlo
  87. ldx [$Htblo+$nlo],$Zlo
  88. ldx [$Htbl+$nlo],$Zhi
  89. ldub [$inp+14],$nlo
  90. ldx [$Htblo+$nhi],$Tlo
  91. and $Zlo,0xf,$remi
  92. ldx [$Htbl+$nhi],$Thi
  93. sll $remi,3,$remi
  94. ldx [$rem_4bit+$remi],$rem
  95. srlx $Zlo,4,$Zlo
  96. mov 13,$cnt
  97. sllx $Zhi,60,$tmp
  98. xor $Tlo,$Zlo,$Zlo
  99. srlx $Zhi,4,$Zhi
  100. xor $Zlo,$tmp,$Zlo
  101. xor $xi1,$nlo,$nlo
  102. and $Zlo,0xf,$remi
  103. and $nlo,0xf0,$nhi
  104. and $nlo,0x0f,$nlo
  105. ba .Lghash_inner
  106. sll $nlo,4,$nlo
  107. .align 32
  108. .Lghash_inner:
  109. ldx [$Htblo+$nlo],$Tlo
  110. sll $remi,3,$remi
  111. xor $Thi,$Zhi,$Zhi
  112. ldx [$Htbl+$nlo],$Thi
  113. srlx $Zlo,4,$Zlo
  114. xor $rem,$Zhi,$Zhi
  115. ldx [$rem_4bit+$remi],$rem
  116. sllx $Zhi,60,$tmp
  117. xor $Tlo,$Zlo,$Zlo
  118. ldub [$inp+$cnt],$nlo
  119. srlx $Zhi,4,$Zhi
  120. xor $Zlo,$tmp,$Zlo
  121. ldub [$Xi+$cnt],$xi1
  122. xor $Thi,$Zhi,$Zhi
  123. and $Zlo,0xf,$remi
  124. ldx [$Htblo+$nhi],$Tlo
  125. sll $remi,3,$remi
  126. xor $rem,$Zhi,$Zhi
  127. ldx [$Htbl+$nhi],$Thi
  128. srlx $Zlo,4,$Zlo
  129. ldx [$rem_4bit+$remi],$rem
  130. sllx $Zhi,60,$tmp
  131. xor $xi1,$nlo,$nlo
  132. srlx $Zhi,4,$Zhi
  133. and $nlo,0xf0,$nhi
  134. addcc $cnt,-1,$cnt
  135. xor $Zlo,$tmp,$Zlo
  136. and $nlo,0x0f,$nlo
  137. xor $Tlo,$Zlo,$Zlo
  138. sll $nlo,4,$nlo
  139. blu .Lghash_inner
  140. and $Zlo,0xf,$remi
  141. ldx [$Htblo+$nlo],$Tlo
  142. sll $remi,3,$remi
  143. xor $Thi,$Zhi,$Zhi
  144. ldx [$Htbl+$nlo],$Thi
  145. srlx $Zlo,4,$Zlo
  146. xor $rem,$Zhi,$Zhi
  147. ldx [$rem_4bit+$remi],$rem
  148. sllx $Zhi,60,$tmp
  149. xor $Tlo,$Zlo,$Zlo
  150. srlx $Zhi,4,$Zhi
  151. xor $Zlo,$tmp,$Zlo
  152. xor $Thi,$Zhi,$Zhi
  153. add $inp,16,$inp
  154. cmp $inp,$len
  155. be,pn `$bits==64?"%xcc":"%icc"`,.Ldone
  156. and $Zlo,0xf,$remi
  157. ldx [$Htblo+$nhi],$Tlo
  158. sll $remi,3,$remi
  159. xor $rem,$Zhi,$Zhi
  160. ldx [$Htbl+$nhi],$Thi
  161. srlx $Zlo,4,$Zlo
  162. ldx [$rem_4bit+$remi],$rem
  163. sllx $Zhi,60,$tmp
  164. xor $Tlo,$Zlo,$Zlo
  165. ldub [$inp+15],$nlo
  166. srlx $Zhi,4,$Zhi
  167. xor $Zlo,$tmp,$Zlo
  168. xor $Thi,$Zhi,$Zhi
  169. stx $Zlo,[$Xi+8]
  170. xor $rem,$Zhi,$Zhi
  171. stx $Zhi,[$Xi]
  172. srl $Zlo,8,$xi1
  173. and $Zlo,0xff,$xi0
  174. ba .Louter
  175. and $xi1,0xff,$xi1
  176. .align 32
  177. .Ldone:
  178. ldx [$Htblo+$nhi],$Tlo
  179. sll $remi,3,$remi
  180. xor $rem,$Zhi,$Zhi
  181. ldx [$Htbl+$nhi],$Thi
  182. srlx $Zlo,4,$Zlo
  183. ldx [$rem_4bit+$remi],$rem
  184. sllx $Zhi,60,$tmp
  185. xor $Tlo,$Zlo,$Zlo
  186. srlx $Zhi,4,$Zhi
  187. xor $Zlo,$tmp,$Zlo
  188. xor $Thi,$Zhi,$Zhi
  189. stx $Zlo,[$Xi+8]
  190. xor $rem,$Zhi,$Zhi
  191. stx $Zhi,[$Xi]
  192. ret
  193. restore
  194. .type gcm_ghash_4bit,#function
  195. .size gcm_ghash_4bit,(.-gcm_ghash_4bit)
  196. ___
  197. undef $inp;
  198. undef $len;
  199. $code.=<<___;
  200. .globl gcm_gmult_4bit
  201. .align 32
  202. gcm_gmult_4bit:
  203. save %sp,-$frame,%sp
  204. ldub [$Xi+15],$nlo
  205. add $Htbl,8,$Htblo
  206. 1: call .+8
  207. add %o7,rem_4bit-1b,$rem_4bit
  208. and $nlo,0xf0,$nhi
  209. and $nlo,0x0f,$nlo
  210. sll $nlo,4,$nlo
  211. ldx [$Htblo+$nlo],$Zlo
  212. ldx [$Htbl+$nlo],$Zhi
  213. ldub [$Xi+14],$nlo
  214. ldx [$Htblo+$nhi],$Tlo
  215. and $Zlo,0xf,$remi
  216. ldx [$Htbl+$nhi],$Thi
  217. sll $remi,3,$remi
  218. ldx [$rem_4bit+$remi],$rem
  219. srlx $Zlo,4,$Zlo
  220. mov 13,$cnt
  221. sllx $Zhi,60,$tmp
  222. xor $Tlo,$Zlo,$Zlo
  223. srlx $Zhi,4,$Zhi
  224. xor $Zlo,$tmp,$Zlo
  225. and $Zlo,0xf,$remi
  226. and $nlo,0xf0,$nhi
  227. and $nlo,0x0f,$nlo
  228. ba .Lgmult_inner
  229. sll $nlo,4,$nlo
  230. .align 32
  231. .Lgmult_inner:
  232. ldx [$Htblo+$nlo],$Tlo
  233. sll $remi,3,$remi
  234. xor $Thi,$Zhi,$Zhi
  235. ldx [$Htbl+$nlo],$Thi
  236. srlx $Zlo,4,$Zlo
  237. xor $rem,$Zhi,$Zhi
  238. ldx [$rem_4bit+$remi],$rem
  239. sllx $Zhi,60,$tmp
  240. xor $Tlo,$Zlo,$Zlo
  241. ldub [$Xi+$cnt],$nlo
  242. srlx $Zhi,4,$Zhi
  243. xor $Zlo,$tmp,$Zlo
  244. xor $Thi,$Zhi,$Zhi
  245. and $Zlo,0xf,$remi
  246. ldx [$Htblo+$nhi],$Tlo
  247. sll $remi,3,$remi
  248. xor $rem,$Zhi,$Zhi
  249. ldx [$Htbl+$nhi],$Thi
  250. srlx $Zlo,4,$Zlo
  251. ldx [$rem_4bit+$remi],$rem
  252. sllx $Zhi,60,$tmp
  253. srlx $Zhi,4,$Zhi
  254. and $nlo,0xf0,$nhi
  255. addcc $cnt,-1,$cnt
  256. xor $Zlo,$tmp,$Zlo
  257. and $nlo,0x0f,$nlo
  258. xor $Tlo,$Zlo,$Zlo
  259. sll $nlo,4,$nlo
  260. blu .Lgmult_inner
  261. and $Zlo,0xf,$remi
  262. ldx [$Htblo+$nlo],$Tlo
  263. sll $remi,3,$remi
  264. xor $Thi,$Zhi,$Zhi
  265. ldx [$Htbl+$nlo],$Thi
  266. srlx $Zlo,4,$Zlo
  267. xor $rem,$Zhi,$Zhi
  268. ldx [$rem_4bit+$remi],$rem
  269. sllx $Zhi,60,$tmp
  270. xor $Tlo,$Zlo,$Zlo
  271. srlx $Zhi,4,$Zhi
  272. xor $Zlo,$tmp,$Zlo
  273. xor $Thi,$Zhi,$Zhi
  274. and $Zlo,0xf,$remi
  275. ldx [$Htblo+$nhi],$Tlo
  276. sll $remi,3,$remi
  277. xor $rem,$Zhi,$Zhi
  278. ldx [$Htbl+$nhi],$Thi
  279. srlx $Zlo,4,$Zlo
  280. ldx [$rem_4bit+$remi],$rem
  281. sllx $Zhi,60,$tmp
  282. xor $Tlo,$Zlo,$Zlo
  283. srlx $Zhi,4,$Zhi
  284. xor $Zlo,$tmp,$Zlo
  285. xor $Thi,$Zhi,$Zhi
  286. stx $Zlo,[$Xi+8]
  287. xor $rem,$Zhi,$Zhi
  288. stx $Zhi,[$Xi]
  289. ret
  290. restore
  291. .type gcm_gmult_4bit,#function
  292. .size gcm_gmult_4bit,(.-gcm_gmult_4bit)
  293. .asciz "GHASH for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
  294. .align 4
  295. ___
  296. $code =~ s/\`([^\`]*)\`/eval $1/gem;
  297. print $code;
  298. close STDOUT;