rc4-parisc.pl 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336
  1. #! /usr/bin/env perl
  2. # Copyright 2009-2020 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. # ====================================================================
  9. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  10. # project. The module is, however, dual licensed under OpenSSL and
  11. # CRYPTOGAMS licenses depending on where you obtain it. For further
  12. # details see http://www.openssl.org/~appro/cryptogams/.
  13. # ====================================================================
  14. # RC4 for PA-RISC.
  15. # June 2009.
  16. #
  17. # Performance is 33% better than gcc 3.2 generated code on PA-7100LC.
  18. # For reference, [4x] unrolled loop is >40% faster than folded one.
  19. # It's possible to unroll loop 8 times on PA-RISC 2.0, but improvement
  20. # is believed to be not sufficient to justify the effort...
  21. #
  22. # Special thanks to polarhome.com for providing HP-UX account.
  23. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  24. # $output is the last argument if it looks like a file (it has an extension)
  25. # $flavour is the first argument if it doesn't look like a file
  26. $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  27. $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  28. $output and open STDOUT,">$output";
  29. if ($flavour =~ /64/) {
  30. $LEVEL ="2.0W";
  31. $SIZE_T =8;
  32. $FRAME_MARKER =80;
  33. $SAVED_RP =16;
  34. $PUSH ="std";
  35. $PUSHMA ="std,ma";
  36. $POP ="ldd";
  37. $POPMB ="ldd,mb";
  38. } else {
  39. $LEVEL ="1.0";
  40. $SIZE_T =4;
  41. $FRAME_MARKER =48;
  42. $SAVED_RP =20;
  43. $PUSH ="stw";
  44. $PUSHMA ="stwm";
  45. $POP ="ldw";
  46. $POPMB ="ldwm";
  47. }
  48. $FRAME=4*$SIZE_T+$FRAME_MARKER; # 4 saved regs + frame marker
  49. # [+ argument transfer]
  50. $SZ=1; # defaults to RC4_CHAR
  51. if (open CONF,"<${dir}../../opensslconf.h") {
  52. while(<CONF>) {
  53. if (m/#\s*define\s+RC4_INT\s+(.*)/) {
  54. $SZ = ($1=~/char$/) ? 1 : 4;
  55. last;
  56. }
  57. }
  58. close CONF;
  59. }
  60. if ($SZ==1) { # RC4_CHAR
  61. $LD="ldb";
  62. $LDX="ldbx";
  63. $MKX="addl";
  64. $ST="stb";
  65. } else { # RC4_INT (~5% faster than RC4_CHAR on PA-7100LC)
  66. $LD="ldw";
  67. $LDX="ldwx,s";
  68. $MKX="sh2addl";
  69. $ST="stw";
  70. }
  71. $key="%r26";
  72. $len="%r25";
  73. $inp="%r24";
  74. $out="%r23";
  75. @XX=("%r19","%r20");
  76. @TX=("%r21","%r22");
  77. $YY="%r28";
  78. $TY="%r29";
  79. $acc="%r1";
  80. $ix="%r2";
  81. $iy="%r3";
  82. $dat0="%r4";
  83. $dat1="%r5";
  84. $rem="%r6";
  85. $mask="%r31";
  86. sub unrolledloopbody {
  87. for ($i=0;$i<4;$i++) {
  88. $code.=<<___;
  89. ldo 1($XX[0]),$XX[1]
  90. `sprintf("$LDX %$TY(%$key),%$dat1") if ($i>0)`
  91. and $mask,$XX[1],$XX[1]
  92. $LDX $YY($key),$TY
  93. $MKX $YY,$key,$ix
  94. $LDX $XX[1]($key),$TX[1]
  95. $MKX $XX[0],$key,$iy
  96. $ST $TX[0],0($ix)
  97. comclr,<> $XX[1],$YY,%r0 ; conditional
  98. copy $TX[0],$TX[1] ; move
  99. `sprintf("%sdep %$dat1,%d,8,%$acc",$i==1?"z":"",8*($i-1)+7) if ($i>0)`
  100. $ST $TY,0($iy)
  101. addl $TX[0],$TY,$TY
  102. addl $TX[1],$YY,$YY
  103. and $mask,$TY,$TY
  104. and $mask,$YY,$YY
  105. ___
  106. push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers
  107. } }
  108. sub foldedloop {
  109. my ($label,$count)=@_;
  110. $code.=<<___;
  111. $label
  112. $MKX $YY,$key,$iy
  113. $LDX $YY($key),$TY
  114. $MKX $XX[0],$key,$ix
  115. $ST $TX[0],0($iy)
  116. ldo 1($XX[0]),$XX[0]
  117. $ST $TY,0($ix)
  118. addl $TX[0],$TY,$TY
  119. ldbx $inp($out),$dat1
  120. and $mask,$TY,$TY
  121. and $mask,$XX[0],$XX[0]
  122. $LDX $TY($key),$acc
  123. $LDX $XX[0]($key),$TX[0]
  124. ldo 1($out),$out
  125. xor $dat1,$acc,$acc
  126. addl $TX[0],$YY,$YY
  127. stb $acc,-1($out)
  128. addib,<> -1,$count,$label ; $count is always small
  129. and $mask,$YY,$YY
  130. ___
  131. }
  132. $code=<<___;
  133. .LEVEL $LEVEL
  134. .SPACE \$TEXT\$
  135. .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
  136. .EXPORT RC4,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
  137. RC4
  138. .PROC
  139. .CALLINFO FRAME=`$FRAME-4*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=6
  140. .ENTRY
  141. $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
  142. $PUSHMA %r3,$FRAME(%sp)
  143. $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
  144. $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
  145. $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
  146. cmpib,*= 0,$len,L\$abort
  147. sub $inp,$out,$inp ; distance between $inp and $out
  148. $LD `0*$SZ`($key),$XX[0]
  149. $LD `1*$SZ`($key),$YY
  150. ldo `2*$SZ`($key),$key
  151. ldi 0xff,$mask
  152. ldi 3,$dat0
  153. ldo 1($XX[0]),$XX[0] ; warm up loop
  154. and $mask,$XX[0],$XX[0]
  155. $LDX $XX[0]($key),$TX[0]
  156. addl $TX[0],$YY,$YY
  157. cmpib,*>>= 6,$len,L\$oop1 ; is $len large enough to bother?
  158. and $mask,$YY,$YY
  159. and,<> $out,$dat0,$rem ; is $out aligned?
  160. b L\$alignedout
  161. subi 4,$rem,$rem
  162. sub $len,$rem,$len
  163. ___
  164. &foldedloop("L\$alignout",$rem); # process till $out is aligned
  165. $code.=<<___;
  166. L\$alignedout ; $len is at least 4 here
  167. and,<> $inp,$dat0,$acc ; is $inp aligned?
  168. b L\$oop4
  169. sub $inp,$acc,$rem ; align $inp
  170. sh3addl $acc,%r0,$acc
  171. subi 32,$acc,$acc
  172. mtctl $acc,%cr11 ; load %sar with vshd align factor
  173. ldwx $rem($out),$dat0
  174. ldo 4($rem),$rem
  175. L\$oop4misalignedinp
  176. ___
  177. &unrolledloopbody();
  178. $code.=<<___;
  179. $LDX $TY($key),$ix
  180. ldwx $rem($out),$dat1
  181. ldo -4($len),$len
  182. or $ix,$acc,$acc ; last piece, no need to dep
  183. vshd $dat0,$dat1,$iy ; align data
  184. copy $dat1,$dat0
  185. xor $iy,$acc,$acc
  186. stw $acc,0($out)
  187. cmpib,*<< 3,$len,L\$oop4misalignedinp
  188. ldo 4($out),$out
  189. cmpib,*= 0,$len,L\$done
  190. nop
  191. b L\$oop1
  192. nop
  193. .ALIGN 8
  194. L\$oop4
  195. ___
  196. &unrolledloopbody();
  197. $code.=<<___;
  198. $LDX $TY($key),$ix
  199. ldwx $inp($out),$dat0
  200. ldo -4($len),$len
  201. or $ix,$acc,$acc ; last piece, no need to dep
  202. xor $dat0,$acc,$acc
  203. stw $acc,0($out)
  204. cmpib,*<< 3,$len,L\$oop4
  205. ldo 4($out),$out
  206. cmpib,*= 0,$len,L\$done
  207. nop
  208. ___
  209. &foldedloop("L\$oop1",$len);
  210. $code.=<<___;
  211. L\$done
  212. $POP `-$FRAME-$SAVED_RP`(%sp),%r2
  213. ldo -1($XX[0]),$XX[0] ; chill out loop
  214. sub $YY,$TX[0],$YY
  215. and $mask,$XX[0],$XX[0]
  216. and $mask,$YY,$YY
  217. $ST $XX[0],`-2*$SZ`($key)
  218. $ST $YY,`-1*$SZ`($key)
  219. $POP `-$FRAME+1*$SIZE_T`(%sp),%r4
  220. $POP `-$FRAME+2*$SIZE_T`(%sp),%r5
  221. $POP `-$FRAME+3*$SIZE_T`(%sp),%r6
  222. L\$abort
  223. bv (%r2)
  224. .EXIT
  225. $POPMB -$FRAME(%sp),%r3
  226. .PROCEND
  227. ___
  228. $code.=<<___;
  229. .EXPORT RC4_set_key,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR
  230. .ALIGN 8
  231. RC4_set_key
  232. .PROC
  233. .CALLINFO NO_CALLS
  234. .ENTRY
  235. $ST %r0,`0*$SZ`($key)
  236. $ST %r0,`1*$SZ`($key)
  237. ldo `2*$SZ`($key),$key
  238. copy %r0,@XX[0]
  239. L\$1st
  240. $ST @XX[0],0($key)
  241. ldo 1(@XX[0]),@XX[0]
  242. bb,>= @XX[0],`31-8`,L\$1st ; @XX[0]<256
  243. ldo $SZ($key),$key
  244. ldo `-256*$SZ`($key),$key ; rewind $key
  245. addl $len,$inp,$inp ; $inp to point at the end
  246. sub %r0,$len,%r23 ; inverse index
  247. copy %r0,@XX[0]
  248. copy %r0,@XX[1]
  249. ldi 0xff,$mask
  250. L\$2nd
  251. $LDX @XX[0]($key),@TX[0]
  252. ldbx %r23($inp),@TX[1]
  253. addi,nuv 1,%r23,%r23 ; increment and conditional
  254. sub %r0,$len,%r23 ; inverse index
  255. addl @TX[0],@XX[1],@XX[1]
  256. addl @TX[1],@XX[1],@XX[1]
  257. and $mask,@XX[1],@XX[1]
  258. $MKX @XX[0],$key,$TY
  259. $LDX @XX[1]($key),@TX[1]
  260. $MKX @XX[1],$key,$YY
  261. ldo 1(@XX[0]),@XX[0]
  262. $ST @TX[0],0($YY)
  263. bb,>= @XX[0],`31-8`,L\$2nd ; @XX[0]<256
  264. $ST @TX[1],0($TY)
  265. bv,n (%r2)
  266. .EXIT
  267. nop
  268. .PROCEND
  269. .EXPORT RC4_options,ENTRY
  270. .ALIGN 8
  271. RC4_options
  272. .PROC
  273. .CALLINFO NO_CALLS
  274. .ENTRY
  275. blr %r0,%r28
  276. ldi 3,%r1
  277. L\$pic
  278. andcm %r28,%r1,%r28
  279. bv (%r2)
  280. .EXIT
  281. ldo L\$opts-L\$pic(%r28),%r28
  282. .PROCEND
  283. .ALIGN 8
  284. L\$opts
  285. .STRINGZ "rc4(4x,`$SZ==1?"char":"int"`)"
  286. .STRINGZ "RC4 for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
  287. ___
  288. if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
  289. =~ /GNU assembler/) {
  290. $gnuas = 1;
  291. }
  292. foreach(split("\n",$code)) {
  293. s/\`([^\`]*)\`/eval $1/ge;
  294. s/(\.LEVEL\s+2\.0)W/$1w/ if ($gnuas && $SIZE_T==8);
  295. s/\.SPACE\s+\$TEXT\$/.text/ if ($gnuas && $SIZE_T==8);
  296. s/\.SUBSPA.*// if ($gnuas && $SIZE_T==8);
  297. s/cmpib,\*/comib,/ if ($SIZE_T==4);
  298. s/\bbv\b/bve/ if ($SIZE_T==8);
  299. print $_,"\n";
  300. }
  301. close STDOUT or die "error closing STDOUT: $!";