2
0

rc4-parisc.pl 7.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333
  1. #! /usr/bin/env perl
  2. # Copyright 2009-2018 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the OpenSSL license (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. # ====================================================================
  9. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  10. # project. The module is, however, dual licensed under OpenSSL and
  11. # CRYPTOGAMS licenses depending on where you obtain it. For further
  12. # details see http://www.openssl.org/~appro/cryptogams/.
  13. # ====================================================================
  14. # RC4 for PA-RISC.
  15. # June 2009.
  16. #
  17. # Performance is 33% better than gcc 3.2 generated code on PA-7100LC.
  18. # For reference, [4x] unrolled loop is >40% faster than folded one.
  19. # It's possible to unroll loop 8 times on PA-RISC 2.0, but improvement
  20. # is believed to be not sufficient to justify the effort...
  21. #
  22. # Special thanks to polarhome.com for providing HP-UX account.
  23. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  24. $flavour = shift;
  25. $output = shift;
  26. open STDOUT,">$output";
  27. if ($flavour =~ /64/) {
  28. $LEVEL ="2.0W";
  29. $SIZE_T =8;
  30. $FRAME_MARKER =80;
  31. $SAVED_RP =16;
  32. $PUSH ="std";
  33. $PUSHMA ="std,ma";
  34. $POP ="ldd";
  35. $POPMB ="ldd,mb";
  36. } else {
  37. $LEVEL ="1.0";
  38. $SIZE_T =4;
  39. $FRAME_MARKER =48;
  40. $SAVED_RP =20;
  41. $PUSH ="stw";
  42. $PUSHMA ="stwm";
  43. $POP ="ldw";
  44. $POPMB ="ldwm";
  45. }
  46. $FRAME=4*$SIZE_T+$FRAME_MARKER; # 4 saved regs + frame marker
  47. # [+ argument transfer]
  48. $SZ=1; # defaults to RC4_CHAR
  49. if (open CONF,"<${dir}../../opensslconf.h") {
  50. while(<CONF>) {
  51. if (m/#\s*define\s+RC4_INT\s+(.*)/) {
  52. $SZ = ($1=~/char$/) ? 1 : 4;
  53. last;
  54. }
  55. }
  56. close CONF;
  57. }
  58. if ($SZ==1) { # RC4_CHAR
  59. $LD="ldb";
  60. $LDX="ldbx";
  61. $MKX="addl";
  62. $ST="stb";
  63. } else { # RC4_INT (~5% faster than RC4_CHAR on PA-7100LC)
  64. $LD="ldw";
  65. $LDX="ldwx,s";
  66. $MKX="sh2addl";
  67. $ST="stw";
  68. }
  69. $key="%r26";
  70. $len="%r25";
  71. $inp="%r24";
  72. $out="%r23";
  73. @XX=("%r19","%r20");
  74. @TX=("%r21","%r22");
  75. $YY="%r28";
  76. $TY="%r29";
  77. $acc="%r1";
  78. $ix="%r2";
  79. $iy="%r3";
  80. $dat0="%r4";
  81. $dat1="%r5";
  82. $rem="%r6";
  83. $mask="%r31";
  84. sub unrolledloopbody {
  85. for ($i=0;$i<4;$i++) {
  86. $code.=<<___;
  87. ldo 1($XX[0]),$XX[1]
  88. `sprintf("$LDX %$TY(%$key),%$dat1") if ($i>0)`
  89. and $mask,$XX[1],$XX[1]
  90. $LDX $YY($key),$TY
  91. $MKX $YY,$key,$ix
  92. $LDX $XX[1]($key),$TX[1]
  93. $MKX $XX[0],$key,$iy
  94. $ST $TX[0],0($ix)
  95. comclr,<> $XX[1],$YY,%r0 ; conditional
  96. copy $TX[0],$TX[1] ; move
  97. `sprintf("%sdep %$dat1,%d,8,%$acc",$i==1?"z":"",8*($i-1)+7) if ($i>0)`
  98. $ST $TY,0($iy)
  99. addl $TX[0],$TY,$TY
  100. addl $TX[1],$YY,$YY
  101. and $mask,$TY,$TY
  102. and $mask,$YY,$YY
  103. ___
  104. push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers
  105. } }
  106. sub foldedloop {
  107. my ($label,$count)=@_;
  108. $code.=<<___;
  109. $label
  110. $MKX $YY,$key,$iy
  111. $LDX $YY($key),$TY
  112. $MKX $XX[0],$key,$ix
  113. $ST $TX[0],0($iy)
  114. ldo 1($XX[0]),$XX[0]
  115. $ST $TY,0($ix)
  116. addl $TX[0],$TY,$TY
  117. ldbx $inp($out),$dat1
  118. and $mask,$TY,$TY
  119. and $mask,$XX[0],$XX[0]
  120. $LDX $TY($key),$acc
  121. $LDX $XX[0]($key),$TX[0]
  122. ldo 1($out),$out
  123. xor $dat1,$acc,$acc
  124. addl $TX[0],$YY,$YY
  125. stb $acc,-1($out)
  126. addib,<> -1,$count,$label ; $count is always small
  127. and $mask,$YY,$YY
  128. ___
  129. }
  130. $code=<<___;
  131. .LEVEL $LEVEL
  132. .SPACE \$TEXT\$
  133. .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
  134. .EXPORT RC4,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
  135. RC4
  136. .PROC
  137. .CALLINFO FRAME=`$FRAME-4*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=6
  138. .ENTRY
  139. $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
  140. $PUSHMA %r3,$FRAME(%sp)
  141. $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
  142. $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
  143. $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
  144. cmpib,*= 0,$len,L\$abort
  145. sub $inp,$out,$inp ; distance between $inp and $out
  146. $LD `0*$SZ`($key),$XX[0]
  147. $LD `1*$SZ`($key),$YY
  148. ldo `2*$SZ`($key),$key
  149. ldi 0xff,$mask
  150. ldi 3,$dat0
  151. ldo 1($XX[0]),$XX[0] ; warm up loop
  152. and $mask,$XX[0],$XX[0]
  153. $LDX $XX[0]($key),$TX[0]
  154. addl $TX[0],$YY,$YY
  155. cmpib,*>>= 6,$len,L\$oop1 ; is $len large enough to bother?
  156. and $mask,$YY,$YY
  157. and,<> $out,$dat0,$rem ; is $out aligned?
  158. b L\$alignedout
  159. subi 4,$rem,$rem
  160. sub $len,$rem,$len
  161. ___
  162. &foldedloop("L\$alignout",$rem); # process till $out is aligned
  163. $code.=<<___;
  164. L\$alignedout ; $len is at least 4 here
  165. and,<> $inp,$dat0,$acc ; is $inp aligned?
  166. b L\$oop4
  167. sub $inp,$acc,$rem ; align $inp
  168. sh3addl $acc,%r0,$acc
  169. subi 32,$acc,$acc
  170. mtctl $acc,%cr11 ; load %sar with vshd align factor
  171. ldwx $rem($out),$dat0
  172. ldo 4($rem),$rem
  173. L\$oop4misalignedinp
  174. ___
  175. &unrolledloopbody();
  176. $code.=<<___;
  177. $LDX $TY($key),$ix
  178. ldwx $rem($out),$dat1
  179. ldo -4($len),$len
  180. or $ix,$acc,$acc ; last piece, no need to dep
  181. vshd $dat0,$dat1,$iy ; align data
  182. copy $dat1,$dat0
  183. xor $iy,$acc,$acc
  184. stw $acc,0($out)
  185. cmpib,*<< 3,$len,L\$oop4misalignedinp
  186. ldo 4($out),$out
  187. cmpib,*= 0,$len,L\$done
  188. nop
  189. b L\$oop1
  190. nop
  191. .ALIGN 8
  192. L\$oop4
  193. ___
  194. &unrolledloopbody();
  195. $code.=<<___;
  196. $LDX $TY($key),$ix
  197. ldwx $inp($out),$dat0
  198. ldo -4($len),$len
  199. or $ix,$acc,$acc ; last piece, no need to dep
  200. xor $dat0,$acc,$acc
  201. stw $acc,0($out)
  202. cmpib,*<< 3,$len,L\$oop4
  203. ldo 4($out),$out
  204. cmpib,*= 0,$len,L\$done
  205. nop
  206. ___
  207. &foldedloop("L\$oop1",$len);
  208. $code.=<<___;
  209. L\$done
  210. $POP `-$FRAME-$SAVED_RP`(%sp),%r2
  211. ldo -1($XX[0]),$XX[0] ; chill out loop
  212. sub $YY,$TX[0],$YY
  213. and $mask,$XX[0],$XX[0]
  214. and $mask,$YY,$YY
  215. $ST $XX[0],`-2*$SZ`($key)
  216. $ST $YY,`-1*$SZ`($key)
  217. $POP `-$FRAME+1*$SIZE_T`(%sp),%r4
  218. $POP `-$FRAME+2*$SIZE_T`(%sp),%r5
  219. $POP `-$FRAME+3*$SIZE_T`(%sp),%r6
  220. L\$abort
  221. bv (%r2)
  222. .EXIT
  223. $POPMB -$FRAME(%sp),%r3
  224. .PROCEND
  225. ___
  226. $code.=<<___;
  227. .EXPORT RC4_set_key,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR
  228. .ALIGN 8
  229. RC4_set_key
  230. .PROC
  231. .CALLINFO NO_CALLS
  232. .ENTRY
  233. $ST %r0,`0*$SZ`($key)
  234. $ST %r0,`1*$SZ`($key)
  235. ldo `2*$SZ`($key),$key
  236. copy %r0,@XX[0]
  237. L\$1st
  238. $ST @XX[0],0($key)
  239. ldo 1(@XX[0]),@XX[0]
  240. bb,>= @XX[0],`31-8`,L\$1st ; @XX[0]<256
  241. ldo $SZ($key),$key
  242. ldo `-256*$SZ`($key),$key ; rewind $key
  243. addl $len,$inp,$inp ; $inp to point at the end
  244. sub %r0,$len,%r23 ; inverse index
  245. copy %r0,@XX[0]
  246. copy %r0,@XX[1]
  247. ldi 0xff,$mask
  248. L\$2nd
  249. $LDX @XX[0]($key),@TX[0]
  250. ldbx %r23($inp),@TX[1]
  251. addi,nuv 1,%r23,%r23 ; increment and conditional
  252. sub %r0,$len,%r23 ; inverse index
  253. addl @TX[0],@XX[1],@XX[1]
  254. addl @TX[1],@XX[1],@XX[1]
  255. and $mask,@XX[1],@XX[1]
  256. $MKX @XX[0],$key,$TY
  257. $LDX @XX[1]($key),@TX[1]
  258. $MKX @XX[1],$key,$YY
  259. ldo 1(@XX[0]),@XX[0]
  260. $ST @TX[0],0($YY)
  261. bb,>= @XX[0],`31-8`,L\$2nd ; @XX[0]<256
  262. $ST @TX[1],0($TY)
  263. bv,n (%r2)
  264. .EXIT
  265. nop
  266. .PROCEND
  267. .EXPORT RC4_options,ENTRY
  268. .ALIGN 8
  269. RC4_options
  270. .PROC
  271. .CALLINFO NO_CALLS
  272. .ENTRY
  273. blr %r0,%r28
  274. ldi 3,%r1
  275. L\$pic
  276. andcm %r28,%r1,%r28
  277. bv (%r2)
  278. .EXIT
  279. ldo L\$opts-L\$pic(%r28),%r28
  280. .PROCEND
  281. .ALIGN 8
  282. L\$opts
  283. .STRINGZ "rc4(4x,`$SZ==1?"char":"int"`)"
  284. .STRINGZ "RC4 for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
  285. ___
  286. if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
  287. =~ /GNU assembler/) {
  288. $gnuas = 1;
  289. }
  290. foreach(split("\n",$code)) {
  291. s/\`([^\`]*)\`/eval $1/ge;
  292. s/(\.LEVEL\s+2\.0)W/$1w/ if ($gnuas && $SIZE_T==8);
  293. s/\.SPACE\s+\$TEXT\$/.text/ if ($gnuas && $SIZE_T==8);
  294. s/\.SUBSPA.*// if ($gnuas && $SIZE_T==8);
  295. s/cmpib,\*/comib,/ if ($SIZE_T==4);
  296. s/\bbv\b/bve/ if ($SIZE_T==8);
  297. print $_,"\n";
  298. }
  299. close STDOUT;