rc4-586.pl 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230
  1. #!/usr/local/bin/perl
  2. # At some point it became apparent that the original SSLeay RC4
  3. # assembler implementation performs suboptimaly on latest IA-32
  4. # microarchitectures. After re-tuning performance has changed as
  5. # following:
  6. #
  7. # Pentium +0%
  8. # Pentium III +17%
  9. # AMD +52%(*)
  10. # P4 +180%(**)
  11. #
  12. # (*) This number is actually a trade-off:-) It's possible to
  13. # achieve +72%, but at the cost of -48% off PIII performance.
  14. # In other words code performing further 13% faster on AMD
  15. # would perform almost 2 times slower on Intel PIII...
  16. # For reference! This code delivers ~80% of rc4-amd64.pl
  17. # performance on the same Opteron machine.
  18. # (**) This number requires compressed key schedule set up by
  19. # RC4_set_key and therefore doesn't apply to 0.9.7 [option for
  20. # compressed key schedule is implemented in 0.9.8 and later,
  21. # see commentary section in rc4_skey.c for further details].
  22. #
  23. # <appro@fy.chalmers.se>
  24. push(@INC,"perlasm","../../perlasm");
  25. require "x86asm.pl";
  26. &asm_init($ARGV[0],"rc4-586.pl");
  27. $x="eax";
  28. $y="ebx";
  29. $tx="ecx";
  30. $ty="edx";
  31. $in="esi";
  32. $out="edi";
  33. $d="ebp";
  34. &RC4("RC4");
  35. &asm_finish();
  36. sub RC4_loop
  37. {
  38. local($n,$p,$char)=@_;
  39. &comment("Round $n");
  40. if ($char)
  41. {
  42. if ($p >= 0)
  43. {
  44. &mov($ty, &swtmp(2));
  45. &cmp($ty, $in);
  46. &jbe(&label("finished"));
  47. &inc($in);
  48. }
  49. else
  50. {
  51. &add($ty, 8);
  52. &inc($in);
  53. &cmp($ty, $in);
  54. &jb(&label("finished"));
  55. &mov(&swtmp(2), $ty);
  56. }
  57. }
  58. # Moved out
  59. # &mov( $tx, &DWP(0,$d,$x,4)) if $p < 0;
  60. &add( &LB($y), &LB($tx));
  61. &mov( $ty, &DWP(0,$d,$y,4));
  62. # XXX
  63. &mov( &DWP(0,$d,$x,4),$ty);
  64. &add( $ty, $tx);
  65. &mov( &DWP(0,$d,$y,4),$tx);
  66. &and( $ty, 0xff);
  67. &inc( &LB($x)); # NEXT ROUND
  68. &mov( $tx, &DWP(0,$d,$x,4)) if $p < 1; # NEXT ROUND
  69. &mov( $ty, &DWP(0,$d,$ty,4));
  70. if (!$char)
  71. {
  72. #moved up into last round
  73. if ($p >= 1)
  74. {
  75. &add( $out, 8)
  76. }
  77. &movb( &BP($n,"esp","",0), &LB($ty));
  78. }
  79. else
  80. {
  81. # Note in+=8 has occured
  82. &movb( &HB($ty), &BP(-1,$in,"",0));
  83. # XXX
  84. &xorb(&LB($ty), &HB($ty));
  85. # XXX
  86. &movb(&BP($n,$out,"",0),&LB($ty));
  87. }
  88. }
  89. sub RC4
  90. {
  91. local($name)=@_;
  92. &function_begin_B($name,"");
  93. &mov($ty,&wparam(1)); # len
  94. &cmp($ty,0);
  95. &jne(&label("proceed"));
  96. &ret();
  97. &set_label("proceed");
  98. &comment("");
  99. &push("ebp");
  100. &push("ebx");
  101. &push("esi");
  102. &xor( $x, $x); # avoid partial register stalls
  103. &push("edi");
  104. &xor( $y, $y); # avoid partial register stalls
  105. &mov( $d, &wparam(0)); # key
  106. &mov( $in, &wparam(2));
  107. &movb( &LB($x), &BP(0,$d,"",1));
  108. &movb( &LB($y), &BP(4,$d,"",1));
  109. &mov( $out, &wparam(3));
  110. &inc( &LB($x));
  111. &stack_push(3); # 3 temp variables
  112. &add( $d, 8);
  113. # detect compressed schedule, see commentary section in rc4_skey.c...
  114. # in 0.9.7 context ~50 bytes below RC4_CHAR label remain redundant,
  115. # as compressed key schedule is set up in 0.9.8 and later.
  116. &cmp(&DWP(256,$d),-1);
  117. &je(&label("RC4_CHAR"));
  118. &lea( $ty, &DWP(-8,$ty,$in));
  119. # check for 0 length input
  120. &mov( &swtmp(2), $ty); # this is now address to exit at
  121. &mov( $tx, &DWP(0,$d,$x,4));
  122. &cmp( $ty, $in);
  123. &jb( &label("end")); # less than 8 bytes
  124. &set_label("start");
  125. # filling DELAY SLOT
  126. &add( $in, 8);
  127. &RC4_loop(0,-1,0);
  128. &RC4_loop(1,0,0);
  129. &RC4_loop(2,0,0);
  130. &RC4_loop(3,0,0);
  131. &RC4_loop(4,0,0);
  132. &RC4_loop(5,0,0);
  133. &RC4_loop(6,0,0);
  134. &RC4_loop(7,1,0);
  135. &comment("apply the cipher text");
  136. # xor the cipher data with input
  137. #&add( $out, 8); #moved up into last round
  138. &mov( $tx, &swtmp(0));
  139. &mov( $ty, &DWP(-8,$in,"",0));
  140. &xor( $tx, $ty);
  141. &mov( $ty, &DWP(-4,$in,"",0));
  142. &mov( &DWP(-8,$out,"",0), $tx);
  143. &mov( $tx, &swtmp(1));
  144. &xor( $tx, $ty);
  145. &mov( $ty, &swtmp(2)); # load end ptr;
  146. &mov( &DWP(-4,$out,"",0), $tx);
  147. &mov( $tx, &DWP(0,$d,$x,4));
  148. &cmp($in, $ty);
  149. &jbe(&label("start"));
  150. &set_label("end");
  151. # There is quite a bit of extra crap in RC4_loop() for this
  152. # first round
  153. &RC4_loop(0,-1,1);
  154. &RC4_loop(1,0,1);
  155. &RC4_loop(2,0,1);
  156. &RC4_loop(3,0,1);
  157. &RC4_loop(4,0,1);
  158. &RC4_loop(5,0,1);
  159. &RC4_loop(6,1,1);
  160. &jmp(&label("finished"));
  161. &align(16);
  162. # this is essentially Intel P4 specific codepath, see rc4_skey.c,
  163. # and is engaged in 0.9.8 and later context...
  164. &set_label("RC4_CHAR");
  165. &lea ($ty,&DWP(0,$in,$ty));
  166. &mov (&swtmp(2),$ty);
  167. &movz ($tx,&BP(0,$d,$x));
  168. # strangely enough unrolled loop performs over 20% slower...
  169. &set_label("RC4_CHAR_loop");
  170. &add (&LB($y),&LB($tx));
  171. &movz ($ty,&BP(0,$d,$y));
  172. &movb (&BP(0,$d,$y),&LB($tx));
  173. &movb (&BP(0,$d,$x),&LB($ty));
  174. &add (&LB($ty),&LB($tx));
  175. &movz ($ty,&BP(0,$d,$ty));
  176. &add (&LB($x),1);
  177. &xorb (&LB($ty),&BP(0,$in));
  178. &lea ($in,&DWP(1,$in));
  179. &movz ($tx,&BP(0,$d,$x));
  180. &cmp ($in,&swtmp(2));
  181. &movb (&BP(0,$out),&LB($ty));
  182. &lea ($out,&DWP(1,$out));
  183. &jb (&label("RC4_CHAR_loop"));
  184. &set_label("finished");
  185. &dec( $x);
  186. &stack_pop(3);
  187. &movb( &BP(-4,$d,"",0),&LB($y));
  188. &movb( &BP(-8,$d,"",0),&LB($x));
  189. &function_end($name);
  190. }