rc4-ia64.pl 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755
  1. #!/usr/bin/env perl
  2. #
  3. # ====================================================================
  4. # Written by David Mosberger <David.Mosberger@acm.org> based on the
  5. # Itanium optimized Crypto code which was released by HP Labs at
  6. # http://www.hpl.hp.com/research/linux/crypto/.
  7. #
  8. # Copyright (c) 2005 Hewlett-Packard Development Company, L.P.
  9. #
  10. # Permission is hereby granted, free of charge, to any person obtaining
  11. # a copy of this software and associated documentation files (the
  12. # "Software"), to deal in the Software without restriction, including
  13. # without limitation the rights to use, copy, modify, merge, publish,
  14. # distribute, sublicense, and/or sell copies of the Software, and to
  15. # permit persons to whom the Software is furnished to do so, subject to
  16. # the following conditions:
  17. #
  18. # The above copyright notice and this permission notice shall be
  19. # included in all copies or substantial portions of the Software.
  20. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  21. # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  22. # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  23. # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
  24. # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  25. # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  26. # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */
  27. # This is a little helper program which generates a software-pipelined
  28. # for RC4 encryption. The basic algorithm looks like this:
  29. #
  30. # for (counter = 0; counter < len; ++counter)
  31. # {
  32. # in = inp[counter];
  33. # SI = S[I];
  34. # J = (SI + J) & 0xff;
  35. # SJ = S[J];
  36. # T = (SI + SJ) & 0xff;
  37. # S[I] = SJ, S[J] = SI;
  38. # ST = S[T];
  39. # outp[counter] = in ^ ST;
  40. # I = (I + 1) & 0xff;
  41. # }
  42. #
  43. # Pipelining this loop isn't easy, because the stores to the S[] array
  44. # need to be observed in the right order. The loop generated by the
  45. # code below has the following pipeline diagram:
  46. #
  47. # cycle
  48. # | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 |10 |11 |12 |13 |14 |15 |16 |17 |
  49. # iter
  50. # 1: xxx LDI xxx xxx xxx LDJ xxx SWP xxx LDT xxx xxx
  51. # 2: xxx LDI xxx xxx xxx LDJ xxx SWP xxx LDT xxx xxx
  52. # 3: xxx LDI xxx xxx xxx LDJ xxx SWP xxx LDT xxx xxx
  53. #
  54. # where:
  55. # LDI = load of S[I]
  56. # LDJ = load of S[J]
  57. # SWP = swap of S[I] and S[J]
  58. # LDT = load of S[T]
  59. #
  60. # Note that in the above diagram, the major trouble-spot is that LDI
  61. # of the 2nd iteration is performed BEFORE the SWP of the first
  62. # iteration. Fortunately, this is easy to detect (I of the 1st
  63. # iteration will be equal to J of the 2nd iteration) and when this
  64. # happens, we simply forward the proper value from the 1st iteration
  65. # to the 2nd one. The proper value in this case is simply the value
  66. # of S[I] from the first iteration (thanks to the fact that SWP
  67. # simply swaps the contents of S[I] and S[J]).
  68. #
  69. # Another potential trouble-spot is in cycle 7, where SWP of the 1st
  70. # iteration issues at the same time as the LDI of the 3rd iteration.
  71. # However, thanks to IA-64 execution semantics, this can be taken
  72. # care of simply by placing LDI later in the instruction-group than
  73. # SWP. IA-64 CPUs will automatically forward the value if they
  74. # detect that the SWP and LDI are accessing the same memory-location.
  75. # The core-loop that can be pipelined then looks like this (annotated
  76. # with McKinley/Madison issue port & latency numbers, assuming L1
  77. # cache hits for the most part):
  78. # operation: instruction: issue-ports: latency
  79. # ------------------ ----------------------------- ------------- -------
  80. # Data = *inp++ ld1 data = [inp], 1 M0-M1 1 cyc c0
  81. # shladd Iptr = I, KeyTable, 3 M0-M3, I0, I1 1 cyc
  82. # I = (I + 1) & 0xff padd1 nextI = I, one M0-M3, I0, I1 3 cyc
  83. # ;;
  84. # SI = S[I] ld8 SI = [Iptr] M0-M1 1 cyc c1 * after SWAP!
  85. # ;;
  86. # cmp.eq.unc pBypass = I, J * after J is valid!
  87. # J = SI + J add J = J, SI M0-M3, I0, I1 1 cyc c2
  88. # (pBypass) br.cond.spnt Bypass
  89. # ;;
  90. # ---------------------------------------------------------------------------------------
  91. # J = J & 0xff zxt1 J = J I0, I1, 1 cyc c3
  92. # ;;
  93. # shladd Jptr = J, KeyTable, 3 M0-M3, I0, I1 1 cyc c4
  94. # ;;
  95. # SJ = S[J] ld8 SJ = [Jptr] M0-M1 1 cyc c5
  96. # ;;
  97. # ---------------------------------------------------------------------------------------
  98. # T = (SI + SJ) add T = SI, SJ M0-M3, I0, I1 1 cyc c6
  99. # ;;
  100. # T = T & 0xff zxt1 T = T I0, I1 1 cyc
  101. # S[I] = SJ st8 [Iptr] = SJ M2-M3 c7
  102. # S[J] = SI st8 [Jptr] = SI M2-M3
  103. # ;;
  104. # shladd Tptr = T, KeyTable, 3 M0-M3, I0, I1 1 cyc c8
  105. # ;;
  106. # ---------------------------------------------------------------------------------------
  107. # T = S[T] ld8 T = [Tptr] M0-M1 1 cyc c9
  108. # ;;
  109. # data ^= T xor data = data, T M0-M3, I0, I1 1 cyc c10
  110. # ;;
  111. # *out++ = Data ^ T dep word = word, data, 8, POS I0, I1 1 cyc c11
  112. # ;;
  113. # ---------------------------------------------------------------------------------------
  114. # There are several points worth making here:
  115. # - Note that due to the bypass/forwarding-path, the first two
  116. # phases of the loop are strangly mingled together. In
  117. # particular, note that the first stage of the pipeline is
  118. # using the value of "J", as calculated by the second stage.
  119. # - Each bundle-pair will have exactly 6 instructions.
  120. # - Pipelined, the loop can execute in 3 cycles/iteration and
  121. # 4 stages. However, McKinley/Madison can issue "st1" to
  122. # the same bank at a rate of at most one per 4 cycles. Thus,
  123. # instead of storing each byte, we accumulate them in a word
  124. # and then write them back at once with a single "st8" (this
  125. # implies that the setup code needs to ensure that the output
  126. # buffer is properly aligned, if need be, by encoding the
  127. # first few bytes separately).
  128. # - There is no space for a "br.ctop" instruction. For this
  129. # reason we can't use module-loop support in IA-64 and have
  130. # to do a traditional, purely software-pipelined loop.
  131. # - We can't replace any of the remaining "add/zxt1" pairs with
  132. # "padd1" because the latency for that instruction is too high
  133. # and would push the loop to the point where more bypasses
  134. # would be needed, which we don't have space for.
  135. # - The above loop runs at around 3.26 cycles/byte, or roughly
  136. # 440 MByte/sec on a 1.5GHz Madison. This is well below the
  137. # system bus bandwidth and hence with judicious use of
  138. # "lfetch" this loop can run at (almost) peak speed even when
  139. # the input and output data reside in memory. The
  140. # max. latency that can be tolerated is (PREFETCH_DISTANCE *
  141. # L2_LINE_SIZE * 3 cyc), or about 384 cycles assuming (at
  142. # least) 1-ahead prefetching of 128 byte cache-lines. Note
  143. # that we do NOT prefetch into L1, since that would only
  144. # interfere with the S[] table values stored there. This is
  145. # acceptable because there is a 10 cycle latency between
  146. # load and first use of the input data.
  147. # - We use a branch to out-of-line bypass-code of cycle-pressure:
  148. # we calculate the next J, check for the need to activate the
  149. # bypass path, and activate the bypass path ALL IN THE SAME
  150. # CYCLE. If we didn't have these constraints, we could do
  151. # the bypass with a simple conditional move instruction.
  152. # Fortunately, the bypass paths get activated relatively
  153. # infrequently, so the extra branches don't cost all that much
  154. # (about 0.04 cycles/byte, measured on a 16396 byte file with
  155. # random input data).
  156. #
  157. $phases = 4; # number of stages/phases in the pipelined-loop
  158. $unroll_count = 6; # number of times we unrolled it
  159. $pComI = (1 << 0);
  160. $pComJ = (1 << 1);
  161. $pComT = (1 << 2);
  162. $pOut = (1 << 3);
  163. $NData = 4;
  164. $NIP = 3;
  165. $NJP = 2;
  166. $NI = 2;
  167. $NSI = 3;
  168. $NSJ = 2;
  169. $NT = 2;
  170. $NOutWord = 2;
  171. #
  172. # $threshold is the minimum length before we attempt to use the
  173. # big software-pipelined loop. It MUST be greater-or-equal
  174. # to:
  175. # PHASES * (UNROLL_COUNT + 1) + 7
  176. #
  177. # The "+ 7" comes from the fact we may have to encode up to
  178. # 7 bytes separately before the output pointer is aligned.
  179. #
  180. $threshold = (3 * ($phases * ($unroll_count + 1)) + 7);
  181. sub I {
  182. local *code = shift;
  183. local $format = shift;
  184. $code .= sprintf ("\t\t".$format."\n", @_);
  185. }
  186. sub P {
  187. local *code = shift;
  188. local $format = shift;
  189. $code .= sprintf ($format."\n", @_);
  190. }
  191. sub STOP {
  192. local *code = shift;
  193. $code .=<<___;
  194. ;;
  195. ___
  196. }
  197. sub emit_body {
  198. local *c = shift;
  199. local *bypass = shift;
  200. local ($iteration, $p) = @_;
  201. local $i0 = $iteration;
  202. local $i1 = $iteration - 1;
  203. local $i2 = $iteration - 2;
  204. local $i3 = $iteration - 3;
  205. local $iw0 = ($iteration - 3) / 8;
  206. local $iw1 = ($iteration > 3) ? ($iteration - 4) / 8 : 1;
  207. local $byte_num = ($iteration - 3) % 8;
  208. local $label = $iteration + 1;
  209. local $pAny = ($p & 0xf) == 0xf;
  210. local $pByp = (($p & $pComI) && ($iteration > 0));
  211. $c.=<<___;
  212. //////////////////////////////////////////////////
  213. ___
  214. if (($p & 0xf) == 0) {
  215. $c.="#ifdef HOST_IS_BIG_ENDIAN\n";
  216. &I(\$c,"shr.u OutWord[%u] = OutWord[%u], 32;;",
  217. $iw1 % $NOutWord, $iw1 % $NOutWord);
  218. $c.="#endif\n";
  219. &I(\$c, "st4 [OutPtr] = OutWord[%u], 4", $iw1 % $NOutWord);
  220. return;
  221. }
  222. # Cycle 0
  223. &I(\$c, "{ .mmi") if ($pAny);
  224. &I(\$c, "ld1 Data[%u] = [InPtr], 1", $i0 % $NData) if ($p & $pComI);
  225. &I(\$c, "padd1 I[%u] = One, I[%u]", $i0 % $NI, $i1 % $NI)if ($p & $pComI);
  226. &I(\$c, "zxt1 J = J") if ($p & $pComJ);
  227. &I(\$c, "}") if ($pAny);
  228. &I(\$c, "{ .mmi") if ($pAny);
  229. &I(\$c, "LKEY T[%u] = [T[%u]]", $i1 % $NT, $i1 % $NT) if ($p & $pOut);
  230. &I(\$c, "add T[%u] = SI[%u], SJ[%u]",
  231. $i0 % $NT, $i2 % $NSI, $i1 % $NSJ) if ($p & $pComT);
  232. &I(\$c, "KEYADDR(IPr[%u], I[%u])", $i0 % $NIP, $i1 % $NI) if ($p & $pComI);
  233. &I(\$c, "}") if ($pAny);
  234. &STOP(\$c);
  235. # Cycle 1
  236. &I(\$c, "{ .mmi") if ($pAny);
  237. &I(\$c, "SKEY [IPr[%u]] = SJ[%u]", $i2 % $NIP, $i1%$NSJ)if ($p & $pComT);
  238. &I(\$c, "SKEY [JP[%u]] = SI[%u]", $i1 % $NJP, $i2%$NSI) if ($p & $pComT);
  239. &I(\$c, "zxt1 T[%u] = T[%u]", $i0 % $NT, $i0 % $NT) if ($p & $pComT);
  240. &I(\$c, "}") if ($pAny);
  241. &I(\$c, "{ .mmi") if ($pAny);
  242. &I(\$c, "LKEY SI[%u] = [IPr[%u]]", $i0 % $NSI, $i0%$NIP)if ($p & $pComI);
  243. &I(\$c, "KEYADDR(JP[%u], J)", $i0 % $NJP) if ($p & $pComJ);
  244. &I(\$c, "xor Data[%u] = Data[%u], T[%u]",
  245. $i3 % $NData, $i3 % $NData, $i1 % $NT) if ($p & $pOut);
  246. &I(\$c, "}") if ($pAny);
  247. &STOP(\$c);
  248. # Cycle 2
  249. &I(\$c, "{ .mmi") if ($pAny);
  250. &I(\$c, "LKEY SJ[%u] = [JP[%u]]", $i0 % $NSJ, $i0%$NJP) if ($p & $pComJ);
  251. &I(\$c, "cmp.eq pBypass, p0 = I[%u], J", $i1 % $NI) if ($pByp);
  252. &I(\$c, "dep OutWord[%u] = Data[%u], OutWord[%u], BYTE_POS(%u), 8",
  253. $iw0%$NOutWord, $i3%$NData, $iw1%$NOutWord, $byte_num) if ($p & $pOut);
  254. &I(\$c, "}") if ($pAny);
  255. &I(\$c, "{ .mmb") if ($pAny);
  256. &I(\$c, "add J = J, SI[%u]", $i0 % $NSI) if ($p & $pComI);
  257. &I(\$c, "KEYADDR(T[%u], T[%u])", $i0 % $NT, $i0 % $NT) if ($p & $pComT);
  258. &P(\$c, "(pBypass)\tbr.cond.spnt.many .rc4Bypass%u",$label)if ($pByp);
  259. &I(\$c, "}") if ($pAny);
  260. &STOP(\$c);
  261. &P(\$c, ".rc4Resume%u:", $label) if ($pByp);
  262. if ($byte_num == 0 && $iteration >= $phases) {
  263. &I(\$c, "st8 [OutPtr] = OutWord[%u], 8",
  264. $iw1 % $NOutWord) if ($p & $pOut);
  265. if ($iteration == (1 + $unroll_count) * $phases - 1) {
  266. if ($unroll_count == 6) {
  267. &I(\$c, "mov OutWord[%u] = OutWord[%u]",
  268. $iw1 % $NOutWord, $iw0 % $NOutWord);
  269. }
  270. &I(\$c, "lfetch.nt1 [InPrefetch], %u",
  271. $unroll_count * $phases);
  272. &I(\$c, "lfetch.excl.nt1 [OutPrefetch], %u",
  273. $unroll_count * $phases);
  274. &I(\$c, "br.cloop.sptk.few .rc4Loop");
  275. }
  276. }
  277. if ($pByp) {
  278. &P(\$bypass, ".rc4Bypass%u:", $label);
  279. &I(\$bypass, "sub J = J, SI[%u]", $i0 % $NSI);
  280. &I(\$bypass, "nop 0");
  281. &I(\$bypass, "nop 0");
  282. &I(\$bypass, ";;");
  283. &I(\$bypass, "add J = J, SI[%u]", $i1 % $NSI);
  284. &I(\$bypass, "mov SI[%u] = SI[%u]", $i0 % $NSI, $i1 % $NSI);
  285. &I(\$bypass, "br.sptk.many .rc4Resume%u\n", $label);
  286. &I(\$bypass, ";;");
  287. }
  288. }
  289. $code=<<___;
  290. .ident \"rc4-ia64.s, version 3.0\"
  291. .ident \"Copyright (c) 2005 Hewlett-Packard Development Company, L.P.\"
  292. #define LCSave r8
  293. #define PRSave r9
  294. /* Inputs become invalid once rotation begins! */
  295. #define StateTable in0
  296. #define DataLen in1
  297. #define InputBuffer in2
  298. #define OutputBuffer in3
  299. #define KTable r14
  300. #define J r15
  301. #define InPtr r16
  302. #define OutPtr r17
  303. #define InPrefetch r18
  304. #define OutPrefetch r19
  305. #define One r20
  306. #define LoopCount r21
  307. #define Remainder r22
  308. #define IFinal r23
  309. #define EndPtr r24
  310. #define tmp0 r25
  311. #define tmp1 r26
  312. #define pBypass p6
  313. #define pDone p7
  314. #define pSmall p8
  315. #define pAligned p9
  316. #define pUnaligned p10
  317. #define pComputeI pPhase[0]
  318. #define pComputeJ pPhase[1]
  319. #define pComputeT pPhase[2]
  320. #define pOutput pPhase[3]
  321. #define RetVal r8
  322. #define L_OK p7
  323. #define L_NOK p8
  324. #define _NINPUTS 4
  325. #define _NOUTPUT 0
  326. #define _NROTATE 24
  327. #define _NLOCALS (_NROTATE - _NINPUTS - _NOUTPUT)
  328. #ifndef SZ
  329. # define SZ 4 // this must be set to sizeof(RC4_INT)
  330. #endif
  331. #if SZ == 1
  332. # define LKEY ld1
  333. # define SKEY st1
  334. # define KEYADDR(dst, i) add dst = i, KTable
  335. #elif SZ == 2
  336. # define LKEY ld2
  337. # define SKEY st2
  338. # define KEYADDR(dst, i) shladd dst = i, 1, KTable
  339. #elif SZ == 4
  340. # define LKEY ld4
  341. # define SKEY st4
  342. # define KEYADDR(dst, i) shladd dst = i, 2, KTable
  343. #else
  344. # define LKEY ld8
  345. # define SKEY st8
  346. # define KEYADDR(dst, i) shladd dst = i, 3, KTable
  347. #endif
  348. #if defined(_HPUX_SOURCE) && !defined(_LP64)
  349. # define ADDP addp4
  350. #else
  351. # define ADDP add
  352. #endif
  353. /* Define a macro for the bit number of the n-th byte: */
  354. #if defined(_HPUX_SOURCE) || defined(B_ENDIAN)
  355. # define HOST_IS_BIG_ENDIAN
  356. # define BYTE_POS(n) (56 - (8 * (n)))
  357. #else
  358. # define BYTE_POS(n) (8 * (n))
  359. #endif
  360. /*
  361. We must perform the first phase of the pipeline explicitly since
  362. we will always load from the stable the first time. The br.cexit
  363. will never be taken since regardless of the number of bytes because
  364. the epilogue count is 4.
  365. */
  366. /* MODSCHED_RC4 macro was split to _PROLOGUE and _LOOP, because HP-UX
  367. assembler failed on original macro with syntax error. <appro> */
  368. #define MODSCHED_RC4_PROLOGUE \\
  369. { \\
  370. ld1 Data[0] = [InPtr], 1; \\
  371. add IFinal = 1, I[1]; \\
  372. KEYADDR(IPr[0], I[1]); \\
  373. } ;; \\
  374. { \\
  375. LKEY SI[0] = [IPr[0]]; \\
  376. mov pr.rot = 0x10000; \\
  377. mov ar.ec = 4; \\
  378. } ;; \\
  379. { \\
  380. add J = J, SI[0]; \\
  381. zxt1 I[0] = IFinal; \\
  382. br.cexit.spnt.few .+16; /* never taken */ \\
  383. } ;;
  384. #define MODSCHED_RC4_LOOP(label) \\
  385. label: \\
  386. { .mmi; \\
  387. (pComputeI) ld1 Data[0] = [InPtr], 1; \\
  388. (pComputeI) add IFinal = 1, I[1]; \\
  389. (pComputeJ) zxt1 J = J; \\
  390. }{ .mmi; \\
  391. (pOutput) LKEY T[1] = [T[1]]; \\
  392. (pComputeT) add T[0] = SI[2], SJ[1]; \\
  393. (pComputeI) KEYADDR(IPr[0], I[1]); \\
  394. } ;; \\
  395. { .mmi; \\
  396. (pComputeT) SKEY [IPr[2]] = SJ[1]; \\
  397. (pComputeT) SKEY [JP[1]] = SI[2]; \\
  398. (pComputeT) zxt1 T[0] = T[0]; \\
  399. }{ .mmi; \\
  400. (pComputeI) LKEY SI[0] = [IPr[0]]; \\
  401. (pComputeJ) KEYADDR(JP[0], J); \\
  402. (pComputeI) cmp.eq.unc pBypass, p0 = I[1], J; \\
  403. } ;; \\
  404. { .mmi; \\
  405. (pComputeJ) LKEY SJ[0] = [JP[0]]; \\
  406. (pOutput) xor Data[3] = Data[3], T[1]; \\
  407. nop 0x0; \\
  408. }{ .mmi; \\
  409. (pComputeT) KEYADDR(T[0], T[0]); \\
  410. (pBypass) mov SI[0] = SI[1]; \\
  411. (pComputeI) zxt1 I[0] = IFinal; \\
  412. } ;; \\
  413. { .mmb; \\
  414. (pOutput) st1 [OutPtr] = Data[3], 1; \\
  415. (pComputeI) add J = J, SI[0]; \\
  416. br.ctop.sptk.few label; \\
  417. } ;;
  418. .text
  419. .align 32
  420. .type RC4, \@function
  421. .global RC4
  422. .proc RC4
  423. .prologue
  424. RC4:
  425. {
  426. .mmi
  427. alloc r2 = ar.pfs, _NINPUTS, _NLOCALS, _NOUTPUT, _NROTATE
  428. .rotr Data[4], I[2], IPr[3], SI[3], JP[2], SJ[2], T[2], \\
  429. OutWord[2]
  430. .rotp pPhase[4]
  431. ADDP InPrefetch = 0, InputBuffer
  432. ADDP KTable = 0, StateTable
  433. }
  434. {
  435. .mmi
  436. ADDP InPtr = 0, InputBuffer
  437. ADDP OutPtr = 0, OutputBuffer
  438. mov RetVal = r0
  439. }
  440. ;;
  441. {
  442. .mmi
  443. lfetch.nt1 [InPrefetch], 0x80
  444. ADDP OutPrefetch = 0, OutputBuffer
  445. }
  446. { // Return 0 if the input length is nonsensical
  447. .mib
  448. ADDP StateTable = 0, StateTable
  449. cmp.ge.unc L_NOK, L_OK = r0, DataLen
  450. (L_NOK) br.ret.sptk.few rp
  451. }
  452. ;;
  453. {
  454. .mib
  455. cmp.eq.or L_NOK, L_OK = r0, InPtr
  456. cmp.eq.or L_NOK, L_OK = r0, OutPtr
  457. nop 0x0
  458. }
  459. {
  460. .mib
  461. cmp.eq.or L_NOK, L_OK = r0, StateTable
  462. nop 0x0
  463. (L_NOK) br.ret.sptk.few rp
  464. }
  465. ;;
  466. LKEY I[1] = [KTable], SZ
  467. /* Prefetch the state-table. It contains 256 elements of size SZ */
  468. #if SZ == 1
  469. ADDP tmp0 = 1*128, StateTable
  470. #elif SZ == 2
  471. ADDP tmp0 = 3*128, StateTable
  472. ADDP tmp1 = 2*128, StateTable
  473. #elif SZ == 4
  474. ADDP tmp0 = 7*128, StateTable
  475. ADDP tmp1 = 6*128, StateTable
  476. #elif SZ == 8
  477. ADDP tmp0 = 15*128, StateTable
  478. ADDP tmp1 = 14*128, StateTable
  479. #endif
  480. ;;
  481. #if SZ >= 8
  482. lfetch.fault.nt1 [tmp0], -256 // 15
  483. lfetch.fault.nt1 [tmp1], -256;;
  484. lfetch.fault.nt1 [tmp0], -256 // 13
  485. lfetch.fault.nt1 [tmp1], -256;;
  486. lfetch.fault.nt1 [tmp0], -256 // 11
  487. lfetch.fault.nt1 [tmp1], -256;;
  488. lfetch.fault.nt1 [tmp0], -256 // 9
  489. lfetch.fault.nt1 [tmp1], -256;;
  490. #endif
  491. #if SZ >= 4
  492. lfetch.fault.nt1 [tmp0], -256 // 7
  493. lfetch.fault.nt1 [tmp1], -256;;
  494. lfetch.fault.nt1 [tmp0], -256 // 5
  495. lfetch.fault.nt1 [tmp1], -256;;
  496. #endif
  497. #if SZ >= 2
  498. lfetch.fault.nt1 [tmp0], -256 // 3
  499. lfetch.fault.nt1 [tmp1], -256;;
  500. #endif
  501. {
  502. .mii
  503. lfetch.fault.nt1 [tmp0] // 1
  504. add I[1]=1,I[1];;
  505. zxt1 I[1]=I[1]
  506. }
  507. {
  508. .mmi
  509. lfetch.nt1 [InPrefetch], 0x80
  510. lfetch.excl.nt1 [OutPrefetch], 0x80
  511. .save pr, PRSave
  512. mov PRSave = pr
  513. } ;;
  514. {
  515. .mmi
  516. lfetch.excl.nt1 [OutPrefetch], 0x80
  517. LKEY J = [KTable], SZ
  518. ADDP EndPtr = DataLen, InPtr
  519. } ;;
  520. {
  521. .mmi
  522. ADDP EndPtr = -1, EndPtr // Make it point to
  523. // last data byte.
  524. mov One = 1
  525. .save ar.lc, LCSave
  526. mov LCSave = ar.lc
  527. .body
  528. } ;;
  529. {
  530. .mmb
  531. sub Remainder = 0, OutPtr
  532. cmp.gtu pSmall, p0 = $threshold, DataLen
  533. (pSmall) br.cond.dpnt .rc4Remainder // Data too small for
  534. // big loop.
  535. } ;;
  536. {
  537. .mmi
  538. and Remainder = 0x7, Remainder
  539. ;;
  540. cmp.eq pAligned, pUnaligned = Remainder, r0
  541. nop 0x0
  542. } ;;
  543. {
  544. .mmb
  545. .pred.rel "mutex",pUnaligned,pAligned
  546. (pUnaligned) add Remainder = -1, Remainder
  547. (pAligned) sub Remainder = EndPtr, InPtr
  548. (pAligned) br.cond.dptk.many .rc4Aligned
  549. } ;;
  550. {
  551. .mmi
  552. nop 0x0
  553. nop 0x0
  554. mov.i ar.lc = Remainder
  555. }
  556. /* Do the initial few bytes via the compact, modulo-scheduled loop
  557. until the output pointer is 8-byte-aligned. */
  558. MODSCHED_RC4_PROLOGUE
  559. MODSCHED_RC4_LOOP(.RC4AlignLoop)
  560. {
  561. .mib
  562. sub Remainder = EndPtr, InPtr
  563. zxt1 IFinal = IFinal
  564. clrrrb // Clear CFM.rrb.pr so
  565. ;; // next "mov pr.rot = N"
  566. // does the right thing.
  567. }
  568. {
  569. .mmi
  570. mov I[1] = IFinal
  571. nop 0x0
  572. nop 0x0
  573. } ;;
  574. .rc4Aligned:
  575. /*
  576. Unrolled loop count = (Remainder - ($unroll_count+1)*$phases)/($unroll_count*$phases)
  577. */
  578. {
  579. .mlx
  580. add LoopCount = 1 - ($unroll_count + 1)*$phases, Remainder
  581. movl Remainder = 0xaaaaaaaaaaaaaaab
  582. } ;;
  583. {
  584. .mmi
  585. setf.sig f6 = LoopCount // M2, M3 6 cyc
  586. setf.sig f7 = Remainder // M2, M3 6 cyc
  587. nop 0x0
  588. } ;;
  589. {
  590. .mfb
  591. nop 0x0
  592. xmpy.hu f6 = f6, f7
  593. nop 0x0
  594. } ;;
  595. {
  596. .mmi
  597. getf.sig LoopCount = f6;; // M2 5 cyc
  598. nop 0x0
  599. shr.u LoopCount = LoopCount, 4
  600. } ;;
  601. {
  602. .mmi
  603. nop 0x0
  604. nop 0x0
  605. mov.i ar.lc = LoopCount
  606. } ;;
  607. /* Now comes the unrolled loop: */
  608. .rc4Prologue:
  609. ___
  610. $iteration = 0;
  611. # Generate the prologue:
  612. $predicates = 1;
  613. for ($i = 0; $i < $phases; ++$i) {
  614. &emit_body (\$code, \$bypass, $iteration++, $predicates);
  615. $predicates = ($predicates << 1) | 1;
  616. }
  617. $code.=<<___;
  618. .rc4Loop:
  619. ___
  620. # Generate the body:
  621. for ($i = 0; $i < $unroll_count*$phases; ++$i) {
  622. &emit_body (\$code, \$bypass, $iteration++, $predicates);
  623. }
  624. $code.=<<___;
  625. .rc4Epilogue:
  626. ___
  627. # Generate the epilogue:
  628. for ($i = 0; $i < $phases; ++$i) {
  629. $predicates <<= 1;
  630. &emit_body (\$code, \$bypass, $iteration++, $predicates);
  631. }
  632. $code.=<<___;
  633. {
  634. .mmi
  635. lfetch.nt1 [EndPtr] // fetch line with last byte
  636. mov IFinal = I[1]
  637. nop 0x0
  638. }
  639. .rc4Remainder:
  640. {
  641. .mmi
  642. sub Remainder = EndPtr, InPtr // Calculate
  643. // # of bytes
  644. // left - 1
  645. nop 0x0
  646. nop 0x0
  647. } ;;
  648. {
  649. .mib
  650. cmp.eq pDone, p0 = -1, Remainder // done already?
  651. mov.i ar.lc = Remainder
  652. (pDone) br.cond.dptk.few .rc4Complete
  653. }
  654. /* Do the remaining bytes via the compact, modulo-scheduled loop */
  655. MODSCHED_RC4_PROLOGUE
  656. MODSCHED_RC4_LOOP(.RC4RestLoop)
  657. .rc4Complete:
  658. {
  659. .mmi
  660. add KTable = -SZ, KTable
  661. add IFinal = -1, IFinal
  662. mov ar.lc = LCSave
  663. } ;;
  664. {
  665. .mii
  666. SKEY [KTable] = J,-SZ
  667. zxt1 IFinal = IFinal
  668. mov pr = PRSave, 0x1FFFF
  669. } ;;
  670. {
  671. .mib
  672. SKEY [KTable] = IFinal
  673. add RetVal = 1, r0
  674. br.ret.sptk.few rp
  675. } ;;
  676. ___
  677. # Last but not least, emit the code for the bypass-code of the unrolled loop:
  678. $code.=$bypass;
  679. $code.=<<___;
  680. .endp RC4
  681. ___
  682. print $code;