md5-ia64.S 21 KB


  1. /* Copyright (c) 2005 Hewlett-Packard Development Company, L.P.
  2. Permission is hereby granted, free of charge, to any person obtaining
  3. a copy of this software and associated documentation files (the
  4. "Software"), to deal in the Software without restriction, including
  5. without limitation the rights to use, copy, modify, merge, publish,
  6. distribute, sublicense, and/or sell copies of the Software, and to
  7. permit persons to whom the Software is furnished to do so, subject to
  8. the following conditions:
  9. The above copyright notice and this permission notice shall be
  10. included in all copies or substantial portions of the Software.
  11. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  12. EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  13. MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  14. NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
  15. LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  16. OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  17. WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */
  18. // Common registers are assigned as follows:
  19. //
  20. // COMMON
  21. //
  22. // t0 Const Tbl Ptr TPtr
  23. // t1 Round Constant TRound
  24. // t4 Block residual LenResid
  25. // t5 Residual Data DTmp
  26. //
  27. // {in,out}0 Block 0 Cycle RotateM0
  28. // {in,out}1 Block Value 12 M12
  29. // {in,out}2 Block Value 8 M8
  30. // {in,out}3 Block Value 4 M4
  31. // {in,out}4 Block Value 0 M0
  32. // {in,out}5 Block 1 Cycle RotateM1
  33. // {in,out}6 Block Value 13 M13
  34. // {in,out}7 Block Value 9 M9
  35. // {in,out}8 Block Value 5 M5
  36. // {in,out}9 Block Value 1 M1
  37. // {in,out}10 Block 2 Cycle RotateM2
  38. // {in,out}11 Block Value 14 M14
  39. // {in,out}12 Block Value 10 M10
  40. // {in,out}13 Block Value 6 M6
  41. // {in,out}14 Block Value 2 M2
  42. // {in,out}15 Block 3 Cycle RotateM3
  43. // {in,out}16 Block Value 15 M15
  44. // {in,out}17 Block Value 11 M11
  45. // {in,out}18 Block Value 7 M7
  46. // {in,out}19 Block Value 3 M3
  47. // {in,out}20 Scratch Z
  48. // {in,out}21 Scratch Y
  49. // {in,out}22 Scratch X
  50. // {in,out}23 Scratch W
  51. // {in,out}24 Digest A A
  52. // {in,out}25 Digest B B
  53. // {in,out}26 Digest C C
  54. // {in,out}27 Digest D D
  55. // {in,out}28 Active Data Ptr DPtr
  56. // in28 Dummy Value -
  57. // out28 Dummy Value -
  58. // bt0 Coroutine Link QUICK_RTN
  59. //
  60. /// These predicates are used for computing the padding block(s) and
  61. /// are shared between the driver and digest co-routines
  62. //
  63. // pt0 Extra Pad Block pExtra
  64. // pt1 Load next word pLoad
  65. // pt2 Skip next word pSkip
  66. // pt3 Search for Pad pNoPad
  67. // pt4 Pad Word 0 pPad0
  68. // pt5 Pad Word 1 pPad1
  69. // pt6 Pad Word 2 pPad2
  70. // pt7 Pad Word 3 pPad3
  71. #define DTmp r19
  72. #define LenResid r18
  73. #define QUICK_RTN b6
  74. #define TPtr r14
  75. #define TRound r15
  76. #define pExtra p6
  77. #define pLoad p7
  78. #define pNoPad p9
  79. #define pPad0 p10
  80. #define pPad1 p11
  81. #define pPad2 p12
  82. #define pPad3 p13
  83. #define pSkip p8
  84. #define A_ out24
  85. #define B_ out25
  86. #define C_ out26
  87. #define D_ out27
  88. #define DPtr_ out28
  89. #define M0_ out4
  90. #define M1_ out9
  91. #define M10_ out12
  92. #define M11_ out17
  93. #define M12_ out1
  94. #define M13_ out6
  95. #define M14_ out11
  96. #define M15_ out16
  97. #define M2_ out14
  98. #define M3_ out19
  99. #define M4_ out3
  100. #define M5_ out8
  101. #define M6_ out13
  102. #define M7_ out18
  103. #define M8_ out2
  104. #define M9_ out7
  105. #define RotateM0_ out0
  106. #define RotateM1_ out5
  107. #define RotateM2_ out10
  108. #define RotateM3_ out15
  109. #define W_ out23
  110. #define X_ out22
  111. #define Y_ out21
  112. #define Z_ out20
  113. #define A in24
  114. #define B in25
  115. #define C in26
  116. #define D in27
  117. #define DPtr in28
  118. #define M0 in4
  119. #define M1 in9
  120. #define M10 in12
  121. #define M11 in17
  122. #define M12 in1
  123. #define M13 in6
  124. #define M14 in11
  125. #define M15 in16
  126. #define M2 in14
  127. #define M3 in19
  128. #define M4 in3
  129. #define M5 in8
  130. #define M6 in13
  131. #define M7 in18
  132. #define M8 in2
  133. #define M9 in7
  134. #define RotateM0 in0
  135. #define RotateM1 in5
  136. #define RotateM2 in10
  137. #define RotateM3 in15
  138. #define W in23
  139. #define X in22
  140. #define Y in21
  141. #define Z in20
  142. /* register stack configuration for md5_block_asm_data_order(): */
  143. #define MD5_NINP 3
  144. #define MD5_NLOC 0
  145. #define MD5_NOUT 29
  146. #define MD5_NROT 0
  147. /* register stack configuration for helpers: */
  148. #define _NINPUTS MD5_NOUT
  149. #define _NLOCALS 0
  150. #define _NOUTPUT 0
  151. #define _NROTATE 24 /* this must be <= _NINPUTS */
  152. #if defined(_HPUX_SOURCE) && !defined(_LP64)
  153. #define ADDP addp4
  154. #else
  155. #define ADDP add
  156. #endif
  157. #if defined(_HPUX_SOURCE) || defined(B_ENDIAN)
  158. #define HOST_IS_BIG_ENDIAN
  159. #endif
  160. // Macros for getting the left and right portions of little-endian words
  161. #define GETLW(dst, src, align) dep.z dst = src, 32 - 8 * align, 8 * align
  162. #define GETRW(dst, src, align) extr.u dst = src, 8 * align, 32 - 8 * align
  163. // MD5 driver
  164. //
  165. // Reads an input block, then calls the digest block
  166. // subroutine and adds the results to the accumulated
  167. // digest. It allocates 32 outs which the subroutine
  168. // uses as it's inputs and rotating
  169. // registers. Initializes the round constant pointer and
  170. // takes care of saving/restoring ar.lc
  171. //
  172. /// INPUT
  173. //
  174. // in0 Context Ptr CtxPtr0
  175. // in1 Input Data Ptr DPtrIn
  176. // in2 Integral Blocks BlockCount
  177. // rp Return Address -
  178. //
  179. /// CODE
  180. //
  181. // v2 Input Align InAlign
  182. // t0 Shared w/digest -
  183. // t1 Shared w/digest -
  184. // t2 Shared w/digest -
  185. // t3 Shared w/digest -
  186. // t4 Shared w/digest -
  187. // t5 Shared w/digest -
  188. // t6 PFS Save PFSSave
  189. // t7 ar.lc Save LCSave
  190. // t8 Saved PR PRSave
  191. // t9 2nd CtxPtr CtxPtr1
  192. // t10 Table Base CTable
  193. // t11 Table[0] CTable0
  194. // t13 Accumulator A AccumA
  195. // t14 Accumulator B AccumB
  196. // t15 Accumulator C AccumC
  197. // t16 Accumulator D AccumD
  198. // pt0 Shared w/digest -
  199. // pt1 Shared w/digest -
  200. // pt2 Shared w/digest -
  201. // pt3 Shared w/digest -
  202. // pt4 Shared w/digest -
  203. // pt5 Shared w/digest -
  204. // pt6 Shared w/digest -
  205. // pt7 Shared w/digest -
  206. // pt8 Not Aligned pOff
  207. // pt8 Blocks Left pAgain
  208. #define AccumA r27
  209. #define AccumB r28
  210. #define AccumC r29
  211. #define AccumD r30
  212. #define CTable r24
  213. #define CTable0 r25
  214. #define CtxPtr0 in0
  215. #define CtxPtr1 r23
  216. #define DPtrIn in1
  217. #define BlockCount in2
  218. #define InAlign r10
  219. #define LCSave r21
  220. #define PFSSave r20
  221. #define PRSave r22
  222. #define pAgain p63
  223. #define pOff p63
  224. .text
  225. /* md5_block_asm_data_order(MD5_CTX *c, const void *data, size_t num)
  226. where:
  227. c: a pointer to a structure of this type:
  228. typedef struct MD5state_st
  229. {
  230. MD5_LONG A,B,C,D;
  231. MD5_LONG Nl,Nh;
  232. MD5_LONG data[MD5_LBLOCK];
  233. unsigned int num;
  234. }
  235. MD5_CTX;
  236. data: a pointer to the input data (may be misaligned)
  237. num: the number of 16-byte blocks to hash (i.e., the length
  238. of DATA is 16*NUM.
  239. */
  240. .type md5_block_asm_data_order, @function
  241. .global md5_block_asm_data_order
  242. .align 32
  243. .proc md5_block_asm_data_order
  244. md5_block_asm_data_order:
  245. .md5_block:
  246. .prologue
  247. { .mmi
  248. .save ar.pfs, PFSSave
  249. alloc PFSSave = ar.pfs, MD5_NINP, MD5_NLOC, MD5_NOUT, MD5_NROT
  250. ADDP CtxPtr1 = 8, CtxPtr0
  251. mov CTable = ip
  252. }
  253. { .mmi
  254. ADDP DPtrIn = 0, DPtrIn
  255. ADDP CtxPtr0 = 0, CtxPtr0
  256. .save ar.lc, LCSave
  257. mov LCSave = ar.lc
  258. }
  259. ;;
  260. { .mmi
  261. add CTable = .md5_tbl_data_order#-.md5_block#, CTable
  262. and InAlign = 0x3, DPtrIn
  263. }
  264. { .mmi
  265. ld4 AccumA = [CtxPtr0], 4
  266. ld4 AccumC = [CtxPtr1], 4
  267. .save pr, PRSave
  268. mov PRSave = pr
  269. .body
  270. }
  271. ;;
  272. { .mmi
  273. ld4 AccumB = [CtxPtr0]
  274. ld4 AccumD = [CtxPtr1]
  275. dep DPtr_ = 0, DPtrIn, 0, 2
  276. } ;;
  277. #ifdef HOST_IS_BIG_ENDIAN
  278. rum psr.be;; // switch to little-endian
  279. #endif
  280. { .mmb
  281. ld4 CTable0 = [CTable], 4
  282. cmp.ne pOff, p0 = 0, InAlign
  283. (pOff) br.cond.spnt.many .md5_unaligned
  284. } ;;
  285. // The FF load/compute loop rotates values three times, so that
  286. // loading into M12 here produces the M0 value, M13 -> M1, etc.
  287. .md5_block_loop0:
  288. { .mmi
  289. ld4 M12_ = [DPtr_], 4
  290. mov TPtr = CTable
  291. mov TRound = CTable0
  292. } ;;
  293. { .mmi
  294. ld4 M13_ = [DPtr_], 4
  295. mov A_ = AccumA
  296. mov B_ = AccumB
  297. } ;;
  298. { .mmi
  299. ld4 M14_ = [DPtr_], 4
  300. mov C_ = AccumC
  301. mov D_ = AccumD
  302. } ;;
  303. { .mmb
  304. ld4 M15_ = [DPtr_], 4
  305. add BlockCount = -1, BlockCount
  306. br.call.sptk.many QUICK_RTN = md5_digest_block0
  307. } ;;
  308. // Now, we add the new digest values and do some clean-up
  309. // before checking if there's another full block to process
  310. { .mmi
  311. add AccumA = AccumA, A_
  312. add AccumB = AccumB, B_
  313. cmp.ne pAgain, p0 = 0, BlockCount
  314. }
  315. { .mib
  316. add AccumC = AccumC, C_
  317. add AccumD = AccumD, D_
  318. (pAgain) br.cond.dptk.many .md5_block_loop0
  319. } ;;
  320. .md5_exit:
  321. #ifdef HOST_IS_BIG_ENDIAN
  322. sum psr.be;; // switch back to big-endian mode
  323. #endif
  324. { .mmi
  325. st4 [CtxPtr0] = AccumB, -4
  326. st4 [CtxPtr1] = AccumD, -4
  327. mov pr = PRSave, 0x1ffff ;;
  328. }
  329. { .mmi
  330. st4 [CtxPtr0] = AccumA
  331. st4 [CtxPtr1] = AccumC
  332. mov ar.lc = LCSave
  333. } ;;
  334. { .mib
  335. mov ar.pfs = PFSSave
  336. br.ret.sptk.few rp
  337. } ;;
  338. #define MD5UNALIGNED(offset) \
  339. .md5_process##offset: \
  340. { .mib ; \
  341. nop 0x0 ; \
  342. GETRW(DTmp, DTmp, offset) ; \
  343. } ;; \
  344. .md5_block_loop##offset: \
  345. { .mmi ; \
  346. ld4 Y_ = [DPtr_], 4 ; \
  347. mov TPtr = CTable ; \
  348. mov TRound = CTable0 ; \
  349. } ;; \
  350. { .mmi ; \
  351. ld4 M13_ = [DPtr_], 4 ; \
  352. mov A_ = AccumA ; \
  353. mov B_ = AccumB ; \
  354. } ;; \
  355. { .mii ; \
  356. ld4 M14_ = [DPtr_], 4 ; \
  357. GETLW(W_, Y_, offset) ; \
  358. mov C_ = AccumC ; \
  359. } \
  360. { .mmi ; \
  361. mov D_ = AccumD ;; \
  362. or M12_ = W_, DTmp ; \
  363. GETRW(DTmp, Y_, offset) ; \
  364. } \
  365. { .mib ; \
  366. ld4 M15_ = [DPtr_], 4 ; \
  367. add BlockCount = -1, BlockCount ; \
  368. br.call.sptk.many QUICK_RTN = md5_digest_block##offset; \
  369. } ;; \
  370. { .mmi ; \
  371. add AccumA = AccumA, A_ ; \
  372. add AccumB = AccumB, B_ ; \
  373. cmp.ne pAgain, p0 = 0, BlockCount ; \
  374. } \
  375. { .mib ; \
  376. add AccumC = AccumC, C_ ; \
  377. add AccumD = AccumD, D_ ; \
  378. (pAgain) br.cond.dptk.many .md5_block_loop##offset ; \
  379. } ;; \
  380. { .mib ; \
  381. nop 0x0 ; \
  382. nop 0x0 ; \
  383. br.cond.sptk.many .md5_exit ; \
  384. } ;;
  385. .align 32
  386. .md5_unaligned:
  387. //
  388. // Because variable shifts are expensive, we special case each of
  389. // the four alignements. In practice, this won't hurt too much
  390. // since only one working set of code will be loaded.
  391. //
  392. { .mib
  393. ld4 DTmp = [DPtr_], 4
  394. cmp.eq pOff, p0 = 1, InAlign
  395. (pOff) br.cond.dpnt.many .md5_process1
  396. } ;;
  397. { .mib
  398. cmp.eq pOff, p0 = 2, InAlign
  399. nop 0x0
  400. (pOff) br.cond.dpnt.many .md5_process2
  401. } ;;
  402. MD5UNALIGNED(3)
  403. MD5UNALIGNED(1)
  404. MD5UNALIGNED(2)
  405. .endp md5_block_asm_data_order
  406. // MD5 Perform the F function and load
  407. //
  408. // Passed the first 4 words (M0 - M3) and initial (A, B, C, D) values,
  409. // computes the FF() round of functions, then branches to the common
  410. // digest code to finish up with GG(), HH, and II().
  411. //
  412. // INPUT
  413. //
  414. // rp Return Address -
  415. //
  416. // CODE
  417. //
  418. // v0 PFS bit bucket PFS
  419. // v1 Loop Trip Count LTrip
  420. // pt0 Load next word pMore
  421. /* For F round: */
  422. #define LTrip r9
  423. #define PFS r8
  424. #define pMore p6
  425. /* For GHI rounds: */
  426. #define T r9
  427. #define U r10
  428. #define V r11
  429. #define COMPUTE(a, b, s, M, R) \
  430. { \
  431. .mii ; \
  432. ld4 TRound = [TPtr], 4 ; \
  433. dep.z Y = Z, 32, 32 ;; \
  434. shrp Z = Z, Y, 64 - s ; \
  435. } ;; \
  436. { \
  437. .mmi ; \
  438. add a = Z, b ; \
  439. mov R = M ; \
  440. nop 0x0 ; \
  441. } ;;
  442. #define LOOP(a, b, s, M, R, label) \
  443. { .mii ; \
  444. ld4 TRound = [TPtr], 4 ; \
  445. dep.z Y = Z, 32, 32 ;; \
  446. shrp Z = Z, Y, 64 - s ; \
  447. } ;; \
  448. { .mib ; \
  449. add a = Z, b ; \
  450. mov R = M ; \
  451. br.ctop.sptk.many label ; \
  452. } ;;
  453. // G(B, C, D) = (B & D) | (C & ~D)
  454. #define G(a, b, c, d, M) \
  455. { .mmi ; \
  456. add Z = M, TRound ; \
  457. and Y = b, d ; \
  458. andcm X = c, d ; \
  459. } ;; \
  460. { .mii ; \
  461. add Z = Z, a ; \
  462. or Y = Y, X ;; \
  463. add Z = Z, Y ; \
  464. } ;;
  465. // H(B, C, D) = B ^ C ^ D
  466. #define H(a, b, c, d, M) \
  467. { .mmi ; \
  468. add Z = M, TRound ; \
  469. xor Y = b, c ; \
  470. nop 0x0 ; \
  471. } ;; \
  472. { .mii ; \
  473. add Z = Z, a ; \
  474. xor Y = Y, d ;; \
  475. add Z = Z, Y ; \
  476. } ;;
  477. // I(B, C, D) = C ^ (B | ~D)
  478. //
  479. // However, since we have an andcm operator, we use the fact that
  480. //
  481. // Y ^ Z == ~Y ^ ~Z
  482. //
  483. // to rewrite the expression as
  484. //
  485. // I(B, C, D) = ~C ^ (~B & D)
  486. #define I(a, b, c, d, M) \
  487. { .mmi ; \
  488. add Z = M, TRound ; \
  489. andcm Y = d, b ; \
  490. andcm X = -1, c ; \
  491. } ;; \
  492. { .mii ; \
  493. add Z = Z, a ; \
  494. xor Y = Y, X ;; \
  495. add Z = Z, Y ; \
  496. } ;;
  497. #define GG4(label) \
  498. G(A, B, C, D, M0) \
  499. COMPUTE(A, B, 5, M0, RotateM0) \
  500. G(D, A, B, C, M1) \
  501. COMPUTE(D, A, 9, M1, RotateM1) \
  502. G(C, D, A, B, M2) \
  503. COMPUTE(C, D, 14, M2, RotateM2) \
  504. G(B, C, D, A, M3) \
  505. LOOP(B, C, 20, M3, RotateM3, label)
  506. #define HH4(label) \
  507. H(A, B, C, D, M0) \
  508. COMPUTE(A, B, 4, M0, RotateM0) \
  509. H(D, A, B, C, M1) \
  510. COMPUTE(D, A, 11, M1, RotateM1) \
  511. H(C, D, A, B, M2) \
  512. COMPUTE(C, D, 16, M2, RotateM2) \
  513. H(B, C, D, A, M3) \
  514. LOOP(B, C, 23, M3, RotateM3, label)
  515. #define II4(label) \
  516. I(A, B, C, D, M0) \
  517. COMPUTE(A, B, 6, M0, RotateM0) \
  518. I(D, A, B, C, M1) \
  519. COMPUTE(D, A, 10, M1, RotateM1) \
  520. I(C, D, A, B, M2) \
  521. COMPUTE(C, D, 15, M2, RotateM2) \
  522. I(B, C, D, A, M3) \
  523. LOOP(B, C, 21, M3, RotateM3, label)
  524. #define FFLOAD(a, b, c, d, M, N, s) \
  525. { .mii ; \
  526. (pMore) ld4 N = [DPtr], 4 ; \
  527. add Z = M, TRound ; \
  528. and Y = c, b ; \
  529. } \
  530. { .mmi ; \
  531. andcm X = d, b ;; \
  532. add Z = Z, a ; \
  533. or Y = Y, X ; \
  534. } ;; \
  535. { .mii ; \
  536. ld4 TRound = [TPtr], 4 ; \
  537. add Z = Z, Y ;; \
  538. dep.z Y = Z, 32, 32 ; \
  539. } ;; \
  540. { .mii ; \
  541. nop 0x0 ; \
  542. shrp Z = Z, Y, 64 - s ;; \
  543. add a = Z, b ; \
  544. } ;;
  545. #define FFLOOP(a, b, c, d, M, N, s, dest) \
  546. { .mii ; \
  547. (pMore) ld4 N = [DPtr], 4 ; \
  548. add Z = M, TRound ; \
  549. and Y = c, b ; \
  550. } \
  551. { .mmi ; \
  552. andcm X = d, b ;; \
  553. add Z = Z, a ; \
  554. or Y = Y, X ; \
  555. } ;; \
  556. { .mii ; \
  557. ld4 TRound = [TPtr], 4 ; \
  558. add Z = Z, Y ;; \
  559. dep.z Y = Z, 32, 32 ; \
  560. } ;; \
  561. { .mii ; \
  562. nop 0x0 ; \
  563. shrp Z = Z, Y, 64 - s ;; \
  564. add a = Z, b ; \
  565. } \
  566. { .mib ; \
  567. cmp.ne pMore, p0 = 0, LTrip ; \
  568. add LTrip = -1, LTrip ; \
  569. br.ctop.dptk.many dest ; \
  570. } ;;
  571. .type md5_digest_block0, @function
  572. .align 32
  573. .proc md5_digest_block0
  574. .prologue
  575. md5_digest_block0:
  576. .altrp QUICK_RTN
  577. .body
  578. { .mmi
  579. alloc PFS = ar.pfs, _NINPUTS, _NLOCALS, _NOUTPUT, _NROTATE
  580. mov LTrip = 2
  581. mov ar.lc = 3
  582. } ;;
  583. { .mii
  584. cmp.eq pMore, p0 = r0, r0
  585. mov ar.ec = 0
  586. nop 0x0
  587. } ;;
  588. .md5_FF_round0:
  589. FFLOAD(A, B, C, D, M12, RotateM0, 7)
  590. FFLOAD(D, A, B, C, M13, RotateM1, 12)
  591. FFLOAD(C, D, A, B, M14, RotateM2, 17)
  592. FFLOOP(B, C, D, A, M15, RotateM3, 22, .md5_FF_round0)
  593. //
  594. // !!! Fall through to md5_digest_GHI
  595. //
  596. .endp md5_digest_block0
  597. .type md5_digest_GHI, @function
  598. .align 32
  599. .proc md5_digest_GHI
  600. .prologue
  601. .regstk _NINPUTS, _NLOCALS, _NOUTPUT, _NROTATE
  602. md5_digest_GHI:
  603. .altrp QUICK_RTN
  604. .body
  605. //
  606. // The following sequence shuffles the block counstants round for the
  607. // next round:
  608. //
  609. // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
  610. // 1 6 11 0 5 10 14 4 9 14 3 8 13 2 7 12
  611. //
  612. { .mmi
  613. mov Z = M0
  614. mov Y = M15
  615. mov ar.lc = 3
  616. }
  617. { .mmi
  618. mov X = M2
  619. mov W = M9
  620. mov V = M4
  621. } ;;
  622. { .mmi
  623. mov M0 = M1
  624. mov M15 = M12
  625. mov ar.ec = 1
  626. }
  627. { .mmi
  628. mov M2 = M11
  629. mov M9 = M14
  630. mov M4 = M5
  631. } ;;
  632. { .mmi
  633. mov M1 = M6
  634. mov M12 = M13
  635. mov U = M3
  636. }
  637. { .mmi
  638. mov M11 = M8
  639. mov M14 = M7
  640. mov M5 = M10
  641. } ;;
  642. { .mmi
  643. mov M6 = Y
  644. mov M13 = X
  645. mov M3 = Z
  646. }
  647. { .mmi
  648. mov M8 = W
  649. mov M7 = V
  650. mov M10 = U
  651. } ;;
  652. .md5_GG_round:
  653. GG4(.md5_GG_round)
  654. // The following sequence shuffles the block constants round for the
  655. // next round:
  656. //
  657. // 1 6 11 0 5 10 14 4 9 14 3 8 13 2 7 12
  658. // 5 8 11 14 1 4 7 10 13 0 3 6 9 12 15 2
  659. { .mmi
  660. mov Z = M0
  661. mov Y = M1
  662. mov ar.lc = 3
  663. }
  664. { .mmi
  665. mov X = M3
  666. mov W = M5
  667. mov V = M6
  668. } ;;
  669. { .mmi
  670. mov M0 = M4
  671. mov M1 = M11
  672. mov ar.ec = 1
  673. }
  674. { .mmi
  675. mov M3 = M9
  676. mov U = M8
  677. mov T = M13
  678. } ;;
  679. { .mmi
  680. mov M4 = Z
  681. mov M11 = Y
  682. mov M5 = M7
  683. }
  684. { .mmi
  685. mov M6 = M14
  686. mov M8 = M12
  687. mov M13 = M15
  688. } ;;
  689. { .mmi
  690. mov M7 = W
  691. mov M14 = V
  692. nop 0x0
  693. }
  694. { .mmi
  695. mov M9 = X
  696. mov M12 = U
  697. mov M15 = T
  698. } ;;
  699. .md5_HH_round:
  700. HH4(.md5_HH_round)
  701. // The following sequence shuffles the block constants round for the
  702. // next round:
  703. //
  704. // 5 8 11 14 1 4 7 10 13 0 3 6 9 12 15 2
  705. // 0 7 14 5 12 3 10 1 8 15 6 13 4 11 2 9
  706. { .mmi
  707. mov Z = M0
  708. mov Y = M15
  709. mov ar.lc = 3
  710. }
  711. { .mmi
  712. mov X = M10
  713. mov W = M1
  714. mov V = M4
  715. } ;;
  716. { .mmi
  717. mov M0 = M9
  718. mov M15 = M12
  719. mov ar.ec = 1
  720. }
  721. { .mmi
  722. mov M10 = M11
  723. mov M1 = M6
  724. mov M4 = M13
  725. } ;;
  726. { .mmi
  727. mov M9 = M14
  728. mov M12 = M5
  729. mov U = M3
  730. }
  731. { .mmi
  732. mov M11 = M8
  733. mov M6 = M7
  734. mov M13 = M2
  735. } ;;
  736. { .mmi
  737. mov M14 = Y
  738. mov M5 = X
  739. mov M3 = Z
  740. }
  741. { .mmi
  742. mov M8 = W
  743. mov M7 = V
  744. mov M2 = U
  745. } ;;
  746. .md5_II_round:
  747. II4(.md5_II_round)
  748. { .mib
  749. nop 0x0
  750. nop 0x0
  751. br.ret.sptk.many QUICK_RTN
  752. } ;;
  753. .endp md5_digest_GHI
  754. #define FFLOADU(a, b, c, d, M, P, N, s, offset) \
  755. { .mii ; \
  756. (pMore) ld4 N = [DPtr], 4 ; \
  757. add Z = M, TRound ; \
  758. and Y = c, b ; \
  759. } \
  760. { .mmi ; \
  761. andcm X = d, b ;; \
  762. add Z = Z, a ; \
  763. or Y = Y, X ; \
  764. } ;; \
  765. { .mii ; \
  766. ld4 TRound = [TPtr], 4 ; \
  767. GETLW(W, P, offset) ; \
  768. add Z = Z, Y ; \
  769. } ;; \
  770. { .mii ; \
  771. or W = W, DTmp ; \
  772. dep.z Y = Z, 32, 32 ;; \
  773. shrp Z = Z, Y, 64 - s ; \
  774. } ;; \
  775. { .mii ; \
  776. add a = Z, b ; \
  777. GETRW(DTmp, P, offset) ; \
  778. mov P = W ; \
  779. } ;;
  780. #define FFLOOPU(a, b, c, d, M, P, N, s, offset) \
  781. { .mii ; \
  782. (pMore) ld4 N = [DPtr], 4 ; \
  783. add Z = M, TRound ; \
  784. and Y = c, b ; \
  785. } \
  786. { .mmi ; \
  787. andcm X = d, b ;; \
  788. add Z = Z, a ; \
  789. or Y = Y, X ; \
  790. } ;; \
  791. { .mii ; \
  792. ld4 TRound = [TPtr], 4 ; \
  793. (pMore) GETLW(W, P, offset) ; \
  794. add Z = Z, Y ; \
  795. } ;; \
  796. { .mii ; \
  797. (pMore) or W = W, DTmp ; \
  798. dep.z Y = Z, 32, 32 ;; \
  799. shrp Z = Z, Y, 64 - s ; \
  800. } ;; \
  801. { .mii ; \
  802. add a = Z, b ; \
  803. (pMore) GETRW(DTmp, P, offset) ; \
  804. (pMore) mov P = W ; \
  805. } \
  806. { .mib ; \
  807. cmp.ne pMore, p0 = 0, LTrip ; \
  808. add LTrip = -1, LTrip ; \
  809. br.ctop.sptk.many .md5_FF_round##offset ; \
  810. } ;;
  811. #define MD5FBLOCK(offset) \
  812. .type md5_digest_block##offset, @function ; \
  813. \
  814. .align 32 ; \
  815. .proc md5_digest_block##offset ; \
  816. .prologue ; \
  817. .altrp QUICK_RTN ; \
  818. .body ; \
  819. md5_digest_block##offset: \
  820. { .mmi ; \
  821. alloc PFS = ar.pfs, _NINPUTS, _NLOCALS, _NOUTPUT, _NROTATE ; \
  822. mov LTrip = 2 ; \
  823. mov ar.lc = 3 ; \
  824. } ;; \
  825. { .mii ; \
  826. cmp.eq pMore, p0 = r0, r0 ; \
  827. mov ar.ec = 0 ; \
  828. nop 0x0 ; \
  829. } ;; \
  830. \
  831. .pred.rel "mutex", pLoad, pSkip ; \
  832. .md5_FF_round##offset: \
  833. FFLOADU(A, B, C, D, M12, M13, RotateM0, 7, offset) \
  834. FFLOADU(D, A, B, C, M13, M14, RotateM1, 12, offset) \
  835. FFLOADU(C, D, A, B, M14, M15, RotateM2, 17, offset) \
  836. FFLOOPU(B, C, D, A, M15, RotateM0, RotateM3, 22, offset) \
  837. \
  838. { .mib ; \
  839. nop 0x0 ; \
  840. nop 0x0 ; \
  841. br.cond.sptk.many md5_digest_GHI ; \
  842. } ;; \
  843. .endp md5_digest_block##offset
  844. MD5FBLOCK(1)
  845. MD5FBLOCK(2)
  846. MD5FBLOCK(3)
  847. .align 64
  848. .type md5_constants, @object
  849. md5_constants:
  850. .md5_tbl_data_order: // To ensure little-endian data
  851. // order, code as bytes.
  852. data1 0x78, 0xa4, 0x6a, 0xd7 // 0
  853. data1 0x56, 0xb7, 0xc7, 0xe8 // 1
  854. data1 0xdb, 0x70, 0x20, 0x24 // 2
  855. data1 0xee, 0xce, 0xbd, 0xc1 // 3
  856. data1 0xaf, 0x0f, 0x7c, 0xf5 // 4
  857. data1 0x2a, 0xc6, 0x87, 0x47 // 5
  858. data1 0x13, 0x46, 0x30, 0xa8 // 6
  859. data1 0x01, 0x95, 0x46, 0xfd // 7
  860. data1 0xd8, 0x98, 0x80, 0x69 // 8
  861. data1 0xaf, 0xf7, 0x44, 0x8b // 9
  862. data1 0xb1, 0x5b, 0xff, 0xff // 10
  863. data1 0xbe, 0xd7, 0x5c, 0x89 // 11
  864. data1 0x22, 0x11, 0x90, 0x6b // 12
  865. data1 0x93, 0x71, 0x98, 0xfd // 13
  866. data1 0x8e, 0x43, 0x79, 0xa6 // 14
  867. data1 0x21, 0x08, 0xb4, 0x49 // 15
  868. data1 0x62, 0x25, 0x1e, 0xf6 // 16
  869. data1 0x40, 0xb3, 0x40, 0xc0 // 17
  870. data1 0x51, 0x5a, 0x5e, 0x26 // 18
  871. data1 0xaa, 0xc7, 0xb6, 0xe9 // 19
  872. data1 0x5d, 0x10, 0x2f, 0xd6 // 20
  873. data1 0x53, 0x14, 0x44, 0x02 // 21
  874. data1 0x81, 0xe6, 0xa1, 0xd8 // 22
  875. data1 0xc8, 0xfb, 0xd3, 0xe7 // 23
  876. data1 0xe6, 0xcd, 0xe1, 0x21 // 24
  877. data1 0xd6, 0x07, 0x37, 0xc3 // 25
  878. data1 0x87, 0x0d, 0xd5, 0xf4 // 26
  879. data1 0xed, 0x14, 0x5a, 0x45 // 27
  880. data1 0x05, 0xe9, 0xe3, 0xa9 // 28
  881. data1 0xf8, 0xa3, 0xef, 0xfc // 29
  882. data1 0xd9, 0x02, 0x6f, 0x67 // 30
  883. data1 0x8a, 0x4c, 0x2a, 0x8d // 31
  884. data1 0x42, 0x39, 0xfa, 0xff // 32
  885. data1 0x81, 0xf6, 0x71, 0x87 // 33
  886. data1 0x22, 0x61, 0x9d, 0x6d // 34
  887. data1 0x0c, 0x38, 0xe5, 0xfd // 35
  888. data1 0x44, 0xea, 0xbe, 0xa4 // 36
  889. data1 0xa9, 0xcf, 0xde, 0x4b // 37
  890. data1 0x60, 0x4b, 0xbb, 0xf6 // 38
  891. data1 0x70, 0xbc, 0xbf, 0xbe // 39
  892. data1 0xc6, 0x7e, 0x9b, 0x28 // 40
  893. data1 0xfa, 0x27, 0xa1, 0xea // 41
  894. data1 0x85, 0x30, 0xef, 0xd4 // 42
  895. data1 0x05, 0x1d, 0x88, 0x04 // 43
  896. data1 0x39, 0xd0, 0xd4, 0xd9 // 44
  897. data1 0xe5, 0x99, 0xdb, 0xe6 // 45
  898. data1 0xf8, 0x7c, 0xa2, 0x1f // 46
  899. data1 0x65, 0x56, 0xac, 0xc4 // 47
  900. data1 0x44, 0x22, 0x29, 0xf4 // 48
  901. data1 0x97, 0xff, 0x2a, 0x43 // 49
  902. data1 0xa7, 0x23, 0x94, 0xab // 50
  903. data1 0x39, 0xa0, 0x93, 0xfc // 51
  904. data1 0xc3, 0x59, 0x5b, 0x65 // 52
  905. data1 0x92, 0xcc, 0x0c, 0x8f // 53
  906. data1 0x7d, 0xf4, 0xef, 0xff // 54
  907. data1 0xd1, 0x5d, 0x84, 0x85 // 55
  908. data1 0x4f, 0x7e, 0xa8, 0x6f // 56
  909. data1 0xe0, 0xe6, 0x2c, 0xfe // 57
  910. data1 0x14, 0x43, 0x01, 0xa3 // 58
  911. data1 0xa1, 0x11, 0x08, 0x4e // 59
  912. data1 0x82, 0x7e, 0x53, 0xf7 // 60
  913. data1 0x35, 0xf2, 0x3a, 0xbd // 61
  914. data1 0xbb, 0xd2, 0xd7, 0x2a // 62
  915. data1 0x91, 0xd3, 0x86, 0xeb // 63
  916. .size md5_constants#,64*4