memmove.s 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212
  1. TS = 0
  2. TE = 1
  3. FROM = 2
  4. N = 3
  5. TMP = 3 /* N and TMP don't overlap */
  6. TMP1 = 4
  7. TEXT memcpy(SB), $0
  8. B _memmove
  9. TEXT memmove(SB), $0
  10. _memmove:
  11. MOVW R(TS), to+0(FP) /* need to save for return value */
  12. MOVW from+4(FP), R(FROM)
  13. MOVW n+8(FP), R(N)
  14. ADD R(N), R(TS), R(TE) /* to end pointer */
  15. CMP R(FROM), R(TS)
  16. BLS _forward
  17. _back:
  18. ADD R(N), R(FROM) /* from end pointer */
  19. CMP $4, R(N) /* need at least 4 bytes to copy */
  20. BLT _b1tail
  21. _b4align: /* align destination on 4 */
  22. AND.S $3, R(TE), R(TMP)
  23. BEQ _b4aligned
  24. MOVBU.W -1(R(FROM)), R(TMP) /* pre-indexed */
  25. MOVBU.W R(TMP), -1(R(TE)) /* pre-indexed */
  26. B _b4align
  27. _b4aligned: /* is source now aligned? */
  28. AND.S $3, R(FROM), R(TMP)
  29. BNE _bunaligned
  30. ADD $31, R(TS), R(TMP) /* do 32-byte chunks if possible */
  31. _b32loop:
  32. CMP R(TMP), R(TE)
  33. BLS _b4tail
  34. MOVM.DB.W (R(FROM)), [R4-R7]
  35. MOVM.DB.W [R4-R7], (R(TE))
  36. MOVM.DB.W (R(FROM)), [R4-R7]
  37. MOVM.DB.W [R4-R7], (R(TE))
  38. B _b32loop
  39. _b4tail: /* do remaining words if possible */
  40. ADD $3, R(TS), R(TMP)
  41. _b4loop:
  42. CMP R(TMP), R(TE)
  43. BLS _b1tail
  44. MOVW.W -4(R(FROM)), R(TMP1) /* pre-indexed */
  45. MOVW.W R(TMP1), -4(R(TE)) /* pre-indexed */
  46. B _b4loop
  47. _b1tail: /* remaining bytes */
  48. CMP R(TE), R(TS)
  49. BEQ _return
  50. MOVBU.W -1(R(FROM)), R(TMP) /* pre-indexed */
  51. MOVBU.W R(TMP), -1(R(TE)) /* pre-indexed */
  52. B _b1tail
  53. _forward:
  54. CMP $4, R(N) /* need at least 4 bytes to copy */
  55. BLT _f1tail
  56. _f4align: /* align destination on 4 */
  57. AND.S $3, R(TS), R(TMP)
  58. BEQ _f4aligned
  59. MOVBU.P 1(R(FROM)), R(TMP) /* implicit write back */
  60. MOVBU.P R(TMP), 1(R(TS)) /* implicit write back */
  61. B _f4align
  62. _f4aligned: /* is source now aligned? */
  63. AND.S $3, R(FROM), R(TMP)
  64. BNE _funaligned
  65. SUB $31, R(TE), R(TMP) /* do 32-byte chunks if possible */
  66. _f32loop:
  67. CMP R(TMP), R(TS)
  68. BHS _f4tail
  69. MOVM.IA.W (R(FROM)), [R4-R7]
  70. MOVM.IA.W [R4-R7], (R(TS))
  71. MOVM.IA.W (R(FROM)), [R4-R7]
  72. MOVM.IA.W [R4-R7], (R(TS))
  73. B _f32loop
  74. _f4tail:
  75. SUB $3, R(TE), R(TMP) /* do remaining words if possible */
  76. _f4loop:
  77. CMP R(TMP), R(TS)
  78. BHS _f1tail
  79. MOVW.P 4(R(FROM)), R(TMP1) /* implicit write back */
  80. MOVW.P R4, 4(R(TS)) /* implicit write back */
  81. B _f4loop
  82. _f1tail:
  83. CMP R(TS), R(TE)
  84. BEQ _return
  85. MOVBU.P 1(R(FROM)), R(TMP) /* implicit write back */
  86. MOVBU.P R(TMP), 1(R(TS)) /* implicit write back */
  87. B _f1tail
  88. _return:
  89. MOVW to+0(FP), R0
  90. RET
  91. RSHIFT = 4
  92. LSHIFT = 5
  93. OFFSET = 11
  94. BR0 = 6
  95. BW0 = 7
  96. BR1 = 7
  97. BW1 = 8
  98. _bunaligned:
  99. CMP $2, R(TMP) /* is R(TMP) < 2 ? */
  100. MOVW.LT $8, R(RSHIFT) /* (R(n)<<24)|(R(n-1)>>8) */
  101. MOVW.LT $24, R(LSHIFT)
  102. MOVW.LT $1, R(OFFSET)
  103. MOVW.EQ $16, R(RSHIFT) /* (R(n)<<16)|(R(n-1)>>16) */
  104. MOVW.EQ $16, R(LSHIFT)
  105. MOVW.EQ $2, R(OFFSET)
  106. MOVW.GT $24, R(RSHIFT) /* (R(n)<<8)|(R(n-1)>>24) */
  107. MOVW.GT $8, R(LSHIFT)
  108. MOVW.GT $3, R(OFFSET)
  109. ADD $8, R(TS), R(TMP) /* do 8-byte chunks if possible */
  110. CMP R(TMP), R(TE)
  111. BLS _b1tail
  112. BIC $3, R(FROM) /* align source */
  113. MOVW (R(FROM)), R(BR0) /* prime first block register */
  114. _bu8loop:
  115. CMP R(TMP), R(TE)
  116. BLS _bu1tail
  117. MOVW R(BR0)<<R(LSHIFT), R(BW1)
  118. MOVM.DB.W (R(FROM)), [R(BR0)-R(BR1)]
  119. ORR R(BR1)>>R(RSHIFT), R(BW1)
  120. MOVW R(BR1)<<R(LSHIFT), R(BW0)
  121. ORR R(BR0)>>R(RSHIFT), R(BW0)
  122. MOVM.DB.W [R(BW0)-R(BW1)], (R(TE))
  123. B _bu8loop
  124. _bu1tail:
  125. ADD R(OFFSET), R(FROM)
  126. B _b1tail
  127. RSHIFT = 4
  128. LSHIFT = 5
  129. OFFSET = 11
  130. FW0 = 6
  131. FR0 = 7
  132. FW1 = 7
  133. FR1 = 8
  134. _funaligned:
  135. CMP $2, R(TMP)
  136. MOVW.LT $8, R(RSHIFT) /* (R(n+1)<<24)|(R(n)>>8) */
  137. MOVW.LT $24, R(LSHIFT)
  138. MOVW.LT $3, R(OFFSET)
  139. MOVW.EQ $16, R(RSHIFT) /* (R(n+1)<<16)|(R(n)>>16) */
  140. MOVW.EQ $16, R(LSHIFT)
  141. MOVW.EQ $2, R(OFFSET)
  142. MOVW.GT $24, R(RSHIFT) /* (R(n+1)<<8)|(R(n)>>24) */
  143. MOVW.GT $8, R(LSHIFT)
  144. MOVW.GT $1, R(OFFSET)
  145. SUB $8, R(TE), R(TMP) /* do 8-byte chunks if possible */
  146. CMP R(TMP), R(TS)
  147. BHS _f1tail
  148. BIC $3, R(FROM) /* align source */
  149. MOVW.P 4(R(FROM)), R(FR1) /* prime last block register, implicit write back */
  150. _fu8loop:
  151. CMP R(TMP), R(TS)
  152. BHS _fu1tail
  153. MOVW R(FR1)>>R(RSHIFT), R(FW0)
  154. MOVM.IA.W (R(FROM)), [R(FR0)-R(FR1)]
  155. ORR R(FR0)<<R(LSHIFT), R(FW0)
  156. MOVW R(FR0)>>R(RSHIFT), R(FW1)
  157. ORR R(FR1)<<R(LSHIFT), R(FW1)
  158. MOVM.IA.W [R(FW0)-R(FW1)], (R(TS))
  159. B _fu8loop
  160. _fu1tail:
  161. SUB R(OFFSET), R(FROM)
  162. B _f1tail