memmove-thumb.s 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223
  1. TS = 0
  2. TE = 1
  3. FROM = 2
  4. N = 3
  5. TMP = 3 /* N and TMP don't overlap */
  6. TMP1 = 4
  7. TEXT memcpy(SB), $0
  8. TEXT memmove(SB), $-4
  9. _memmove:
  10. MOVW R(TS), to+0(FP) /* need to save for return value */
  11. MOVW from+4(FP), R(FROM)
  12. MOVW n+8(FP), R(N)
  13. ADD R(N), R(TS), R(TE) /* to end pointer */
  14. CMP R(FROM), R(TS)
  15. BLS _forward
  16. _back:
  17. ADD R(N), R(FROM) /* from end pointer */
  18. CMP $4, R(N) /* need at least 4 bytes to copy */
  19. BLT _b1tail
  20. _b4align: /* align destination on 4 */
  21. AND.S $3, R(TE), R(TMP)
  22. BEQ _b4aligned
  23. MOVBU.W -1(R(FROM)), R(TMP) /* pre-indexed */
  24. MOVBU.W R(TMP), -1(R(TE)) /* pre-indexed */
  25. B _b4align
  26. _b4aligned: /* is source now aligned? */
  27. AND.S $3, R(FROM), R(TMP)
  28. BNE _bunaligned
  29. ADD $31, R(TS), R(TMP) /* do 32-byte chunks if possible */
  30. _b32loop:
  31. CMP R(TMP), R(TE)
  32. BLS _b4tail
  33. MOVM.DB.W (R(FROM)), [R4-R11]
  34. MOVM.DB.W [R4-R11], (R(TE))
  35. B _b32loop
  36. _b4tail: /* do remaining words if possible */
  37. ADD $3, R(TS), R(TMP)
  38. _b4loop:
  39. CMP R(TMP), R(TE)
  40. BLS _b1tail
  41. MOVW.W -4(R(FROM)), R(TMP1) /* pre-indexed */
  42. MOVW.W R(TMP1), -4(R(TE)) /* pre-indexed */
  43. B _b4loop
  44. _b1tail: /* remaining bytes */
  45. CMP R(TE), R(TS)
  46. BEQ _return
  47. MOVBU.W -1(R(FROM)), R(TMP) /* pre-indexed */
  48. MOVBU.W R(TMP), -1(R(TE)) /* pre-indexed */
  49. B _b1tail
  50. _forward:
  51. CMP $4, R(N) /* need at least 4 bytes to copy */
  52. BLT _f1tail
  53. _f4align: /* align destination on 4 */
  54. AND.S $3, R(TS), R(TMP)
  55. BEQ _f4aligned
  56. MOVBU.P 1(R(FROM)), R(TMP) /* implicit write back */
  57. MOVBU.P R(TMP), 1(R(TS)) /* implicit write back */
  58. B _f4align
  59. _f4aligned: /* is source now aligned? */
  60. AND.S $3, R(FROM), R(TMP)
  61. BNE _funaligned
  62. SUB $31, R(TE), R(TMP) /* do 32-byte chunks if possible */
  63. _f32loop:
  64. CMP R(TMP), R(TS)
  65. BHS _f4tail
  66. MOVM.IA.W (R(FROM)), [R4-R11]
  67. MOVM.IA.W [R4-R11], (R(TS))
  68. B _f32loop
  69. _f4tail:
  70. SUB $3, R(TE), R(TMP) /* do remaining words if possible */
  71. _f4loop:
  72. CMP R(TMP), R(TS)
  73. BHS _f1tail
  74. MOVW.P 4(R(FROM)), R(TMP1) /* implicit write back */
  75. MOVW.P R4, 4(R(TS)) /* implicit write back */
  76. B _f4loop
  77. _f1tail:
  78. CMP R(TS), R(TE)
  79. BEQ _return
  80. MOVBU.P 1(R(FROM)), R(TMP) /* implicit write back */
  81. MOVBU.P R(TMP), 1(R(TS)) /* implicit write back */
  82. B _f1tail
  83. _return:
  84. MOVW to+0(FP), R0
  85. RET
  86. RSHIFT = 4
  87. LSHIFT = 5
  88. OFFSET = 6
  89. BR0 = 7
  90. BW0 = 8
  91. BR1 = 8
  92. BW1 = 9
  93. BR2 = 9
  94. BW2 = 10
  95. BR3 = 10
  96. BW3 = 11
  97. _bunaligned:
  98. CMP $2, R(TMP) /* is R(TMP) < 2 ? */
  99. MOVW.LT $8, R(RSHIFT) /* (R(n)<<24)|(R(n-1)>>8) */
  100. MOVW.LT $24, R(LSHIFT)
  101. MOVW.LT $1, R(OFFSET)
  102. MOVW.EQ $16, R(RSHIFT) /* (R(n)<<16)|(R(n-1)>>16) */
  103. MOVW.EQ $16, R(LSHIFT)
  104. MOVW.EQ $2, R(OFFSET)
  105. MOVW.GT $24, R(RSHIFT) /* (R(n)<<8)|(R(n-1)>>24) */
  106. MOVW.GT $8, R(LSHIFT)
  107. MOVW.GT $3, R(OFFSET)
  108. ADD $16, R(TS), R(TMP) /* do 16-byte chunks if possible */
  109. CMP R(TMP), R(TE)
  110. BLS _b1tail
  111. AND $~0x03, R(FROM) /* align source */
  112. MOVW (R(FROM)), R(BR0) /* prime first block register */
  113. _bu16loop:
  114. CMP R(TMP), R(TE)
  115. BLS _bu1tail
  116. MOVW R(BR0)<<R(LSHIFT), R(BW3)
  117. MOVM.DB.W (R(FROM)), [R(BR0)-R(BR3)]
  118. ORR R(BR3)>>R(RSHIFT), R(BW3)
  119. MOVW R(BR3)<<R(LSHIFT), R(BW2)
  120. ORR R(BR2)>>R(RSHIFT), R(BW2)
  121. MOVW R(BR2)<<R(LSHIFT), R(BW1)
  122. ORR R(BR1)>>R(RSHIFT), R(BW1)
  123. MOVW R(BR1)<<R(LSHIFT), R(BW0)
  124. ORR R(BR0)>>R(RSHIFT), R(BW0)
  125. MOVM.DB.W [R(BW0)-R(BW3)], (R(TE))
  126. B _bu16loop
  127. _bu1tail:
  128. ADD R(OFFSET), R(FROM)
  129. B _b1tail
  130. FW0 = 7
  131. FR0 = 8
  132. FW1 = 8
  133. FR1 = 9
  134. FW2 = 9
  135. FR2 = 10
  136. FW3 = 10
  137. FR3 = 11
  138. _funaligned:
  139. CMP $2, R(TMP)
  140. MOVW.LT $8, R(RSHIFT) /* (R(n+1)<<24)|(R(n)>>8) */
  141. MOVW.LT $24, R(LSHIFT)
  142. MOVW.LT $3, R(OFFSET)
  143. MOVW.EQ $16, R(RSHIFT) /* (R(n+1)<<16)|(R(n)>>16) */
  144. MOVW.EQ $16, R(LSHIFT)
  145. MOVW.EQ $2, R(OFFSET)
  146. MOVW.GT $24, R(RSHIFT) /* (R(n+1)<<8)|(R(n)>>24) */
  147. MOVW.GT $8, R(LSHIFT)
  148. MOVW.GT $1, R(OFFSET)
  149. SUB $16, R(TE), R(TMP) /* do 16-byte chunks if possible */
  150. CMP R(TMP), R(TS)
  151. BHS _f1tail
  152. AND $~0x03, R(FROM) /* align source */
  153. MOVW.P 4(R(FROM)), R(FR3) /* prime last block register, implicit write back */
  154. _fu16loop:
  155. CMP R(TMP), R(TS)
  156. BHS _fu1tail
  157. MOVW R(FR3)>>R(RSHIFT), R(FW0)
  158. MOVM.IA.W (R(FROM)), [R(FR0)-R(FR3)]
  159. ORR R(FR0)<<R(LSHIFT), R(FW0)
  160. MOVW R(FR0)>>R(RSHIFT), R(FW1)
  161. ORR R(FR1)<<R(LSHIFT), R(FW1)
  162. MOVW R(FR1)>>R(RSHIFT), R(FW2)
  163. ORR R(FR2)<<R(LSHIFT), R(FW2)
  164. MOVW R(FR2)>>R(RSHIFT), R(FW3)
  165. ORR R(FR3)<<R(LSHIFT), R(FW3)
  166. MOVM.IA.W [R(FW0)-R(FW3)], (R(TS))
  167. B _fu16loop
  168. _fu1tail:
  169. SUB R(OFFSET), R(FROM)
  170. B _f1tail