memmove.s 2.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201
  1. #define QUAD 8
  2. #define ALIGN 64
  3. #define BLOCK 64
  4. TEXT memmove(SB), $0
  5. _memmove:
  6. MOVL from+4(FP), R7
  7. MOVL n+8(FP), R10
  8. MOVQ R0, R6
  9. CMPUGE R7, R0, R5
  10. BNE R5, _forward
  11. MOVQ R6, R8 /* end to address */
  12. ADDL R10, R6, R6 /* to+n */
  13. ADDL R10, R7, R7 /* from+n */
  14. CMPUGE $ALIGN, R10, R1 /* need at least ALIGN bytes */
  15. BNE R1, _b1tail
  16. _balign:
  17. AND $(ALIGN-1), R6, R1
  18. BEQ R1, _baligned
  19. MOVBU -1(R7), R2
  20. ADDL $-1, R6, R6
  21. MOVB R2, (R6)
  22. ADDL $-1, R7, R7
  23. JMP _balign
  24. _baligned:
  25. AND $(QUAD-1), R7, R1 /* is the source quad-aligned */
  26. BNE R1, _bunaligned
  27. ADDL $(BLOCK-1), R8, R9
  28. _bblock:
  29. CMPUGE R9, R6, R1
  30. BNE R1, _b8tail
  31. MOVQ -64(R7), R22
  32. MOVQ -56(R7), R23
  33. MOVQ -48(R7), R24
  34. MOVQ -40(R7), R25
  35. MOVQ -32(R7), R2
  36. MOVQ -24(R7), R3
  37. MOVQ -16(R7), R4
  38. MOVQ -8(R7), R5
  39. SUBL $64, R6, R6
  40. SUBL $64, R7, R7
  41. MOVQ R22, (R6)
  42. MOVQ R23, 8(R6)
  43. MOVQ R24, 16(R6)
  44. MOVQ R25, 24(R6)
  45. MOVQ R2, 32(R6)
  46. MOVQ R3, 40(R6)
  47. MOVQ R4, 48(R6)
  48. MOVQ R5, 56(R6)
  49. JMP _bblock
  50. _b8tail:
  51. ADDL $(QUAD-1), R8, R9
  52. _b8block:
  53. CMPUGE R9, R6, R1
  54. BNE R1, _b1tail
  55. MOVQ -8(R7), R2
  56. SUBL $8, R6
  57. MOVQ R2, (R6)
  58. SUBL $8, R7
  59. JMP _b8block
  60. _b1tail:
  61. CMPUGE R8, R6, R1
  62. BNE R1, _ret
  63. MOVBU -1(R7), R2
  64. SUBL $1, R6, R6
  65. MOVB R2, (R6)
  66. SUBL $1, R7, R7
  67. JMP _b1tail
  68. _ret:
  69. RET
  70. _bunaligned:
  71. ADDL $(16-1), R8, R9
  72. _bu8block:
  73. CMPUGE R9, R6, R1
  74. BNE R1, _b1tail
  75. MOVQU -16(R7), R4
  76. MOVQU -8(R7), R3
  77. MOVQU (R7), R2
  78. SUBL $16, R6
  79. EXTQH R7, R2, R2
  80. EXTQL R7, R3, R5
  81. OR R5, R2, R11
  82. EXTQH R7, R3, R3
  83. EXTQL R7, R4, R4
  84. OR R3, R4, R13
  85. MOVQ R11, 8(R6)
  86. MOVQ R13, (R6)
  87. SUBL $16, R7
  88. JMP _bu8block
  89. _forward:
  90. ADDL R10, R6, R8 /* end to address */
  91. CMPUGE $ALIGN, R10, R1 /* need at least ALIGN bytes */
  92. BNE R1, _f1tail
  93. _falign:
  94. AND $(ALIGN-1), R6, R1
  95. BEQ R1, _faligned
  96. MOVBU (R7), R2
  97. ADDL $1, R6, R6
  98. ADDL $1, R7, R7
  99. MOVB R2, -1(R6)
  100. JMP _falign
  101. _faligned:
  102. AND $(QUAD-1), R7, R1 /* is the source quad-aligned */
  103. BNE R1, _funaligned
  104. SUBL $(BLOCK-1), R8, R9
  105. _fblock:
  106. CMPUGT R9, R6, R1
  107. BEQ R1, _f8tail
  108. MOVQ (R7), R2
  109. MOVQ 8(R7), R3
  110. MOVQ 16(R7), R4
  111. MOVQ 24(R7), R5
  112. MOVQ 32(R7), R22
  113. MOVQ 40(R7), R23
  114. MOVQ 48(R7), R24
  115. MOVQ 56(R7), R25
  116. ADDL $64, R6, R6
  117. ADDL $64, R7, R7
  118. MOVQ R2, -64(R6)
  119. MOVQ R3, -56(R6)
  120. MOVQ R4, -48(R6)
  121. MOVQ R5, -40(R6)
  122. MOVQ R22, -32(R6)
  123. MOVQ R23, -24(R6)
  124. MOVQ R24, -16(R6)
  125. MOVQ R25, -8(R6)
  126. JMP _fblock
  127. _f8tail:
  128. SUBL $(QUAD-1), R8, R9
  129. _f8block:
  130. CMPUGT R9, R6, R1
  131. BEQ R1, _f1tail
  132. MOVQ (R7), R2
  133. ADDL $8, R6
  134. ADDL $8, R7
  135. MOVQ R2, -8(R6)
  136. JMP _f8block
  137. _f1tail:
  138. CMPUGT R8, R6, R1
  139. BEQ R1, _fret
  140. MOVBU (R7), R2
  141. ADDL $1, R6, R6
  142. ADDL $1, R7, R7
  143. MOVB R2, -1(R6)
  144. JMP _f1tail
  145. _fret:
  146. RET
  147. _funaligned:
  148. SUBL $(16-1), R8, R9
  149. _fu8block:
  150. CMPUGT R9, R6, R1
  151. BEQ R1, _f1tail
  152. MOVQU (R7), R2
  153. MOVQU 8(R7), R3
  154. MOVQU 16(R7), R4
  155. EXTQL R7, R2, R2
  156. EXTQH R7, R3, R5
  157. OR R5, R2, R11
  158. EXTQL R7, R3, R3
  159. MOVQ R11, (R6)
  160. EXTQH R7, R4, R4
  161. OR R3, R4, R11
  162. MOVQ R11, 8(R6)
  163. ADDL $16, R6
  164. ADDL $16, R7
  165. JMP _fu8block
  166. TEXT memcpy(SB), $0
  167. JMP _memmove