memmove.s 2.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197
  1. #define QUAD 8
  2. #define ALIGN 64
  3. #define BLOCK 64
  4. TEXT memmove(SB), $0
  5. MOVL from+4(FP), R7
  6. MOVL n+8(FP), R10
  7. MOVQ R0, R6
  8. CMPUGE R7, R0, R5
  9. BNE R5, _forward
  10. MOVQ R6, R8 /* end to address */
  11. ADDL R10, R6, R6 /* to+n */
  12. ADDL R10, R7, R7 /* from+n */
  13. CMPUGE $ALIGN, R10, R1 /* need at least ALIGN bytes */
  14. BNE R1, _b1tail
  15. _balign:
  16. AND $(ALIGN-1), R6, R1
  17. BEQ R1, _baligned
  18. MOVBU -1(R7), R2
  19. ADDL $-1, R6, R6
  20. MOVB R2, (R6)
  21. ADDL $-1, R7, R7
  22. JMP _balign
  23. _baligned:
  24. AND $(QUAD-1), R7, R1 /* is the source quad-aligned */
  25. BNE R1, _bunaligned
  26. ADDL $(BLOCK-1), R8, R9
  27. _bblock:
  28. CMPUGE R9, R6, R1
  29. BNE R1, _b8tail
  30. MOVQ -64(R7), R22
  31. MOVQ -56(R7), R23
  32. MOVQ -48(R7), R24
  33. MOVQ -40(R7), R25
  34. MOVQ -32(R7), R2
  35. MOVQ -24(R7), R3
  36. MOVQ -16(R7), R4
  37. MOVQ -8(R7), R5
  38. SUBL $64, R6, R6
  39. SUBL $64, R7, R7
  40. MOVQ R22, (R6)
  41. MOVQ R23, 8(R6)
  42. MOVQ R24, 16(R6)
  43. MOVQ R25, 24(R6)
  44. MOVQ R2, 32(R6)
  45. MOVQ R3, 40(R6)
  46. MOVQ R4, 48(R6)
  47. MOVQ R5, 56(R6)
  48. JMP _bblock
  49. _b8tail:
  50. ADDL $(QUAD-1), R8, R9
  51. _b8block:
  52. CMPUGE R9, R6, R1
  53. BNE R1, _b1tail
  54. MOVQ -8(R7), R2
  55. SUBL $8, R6
  56. MOVQ R2, (R6)
  57. SUBL $8, R7
  58. JMP _b8block
  59. _b1tail:
  60. CMPUGE R8, R6, R1
  61. BNE R1, _ret
  62. MOVBU -1(R7), R2
  63. SUBL $1, R6, R6
  64. MOVB R2, (R6)
  65. SUBL $1, R7, R7
  66. JMP _b1tail
  67. _ret:
  68. RET
  69. _bunaligned:
  70. ADDL $(16-1), R8, R9
  71. _bu8block:
  72. CMPUGE R9, R6, R1
  73. BNE R1, _b1tail
  74. MOVQU -16(R7), R4
  75. MOVQU -8(R7), R3
  76. MOVQU (R7), R2
  77. SUBL $16, R6
  78. EXTQH R7, R2, R2
  79. EXTQL R7, R3, R5
  80. OR R5, R2, R11
  81. EXTQH R7, R3, R3
  82. EXTQL R7, R4, R4
  83. OR R3, R4, R13
  84. MOVQ R11, 8(R6)
  85. MOVQ R13, (R6)
  86. SUBL $16, R7
  87. JMP _bu8block
  88. _forward:
  89. ADDL R10, R6, R8 /* end to address */
  90. CMPUGE $ALIGN, R10, R1 /* need at least ALIGN bytes */
  91. BNE R1, _f1tail
  92. _falign:
  93. AND $(ALIGN-1), R6, R1
  94. BEQ R1, _faligned
  95. MOVBU (R7), R2
  96. ADDL $1, R6, R6
  97. ADDL $1, R7, R7
  98. MOVB R2, -1(R6)
  99. JMP _falign
  100. _faligned:
  101. AND $(QUAD-1), R7, R1 /* is the source quad-aligned */
  102. BNE R1, _funaligned
  103. SUBL $(BLOCK-1), R8, R9
  104. _fblock:
  105. CMPUGT R9, R6, R1
  106. BEQ R1, _f8tail
  107. MOVQ (R7), R2
  108. MOVQ 8(R7), R3
  109. MOVQ 16(R7), R4
  110. MOVQ 24(R7), R5
  111. MOVQ 32(R7), R22
  112. MOVQ 40(R7), R23
  113. MOVQ 48(R7), R24
  114. MOVQ 56(R7), R25
  115. ADDL $64, R6, R6
  116. ADDL $64, R7, R7
  117. MOVQ R2, -64(R6)
  118. MOVQ R3, -56(R6)
  119. MOVQ R4, -48(R6)
  120. MOVQ R5, -40(R6)
  121. MOVQ R22, -32(R6)
  122. MOVQ R23, -24(R6)
  123. MOVQ R24, -16(R6)
  124. MOVQ R25, -8(R6)
  125. JMP _fblock
  126. _f8tail:
  127. SUBL $(QUAD-1), R8, R9
  128. _f8block:
  129. CMPUGT R9, R6, R1
  130. BEQ R1, _f1tail
  131. MOVQ (R7), R2
  132. ADDL $8, R6
  133. ADDL $8, R7
  134. MOVQ R2, -8(R6)
  135. JMP _f8block
  136. _f1tail:
  137. CMPUGT R8, R6, R1
  138. BEQ R1, _fret
  139. MOVBU (R7), R2
  140. ADDL $1, R6, R6
  141. ADDL $1, R7, R7
  142. MOVB R2, -1(R6)
  143. JMP _f1tail
  144. _fret:
  145. RET
  146. _funaligned:
  147. SUBL $(16-1), R8, R9
  148. _fu8block:
  149. CMPUGT R9, R6, R1
  150. BEQ R1, _f1tail
  151. MOVQU (R7), R2
  152. MOVQU 8(R7), R3
  153. MOVQU 16(R7), R4
  154. EXTQL R7, R2, R2
  155. EXTQH R7, R3, R5
  156. OR R5, R2, R11
  157. EXTQL R7, R3, R3
  158. MOVQ R11, (R6)
  159. EXTQH R7, R4, R4
  160. OR R3, R4, R11
  161. MOVQ R11, 8(R6)
  162. ADDL $16, R6
  163. ADDL $16, R7
  164. JMP _fu8block