123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212 |
- TS = 0
- TE = 1
- FROM = 2
- N = 3
- TMP = 3 /* N and TMP don't overlap */
- TMP1 = 4
- TEXT memcpy(SB), $0
- B _memmove
- TEXT memmove(SB), $0
- _memmove:
- MOVW R(TS), to+0(FP) /* need to save for return value */
- MOVW from+4(FP), R(FROM)
- MOVW n+8(FP), R(N)
- ADD R(N), R(TS), R(TE) /* to end pointer */
- CMP R(FROM), R(TS)
- BLS _forward
- _back:
- ADD R(N), R(FROM) /* from end pointer */
- CMP $4, R(N) /* need at least 4 bytes to copy */
- BLT _b1tail
- _b4align: /* align destination on 4 */
- AND.S $3, R(TE), R(TMP)
- BEQ _b4aligned
- MOVBU.W -1(R(FROM)), R(TMP) /* pre-indexed */
- MOVBU.W R(TMP), -1(R(TE)) /* pre-indexed */
- B _b4align
- _b4aligned: /* is source now aligned? */
- AND.S $3, R(FROM), R(TMP)
- BNE _bunaligned
- ADD $31, R(TS), R(TMP) /* do 32-byte chunks if possible */
- _b32loop:
- CMP R(TMP), R(TE)
- BLS _b4tail
- MOVM.DB.W (R(FROM)), [R4-R7]
- MOVM.DB.W [R4-R7], (R(TE))
- MOVM.DB.W (R(FROM)), [R4-R7]
- MOVM.DB.W [R4-R7], (R(TE))
- B _b32loop
- _b4tail: /* do remaining words if possible */
- ADD $3, R(TS), R(TMP)
- _b4loop:
- CMP R(TMP), R(TE)
- BLS _b1tail
- MOVW.W -4(R(FROM)), R(TMP1) /* pre-indexed */
- MOVW.W R(TMP1), -4(R(TE)) /* pre-indexed */
- B _b4loop
- _b1tail: /* remaining bytes */
- CMP R(TE), R(TS)
- BEQ _return
- MOVBU.W -1(R(FROM)), R(TMP) /* pre-indexed */
- MOVBU.W R(TMP), -1(R(TE)) /* pre-indexed */
- B _b1tail
- _forward:
- CMP $4, R(N) /* need at least 4 bytes to copy */
- BLT _f1tail
- _f4align: /* align destination on 4 */
- AND.S $3, R(TS), R(TMP)
- BEQ _f4aligned
- MOVBU.P 1(R(FROM)), R(TMP) /* implicit write back */
- MOVBU.P R(TMP), 1(R(TS)) /* implicit write back */
- B _f4align
- _f4aligned: /* is source now aligned? */
- AND.S $3, R(FROM), R(TMP)
- BNE _funaligned
- SUB $31, R(TE), R(TMP) /* do 32-byte chunks if possible */
- _f32loop:
- CMP R(TMP), R(TS)
- BHS _f4tail
- MOVM.IA.W (R(FROM)), [R4-R7]
- MOVM.IA.W [R4-R7], (R(TS))
- MOVM.IA.W (R(FROM)), [R4-R7]
- MOVM.IA.W [R4-R7], (R(TS))
- B _f32loop
- _f4tail:
- SUB $3, R(TE), R(TMP) /* do remaining words if possible */
- _f4loop:
- CMP R(TMP), R(TS)
- BHS _f1tail
- MOVW.P 4(R(FROM)), R(TMP1) /* implicit write back */
- MOVW.P R4, 4(R(TS)) /* implicit write back */
- B _f4loop
- _f1tail:
- CMP R(TS), R(TE)
- BEQ _return
- MOVBU.P 1(R(FROM)), R(TMP) /* implicit write back */
- MOVBU.P R(TMP), 1(R(TS)) /* implicit write back */
- B _f1tail
- _return:
- MOVW to+0(FP), R0
- RET
- RSHIFT = 4
- LSHIFT = 5
- OFFSET = 11
- BR0 = 6
- BW0 = 7
- BR1 = 7
- BW1 = 8
- _bunaligned:
- CMP $2, R(TMP) /* is R(TMP) < 2 ? */
- MOVW.LT $8, R(RSHIFT) /* (R(n)<<24)|(R(n-1)>>8) */
- MOVW.LT $24, R(LSHIFT)
- MOVW.LT $1, R(OFFSET)
- MOVW.EQ $16, R(RSHIFT) /* (R(n)<<16)|(R(n-1)>>16) */
- MOVW.EQ $16, R(LSHIFT)
- MOVW.EQ $2, R(OFFSET)
- MOVW.GT $24, R(RSHIFT) /* (R(n)<<8)|(R(n-1)>>24) */
- MOVW.GT $8, R(LSHIFT)
- MOVW.GT $3, R(OFFSET)
- ADD $8, R(TS), R(TMP) /* do 8-byte chunks if possible */
- CMP R(TMP), R(TE)
- BLS _b1tail
- BIC $3, R(FROM) /* align source */
- MOVW (R(FROM)), R(BR0) /* prime first block register */
- _bu8loop:
- CMP R(TMP), R(TE)
- BLS _bu1tail
- MOVW R(BR0)<<R(LSHIFT), R(BW1)
- MOVM.DB.W (R(FROM)), [R(BR0)-R(BR1)]
- ORR R(BR1)>>R(RSHIFT), R(BW1)
- MOVW R(BR1)<<R(LSHIFT), R(BW0)
- ORR R(BR0)>>R(RSHIFT), R(BW0)
- MOVM.DB.W [R(BW0)-R(BW1)], (R(TE))
- B _bu8loop
- _bu1tail:
- ADD R(OFFSET), R(FROM)
- B _b1tail
- RSHIFT = 4
- LSHIFT = 5
- OFFSET = 11
- FW0 = 6
- FR0 = 7
- FW1 = 7
- FR1 = 8
- _funaligned:
- CMP $2, R(TMP)
- MOVW.LT $8, R(RSHIFT) /* (R(n+1)<<24)|(R(n)>>8) */
- MOVW.LT $24, R(LSHIFT)
- MOVW.LT $3, R(OFFSET)
- MOVW.EQ $16, R(RSHIFT) /* (R(n+1)<<16)|(R(n)>>16) */
- MOVW.EQ $16, R(LSHIFT)
- MOVW.EQ $2, R(OFFSET)
- MOVW.GT $24, R(RSHIFT) /* (R(n+1)<<8)|(R(n)>>24) */
- MOVW.GT $8, R(LSHIFT)
- MOVW.GT $1, R(OFFSET)
- SUB $8, R(TE), R(TMP) /* do 8-byte chunks if possible */
- CMP R(TMP), R(TS)
- BHS _f1tail
- BIC $3, R(FROM) /* align source */
- MOVW.P 4(R(FROM)), R(FR1) /* prime last block register, implicit write back */
- _fu8loop:
- CMP R(TMP), R(TS)
- BHS _fu1tail
- MOVW R(FR1)>>R(RSHIFT), R(FW0)
- MOVM.IA.W (R(FROM)), [R(FR0)-R(FR1)]
- ORR R(FR0)<<R(LSHIFT), R(FW0)
- MOVW R(FR0)>>R(RSHIFT), R(FW1)
- ORR R(FR1)<<R(LSHIFT), R(FW1)
- MOVM.IA.W [R(FW0)-R(FW1)], (R(TS))
- B _fu8loop
- _fu1tail:
- SUB R(OFFSET), R(FROM)
- B _f1tail
|