Subversion Repositories planix.SVN

Rev

Rev 2 | Blame | Compare with Previous | Last modification | View Log | RSS feed

TS = 0
TE = 1
FROM = 2
N = 3
TMP = 3                                 /* N and TMP don't overlap */
TMP1 = 4

TEXT memcpy(SB), $0
        B       _memmove
TEXT memmove(SB), $0
_memmove:
        MOVW    R(TS), to+0(FP)         /* need to save for return value */
        MOVW    from+4(FP), R(FROM)
        MOVW    n+8(FP), R(N)

        ADD     R(N), R(TS), R(TE)      /* to end pointer */

        CMP     R(FROM), R(TS)
        BLS     _forward

_back:
        ADD     R(N), R(FROM)           /* from end pointer */
        CMP     $4, R(N)                /* need at least 4 bytes to copy */
        BLT     _b1tail

_b4align:                               /* align destination on 4 */
        AND.S   $3, R(TE), R(TMP)
        BEQ     _b4aligned

        MOVBU.W -1(R(FROM)), R(TMP)     /* pre-indexed */
        MOVBU.W R(TMP), -1(R(TE))       /* pre-indexed */
        B       _b4align

_b4aligned:                             /* is source now aligned? */
        AND.S   $3, R(FROM), R(TMP)
        BNE     _bunaligned

        ADD     $31, R(TS), R(TMP)      /* do 32-byte chunks if possible */
_b32loop:
        CMP     R(TMP), R(TE)
        BLS     _b4tail

        MOVM.DB.W (R(FROM)), [R4-R7]
        MOVM.DB.W [R4-R7], (R(TE))
        MOVM.DB.W (R(FROM)), [R4-R7]
        MOVM.DB.W [R4-R7], (R(TE))
        B       _b32loop

_b4tail:                                /* do remaining words if possible */
        ADD     $3, R(TS), R(TMP)
_b4loop:
        CMP     R(TMP), R(TE)
        BLS     _b1tail

        MOVW.W  -4(R(FROM)), R(TMP1)    /* pre-indexed */
        MOVW.W  R(TMP1), -4(R(TE))      /* pre-indexed */
        B       _b4loop

_b1tail:                                /* remaining bytes */
        CMP     R(TE), R(TS)
        BEQ     _return

        MOVBU.W -1(R(FROM)), R(TMP)     /* pre-indexed */
        MOVBU.W R(TMP), -1(R(TE))       /* pre-indexed */
        B       _b1tail

_forward:
        CMP     $4, R(N)                /* need at least 4 bytes to copy */
        BLT     _f1tail

_f4align:                               /* align destination on 4 */
        AND.S   $3, R(TS), R(TMP)
        BEQ     _f4aligned

        MOVBU.P 1(R(FROM)), R(TMP)      /* implicit write back */
        MOVBU.P R(TMP), 1(R(TS))        /* implicit write back */
        B       _f4align

_f4aligned:                             /* is source now aligned? */
        AND.S   $3, R(FROM), R(TMP)
        BNE     _funaligned

        SUB     $31, R(TE), R(TMP)      /* do 32-byte chunks if possible */
_f32loop:
        CMP     R(TMP), R(TS)
        BHS     _f4tail

        MOVM.IA.W (R(FROM)), [R4-R7] 
        MOVM.IA.W [R4-R7], (R(TS))
        MOVM.IA.W (R(FROM)), [R4-R7] 
        MOVM.IA.W [R4-R7], (R(TS))
        B       _f32loop

_f4tail:
        SUB     $3, R(TE), R(TMP)       /* do remaining words if possible */
_f4loop:
        CMP     R(TMP), R(TS)
        BHS     _f1tail

        MOVW.P  4(R(FROM)), R(TMP1)     /* implicit write back */
        MOVW.P  R4, 4(R(TS))            /* implicit write back */
        B       _f4loop

_f1tail:
        CMP     R(TS), R(TE)
        BEQ     _return

        MOVBU.P 1(R(FROM)), R(TMP)      /* implicit write back */
        MOVBU.P R(TMP), 1(R(TS))        /* implicit write back */
        B       _f1tail

_return:
        MOVW    to+0(FP), R0
        RET

RSHIFT = 4
LSHIFT = 5
OFFSET = 11

BR0 = 6
BW0 = 7
BR1 = 7
BW1 = 8

_bunaligned:
        CMP     $2, R(TMP)              /* is R(TMP) < 2 ? */

        MOVW.LT $8, R(RSHIFT)           /* (R(n)<<24)|(R(n-1)>>8) */
        MOVW.LT $24, R(LSHIFT)
        MOVW.LT $1, R(OFFSET)

        MOVW.EQ $16, R(RSHIFT)          /* (R(n)<<16)|(R(n-1)>>16) */
        MOVW.EQ $16, R(LSHIFT)
        MOVW.EQ $2, R(OFFSET)

        MOVW.GT $24, R(RSHIFT)          /* (R(n)<<8)|(R(n-1)>>24) */
        MOVW.GT $8, R(LSHIFT)
        MOVW.GT $3, R(OFFSET)

        ADD     $8, R(TS), R(TMP)       /* do 8-byte chunks if possible */
        CMP     R(TMP), R(TE)
        BLS     _b1tail

        BIC     $3, R(FROM)             /* align source */
        MOVW    (R(FROM)), R(BR0)       /* prime first block register */

_bu8loop:
        CMP     R(TMP), R(TE)
        BLS     _bu1tail

        MOVW    R(BR0)<<R(LSHIFT), R(BW1)
        MOVM.DB.W (R(FROM)), [R(BR0)-R(BR1)]
        ORR     R(BR1)>>R(RSHIFT), R(BW1)

        MOVW    R(BR1)<<R(LSHIFT), R(BW0)
        ORR     R(BR0)>>R(RSHIFT), R(BW0)

        MOVM.DB.W [R(BW0)-R(BW1)], (R(TE))
        B       _bu8loop

_bu1tail:
        ADD     R(OFFSET), R(FROM)
        B       _b1tail

RSHIFT = 4
LSHIFT = 5
OFFSET = 11

FW0 = 6
FR0 = 7
FW1 = 7
FR1 = 8

_funaligned:
        CMP     $2, R(TMP)

        MOVW.LT $8, R(RSHIFT)           /* (R(n+1)<<24)|(R(n)>>8) */
        MOVW.LT $24, R(LSHIFT)
        MOVW.LT $3, R(OFFSET)

        MOVW.EQ $16, R(RSHIFT)          /* (R(n+1)<<16)|(R(n)>>16) */
        MOVW.EQ $16, R(LSHIFT)
        MOVW.EQ $2, R(OFFSET)

        MOVW.GT $24, R(RSHIFT)          /* (R(n+1)<<8)|(R(n)>>24) */
        MOVW.GT $8, R(LSHIFT)
        MOVW.GT $1, R(OFFSET)

        SUB     $8, R(TE), R(TMP)       /* do 8-byte chunks if possible */
        CMP     R(TMP), R(TS)
        BHS     _f1tail

        BIC     $3, R(FROM)             /* align source */
        MOVW.P  4(R(FROM)), R(FR1)      /* prime last block register, implicit write back */

_fu8loop:
        CMP     R(TMP), R(TS)
        BHS     _fu1tail

        MOVW    R(FR1)>>R(RSHIFT), R(FW0)
        MOVM.IA.W (R(FROM)), [R(FR0)-R(FR1)]
        ORR     R(FR0)<<R(LSHIFT), R(FW0)

        MOVW    R(FR0)>>R(RSHIFT), R(FW1)
        ORR     R(FR1)<<R(LSHIFT), R(FW1)

        MOVM.IA.W [R(FW0)-R(FW1)], (R(TS))
        B       _fu8loop

_fu1tail:
        SUB     R(OFFSET), R(FROM)
        B       _f1tail