150 lines
3.2 KiB
ArmAsm
150 lines
3.2 KiB
ArmAsm
#include "setarch.h"
|
|
|
|
#include "defines.h"
|
|
|
|
#ifdef __H8300SX__
|
|
|
|
.global _memcpy
|
|
_memcpy:
|
|
stm.l er4-er6,@-er7
|
|
|
|
; Set up source and destination pointers for movmd.
|
|
mov.l er0,er6
|
|
mov.l er1,er5
|
|
|
|
; See whether the copy is long enough to use the movmd.l code.
|
|
; Although the code can handle anything longer than 6 bytes,
|
|
; it can be more expensive than movmd.b for small moves.
|
|
; It's better to use a higher threshold to account for this.
|
|
;
|
|
; Note that the exact overhead of the movmd.l checks depends on
|
|
; the alignments of the length and pointers. They are faster when
|
|
; er0 & 3 == er1 & 3 == er2 & 3, faster still when these values
|
|
; are 0. This threshold is a compromise between the various cases.
|
|
cmp #16,LEN(r2)
|
|
blo simple
|
|
|
|
; movmd.l only works for even addresses. If one of the addresses
|
|
; is odd and the other is not, fall back on a simple move.
|
|
bld #0,r5l
|
|
bxor #0,r6l
|
|
bcs simple
|
|
|
|
; Make the addresses even.
|
|
bld #0,r5l
|
|
bcc word_aligned
|
|
mov.b @er5+,@er6+
|
|
sub #1,LEN(r2)
|
|
|
|
word_aligned:
|
|
; See if copying one word would make the first operand longword
|
|
; aligned. Although this is only really worthwhile if it aligns
|
|
; the second operand as well, it's no worse if doesn't, so it
|
|
; hardly seems worth the overhead of a "band" check.
|
|
bld #1,r6l
|
|
bcc fast_copy
|
|
mov.w @er5+,@er6+
|
|
sub #2,LEN(r2)
|
|
|
|
fast_copy:
|
|
; Set (e)r4 to the number of longwords to copy.
|
|
mov LEN(r2),LEN(r4)
|
|
shlr #2,LEN(r4)
|
|
|
|
#ifdef __NORMAL_MODE__
|
|
; 16-bit pointers and size_ts: one movmd.l is enough. This code
|
|
; is never reached with r4 == 0.
|
|
movmd.l
|
|
and.w #3,r2
|
|
simple:
|
|
mov.w r2,r4
|
|
beq quit
|
|
movmd.b
|
|
quit:
|
|
rts/l er4-er6
|
|
#else
|
|
; Skip the first iteration if the number of longwords is divisible
|
|
; by 0x10000.
|
|
mov.w r4,r4
|
|
beq fast_loop_next
|
|
|
|
; This loop copies r4 (!= 0) longwords the first time round and 65536
|
|
; longwords on each iteration after that.
|
|
fast_loop:
|
|
movmd.l
|
|
fast_loop_next:
|
|
sub.w #1,e4
|
|
bhs fast_loop
|
|
|
|
; Mop up any left-over bytes. We could just fall through to the
|
|
; simple code after the "and" but the version below is quicker
|
|
; and only takes 10 more bytes.
|
|
and.w #3,r2
|
|
beq quit
|
|
mov.w r2,r4
|
|
movmd.b
|
|
quit:
|
|
rts/l er4-er6
|
|
|
|
simple:
|
|
; Simple bytewise copy. We need to handle all lengths, including zero.
|
|
mov.w r2,r4
|
|
beq simple_loop_next
|
|
simple_loop:
|
|
movmd.b
|
|
simple_loop_next:
|
|
sub.w #1,e2
|
|
bhs simple_loop
|
|
rts/l er4-er6
|
|
#endif
|
|
|
|
#else
|
|
|
|
.global _memcpy
|
|
_memcpy:
|
|
; MOVP @(2/4,r7),A0P ; dst
|
|
; MOVP @(4/8,r7),A1P ; src
|
|
; MOVP @(6/12,r7),A2P ; len
|
|
|
|
MOVP A0P,A3P ; keep copy of final dst
|
|
ADDP A2P,A0P ; point to end of dst
|
|
CMPP A0P,A3P ; see if anything to do
|
|
beq quit
|
|
|
|
ADDP A2P,A1P ; point to end of src
|
|
|
|
; lets see if we can do this in words
|
|
or A0L,A2L ; or in the dst address
|
|
or A3L,A2L ; or the length
|
|
or A1L,A2L ; or the src address
|
|
btst #0,A2L ; see if the lsb is zero
|
|
bne byteloop
|
|
|
|
wordloop:
|
|
#ifdef __NORMAL_MODE__
|
|
sub #2,A1P
|
|
#else
|
|
subs #2,A1P ; point to word
|
|
#endif
|
|
mov.w @A1P,A2 ; get word
|
|
mov.w A2,@-A0P ; save word
|
|
CMPP A0P,A3P ; at the front again ?
|
|
bne wordloop
|
|
rts
|
|
|
|
byteloop:
|
|
#ifdef __NORMAL_MODE__
|
|
sub #1,A1P
|
|
#else
|
|
subs #1,A1P ; point to byte
|
|
#endif
|
|
mov.b @A1P,A2L ; get byte
|
|
mov.b A2L,@-A0P ; save byte
|
|
CMPP A0P,A3P ; at the front again ?
|
|
bne byteloop
|
|
|
|
; return with A0 pointing to dst
|
|
quit: rts
|
|
|
|
#endif
|