mirror of
git://sourceware.org/git/newlib-cygwin.git
synced 2025-01-28 10:00:23 +08:00
96bff22c5d
Also handle as single quad word when destination ends at last byte of first quad word. Fix byte selection in single quad code.
141 lines
2.6 KiB
ArmAsm
141 lines
2.6 KiB
ArmAsm
!
|
|
! Fast SH memset
|
|
!
|
|
! by Toshiyasu Morita (tm@netcom.com)
|
|
!
|
|
! SH5 code by J"orn Rennecke (joern.rennecke@superh.com)
|
|
! Copyright 2002 SuperH Ltd.
|
|
!
|
|
|
|
#include "asm.h"
|
|
|
|
ENTRY(memset)
|
|
#if __SHMEDIA__
|
|
pta/l multiquad, tr0
|
|
andi r2, 7, r22
|
|
ptabs r18, tr2
|
|
mshflo.b r3,r3,r3
|
|
add r4, r22, r23
|
|
mperm.w r3, r63, r3 // Fill pattern now in every byte of r3
|
|
|
|
movi 8, r9
|
|
bgtu/u r23, r9, tr0
|
|
|
|
ldlo.q r2, 0, r7
|
|
shlli r4, 2, r4
|
|
movi -1, r8
|
|
SHHI r8, r4, r8
|
|
SHHI r8, r4, r8
|
|
mcmv r7, r8, r3
|
|
stlo.q r2, 0, r3
|
|
blink tr2, r63
|
|
|
|
multiquad:
|
|
pta/l lastquad, tr0
|
|
stlo.q r2, 0, r3
|
|
shlri r23, 3, r24
|
|
add r2, r4, r5
|
|
beqi/u r24, 1, tr0 // lastquad
|
|
pta/l loop, tr1
|
|
sub r2, r22, r25
|
|
andi r5, -8, r20 // calculate end address and
|
|
addi r20, -7*8, r8 // loop end address; This might overflow, so we need
|
|
// to use a different test before we start the loop
|
|
bge/u r24, r9, tr1 // loop
|
|
st.q r25, 8, r3
|
|
st.q r20, -8, r3
|
|
shlri r24, 1, r24
|
|
beqi/u r24, 1, tr0 // lastquad
|
|
st.q r25, 16, r3
|
|
st.q r20, -16, r3
|
|
beqi/u r24, 2, tr0 // lastquad
|
|
st.q r25, 24, r3
|
|
st.q r20, -24, r3
|
|
lastquad:
|
|
sthi.q r5, -1, r3
|
|
blink tr2,r63
|
|
|
|
loop:
|
|
alloco r25, 32
|
|
st.q r25, 8, r3
|
|
st.q r25, 16, r3
|
|
st.q r25, 24, r3
|
|
st.q r25, 32, r3
|
|
addi r25, 32, r25
|
|
bgeu/l r8, r25, tr1
|
|
|
|
st.q r20, -24, r3
|
|
st.q r20, -16, r3
|
|
st.q r20, -8, r3
|
|
sthi.q r5, -1, r3
|
|
blink tr2,r63
|
|
#else /* ! SHMEDIA, i.e. SH1 .. SH4 / SHcompact */
|
|
! Entry: r4: destination pointer
|
|
! r5: fill value
|
|
! r6: byte count
|
|
!
|
|
! Exit: r0-r3: trashed
|
|
!
|
|
|
|
! This assumes that the first four bytes of the address space (0..3) are
|
|
! reserved - usually by the linker script. Otherwise, we would had to check
|
|
! for the case of objects of the size 12..15 at address 0..3 .
|
|
|
|
#ifdef __SH5__
|
|
#define DST r2
|
|
#define VAL r3
|
|
#define CNT r4
|
|
#define TMP r5
|
|
#else
|
|
#define DST r4
|
|
#define VAL r5
|
|
#define CNT r6
|
|
#define TMP r2
|
|
#endif
|
|
|
|
mov #12,r0 ! Check for small number of bytes
|
|
cmp/gt CNT,r0
|
|
mov DST,r0
|
|
SL(bt, L_store_byte_loop_check0, add DST,CNT)
|
|
|
|
tst #3,r0 ! Align destination
|
|
SL(bt, L_dup_bytes, extu.b r5,r5)
|
|
.balignw 4,0x0009
|
|
L_align_loop:
|
|
mov.b VAL,@r0
|
|
add #1,r0
|
|
tst #3,r0
|
|
bf L_align_loop
|
|
|
|
L_dup_bytes:
|
|
swap.b VAL,TMP ! Duplicate bytes across longword
|
|
or TMP,VAL
|
|
swap.w VAL,TMP
|
|
or TMP,VAL
|
|
|
|
add #-16,CNT
|
|
|
|
.balignw 4,0x0009
|
|
L_store_long_loop:
|
|
mov.l VAL,@r0 ! Store double longs to memory
|
|
cmp/hs CNT,r0
|
|
mov.l VAL,@(4,r0)
|
|
SL(bf, L_store_long_loop, add #8,r0)
|
|
|
|
add #16,CNT
|
|
|
|
L_store_byte_loop_check0:
|
|
cmp/eq CNT,r0
|
|
bt L_exit
|
|
.balignw 4,0x0009
|
|
L_store_byte_loop:
|
|
mov.b VAL,@r0 ! Store bytes to memory
|
|
add #1,r0
|
|
cmp/eq CNT,r0
|
|
bf L_store_byte_loop
|
|
|
|
L_exit:
|
|
rts
|
|
mov r4,r0
|
|
#endif /* ! SHMEDIA */
|