165 lines
3.1 KiB
ArmAsm
165 lines
3.1 KiB
ArmAsm
!
|
|
! Fast SH memset
|
|
!
|
|
! by Toshiyasu Morita (tm@netcom.com)
|
|
!
|
|
! SH5 code by J"orn Rennecke (joern.rennecke@superh.com)
|
|
! Copyright 2002 SuperH Ltd.
|
|
!
|
|
|
|
#include "asm.h"
|
|
|
|
ENTRY(memset)
|
|
#if __SHMEDIA__
|
|
pta/l multiquad, tr0
|
|
ptabs r18, tr2
|
|
|
|
andi r2, -8, r25
|
|
add r2, r4, r5
|
|
addi r5, -1, r20 // calculate end address.
|
|
andi r20, -8, r20
|
|
cmveq r4, r25, r20
|
|
bne/u r25, r20, tr0 // multiquad
|
|
|
|
! This sequence could clobber volatile objects that are in the same
|
|
! quadword as a very short char array.
|
|
! ldlo.q r2, 0, r7
|
|
! shlli r4, 2, r4
|
|
! movi -1, r8
|
|
! SHHI r8, r4, r8
|
|
! SHHI r8, r4, r8
|
|
! mcmv r7, r8, r3
|
|
! stlo.q r2, 0, r3
|
|
|
|
pta/l setlongs, tr0
|
|
movi 4, r8
|
|
bgeu/u r4, r8, tr0
|
|
pta/l endset, tr0
|
|
beqi/u r4, 0, tr0
|
|
st.b r2, 0, r3
|
|
beqi/u r4, 1, tr0
|
|
nop
|
|
st.b r2, 1, r3
|
|
beqi/l r4, 2, tr0
|
|
st.b r2,2,r3
|
|
endset: blink tr2, r63
|
|
setlongs:
|
|
mshflo.b r3, r3, r3
|
|
mperm.w r3, r63, r3 // Fill pattern now in every byte of r3
|
|
stlo.l r2, 0, r3
|
|
nop
|
|
nop
|
|
sthi.l r5, -1, r3
|
|
blink tr2, r63
|
|
|
|
multiquad:
|
|
mshflo.b r3, r3, r3
|
|
mperm.w r3, r63, r3 // Fill pattern now in every byte of r3
|
|
pta/l lastquad, tr0
|
|
stlo.q r2, 0, r3
|
|
sub r20, r25, r24
|
|
movi 64, r9
|
|
beqi/u r24, 8, tr0 // lastquad
|
|
pta/l loop, tr1
|
|
addi r20, -7*8, r8 // loop end address; This might overflow, so we need
|
|
// to use a different test before we start the loop
|
|
bgeu/u r24, r9, tr1// loop
|
|
st.q r25, 8, r3
|
|
shlri r24, 4, r24
|
|
st.q r20, -8, r3
|
|
beqi/u r24, 1, tr0 // lastquad
|
|
st.q r25, 16, r3
|
|
st.q r20, -16, r3
|
|
beqi/u r24, 2, tr0 // lastquad
|
|
st.q r25, 24, r3
|
|
st.q r20, -24, r3
|
|
lastquad:
|
|
sthi.q r5, -1, r3
|
|
blink tr2,r63
|
|
|
|
loop:
|
|
alloco r25, 32
|
|
st.q r25, 8, r3
|
|
st.q r25, 16, r3
|
|
st.q r25, 24, r3
|
|
st.q r25, 32, r3
|
|
addi r25, 32, r25
|
|
bgeu/l r8, r25, tr1 // loop
|
|
|
|
st.q r20, -40, r3
|
|
st.q r20, -32, r3
|
|
st.q r20, -24, r3
|
|
st.q r20, -16, r3
|
|
st.q r20, -8, r3
|
|
sthi.q r5, -1, r3
|
|
blink tr2,r63
|
|
#else /* ! SHMEDIA, i.e. SH1 .. SH4 / SHcompact */
|
|
! Entry: r4: destination pointer
|
|
! r5: fill value
|
|
! r6: byte count
|
|
!
|
|
! Exit: r0-r3: trashed
|
|
!
|
|
|
|
! This assumes that the first four bytes of the address space (0..3) are
|
|
! reserved - usually by the linker script. Otherwise, we would had to check
|
|
! for the case of objects of the size 12..15 at address 0..3 .
|
|
|
|
#ifdef __SH5__
|
|
#define DST r2
|
|
#define VAL r3
|
|
#define CNT r4
|
|
#define TMP r5
|
|
#else
|
|
#define DST r4
|
|
#define VAL r5
|
|
#define CNT r6
|
|
#define TMP r2
|
|
#endif
|
|
|
|
mov #12,r0 ! Check for small number of bytes
|
|
cmp/gt CNT,r0
|
|
mov DST,r0
|
|
SL(bt, L_store_byte_loop_check0, add DST,CNT)
|
|
|
|
tst #3,r0 ! Align destination
|
|
SL(bt, L_dup_bytes, extu.b r5,r5)
|
|
.balignw 4,0x0009
|
|
L_align_loop:
|
|
mov.b VAL,@r0
|
|
add #1,r0
|
|
tst #3,r0
|
|
bf L_align_loop
|
|
|
|
L_dup_bytes:
|
|
swap.b VAL,TMP ! Duplicate bytes across longword
|
|
or TMP,VAL
|
|
swap.w VAL,TMP
|
|
or TMP,VAL
|
|
|
|
add #-16,CNT
|
|
|
|
.balignw 4,0x0009
|
|
L_store_long_loop:
|
|
mov.l VAL,@r0 ! Store double longs to memory
|
|
cmp/hs CNT,r0
|
|
mov.l VAL,@(4,r0)
|
|
SL(bf, L_store_long_loop, add #8,r0)
|
|
|
|
add #16,CNT
|
|
|
|
L_store_byte_loop_check0:
|
|
cmp/eq CNT,r0
|
|
bt L_exit
|
|
.balignw 4,0x0009
|
|
L_store_byte_loop:
|
|
mov.b VAL,@r0 ! Store bytes to memory
|
|
add #1,r0
|
|
cmp/eq CNT,r0
|
|
bf L_store_byte_loop
|
|
|
|
L_exit:
|
|
rts
|
|
mov r4,r0
|
|
#endif /* ! SHMEDIA */
|