322 lines
7.6 KiB
ArmAsm
322 lines
7.6 KiB
ArmAsm
|
/*
|
||
|
* Copyright (c) 2013 ARM Ltd
|
||
|
* All rights reserved.
|
||
|
*
|
||
|
* Redistribution and use in source and binary forms, with or without
|
||
|
* modification, are permitted provided that the following conditions
|
||
|
* are met:
|
||
|
* 1. Redistributions of source code must retain the above copyright
|
||
|
* notice, this list of conditions and the following disclaimer.
|
||
|
* 2. Redistributions in binary form must reproduce the above copyright
|
||
|
* notice, this list of conditions and the following disclaimer in the
|
||
|
* documentation and/or other materials provided with the distribution.
|
||
|
* 3. The name of the company may not be used to endorse or promote
|
||
|
* products derived from this software without specific prior written
|
||
|
* permission.
|
||
|
*
|
||
|
* THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
|
||
|
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
||
|
* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
||
|
* IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||
|
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
|
||
|
* TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||
|
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||
|
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||
|
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||
|
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||
|
*/
|
||
|
|
||
|
/* This memcpy routine is optimised for Cortex-M3/M4 cores with/without
|
||
|
unaligned access.
|
||
|
|
||
|
If compiled with GCC, this file should be enclosed within following
|
||
|
pre-processing check:
|
||
|
if defined (__ARM_ARCH_7M__) || defined (__ARM_ARCH_7EM__)
|
||
|
|
||
|
Prototype: void *memcpy (void *dst, const void *src, size_t count);
|
||
|
|
||
|
The job will be done in 5 steps.
|
||
|
Step 1: Align src/dest pointers, copy mis-aligned if fail to align both
|
||
|
Step 2: Repeatedly copy big block size of __OPT_BIG_BLOCK_SIZE
|
||
|
Step 3: Repeatedly copy big block size of __OPT_MID_BLOCK_SIZE
|
||
|
Step 4: Copy word by word
|
||
|
Step 5: Copy byte-to-byte
|
||
|
|
||
|
Tunable options:
|
||
|
__OPT_BIG_BLOCK_SIZE: Size of big block in words. Default to 64.
|
||
|
__OPT_MID_BLOCK_SIZE: Size of big block in words. Default to 16.
|
||
|
*/
|
||
|
#ifndef __OPT_BIG_BLOCK_SIZE
|
||
|
#define __OPT_BIG_BLOCK_SIZE (4 * 16)
|
||
|
#endif
|
||
|
|
||
|
#ifndef __OPT_MID_BLOCK_SIZE
|
||
|
#define __OPT_MID_BLOCK_SIZE (4 * 4)
|
||
|
#endif
|
||
|
|
||
|
#if __OPT_BIG_BLOCK_SIZE == 16
|
||
|
#define BEGIN_UNROLL_BIG_BLOCK \
|
||
|
.irp offset, 0,4,8,12
|
||
|
#elif __OPT_BIG_BLOCK_SIZE == 32
|
||
|
#define BEGIN_UNROLL_BIG_BLOCK \
|
||
|
.irp offset, 0,4,8,12,16,20,24,28
|
||
|
#elif __OPT_BIG_BLOCK_SIZE == 64
|
||
|
#define BEGIN_UNROLL_BIG_BLOCK \
|
||
|
.irp offset, 0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60
|
||
|
#else
|
||
|
#error "Illegal __OPT_BIG_BLOCK_SIZE"
|
||
|
#endif
|
||
|
|
||
|
#if __OPT_MID_BLOCK_SIZE == 8
|
||
|
#define BEGIN_UNROLL_MID_BLOCK \
|
||
|
.irp offset, 0,4
|
||
|
#elif __OPT_MID_BLOCK_SIZE == 16
|
||
|
#define BEGIN_UNROLL_MID_BLOCK \
|
||
|
.irp offset, 0,4,8,12
|
||
|
#else
|
||
|
#error "Illegal __OPT_MID_BLOCK_SIZE"
|
||
|
#endif
|
||
|
|
||
|
#define END_UNROLL .endr
|
||
|
|
||
|
.syntax unified
|
||
|
.text
|
||
|
.align 2
|
||
|
.global memcpy
|
||
|
.thumb
|
||
|
.thumb_func
|
||
|
.type memcpy, %function
|
||
|
memcpy:
|
||
|
@ r0: dst
|
||
|
@ r1: src
|
||
|
@ r2: len
|
||
|
#ifdef __ARM_FEATURE_UNALIGNED
|
||
|
/* In case of UNALIGNED access supported, ip is not used in
|
||
|
function body. */
|
||
|
mov ip, r0
|
||
|
#else
|
||
|
push {r0}
|
||
|
#endif
|
||
|
orr r3, r1, r0
|
||
|
ands r3, r3, #3
|
||
|
bne .Lmisaligned_copy
|
||
|
|
||
|
.Lbig_block:
|
||
|
subs r2, __OPT_BIG_BLOCK_SIZE
|
||
|
blo .Lmid_block
|
||
|
|
||
|
/* Kernel loop for big block copy */
|
||
|
.align 2
|
||
|
.Lbig_block_loop:
|
||
|
BEGIN_UNROLL_BIG_BLOCK
|
||
|
#ifdef __ARM_ARCH_7EM__
|
||
|
ldr r3, [r1], #4
|
||
|
str r3, [r0], #4
|
||
|
END_UNROLL
|
||
|
#else /* __ARM_ARCH_7M__ */
|
||
|
ldr r3, [r1, \offset]
|
||
|
str r3, [r0, \offset]
|
||
|
END_UNROLL
|
||
|
adds r0, __OPT_BIG_BLOCK_SIZE
|
||
|
adds r1, __OPT_BIG_BLOCK_SIZE
|
||
|
#endif
|
||
|
subs r2, __OPT_BIG_BLOCK_SIZE
|
||
|
bhs .Lbig_block_loop
|
||
|
|
||
|
.Lmid_block:
|
||
|
adds r2, __OPT_BIG_BLOCK_SIZE - __OPT_MID_BLOCK_SIZE
|
||
|
blo .Lcopy_word_by_word
|
||
|
|
||
|
/* Kernel loop for mid-block copy */
|
||
|
.align 2
|
||
|
.Lmid_block_loop:
|
||
|
BEGIN_UNROLL_MID_BLOCK
|
||
|
#ifdef __ARM_ARCH_7EM__
|
||
|
ldr r3, [r1], #4
|
||
|
str r3, [r0], #4
|
||
|
END_UNROLL
|
||
|
#else /* __ARM_ARCH_7M__ */
|
||
|
ldr r3, [r1, \offset]
|
||
|
str r3, [r0, \offset]
|
||
|
END_UNROLL
|
||
|
adds r0, __OPT_MID_BLOCK_SIZE
|
||
|
adds r1, __OPT_MID_BLOCK_SIZE
|
||
|
#endif
|
||
|
subs r2, __OPT_MID_BLOCK_SIZE
|
||
|
bhs .Lmid_block_loop
|
||
|
|
||
|
.Lcopy_word_by_word:
|
||
|
adds r2, __OPT_MID_BLOCK_SIZE - 4
|
||
|
blo .Lcopy_less_than_4
|
||
|
|
||
|
/* Kernel loop for small block copy */
|
||
|
.align 2
|
||
|
.Lcopy_word_by_word_loop:
|
||
|
ldr r3, [r1], #4
|
||
|
str r3, [r0], #4
|
||
|
subs r2, #4
|
||
|
bhs .Lcopy_word_by_word_loop
|
||
|
|
||
|
.Lcopy_less_than_4:
|
||
|
adds r2, #4
|
||
|
beq .Ldone
|
||
|
|
||
|
lsls r2, r2, #31
|
||
|
itt ne
|
||
|
ldrbne r3, [r1], #1
|
||
|
strbne r3, [r0], #1
|
||
|
|
||
|
bcc .Ldone
|
||
|
#ifdef __ARM_FEATURE_UNALIGNED
|
||
|
ldrh r3, [r1]
|
||
|
strh r3, [r0]
|
||
|
#else
|
||
|
ldrb r3, [r1]
|
||
|
strb r3, [r0]
|
||
|
ldrb r3, [r1, #1]
|
||
|
strb r3, [r0, #1]
|
||
|
#endif /* __ARM_FEATURE_UNALIGNED */
|
||
|
|
||
|
.Ldone:
|
||
|
#ifdef __ARM_FEATURE_UNALIGNED
|
||
|
mov r0, ip
|
||
|
#else
|
||
|
pop {r0}
|
||
|
#endif
|
||
|
bx lr
|
||
|
|
||
|
.align 2
|
||
|
.Lmisaligned_copy:
|
||
|
#ifdef __ARM_FEATURE_UNALIGNED
|
||
|
/* Define label DST_ALIGNED to BIG_BLOCK. It will go to aligned copy
|
||
|
once destination is adjusted to aligned. */
|
||
|
#define Ldst_aligned Lbig_block
|
||
|
|
||
|
/* Copy word by word using LDR when alignment can be done in hardware,
|
||
|
i.e., SCTLR.A is set, supporting unaligned access in LDR and STR. */
|
||
|
|
||
|
cmp r2, #8
|
||
|
blo .Lbyte_copy
|
||
|
|
||
|
/* if src is aligned, just go to the big block loop. */
|
||
|
lsls r3, r1, #30
|
||
|
beq .Ldst_aligned
|
||
|
#else
|
||
|
/* if len < 12, misalignment adjustment has more overhead than
|
||
|
just byte-to-byte copy. Also, len must >=8 to guarantee code
|
||
|
afterward work correctly. */
|
||
|
cmp r2, #12
|
||
|
blo .Lbyte_copy
|
||
|
#endif /* __ARM_FEATURE_UNALIGNED */
|
||
|
|
||
|
/* Align dst only, not trying to align src. That is the because
|
||
|
handling of aligned src and misaligned dst need more overhead than
|
||
|
otherwise. By doing this the worst case is when initial src is aligned,
|
||
|
additional up to 4 byte additional copy will executed, which is
|
||
|
acceptable. */
|
||
|
|
||
|
ands r3, r0, #3
|
||
|
beq .Ldst_aligned
|
||
|
|
||
|
rsb r3, #4
|
||
|
subs r2, r3
|
||
|
|
||
|
lsls r3, r3, #31
|
||
|
itt ne
|
||
|
ldrbne r3, [r1], #1
|
||
|
strbne r3, [r0], #1
|
||
|
|
||
|
bcc .Ldst_aligned
|
||
|
|
||
|
#ifdef __ARM_FEATURE_UNALIGNED
|
||
|
ldrh r3, [r1], #2
|
||
|
strh r3, [r0], #2
|
||
|
b .Ldst_aligned
|
||
|
#else
|
||
|
ldrb r3, [r1], #1
|
||
|
strb r3, [r0], #1
|
||
|
ldrb r3, [r1], #1
|
||
|
strb r3, [r0], #1
|
||
|
/* Now that dst is aligned */
|
||
|
.Ldst_aligned:
|
||
|
/* if r1 is aligned now, it means r0/r1 has the same misalignment,
|
||
|
and they are both aligned now. Go aligned copy. */
|
||
|
ands r3, r1, #3
|
||
|
beq .Lbig_block
|
||
|
|
||
|
/* dst is aligned, but src isn't. Misaligned copy. */
|
||
|
|
||
|
push {r4, r5}
|
||
|
subs r2, #4
|
||
|
|
||
|
/* Backward r1 by misaligned bytes, to make r1 aligned.
|
||
|
Since we need to restore r1 to unaligned address after the loop,
|
||
|
we need keep the offset bytes to ip and sub it from r1 afterward. */
|
||
|
subs r1, r3
|
||
|
rsb ip, r3, #4
|
||
|
|
||
|
/* Pre-load on word */
|
||
|
ldr r4, [r1], #4
|
||
|
|
||
|
cmp r3, #2
|
||
|
beq .Lmisaligned_copy_2_2
|
||
|
cmp r3, #3
|
||
|
beq .Lmisaligned_copy_3_1
|
||
|
|
||
|
.macro mis_src_copy shift
|
||
|
1:
|
||
|
lsrs r4, r4, \shift
|
||
|
ldr r3, [r1], #4
|
||
|
lsls r5, r3, 32-\shift
|
||
|
orr r4, r4, r5
|
||
|
str r4, [r0], #4
|
||
|
mov r4, r3
|
||
|
subs r2, #4
|
||
|
bhs 1b
|
||
|
.endm
|
||
|
|
||
|
.Lmisaligned_copy_1_3:
|
||
|
mis_src_copy shift=8
|
||
|
b .Lsrc_misaligned_tail
|
||
|
|
||
|
.Lmisaligned_copy_3_1:
|
||
|
mis_src_copy shift=24
|
||
|
b .Lsrc_misaligned_tail
|
||
|
|
||
|
.Lmisaligned_copy_2_2:
|
||
|
/* For 2_2 misalignment, ldr is still faster than 2 x ldrh. */
|
||
|
mis_src_copy shift=16
|
||
|
|
||
|
.Lsrc_misaligned_tail:
|
||
|
adds r2, #4
|
||
|
subs r1, ip
|
||
|
pop {r4, r5}
|
||
|
|
||
|
#endif /* __ARM_FEATURE_UNALIGNED */
|
||
|
|
||
|
.Lbyte_copy:
|
||
|
subs r2, #4
|
||
|
blo .Lcopy_less_than_4
|
||
|
|
||
|
.Lbyte_copy_loop:
|
||
|
subs r2, #1
|
||
|
ldrb r3, [r1], #1
|
||
|
strb r3, [r0], #1
|
||
|
bhs .Lbyte_copy_loop
|
||
|
|
||
|
ldrb r3, [r1]
|
||
|
strb r3, [r0]
|
||
|
ldrb r3, [r1, #1]
|
||
|
strb r3, [r0, #1]
|
||
|
ldrb r3, [r1, #2]
|
||
|
strb r3, [r0, #2]
|
||
|
|
||
|
#ifdef __ARM_FEATURE_UNALIGNED
|
||
|
mov r0, ip
|
||
|
#else
|
||
|
pop {r0}
|
||
|
#endif
|
||
|
bx lr
|
||
|
|
||
|
.size memcpy, .-memcpy
|