280 lines
7.7 KiB
ArmAsm
280 lines
7.7 KiB
ArmAsm
|
/*
|
||
|
* Copyright 2022 NXP
|
||
|
* All rights reserved.
|
||
|
*
|
||
|
* SPDX-License-Identifier: BSD-3-Clause
|
||
|
*/
|
||
|
|
||
|
.syntax unified
|
||
|
|
||
|
.text
|
||
|
.thumb
|
||
|
|
||
|
.align 2
|
||
|
|
||
|
#ifndef MSDK_MISC_OVERRIDE_MEMCPY
|
||
|
#define MSDK_MISC_OVERRIDE_MEMCPY 1
|
||
|
#endif
|
||
|
|
||
|
/*
|
||
|
This mempcy function is used to replace the GCC newlib function for these purposes:
|
||
|
1. The newlib nano memcpy function use byte by byte copy, it is slow.
|
||
|
2. The newlib memcpy function for CM4, CM7, CM33 does't check address alignment,
|
||
|
so it may run to fault when the address is unaligned, and the memory region
|
||
|
is device memory, which does not support unaligned access.
|
||
|
|
||
|
This function is manually optimized base on assembly result of the c function.
|
||
|
The workflow is:
|
||
|
1. Return directly if length is 0.
|
||
|
2. If the source address is not 4-byte aligned, copy the unaligned part first byte by byte.
|
||
|
3. If the destination address is 4-byte aligned, then copy the 16-byte aligned part first,
|
||
|
copy 16-byte each loop, and then copy 8-byte, 4-byte, 2-byte and 1-byte.
|
||
|
4. If the destination address is not 4-byte aligned, load source data into register word
|
||
|
by word first, then store to memory based on alignement requirement. For the left part,
|
||
|
copy them byte by byte.
|
||
|
|
||
|
The source code of the c function is:
|
||
|
|
||
|
#define __CPY_WORD(dst, src) \
|
||
|
*(uint32_t *)(dst) = *(uint32_t *)(src); \
|
||
|
(dst) = ((uint32_t *)dst) + 1; \
|
||
|
(src) = ((uint32_t *)src) + 1
|
||
|
|
||
|
#define __CPY_HWORD(dst, src) \
|
||
|
*(uint16_t *)(dst) = *(uint16_t *)(src); \
|
||
|
(dst) = ((uint16_t *)dst) + 1; \
|
||
|
(src) = ((uint16_t *)src) + 1
|
||
|
|
||
|
#define __CPY_BYTE(dst, src) \
|
||
|
*(uint8_t *)(dst) = *(uint8_t *)(src); \
|
||
|
(dst) = ((uint8_t *)dst) + 1; \
|
||
|
(src) = ((uint8_t *)src) + 1
|
||
|
|
||
|
void * memcpy(void *restrict dst, const void * restrict src, size_t n)
|
||
|
{
|
||
|
void *ret = dst;
|
||
|
uint32_t tmp;
|
||
|
|
||
|
if (0 == n) return ret;
|
||
|
|
||
|
while (((uintptr_t)src & 0x03UL) != 0UL)
|
||
|
{
|
||
|
__CPY_BYTE(dst, src);
|
||
|
n--;
|
||
|
|
||
|
if (0 == n) return ret;
|
||
|
}
|
||
|
|
||
|
if (((uintptr_t)dst & 0x03UL) == 0UL)
|
||
|
{
|
||
|
while (n >= 16UL)
|
||
|
{
|
||
|
__CPY_WORD(dst, src);
|
||
|
__CPY_WORD(dst, src);
|
||
|
__CPY_WORD(dst, src);
|
||
|
__CPY_WORD(dst, src);
|
||
|
n-= 16UL;
|
||
|
}
|
||
|
|
||
|
if ((n & 0x08UL) != 0UL)
|
||
|
{
|
||
|
__CPY_WORD(dst, src);
|
||
|
__CPY_WORD(dst, src);
|
||
|
}
|
||
|
|
||
|
if ((n & 0x04UL) != 0UL)
|
||
|
{
|
||
|
__CPY_WORD(dst, src);
|
||
|
}
|
||
|
|
||
|
if ((n & 0x02UL) != 0UL)
|
||
|
{
|
||
|
__CPY_HWORD(dst, src);
|
||
|
}
|
||
|
|
||
|
if ((n & 0x01UL) != 0UL)
|
||
|
{
|
||
|
__CPY_BYTE(dst, src);
|
||
|
}
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
if (((uintptr_t)dst & 1UL) == 0UL)
|
||
|
{
|
||
|
while (n >= 4)
|
||
|
{
|
||
|
tmp = *(uint32_t *)src;
|
||
|
src = ((uint32_t *)src) + 1;
|
||
|
|
||
|
*(volatile uint16_t *)dst = (uint16_t)tmp;
|
||
|
dst = ((uint16_t *)dst) + 1;
|
||
|
*(volatile uint16_t *)dst = (uint16_t)(tmp>>16U);
|
||
|
dst = ((uint16_t *)dst) + 1;
|
||
|
|
||
|
n-=4;
|
||
|
}
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
while (n >= 4)
|
||
|
{
|
||
|
tmp = *(uint32_t *)src;
|
||
|
src = ((uint32_t *)src) + 1;
|
||
|
|
||
|
*(volatile uint8_t *)dst = (uint8_t)tmp;
|
||
|
dst = ((uint8_t *)dst) + 1;
|
||
|
*(volatile uint16_t *)dst = (uint16_t)(tmp>>8U);
|
||
|
dst = ((uint16_t *)dst) + 1;
|
||
|
*(volatile uint8_t *)dst = (uint8_t)(tmp>>24U);
|
||
|
dst = ((uint8_t *)dst) + 1;
|
||
|
n-=4;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
while (n > 0)
|
||
|
{
|
||
|
__CPY_BYTE(dst, src);
|
||
|
n--;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return ret;
|
||
|
}
|
||
|
|
||
|
The test function is:
|
||
|
|
||
|
void test_memcpy(uint8_t *dst, const uint8_t * src, size_t n)
|
||
|
{
|
||
|
uint8_t * ds;
|
||
|
uint8_t * de;
|
||
|
const uint8_t *ss;
|
||
|
const uint8_t *se;
|
||
|
uint8_t * ret;
|
||
|
|
||
|
for (ss = src; ss < src+n; ss++)
|
||
|
{
|
||
|
for (se = ss; se < src + n; se ++)
|
||
|
{
|
||
|
size_t nn = (uintptr_t)se - (uintptr_t)ss;
|
||
|
|
||
|
for (ds = dst; ds + nn < dst+n; ds++)
|
||
|
{
|
||
|
de = ds + nn;
|
||
|
|
||
|
memset(dst, 0, n);
|
||
|
|
||
|
ret = memcpy(ds, ss, nn);
|
||
|
|
||
|
assert(ret == ds);
|
||
|
|
||
|
for (const uint8_t *data = dst; data < ds; data++)
|
||
|
{
|
||
|
assert(0 == *data);
|
||
|
}
|
||
|
|
||
|
for (const uint8_t *data = de; data < dst+n; data++)
|
||
|
{
|
||
|
assert(0 == *data);
|
||
|
}
|
||
|
|
||
|
assert(memcmp(ds, ss, nn) == 0);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
test_memcpy((uint8_t *)0x20240000, (const uint8_t *)0x202C0000, 48);
|
||
|
|
||
|
*/
|
||
|
|
||
|
#if MSDK_MISC_OVERRIDE_MEMCPY
|
||
|
|
||
|
.thumb_func
|
||
|
.align 2
|
||
|
.global memcpy
|
||
|
.type memcpy, %function
|
||
|
|
||
|
memcpy:
|
||
|
push {r0, r4, r5, r6, r7, lr}
|
||
|
cmp r2, #0
|
||
|
beq ret /* If copy size is 0, return. */
|
||
|
|
||
|
src_word_unaligned:
|
||
|
ands r3, r1, #3 /* Make src 4-byte align. */
|
||
|
beq.n src_word_aligned /* src is 4-byte aligned, jump. */
|
||
|
ldrb r4, [r1], #1
|
||
|
subs r2, r2, #1 /* n-- */
|
||
|
strb r4, [r0], #1
|
||
|
beq.n ret /* n=0, return. */
|
||
|
b.n src_word_unaligned
|
||
|
|
||
|
src_word_aligned:
|
||
|
ands r3, r0, #3 /* Check dest 4-byte align. */
|
||
|
bne.n dst_word_unaligned
|
||
|
|
||
|
dst_word_aligned:
|
||
|
cmp r2, #16
|
||
|
blt.n size_ge_8
|
||
|
size_ge_16: /* size greater or equal than 16, use ldm and stm. */
|
||
|
subs r2, r2, #16 /* n -= 16 */
|
||
|
ldmia r1!, { r4, r5, r6, r7 }
|
||
|
cmp r2, #16
|
||
|
stmia r0!, { r4, r5, r6, r7 }
|
||
|
bcs.n size_ge_16
|
||
|
size_ge_8: /* size greater or equal than 8 */
|
||
|
lsls r3, r2, #28
|
||
|
itt mi
|
||
|
ldmiami r1!, { r4, r5 }
|
||
|
stmiami r0!, { r4, r5 }
|
||
|
size_ge_4: /* size greater or equal than 4 */
|
||
|
lsls r3, r2, #29
|
||
|
itt mi
|
||
|
ldrmi r4, [r1], #4
|
||
|
strmi r4, [r0], #4
|
||
|
size_ge_2: /* size greater or equal than 2 */
|
||
|
lsls r3, r2, #30
|
||
|
itt mi
|
||
|
ldrhmi r4, [r1], #2
|
||
|
strhmi r4, [r0], #2
|
||
|
size_ge_1: /* size greater or equal than 1 */
|
||
|
lsls r3, r2, #31
|
||
|
itt mi
|
||
|
ldrbmi r4, [r1]
|
||
|
strbmi r4, [r0]
|
||
|
b.n ret
|
||
|
|
||
|
dst_word_unaligned:
|
||
|
lsls r3, r0, #31
|
||
|
bmi.n dst_half_word_unaligned
|
||
|
dst_half_word_aligned:
|
||
|
cmp r2, #4
|
||
|
bcc.n size_lt_4
|
||
|
ldr r4, [r1], #4
|
||
|
subs r2, r2, #4
|
||
|
strh r4, [r0], #2
|
||
|
lsrs r5, r4, #16
|
||
|
strh r5, [r0], #2
|
||
|
b dst_half_word_aligned
|
||
|
dst_half_word_unaligned:
|
||
|
cmp r2, #4
|
||
|
bcc.n size_lt_4
|
||
|
ldr r4, [r1], #4
|
||
|
subs r2, r2, #4
|
||
|
strb r4, [r0], #1
|
||
|
lsrs r5, r4, #8
|
||
|
strh r5, [r0], #2
|
||
|
lsrs r6, r4, #24
|
||
|
strb r6, [r0], #1
|
||
|
b dst_half_word_unaligned
|
||
|
size_lt_4: /* size less than 4. */
|
||
|
cmp r2, #0
|
||
|
ittt ne
|
||
|
ldrbne r4, [r1], #1
|
||
|
strbne r4, [r0], #1
|
||
|
subne r2, r2, #1
|
||
|
bne size_lt_4
|
||
|
ret:
|
||
|
pop {r0, r4, r5, r6, r7, pc}
|
||
|
|
||
|
#endif /* MSDK_MISC_OVERRIDE_MEMCPY */
|