280 lines
7.7 KiB
ArmAsm

/*
* Copyright 2022 NXP
* All rights reserved.
*
* SPDX-License-Identifier: BSD-3-Clause
*/
.syntax unified
.text
.thumb
.align 2
#ifndef MSDK_MISC_OVERRIDE_MEMCPY
#define MSDK_MISC_OVERRIDE_MEMCPY 1
#endif
/*
This mempcy function is used to replace the GCC newlib function for these purposes:
1. The newlib nano memcpy function use byte by byte copy, it is slow.
2. The newlib memcpy function for CM4, CM7, CM33 does't check address alignment,
so it may run to fault when the address is unaligned, and the memory region
is device memory, which does not support unaligned access.
This function is manually optimized base on assembly result of the c function.
The workflow is:
1. Return directly if length is 0.
2. If the source address is not 4-byte aligned, copy the unaligned part first byte by byte.
3. If the destination address is 4-byte aligned, then copy the 16-byte aligned part first,
copy 16-byte each loop, and then copy 8-byte, 4-byte, 2-byte and 1-byte.
4. If the destination address is not 4-byte aligned, load source data into register word
by word first, then store to memory based on alignement requirement. For the left part,
copy them byte by byte.
The source code of the c function is:
#define __CPY_WORD(dst, src) \
*(uint32_t *)(dst) = *(uint32_t *)(src); \
(dst) = ((uint32_t *)dst) + 1; \
(src) = ((uint32_t *)src) + 1
#define __CPY_HWORD(dst, src) \
*(uint16_t *)(dst) = *(uint16_t *)(src); \
(dst) = ((uint16_t *)dst) + 1; \
(src) = ((uint16_t *)src) + 1
#define __CPY_BYTE(dst, src) \
*(uint8_t *)(dst) = *(uint8_t *)(src); \
(dst) = ((uint8_t *)dst) + 1; \
(src) = ((uint8_t *)src) + 1
void * memcpy(void *restrict dst, const void * restrict src, size_t n)
{
void *ret = dst;
uint32_t tmp;
if (0 == n) return ret;
while (((uintptr_t)src & 0x03UL) != 0UL)
{
__CPY_BYTE(dst, src);
n--;
if (0 == n) return ret;
}
if (((uintptr_t)dst & 0x03UL) == 0UL)
{
while (n >= 16UL)
{
__CPY_WORD(dst, src);
__CPY_WORD(dst, src);
__CPY_WORD(dst, src);
__CPY_WORD(dst, src);
n-= 16UL;
}
if ((n & 0x08UL) != 0UL)
{
__CPY_WORD(dst, src);
__CPY_WORD(dst, src);
}
if ((n & 0x04UL) != 0UL)
{
__CPY_WORD(dst, src);
}
if ((n & 0x02UL) != 0UL)
{
__CPY_HWORD(dst, src);
}
if ((n & 0x01UL) != 0UL)
{
__CPY_BYTE(dst, src);
}
}
else
{
if (((uintptr_t)dst & 1UL) == 0UL)
{
while (n >= 4)
{
tmp = *(uint32_t *)src;
src = ((uint32_t *)src) + 1;
*(volatile uint16_t *)dst = (uint16_t)tmp;
dst = ((uint16_t *)dst) + 1;
*(volatile uint16_t *)dst = (uint16_t)(tmp>>16U);
dst = ((uint16_t *)dst) + 1;
n-=4;
}
}
else
{
while (n >= 4)
{
tmp = *(uint32_t *)src;
src = ((uint32_t *)src) + 1;
*(volatile uint8_t *)dst = (uint8_t)tmp;
dst = ((uint8_t *)dst) + 1;
*(volatile uint16_t *)dst = (uint16_t)(tmp>>8U);
dst = ((uint16_t *)dst) + 1;
*(volatile uint8_t *)dst = (uint8_t)(tmp>>24U);
dst = ((uint8_t *)dst) + 1;
n-=4;
}
}
while (n > 0)
{
__CPY_BYTE(dst, src);
n--;
}
}
return ret;
}
The test function is:
void test_memcpy(uint8_t *dst, const uint8_t * src, size_t n)
{
uint8_t * ds;
uint8_t * de;
const uint8_t *ss;
const uint8_t *se;
uint8_t * ret;
for (ss = src; ss < src+n; ss++)
{
for (se = ss; se < src + n; se ++)
{
size_t nn = (uintptr_t)se - (uintptr_t)ss;
for (ds = dst; ds + nn < dst+n; ds++)
{
de = ds + nn;
memset(dst, 0, n);
ret = memcpy(ds, ss, nn);
assert(ret == ds);
for (const uint8_t *data = dst; data < ds; data++)
{
assert(0 == *data);
}
for (const uint8_t *data = de; data < dst+n; data++)
{
assert(0 == *data);
}
assert(memcmp(ds, ss, nn) == 0);
}
}
}
}
test_memcpy((uint8_t *)0x20240000, (const uint8_t *)0x202C0000, 48);
*/
#if MSDK_MISC_OVERRIDE_MEMCPY
.thumb_func
.align 2
.global memcpy
.type memcpy, %function
memcpy:
push {r0, r4, r5, r6, r7, lr}
cmp r2, #0
beq ret /* If copy size is 0, return. */
src_word_unaligned:
ands r3, r1, #3 /* Make src 4-byte align. */
beq.n src_word_aligned /* src is 4-byte aligned, jump. */
ldrb r4, [r1], #1
subs r2, r2, #1 /* n-- */
strb r4, [r0], #1
beq.n ret /* n=0, return. */
b.n src_word_unaligned
src_word_aligned:
ands r3, r0, #3 /* Check dest 4-byte align. */
bne.n dst_word_unaligned
dst_word_aligned:
cmp r2, #16
blt.n size_ge_8
size_ge_16: /* size greater or equal than 16, use ldm and stm. */
subs r2, r2, #16 /* n -= 16 */
ldmia r1!, { r4, r5, r6, r7 }
cmp r2, #16
stmia r0!, { r4, r5, r6, r7 }
bcs.n size_ge_16
size_ge_8: /* size greater or equal than 8 */
lsls r3, r2, #28
itt mi
ldmiami r1!, { r4, r5 }
stmiami r0!, { r4, r5 }
size_ge_4: /* size greater or equal than 4 */
lsls r3, r2, #29
itt mi
ldrmi r4, [r1], #4
strmi r4, [r0], #4
size_ge_2: /* size greater or equal than 2 */
lsls r3, r2, #30
itt mi
ldrhmi r4, [r1], #2
strhmi r4, [r0], #2
size_ge_1: /* size greater or equal than 1 */
lsls r3, r2, #31
itt mi
ldrbmi r4, [r1]
strbmi r4, [r0]
b.n ret
dst_word_unaligned:
lsls r3, r0, #31
bmi.n dst_half_word_unaligned
dst_half_word_aligned:
cmp r2, #4
bcc.n size_lt_4
ldr r4, [r1], #4
subs r2, r2, #4
strh r4, [r0], #2
lsrs r5, r4, #16
strh r5, [r0], #2
b dst_half_word_aligned
dst_half_word_unaligned:
cmp r2, #4
bcc.n size_lt_4
ldr r4, [r1], #4
subs r2, r2, #4
strb r4, [r0], #1
lsrs r5, r4, #8
strh r5, [r0], #2
lsrs r6, r4, #24
strb r6, [r0], #1
b dst_half_word_unaligned
size_lt_4: /* size less than 4. */
cmp r2, #0
ittt ne
ldrbne r4, [r1], #1
strbne r4, [r0], #1
subne r2, r2, #1
bne size_lt_4
ret:
pop {r0, r4, r5, r6, r7, pc}
#endif /* MSDK_MISC_OVERRIDE_MEMCPY */