newlib-cygwin/newlib/libc/machine/xtensa/memset.S

194 lines
4.2 KiB
ArmAsm
Raw Normal View History

2023-08-17 06:05:53 +08:00
/* ANSI C standard library function memset.
Copyright (c) 2001-2008 Tensilica Inc.
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be included
in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */
#include "xtensa-asm.h"
/* void *memset (void *dst, int c, size_t length)
The algorithm is as follows:
Create a word with c in all byte positions.
If the destination is aligned, set 16B chunks with a loop, and then
finish up with 8B, 4B, 2B, and 1B stores conditional on the length.
If the destination is unaligned, align it by conditionally
setting 1B and/or 2B and then go to aligned case.
This code tries to use fall-through branches for the common
case of an aligned destination (except for the branches to
the alignment labels). */
/* Byte-by-byte set. */
.text
.begin schedule
.align XCHAL_INST_FETCH_WIDTH
.literal_position
__memset_aux:
/* Skip bytes to get proper alignment for three-byte loop */
.skip XCHAL_INST_FETCH_WIDTH - 3
.Lbyteset:
#if XCHAL_HAVE_LOOPS
loopnez a4, 2f
#else
beqz a4, 2f
add a6, a5, a4 // a6 = ending address
#endif
1: s8i a3, a5, 0
#if XTENSA_ESP32_PSRAM_CACHE_FIX
memw
#endif
addi a5, a5, 1
#if !XCHAL_HAVE_LOOPS
bltu a5, a6, 1b
#endif
2: leaf_return
/* Destination is unaligned. */
.align 4
.Ldst1mod2: // dst is only byte aligned
/* Do short sizes byte-by-byte. */
bltui a4, 8, .Lbyteset
/* Set 1 byte. */
s8i a3, a5, 0
addi a5, a5, 1
addi a4, a4, -1
#if XTENSA_ESP32_PSRAM_CACHE_FIX
memw
#endif
/* Now retest if dst is aligned. */
_bbci.l a5, 1, .Ldstaligned
.Ldst2mod4: // dst has 16-bit alignment
/* Do short sizes byte-by-byte. */
bltui a4, 8, .Lbyteset
/* Set 2 bytes. */
s16i a3, a5, 0
addi a5, a5, 2
addi a4, a4, -2
#if XTENSA_ESP32_PSRAM_CACHE_FIX
memw
#endif
/* dst is now aligned; return to main algorithm */
j .Ldstaligned
.align 4
.global memset
.type memset, @function
memset:
leaf_entry sp, 16
/* a2 = dst, a3 = c, a4 = length */
/* Duplicate character into all bytes of word. */
extui a3, a3, 0, 8
slli a7, a3, 8
or a3, a3, a7
slli a7, a3, 16
or a3, a3, a7
mov a5, a2 // copy dst so that a2 is return value
/* Check if dst is unaligned. */
_bbsi.l a2, 0, .Ldst1mod2
_bbsi.l a2, 1, .Ldst2mod4
.Ldstaligned:
/* Get number of loop iterations with 16B per iteration. */
srli a7, a4, 4
#if XTENSA_ESP32_PSRAM_CACHE_FIX
//do not do this if we have less than one iteration to do
beqz a7, 2f
//this seems to work to prefetch the cache line
s32i a3, a5, 0
nop
#endif
/* Destination is word-aligned. */
#if XCHAL_HAVE_LOOPS
loopnez a7, 2f
#else
beqz a7, 2f
slli a6, a7, 4
add a6, a6, a5 // a6 = end of last 16B chunk
#endif
/* Set 16 bytes per iteration. */
1: s32i a3, a5, 0
s32i a3, a5, 4
s32i a3, a5, 8
s32i a3, a5, 12
addi a5, a5, 16
#if !XCHAL_HAVE_LOOPS
bltu a5, a6, 1b
#endif
/* Set any leftover pieces smaller than 16B. */
2: bbci.l a4, 3, 3f
/* Set 8 bytes. */
s32i a3, a5, 0
s32i a3, a5, 4
addi a5, a5, 8
3: bbci.l a4, 2, 4f
/* Set 4 bytes. */
s32i a3, a5, 0
addi a5, a5, 4
4: bbci.l a4, 1, 5f
/* Set 2 bytes. */
s16i a3, a5, 0
addi a5, a5, 2
#if XTENSA_ESP32_PSRAM_CACHE_FIX
memw
#endif
5: bbci.l a4, 0, 6f
/* Set 1 byte. */
s8i a3, a5, 0
#if XTENSA_ESP32_PSRAM_CACHE_FIX
memw
#endif
6: leaf_return
.end schedule
.size memset, . - memset