newlib-cygwin/newlib/libc/machine/xtensa/strcmp.S

354 lines
9.6 KiB
ArmAsm
Raw Normal View History

2023-08-17 06:05:53 +08:00
/* ANSI C standard library function strcmp.
Copyright (c) 2001-20012 Tensilica Inc.
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be included
in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */
#include "xtensa-asm.h"
#define MASK4 0x40404040
#if XCHAL_HAVE_L32R
.literal .Lmask0, MASK0
.literal .Lmask1, MASK1
.literal .Lmask2, MASK2
.literal .Lmask3, MASK3
.literal .Lmask4, MASK4
#endif /* XCHAL_HAVE_L32R */
.text
.align 4
.literal_position
.global strcmp
.type strcmp, @function
strcmp:
leaf_entry sp, 16
/* a2 = s1, a3 = s2 */
l8ui a8, a2, 0 // byte 0 from s1
l8ui a9, a3, 0 // byte 0 from s2
movi a10, 3 // mask
bne a8, a9, .Lretdiff
or a11, a2, a3
bnone a11, a10, .Laligned
xor a11, a2, a3 // compare low two bits of s1 and s2
bany a11, a10, .Lunaligned // if they have different alignment
/* s1/s2 are not word-aligned. */
addi a2, a2, 1 // advance s1
beqz a8, .Leq // bytes equal, if zero, strings are equal
addi a3, a3, 1 // advance s2
bnone a2, a10, .Laligned // if s1/s2 now aligned
l8ui a8, a2, 0 // byte 1 from s1
l8ui a9, a3, 0 // byte 1 from s2
addi a2, a2, 1 // advance s1
bne a8, a9, .Lretdiff // if different, return difference
beqz a8, .Leq // bytes equal, if zero, strings are equal
addi a3, a3, 1 // advance s2
bnone a2, a10, .Laligned // if s1/s2 now aligned
l8ui a8, a2, 0 // byte 2 from s1
l8ui a9, a3, 0 // byte 2 from s2
addi a2, a2, 1 // advance s1
bne a8, a9, .Lretdiff // if different, return difference
beqz a8, .Leq // bytes equal, if zero, strings are equal
addi a3, a3, 1 // advance s2
j .Laligned
/* s1 and s2 have different alignment.
If the zero-overhead loop option is available, use an (almost)
infinite zero-overhead loop with conditional exits so we only pay
for taken branches when exiting the loop.
Note: It is important for this unaligned case to come before the
code for aligned strings, because otherwise some of the branches
above cannot reach and have to be transformed to branches around
jumps. The unaligned code is smaller and the branches can reach
over it. */
.align 4
#if XCHAL_HAVE_LOOPS
#if XCHAL_HAVE_DENSITY
/* (2 mod 4) alignment for loop instruction */
#else
/* (1 mod 4) alignment for loop instruction */
.byte 0
.byte 0
#endif
#endif
.Lunaligned:
#if XCHAL_HAVE_LOOPS
#if XCHAL_HAVE_DENSITY
_movi.n a8, 0 // set up for the maximum loop count
#else
_movi a8, 0 // set up for the maximum loop count
#endif
loop a8, .Lretdiff // loop forever (almost anyway)
#endif
.Lnextbyte:
l8ui a8, a2, 0
l8ui a9, a3, 0
addi a2, a2, 1
bne a8, a9, .Lretdiff
addi a3, a3, 1
#if XCHAL_HAVE_LOOPS
beqz a8, .Lretdiff
#else
bnez a8, .Lnextbyte
#endif
.Lretdiff:
sub a2, a8, a9
leaf_return
/* s1 is word-aligned; s2 is word-aligned.
If the zero-overhead loop option is available, use an (almost)
infinite zero-overhead loop with conditional exits so we only pay
for taken branches when exiting the loop. */
/* New algorithm, relying on the fact that all normal ASCII is between
32 and 127.
Rather than check all bytes for zero:
Take one word (4 bytes). Call it w1.
Shift w1 left by one into w1'.
Or w1 and w1'. For all normal ASCII bit 6 will be 1; for zero it won't.
Check that all 4 bit 6's (one for each byte) are one:
If they are, we are definitely not done.
If they are not, we are probably done, but need to check for zero. */
.align 4
#if XCHAL_HAVE_LOOPS
#if !XCHAL_HAVE_L32R
/* (2 mod 4) alignment for loop instruction */
.byte 0
.byte 0
#endif
.Laligned:
#if XCHAL_HAVE_L32R
l32r a4, .Lmask0 // mask for byte 0
l32r a7, .Lmask4
#else
const16 a4, MASK0@h
const16 a4, MASK0@l
const16 a7, MASK4@h
const16 a7, MASK4@l
#endif
/* Loop forever */
1:
loop a0, .Laligned_done
/* First unrolled loop body. */
l32i a8, a2, 0 // get word from s1
l32i a9, a3, 0 // get word from s2
slli a5, a8, 1
bne a8, a9, .Lwne2
or a9, a8, a5
bnall a9, a7, .Lprobeq
/* Second unrolled loop body. */
l32i a8, a2, 4 // get word from s1+4
l32i a9, a3, 4 // get word from s2+4
slli a5, a8, 1
bne a8, a9, .Lwne2
or a9, a8, a5
bnall a9, a7, .Lprobeq2
addi a2, a2, 8 // advance s1 pointer
addi a3, a3, 8 // advance s2 pointer
.Laligned_done:
j 1b
.Lprobeq2:
/* Adjust pointers to account for the loop unrolling. */
addi a2, a2, 4
addi a3, a3, 4
#else /* !XCHAL_HAVE_LOOPS */
.Laligned:
movi a4, MASK0 // mask for byte 0
movi a7, MASK4
j .Lfirstword
.Lnextword:
addi a2, a2, 4 // advance s1 pointer
addi a3, a3, 4 // advance s2 pointer
.Lfirstword:
l32i a8, a2, 0 // get word from s1
l32i a9, a3, 0 // get word from s2
slli a5, a8, 1
bne a8, a9, .Lwne2
or a9, a8, a5
ball a9, a7, .Lnextword
#endif /* !XCHAL_HAVE_LOOPS */
/* align (0 mod 4) */
.Lprobeq:
/* Words are probably equal, but check for sure.
If not, loop over the rest of string using normal algorithm. */
bnone a8, a4, .Leq // if byte 0 is zero
#if XCHAL_HAVE_L32R
l32r a5, .Lmask1 // mask for byte 1
l32r a6, .Lmask2 // mask for byte 2
bnone a8, a5, .Leq // if byte 1 is zero
l32r a7, .Lmask3 // mask for byte 3
bnone a8, a6, .Leq // if byte 2 is zero
bnone a8, a7, .Leq // if byte 3 is zero
/* align (1 mod 4) */
#else
const16 a5, MASK1@h // mask for byte 1
const16 a5, MASK1@l
bnone a8, a5, .Leq // if byte 1 is zero
const16 a6, MASK2@h // mask for byte 2
const16 a6, MASK2@l
bnone a8, a6, .Leq // if byte 2 is zero
const16 a7, MASK3@h // mask for byte 3
const16 a7, MASK3@l
bnone a8, a7, .Leq // if byte 3 is zero
/* align (2 mod 4) */
#endif /* XCHAL_HAVE_L32R */
#if XCHAL_HAVE_DENSITY
addi.n a2, a2, 4 // advance s1 pointer
addi.n a3, a3, 4 // advance s2 pointer
/* align (1 mod 4) or (2 mod 4) */
#else
addi a2, a2, 4 // advance s1 pointer
addi a3, a3, 4 // advance s2 pointer
or a1, a1, a1 // nop
#if !XCHAL_HAVE_L32R
or a1, a1, a1 // nop
#endif
/* align (2 mod 4) */
#endif /* XCHAL_HAVE_DENSITY */
#if XCHAL_HAVE_LOOPS
1:
loop a0, .Leq // loop forever (a4 is bigger than max iters)
l32i a8, a2, 0 // get word from s1
l32i a9, a3, 0 // get word from s2
addi a2, a2, 4 // advance s1 pointer
bne a8, a9, .Lwne
bnone a8, a4, .Leq // if byte 0 is zero
bnone a8, a5, .Leq // if byte 1 is zero
bnone a8, a6, .Leq // if byte 2 is zero
bnone a8, a7, .Leq // if byte 3 is zero
addi a3, a3, 4 // advance s2 pointer
j 1b
#else /* !XCHAL_HAVE_LOOPS */
j .Lfirstword2
.Lnextword2:
addi a3, a3, 4 // advance s2 pointer
.Lfirstword2:
l32i a8, a2, 0 // get word from s1
l32i a9, a3, 0 // get word from s2
addi a2, a2, 4 // advance s1 pointer
bne a8, a9, .Lwne
bnone a8, a4, .Leq // if byte 0 is zero
bnone a8, a5, .Leq // if byte 1 is zero
bnone a8, a6, .Leq // if byte 2 is zero
bany a8, a7, .Lnextword2 // if byte 3 is zero
#endif /* !XCHAL_HAVE_LOOPS */
/* Words are equal; some byte is zero. */
.Leq: movi a2, 0 // return equal
leaf_return
.Lwne2: /* Words are not equal. On big-endian processors, if none of the
bytes are zero, the return value can be determined by a simple
comparison. */
#ifdef __XTENSA_EB__
or a10, a8, a5
bnall a10, a7, .Lsomezero
bgeu a8, a9, .Lposreturn
movi a2, -1
leaf_return
.Lposreturn:
movi a2, 1
leaf_return
.Lsomezero: // There is probably some zero byte.
#endif /* __XTENSA_EB__ */
.Lwne: /* Words are not equal. */
xor a2, a8, a9 // get word with nonzero in byte that differs
bany a2, a4, .Ldiff0 // if byte 0 differs
movi a5, MASK1 // mask for byte 1
bnone a8, a4, .Leq // if byte 0 is zero
bany a2, a5, .Ldiff1 // if byte 1 differs
movi a6, MASK2 // mask for byte 2
bnone a8, a5, .Leq // if byte 1 is zero
bany a2, a6, .Ldiff2 // if byte 2 differs
bnone a8, a6, .Leq // if byte 2 is zero
#ifdef __XTENSA_EB__
.Ldiff3:
.Ldiff2:
.Ldiff1:
/* Byte 0 is equal (at least) and there is a difference before a zero
byte. Just subtract words to get the return value.
The high order equal bytes cancel, leaving room for the sign. */
sub a2, a8, a9
leaf_return
.Ldiff0:
/* Need to make room for the sign, so can't subtract whole words. */
extui a10, a8, 24, 8
extui a11, a9, 24, 8
sub a2, a10, a11
leaf_return
#else /* !__XTENSA_EB__ */
/* Little-endian is a little more difficult because can't subtract
whole words. */
.Ldiff3:
/* Bytes 0-2 are equal; byte 3 is different.
For little-endian need to have a sign bit for the difference. */
extui a10, a8, 24, 8
extui a11, a9, 24, 8
sub a2, a10, a11
leaf_return
.Ldiff0:
/* Byte 0 is different. */
extui a10, a8, 0, 8
extui a11, a9, 0, 8
sub a2, a10, a11
leaf_return
.Ldiff1:
/* Byte 0 is equal; byte 1 is different. */
extui a10, a8, 8, 8
extui a11, a9, 8, 8
sub a2, a10, a11
leaf_return
.Ldiff2:
/* Bytes 0-1 are equal; byte 2 is different. */
extui a10, a8, 16, 8
extui a11, a9, 16, 8
sub a2, a10, a11
leaf_return
#endif /* !__XTENSA_EB */
.size strcmp, . - strcmp