mirror of
git://sourceware.org/git/newlib-cygwin.git
synced 2025-02-21 00:07:36 +08:00
aarch64: Sync with ARM-software/optimized-routines
Update AArch64 assembly string routines from: https://github.com/ARM-software/optimized-routines commit 0cf84f26b6b8dcad8287fe30a4dcc1fdabd06560 Author: Sebastian Huber <sebastian.huber@embedded-brains.de> Date: Thu Jul 27 17:14:57 2023 +0200 string: Fix corrupt GNU_PROPERTY_TYPE (5) size For ELF32 the notes alignment is 4 and not 8. Add license and copyright information to COPYING.NEWLIB as entry (56).
This commit is contained in:
parent
a9e8e3d1cb
commit
96ec8f868e
250
COPYING.NEWLIB
250
COPYING.NEWLIB
@ -1291,3 +1291,253 @@ LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||||
OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
SUCH DAMAGE.
|
||||
|
||||
(56) MIT OR Apache-2.0 WITH LLVM-exception (newlib/libc/machine/aarch64)
|
||||
|
||||
SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
|
||||
|
||||
MIT License
|
||||
-----------
|
||||
|
||||
Copyright (c) 1999-2023, Arm Limited.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
|
||||
|
||||
Apache-2.0 WITH LLVM-exception
|
||||
------------------------------
|
||||
|
||||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work.
|
||||
|
||||
To apply the Apache License to your work, attach the following
|
||||
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||
replaced with your own identifying information. (Don't include
|
||||
the brackets!) The text should be enclosed in the appropriate
|
||||
comment syntax for the file format. We also recommend that a
|
||||
file or class name and description of purpose be included on the
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright [yyyy] [name of copyright owner]
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
|
||||
|
||||
--- LLVM Exceptions to the Apache 2.0 License ----
|
||||
|
||||
As an exception, if, as a result of your compiling your source code, portions
|
||||
of this Software are embedded into an Object form of such source code, you
|
||||
may redistribute such embedded portions in such Object form without complying
|
||||
with the conditions of Sections 4(a), 4(b) and 4(d) of the License.
|
||||
|
||||
In addition, if you combine or link compiled forms of this Software with
|
||||
software that is licensed under the GPLv2 ("Combined Software") and if a
|
||||
court of competent jurisdiction determines that the patent provision (Section
|
||||
3), the indemnity provision (Section 9) or other Section of the License
|
||||
conflicts with the conditions of the GPLv2, you may retroactively and
|
||||
prospectively choose to deem waived or otherwise exclude such Section(s) of
|
||||
the License, but only in their entirety and only with respect to the Combined
|
||||
Software.
|
||||
|
106
newlib/libc/machine/aarch64/asmdefs.h
Normal file
106
newlib/libc/machine/aarch64/asmdefs.h
Normal file
@ -0,0 +1,106 @@
|
||||
/*
|
||||
* Macros for asm code. AArch64 version.
|
||||
*
|
||||
* Copyright (c) 2019-2023, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#ifndef _ASMDEFS_H
|
||||
#define _ASMDEFS_H
|
||||
|
||||
/* Branch Target Identitication support. */
|
||||
#define BTI_C hint 34
|
||||
#define BTI_J hint 36
|
||||
/* Return address signing support (pac-ret). */
|
||||
#define PACIASP hint 25; .cfi_window_save
|
||||
#define AUTIASP hint 29; .cfi_window_save
|
||||
|
||||
/* GNU_PROPERTY_AARCH64_* macros from elf.h. */
|
||||
#define FEATURE_1_AND 0xc0000000
|
||||
#define FEATURE_1_BTI 1
|
||||
#define FEATURE_1_PAC 2
|
||||
|
||||
/* Add a NT_GNU_PROPERTY_TYPE_0 note. */
|
||||
#ifdef __ILP32__
|
||||
#define GNU_PROPERTY(type, value) \
|
||||
.section .note.gnu.property, "a"; \
|
||||
.p2align 2; \
|
||||
.word 4; \
|
||||
.word 12; \
|
||||
.word 5; \
|
||||
.asciz "GNU"; \
|
||||
.word type; \
|
||||
.word 4; \
|
||||
.word value; \
|
||||
.text
|
||||
#else
|
||||
#define GNU_PROPERTY(type, value) \
|
||||
.section .note.gnu.property, "a"; \
|
||||
.p2align 3; \
|
||||
.word 4; \
|
||||
.word 16; \
|
||||
.word 5; \
|
||||
.asciz "GNU"; \
|
||||
.word type; \
|
||||
.word 4; \
|
||||
.word value; \
|
||||
.word 0; \
|
||||
.text
|
||||
#endif
|
||||
|
||||
/* If set then the GNU Property Note section will be added to
|
||||
mark objects to support BTI and PAC-RET. */
|
||||
#ifndef WANT_GNU_PROPERTY
|
||||
#define WANT_GNU_PROPERTY 1
|
||||
#endif
|
||||
|
||||
#if WANT_GNU_PROPERTY
|
||||
/* Add property note with supported features to all asm files. */
|
||||
GNU_PROPERTY (FEATURE_1_AND, FEATURE_1_BTI|FEATURE_1_PAC)
|
||||
#endif
|
||||
|
||||
#define ENTRY_ALIGN(name, alignment) \
|
||||
.global name; \
|
||||
.type name,%function; \
|
||||
.align alignment; \
|
||||
name: \
|
||||
.cfi_startproc; \
|
||||
BTI_C;
|
||||
|
||||
#define ENTRY(name) ENTRY_ALIGN(name, 6)
|
||||
|
||||
#define ENTRY_ALIAS(name) \
|
||||
.global name; \
|
||||
.type name,%function; \
|
||||
name:
|
||||
|
||||
#define END(name) \
|
||||
.cfi_endproc; \
|
||||
.size name, .-name;
|
||||
|
||||
#define L(l) .L ## l
|
||||
|
||||
#ifdef __ILP32__
|
||||
/* Sanitize padding bits of pointer arguments as per aapcs64 */
|
||||
#define PTR_ARG(n) mov w##n, w##n
|
||||
#else
|
||||
#define PTR_ARG(n)
|
||||
#endif
|
||||
|
||||
#ifdef __ILP32__
|
||||
/* Sanitize padding bits of size arguments as per aapcs64 */
|
||||
#define SIZE_ARG(n) mov w##n, w##n
|
||||
#else
|
||||
#define SIZE_ARG(n)
|
||||
#endif
|
||||
|
||||
/* Compiler supports SVE instructions */
|
||||
#ifndef HAVE_SVE
|
||||
# if __aarch64__ && (__GNUC__ >= 8 || __clang_major__ >= 5)
|
||||
# define HAVE_SVE 1
|
||||
# else
|
||||
# define HAVE_SVE 0
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#endif
|
@ -1,31 +1,8 @@
|
||||
/*
|
||||
* memchr - find a character in a memory zone
|
||||
*
|
||||
* Copyright (c) 2014, ARM Limited
|
||||
* All rights Reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
* * Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of the company nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this
|
||||
* software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
* HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
* Copyright (c) 2014-2022, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
|
||||
@ -37,6 +14,8 @@
|
||||
* Neon Available.
|
||||
*/
|
||||
|
||||
#include "asmdefs.h"
|
||||
|
||||
/* Arguments and results. */
|
||||
#define srcin x0
|
||||
#define chrin w1
|
||||
@ -70,17 +49,11 @@
|
||||
* identify exactly which byte has matched.
|
||||
*/
|
||||
|
||||
.macro def_fn f p2align=0
|
||||
.text
|
||||
.p2align \p2align
|
||||
.global \f
|
||||
.type \f, %function
|
||||
\f:
|
||||
.endm
|
||||
|
||||
def_fn memchr
|
||||
ENTRY (memchr)
|
||||
PTR_ARG (0)
|
||||
SIZE_ARG (2)
|
||||
/* Do not dereference srcin if no bytes to compare. */
|
||||
cbz cntin, .Lzero_length
|
||||
cbz cntin, L(zero_length)
|
||||
/*
|
||||
* Magic constant 0x40100401 allows us to identify which lane matches
|
||||
* the requested byte.
|
||||
@ -93,7 +66,7 @@ def_fn memchr
|
||||
dup vrepmask.4s, wtmp2
|
||||
ands soff, srcin, #31
|
||||
and cntrem, cntin, #31
|
||||
b.eq .Lloop
|
||||
b.eq L(loop)
|
||||
|
||||
/*
|
||||
* Input string is not 32-byte aligned. We calculate the syndrome
|
||||
@ -110,41 +83,41 @@ def_fn memchr
|
||||
and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
|
||||
addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */
|
||||
addp vend.16b, vend.16b, vend.16b /* 128->64 */
|
||||
mov synd, vend.2d[0]
|
||||
mov synd, vend.d[0]
|
||||
/* Clear the soff*2 lower bits */
|
||||
lsl tmp, soff, #1
|
||||
lsr synd, synd, tmp
|
||||
lsl synd, synd, tmp
|
||||
/* The first block can also be the last */
|
||||
b.ls .Lmasklast
|
||||
b.ls L(masklast)
|
||||
/* Have we found something already? */
|
||||
cbnz synd, .Ltail
|
||||
cbnz synd, L(tail)
|
||||
|
||||
.Lloop:
|
||||
L(loop):
|
||||
ld1 {vdata1.16b, vdata2.16b}, [src], #32
|
||||
subs cntin, cntin, #32
|
||||
cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
|
||||
cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
|
||||
/* If we're out of data we finish regardless of the result */
|
||||
b.ls .Lend
|
||||
b.ls L(end)
|
||||
/* Use a fast check for the termination condition */
|
||||
orr vend.16b, vhas_chr1.16b, vhas_chr2.16b
|
||||
addp vend.2d, vend.2d, vend.2d
|
||||
mov synd, vend.2d[0]
|
||||
mov synd, vend.d[0]
|
||||
/* We're not out of data, loop if we haven't found the character */
|
||||
cbz synd, .Lloop
|
||||
cbz synd, L(loop)
|
||||
|
||||
.Lend:
|
||||
L(end):
|
||||
/* Termination condition found, let's calculate the syndrome value */
|
||||
and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
|
||||
and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
|
||||
addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */
|
||||
addp vend.16b, vend.16b, vend.16b /* 128->64 */
|
||||
mov synd, vend.2d[0]
|
||||
mov synd, vend.d[0]
|
||||
/* Only do the clear for the last possible block */
|
||||
b.hi .Ltail
|
||||
b.hs L(tail)
|
||||
|
||||
.Lmasklast:
|
||||
L(masklast):
|
||||
/* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */
|
||||
add tmp, cntrem, soff
|
||||
and tmp, tmp, #31
|
||||
@ -153,7 +126,7 @@ def_fn memchr
|
||||
lsl synd, synd, tmp
|
||||
lsr synd, synd, tmp
|
||||
|
||||
.Ltail:
|
||||
L(tail):
|
||||
/* Count the trailing zeros using bit reversing */
|
||||
rbit synd, synd
|
||||
/* Compensate the last post-increment */
|
||||
@ -168,9 +141,9 @@ def_fn memchr
|
||||
csel result, xzr, result, eq
|
||||
ret
|
||||
|
||||
.Lzero_length:
|
||||
L(zero_length):
|
||||
mov result, #0
|
||||
ret
|
||||
|
||||
.size memchr, . - memchr
|
||||
END (memchr)
|
||||
#endif
|
||||
|
@ -1,57 +1,7 @@
|
||||
/* memcmp - compare memory
|
||||
|
||||
Copyright (c) 2018 Linaro Limited
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the Linaro nor the
|
||||
names of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
|
||||
|
||||
/*
|
||||
* Copyright (c) 2017 ARM Ltd
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* 3. The name of the company may not be used to endorse or promote
|
||||
* products derived from this software without specific prior written
|
||||
* permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
||||
* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
||||
* IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
|
||||
* TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
* Copyright (c) 2013-2022, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
|
||||
@ -60,103 +10,79 @@
|
||||
|
||||
/* Assumptions:
|
||||
*
|
||||
* ARMv8-a, AArch64, unaligned accesses.
|
||||
* ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
|
||||
*/
|
||||
|
||||
#define L(l) .L ## l
|
||||
#include "asmdefs.h"
|
||||
|
||||
/* Parameters and result. */
|
||||
#define src1 x0
|
||||
#define src2 x1
|
||||
#define limit x2
|
||||
#define result w0
|
||||
#define src1 x0
|
||||
#define src2 x1
|
||||
#define limit x2
|
||||
#define result w0
|
||||
|
||||
/* Internal variables. */
|
||||
#define data1 x3
|
||||
#define data1w w3
|
||||
#define data1h x4
|
||||
#define data2 x5
|
||||
#define data2w w5
|
||||
#define data2h x6
|
||||
#define tmp1 x7
|
||||
#define tmp2 x8
|
||||
#define data1 x3
|
||||
#define data1w w3
|
||||
#define data2 x4
|
||||
#define data2w w4
|
||||
#define data3 x5
|
||||
#define data3w w5
|
||||
#define data4 x6
|
||||
#define data4w w6
|
||||
#define tmp x6
|
||||
#define src1end x7
|
||||
#define src2end x8
|
||||
|
||||
.macro def_fn f p2align=0
|
||||
.text
|
||||
.p2align \p2align
|
||||
.global \f
|
||||
.type \f, %function
|
||||
\f:
|
||||
.endm
|
||||
|
||||
def_fn memcmp p2align=6
|
||||
subs limit, limit, 8
|
||||
b.lo L(less8)
|
||||
ENTRY (memcmp)
|
||||
PTR_ARG (0)
|
||||
PTR_ARG (1)
|
||||
SIZE_ARG (2)
|
||||
|
||||
ldr data1, [src1], 8
|
||||
ldr data2, [src2], 8
|
||||
cmp limit, 16
|
||||
b.lo L(less16)
|
||||
ldp data1, data3, [src1]
|
||||
ldp data2, data4, [src2]
|
||||
ccmp data1, data2, 0, ne
|
||||
ccmp data3, data4, 0, eq
|
||||
b.ne L(return2)
|
||||
|
||||
add src1end, src1, limit
|
||||
add src2end, src2, limit
|
||||
cmp limit, 32
|
||||
b.ls L(last_bytes)
|
||||
cmp limit, 160
|
||||
b.hs L(loop_align)
|
||||
sub limit, limit, 32
|
||||
|
||||
.p2align 4
|
||||
L(loop32):
|
||||
ldp data1, data3, [src1, 16]
|
||||
ldp data2, data4, [src2, 16]
|
||||
cmp data1, data2
|
||||
b.ne L(return)
|
||||
|
||||
subs limit, limit, 8
|
||||
b.gt L(more16)
|
||||
|
||||
ldr data1, [src1, limit]
|
||||
ldr data2, [src2, limit]
|
||||
b L(return)
|
||||
|
||||
L(more16):
|
||||
ldr data1, [src1], 8
|
||||
ldr data2, [src2], 8
|
||||
cmp data1, data2
|
||||
bne L(return)
|
||||
|
||||
/* Jump directly to comparing the last 16 bytes for 32 byte (or less)
|
||||
strings. */
|
||||
subs limit, limit, 16
|
||||
ccmp data3, data4, 0, eq
|
||||
b.ne L(return2)
|
||||
cmp limit, 16
|
||||
b.ls L(last_bytes)
|
||||
|
||||
/* We overlap loads between 0-32 bytes at either side of SRC1 when we
|
||||
try to align, so limit it only to strings larger than 128 bytes. */
|
||||
cmp limit, 96
|
||||
b.ls L(loop16)
|
||||
|
||||
/* Align src1 and adjust src2 with bytes not yet done. */
|
||||
and tmp1, src1, 15
|
||||
add limit, limit, tmp1
|
||||
sub src1, src1, tmp1
|
||||
sub src2, src2, tmp1
|
||||
|
||||
/* Loop performing 16 bytes per iteration using aligned src1.
|
||||
Limit is pre-decremented by 16 and must be larger than zero.
|
||||
Exit if <= 16 bytes left to do or if the data is not equal. */
|
||||
.p2align 4
|
||||
L(loop16):
|
||||
ldp data1, data1h, [src1], 16
|
||||
ldp data2, data2h, [src2], 16
|
||||
subs limit, limit, 16
|
||||
ccmp data1, data2, 0, hi
|
||||
ccmp data1h, data2h, 0, eq
|
||||
b.eq L(loop16)
|
||||
|
||||
ldp data1, data3, [src1, 32]
|
||||
ldp data2, data4, [src2, 32]
|
||||
cmp data1, data2
|
||||
bne L(return)
|
||||
mov data1, data1h
|
||||
mov data2, data2h
|
||||
cmp data1, data2
|
||||
bne L(return)
|
||||
ccmp data3, data4, 0, eq
|
||||
b.ne L(return2)
|
||||
add src1, src1, 32
|
||||
add src2, src2, 32
|
||||
L(last64):
|
||||
subs limit, limit, 32
|
||||
b.hi L(loop32)
|
||||
|
||||
/* Compare last 1-16 bytes using unaligned access. */
|
||||
L(last_bytes):
|
||||
add src1, src1, limit
|
||||
add src2, src2, limit
|
||||
ldp data1, data1h, [src1]
|
||||
ldp data2, data2h, [src2]
|
||||
cmp data1, data2
|
||||
bne L(return)
|
||||
mov data1, data1h
|
||||
mov data2, data2h
|
||||
ldp data1, data3, [src1end, -16]
|
||||
ldp data2, data4, [src2end, -16]
|
||||
L(return2):
|
||||
cmp data1, data2
|
||||
csel data1, data1, data3, ne
|
||||
csel data2, data2, data4, ne
|
||||
|
||||
/* Compare data bytes and set return value to 0, -1 or 1. */
|
||||
L(return):
|
||||
@ -164,33 +90,106 @@ L(return):
|
||||
rev data1, data1
|
||||
rev data2, data2
|
||||
#endif
|
||||
cmp data1, data2
|
||||
L(ret_eq):
|
||||
cmp data1, data2
|
||||
cset result, ne
|
||||
cneg result, result, lo
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
/* Compare up to 8 bytes. Limit is [-8..-1]. */
|
||||
L(less16):
|
||||
add src1end, src1, limit
|
||||
add src2end, src2, limit
|
||||
tbz limit, 3, L(less8)
|
||||
ldr data1, [src1]
|
||||
ldr data2, [src2]
|
||||
ldr data3, [src1end, -8]
|
||||
ldr data4, [src2end, -8]
|
||||
b L(return2)
|
||||
|
||||
.p2align 4
|
||||
L(less8):
|
||||
adds limit, limit, 4
|
||||
b.lo L(less4)
|
||||
ldr data1w, [src1], 4
|
||||
ldr data2w, [src2], 4
|
||||
tbz limit, 2, L(less4)
|
||||
ldr data1w, [src1]
|
||||
ldr data2w, [src2]
|
||||
ldr data3w, [src1end, -4]
|
||||
ldr data4w, [src2end, -4]
|
||||
b L(return2)
|
||||
|
||||
L(less4):
|
||||
tbz limit, 1, L(less2)
|
||||
ldrh data1w, [src1]
|
||||
ldrh data2w, [src2]
|
||||
cmp data1w, data2w
|
||||
b.ne L(return)
|
||||
sub limit, limit, 4
|
||||
L(less4):
|
||||
adds limit, limit, 4
|
||||
beq L(ret_eq)
|
||||
L(byte_loop):
|
||||
ldrb data1w, [src1], 1
|
||||
ldrb data2w, [src2], 1
|
||||
subs limit, limit, 1
|
||||
ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */
|
||||
b.eq L(byte_loop)
|
||||
L(less2):
|
||||
mov result, 0
|
||||
tbz limit, 0, L(return_zero)
|
||||
ldrb data1w, [src1end, -1]
|
||||
ldrb data2w, [src2end, -1]
|
||||
sub result, data1w, data2w
|
||||
L(return_zero):
|
||||
ret
|
||||
|
||||
.size memcmp, . - memcmp
|
||||
L(loop_align):
|
||||
ldp data1, data3, [src1, 16]
|
||||
ldp data2, data4, [src2, 16]
|
||||
cmp data1, data2
|
||||
ccmp data3, data4, 0, eq
|
||||
b.ne L(return2)
|
||||
|
||||
/* Align src2 and adjust src1, src2 and limit. */
|
||||
and tmp, src2, 15
|
||||
sub tmp, tmp, 16
|
||||
sub src2, src2, tmp
|
||||
add limit, limit, tmp
|
||||
sub src1, src1, tmp
|
||||
sub limit, limit, 64 + 16
|
||||
|
||||
.p2align 4
|
||||
L(loop64):
|
||||
ldr q0, [src1, 16]
|
||||
ldr q1, [src2, 16]
|
||||
subs limit, limit, 64
|
||||
ldr q2, [src1, 32]
|
||||
ldr q3, [src2, 32]
|
||||
eor v0.16b, v0.16b, v1.16b
|
||||
eor v1.16b, v2.16b, v3.16b
|
||||
ldr q2, [src1, 48]
|
||||
ldr q3, [src2, 48]
|
||||
umaxp v0.16b, v0.16b, v1.16b
|
||||
ldr q4, [src1, 64]!
|
||||
ldr q5, [src2, 64]!
|
||||
eor v1.16b, v2.16b, v3.16b
|
||||
eor v2.16b, v4.16b, v5.16b
|
||||
umaxp v1.16b, v1.16b, v2.16b
|
||||
umaxp v0.16b, v0.16b, v1.16b
|
||||
umaxp v0.16b, v0.16b, v0.16b
|
||||
fmov tmp, d0
|
||||
ccmp tmp, 0, 0, hi
|
||||
b.eq L(loop64)
|
||||
|
||||
/* If equal, process last 1-64 bytes using scalar loop. */
|
||||
add limit, limit, 64 + 16
|
||||
cbz tmp, L(last64)
|
||||
|
||||
/* Determine the 8-byte aligned offset of the first difference. */
|
||||
#ifdef __AARCH64EB__
|
||||
rev16 tmp, tmp
|
||||
#endif
|
||||
rev tmp, tmp
|
||||
clz tmp, tmp
|
||||
bic tmp, tmp, 7
|
||||
sub tmp, tmp, 48
|
||||
ldr data1, [src1, tmp]
|
||||
ldr data2, [src2, tmp]
|
||||
#ifndef __AARCH64EB__
|
||||
rev data1, data1
|
||||
rev data2, data2
|
||||
#endif
|
||||
mov result, 1
|
||||
cmp data1, data2
|
||||
cneg result, result, lo
|
||||
ret
|
||||
|
||||
END (memcmp)
|
||||
#endif
|
||||
|
@ -1,55 +1,8 @@
|
||||
/* Copyright (c) 2012-2013, Linaro Limited
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the Linaro nor the
|
||||
names of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
|
||||
|
||||
/*
|
||||
* Copyright (c) 2015 ARM Ltd
|
||||
* All rights reserved.
|
||||
* memcpy - copy memory area
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* 3. The name of the company may not be used to endorse or promote
|
||||
* products derived from this software without specific prior written
|
||||
* permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
||||
* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
||||
* IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
|
||||
* TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
* Copyright (c) 2012-2022, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
/* Assumptions:
|
||||
@ -61,6 +14,7 @@
|
||||
#if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
|
||||
/* See memcpy-stub.c */
|
||||
#else
|
||||
#include "asmdefs.h"
|
||||
|
||||
#define dstin x0
|
||||
#define src x1
|
||||
@ -71,122 +25,139 @@
|
||||
#define A_l x6
|
||||
#define A_lw w6
|
||||
#define A_h x7
|
||||
#define A_hw w7
|
||||
#define B_l x8
|
||||
#define B_lw w8
|
||||
#define B_h x9
|
||||
#define C_l x10
|
||||
#define C_lw w10
|
||||
#define C_h x11
|
||||
#define D_l x12
|
||||
#define D_h x13
|
||||
#define E_l src
|
||||
#define E_h count
|
||||
#define F_l srcend
|
||||
#define F_h dst
|
||||
#define tmp1 x9
|
||||
#define E_l x14
|
||||
#define E_h x15
|
||||
#define F_l x16
|
||||
#define F_h x17
|
||||
#define G_l count
|
||||
#define G_h dst
|
||||
#define H_l src
|
||||
#define H_h srcend
|
||||
#define tmp1 x14
|
||||
|
||||
#define L(l) .L ## l
|
||||
/* This implementation handles overlaps and supports both memcpy and memmove
|
||||
from a single entry point. It uses unaligned accesses and branchless
|
||||
sequences to keep the code small, simple and improve performance.
|
||||
|
||||
.macro def_fn f p2align=0
|
||||
.text
|
||||
.p2align \p2align
|
||||
.global \f
|
||||
.type \f, %function
|
||||
\f:
|
||||
.endm
|
||||
Copies are split into 3 main cases: small copies of up to 32 bytes, medium
|
||||
copies of up to 128 bytes, and large copies. The overhead of the overlap
|
||||
check is negligible since it is only required for large copies.
|
||||
|
||||
/* Copies are split into 3 main cases: small copies of up to 16 bytes,
|
||||
medium copies of 17..96 bytes which are fully unrolled. Large copies
|
||||
of more than 96 bytes align the destination and use an unrolled loop
|
||||
processing 64 bytes per iteration.
|
||||
Small and medium copies read all data before writing, allowing any
|
||||
kind of overlap, and memmove tailcalls memcpy for these cases as
|
||||
well as non-overlapping copies.
|
||||
Large copies use a software pipelined loop processing 64 bytes per iteration.
|
||||
The destination pointer is 16-byte aligned to minimize unaligned accesses.
|
||||
The loop tail is handled by always copying 64 bytes from the end.
|
||||
*/
|
||||
|
||||
def_fn memcpy p2align=6
|
||||
prfm PLDL1KEEP, [src]
|
||||
ENTRY_ALIAS (memmove)
|
||||
ENTRY (memcpy)
|
||||
PTR_ARG (0)
|
||||
PTR_ARG (1)
|
||||
SIZE_ARG (2)
|
||||
add srcend, src, count
|
||||
add dstend, dstin, count
|
||||
cmp count, 16
|
||||
b.ls L(copy16)
|
||||
cmp count, 96
|
||||
cmp count, 128
|
||||
b.hi L(copy_long)
|
||||
cmp count, 32
|
||||
b.hi L(copy32_128)
|
||||
|
||||
/* Medium copies: 17..96 bytes. */
|
||||
sub tmp1, count, 1
|
||||
/* Small copies: 0..32 bytes. */
|
||||
cmp count, 16
|
||||
b.lo L(copy16)
|
||||
ldp A_l, A_h, [src]
|
||||
tbnz tmp1, 6, L(copy96)
|
||||
ldp D_l, D_h, [srcend, -16]
|
||||
tbz tmp1, 5, 1f
|
||||
ldp B_l, B_h, [src, 16]
|
||||
ldp C_l, C_h, [srcend, -32]
|
||||
stp B_l, B_h, [dstin, 16]
|
||||
stp C_l, C_h, [dstend, -32]
|
||||
1:
|
||||
stp A_l, A_h, [dstin]
|
||||
stp D_l, D_h, [dstend, -16]
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
/* Small copies: 0..16 bytes. */
|
||||
/* Copy 8-15 bytes. */
|
||||
L(copy16):
|
||||
cmp count, 8
|
||||
b.lo 1f
|
||||
tbz count, 3, L(copy8)
|
||||
ldr A_l, [src]
|
||||
ldr A_h, [srcend, -8]
|
||||
str A_l, [dstin]
|
||||
str A_h, [dstend, -8]
|
||||
ret
|
||||
.p2align 4
|
||||
1:
|
||||
tbz count, 2, 1f
|
||||
|
||||
.p2align 3
|
||||
/* Copy 4-7 bytes. */
|
||||
L(copy8):
|
||||
tbz count, 2, L(copy4)
|
||||
ldr A_lw, [src]
|
||||
ldr A_hw, [srcend, -4]
|
||||
ldr B_lw, [srcend, -4]
|
||||
str A_lw, [dstin]
|
||||
str A_hw, [dstend, -4]
|
||||
str B_lw, [dstend, -4]
|
||||
ret
|
||||
|
||||
/* Copy 0..3 bytes. Use a branchless sequence that copies the same
|
||||
byte 3 times if count==1, or the 2nd byte twice if count==2. */
|
||||
1:
|
||||
cbz count, 2f
|
||||
/* Copy 0..3 bytes using a branchless sequence. */
|
||||
L(copy4):
|
||||
cbz count, L(copy0)
|
||||
lsr tmp1, count, 1
|
||||
ldrb A_lw, [src]
|
||||
ldrb A_hw, [srcend, -1]
|
||||
ldrb C_lw, [srcend, -1]
|
||||
ldrb B_lw, [src, tmp1]
|
||||
strb A_lw, [dstin]
|
||||
strb B_lw, [dstin, tmp1]
|
||||
strb A_hw, [dstend, -1]
|
||||
2: ret
|
||||
|
||||
.p2align 4
|
||||
/* Copy 64..96 bytes. Copy 64 bytes from the start and
|
||||
32 bytes from the end. */
|
||||
L(copy96):
|
||||
ldp B_l, B_h, [src, 16]
|
||||
ldp C_l, C_h, [src, 32]
|
||||
ldp D_l, D_h, [src, 48]
|
||||
ldp E_l, E_h, [srcend, -32]
|
||||
ldp F_l, F_h, [srcend, -16]
|
||||
stp A_l, A_h, [dstin]
|
||||
stp B_l, B_h, [dstin, 16]
|
||||
stp C_l, C_h, [dstin, 32]
|
||||
stp D_l, D_h, [dstin, 48]
|
||||
stp E_l, E_h, [dstend, -32]
|
||||
stp F_l, F_h, [dstend, -16]
|
||||
strb C_lw, [dstend, -1]
|
||||
L(copy0):
|
||||
ret
|
||||
|
||||
/* Align DST to 16 byte alignment so that we don't cross cache line
|
||||
boundaries on both loads and stores. There are at least 96 bytes
|
||||
to copy, so copy 16 bytes unaligned and then align. The loop
|
||||
copies 64 bytes per iteration and prefetches one iteration ahead. */
|
||||
.p2align 4
|
||||
/* Medium copies: 33..128 bytes. */
|
||||
L(copy32_128):
|
||||
ldp A_l, A_h, [src]
|
||||
ldp B_l, B_h, [src, 16]
|
||||
ldp C_l, C_h, [srcend, -32]
|
||||
ldp D_l, D_h, [srcend, -16]
|
||||
cmp count, 64
|
||||
b.hi L(copy128)
|
||||
stp A_l, A_h, [dstin]
|
||||
stp B_l, B_h, [dstin, 16]
|
||||
stp C_l, C_h, [dstend, -32]
|
||||
stp D_l, D_h, [dstend, -16]
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
/* Copy 65..128 bytes. */
|
||||
L(copy128):
|
||||
ldp E_l, E_h, [src, 32]
|
||||
ldp F_l, F_h, [src, 48]
|
||||
cmp count, 96
|
||||
b.ls L(copy96)
|
||||
ldp G_l, G_h, [srcend, -64]
|
||||
ldp H_l, H_h, [srcend, -48]
|
||||
stp G_l, G_h, [dstend, -64]
|
||||
stp H_l, H_h, [dstend, -48]
|
||||
L(copy96):
|
||||
stp A_l, A_h, [dstin]
|
||||
stp B_l, B_h, [dstin, 16]
|
||||
stp E_l, E_h, [dstin, 32]
|
||||
stp F_l, F_h, [dstin, 48]
|
||||
stp C_l, C_h, [dstend, -32]
|
||||
stp D_l, D_h, [dstend, -16]
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
/* Copy more than 128 bytes. */
|
||||
L(copy_long):
|
||||
/* Use backwards copy if there is an overlap. */
|
||||
sub tmp1, dstin, src
|
||||
cbz tmp1, L(copy0)
|
||||
cmp tmp1, count
|
||||
b.lo L(copy_long_backwards)
|
||||
|
||||
/* Copy 16 bytes and then align dst to 16-byte alignment. */
|
||||
|
||||
ldp D_l, D_h, [src]
|
||||
and tmp1, dstin, 15
|
||||
bic dst, dstin, 15
|
||||
ldp D_l, D_h, [src]
|
||||
sub src, src, tmp1
|
||||
add count, count, tmp1 /* Count is now 16 too large. */
|
||||
ldp A_l, A_h, [src, 16]
|
||||
@ -195,8 +166,9 @@ L(copy_long):
|
||||
ldp C_l, C_h, [src, 48]
|
||||
ldp D_l, D_h, [src, 64]!
|
||||
subs count, count, 128 + 16 /* Test and readjust count. */
|
||||
b.ls 2f
|
||||
1:
|
||||
b.ls L(copy64_from_end)
|
||||
|
||||
L(loop64):
|
||||
stp A_l, A_h, [dst, 16]
|
||||
ldp A_l, A_h, [src, 16]
|
||||
stp B_l, B_h, [dst, 32]
|
||||
@ -206,12 +178,10 @@ L(copy_long):
|
||||
stp D_l, D_h, [dst, 64]!
|
||||
ldp D_l, D_h, [src, 64]!
|
||||
subs count, count, 64
|
||||
b.hi 1b
|
||||
b.hi L(loop64)
|
||||
|
||||
/* Write the last full set of 64 bytes. The remainder is at most 64
|
||||
bytes, so it is safe to always copy 64 bytes from the end even if
|
||||
there is just 1 byte left. */
|
||||
2:
|
||||
/* Write the last iteration and copy 64 bytes from the end. */
|
||||
L(copy64_from_end):
|
||||
ldp E_l, E_h, [srcend, -64]
|
||||
stp A_l, A_h, [dst, 16]
|
||||
ldp A_l, A_h, [srcend, -48]
|
||||
@ -226,5 +196,51 @@ L(copy_long):
|
||||
stp C_l, C_h, [dstend, -16]
|
||||
ret
|
||||
|
||||
.size memcpy, . - memcpy
|
||||
.p2align 4
|
||||
|
||||
/* Large backwards copy for overlapping copies.
|
||||
Copy 16 bytes and then align dst to 16-byte alignment. */
|
||||
L(copy_long_backwards):
|
||||
ldp D_l, D_h, [srcend, -16]
|
||||
and tmp1, dstend, 15
|
||||
sub srcend, srcend, tmp1
|
||||
sub count, count, tmp1
|
||||
ldp A_l, A_h, [srcend, -16]
|
||||
stp D_l, D_h, [dstend, -16]
|
||||
ldp B_l, B_h, [srcend, -32]
|
||||
ldp C_l, C_h, [srcend, -48]
|
||||
ldp D_l, D_h, [srcend, -64]!
|
||||
sub dstend, dstend, tmp1
|
||||
subs count, count, 128
|
||||
b.ls L(copy64_from_start)
|
||||
|
||||
L(loop64_backwards):
|
||||
stp A_l, A_h, [dstend, -16]
|
||||
ldp A_l, A_h, [srcend, -16]
|
||||
stp B_l, B_h, [dstend, -32]
|
||||
ldp B_l, B_h, [srcend, -32]
|
||||
stp C_l, C_h, [dstend, -48]
|
||||
ldp C_l, C_h, [srcend, -48]
|
||||
stp D_l, D_h, [dstend, -64]!
|
||||
ldp D_l, D_h, [srcend, -64]!
|
||||
subs count, count, 64
|
||||
b.hi L(loop64_backwards)
|
||||
|
||||
/* Write the last iteration and copy 64 bytes from the start. */
|
||||
L(copy64_from_start):
|
||||
ldp G_l, G_h, [src, 48]
|
||||
stp A_l, A_h, [dstend, -16]
|
||||
ldp A_l, A_h, [src, 32]
|
||||
stp B_l, B_h, [dstend, -32]
|
||||
ldp B_l, B_h, [src, 16]
|
||||
stp C_l, C_h, [dstend, -48]
|
||||
ldp C_l, C_h, [src]
|
||||
stp D_l, D_h, [dstend, -64]
|
||||
stp G_l, G_h, [dstin, 48]
|
||||
stp A_l, A_h, [dstin, 32]
|
||||
stp B_l, B_h, [dstin, 16]
|
||||
stp C_l, C_h, [dstin]
|
||||
ret
|
||||
|
||||
END (memcpy)
|
||||
#endif
|
||||
|
@ -1,66 +1,20 @@
|
||||
/* Copyright (c) 2012-2013, Linaro Limited
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the Linaro nor the
|
||||
names of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
|
||||
|
||||
/*
|
||||
* Copyright (c) 2015 ARM Ltd
|
||||
* All rights reserved.
|
||||
* memset - fill memory with a constant byte
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* 3. The name of the company may not be used to endorse or promote
|
||||
* products derived from this software without specific prior written
|
||||
* permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
||||
* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
||||
* IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
|
||||
* TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
* Copyright (c) 2012-2022, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
/* Assumptions:
|
||||
*
|
||||
* ARMv8-a, AArch64, unaligned accesses
|
||||
* ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
|
||||
*
|
||||
*/
|
||||
|
||||
#if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
|
||||
/* See memset-stub.c */
|
||||
#else
|
||||
#include "asmdefs.h"
|
||||
|
||||
#define dstin x0
|
||||
#define val x1
|
||||
@ -68,24 +22,11 @@
|
||||
#define count x2
|
||||
#define dst x3
|
||||
#define dstend x4
|
||||
#define tmp1 x5
|
||||
#define tmp1w w5
|
||||
#define tmp2 x6
|
||||
#define tmp2w w6
|
||||
#define zva_len x7
|
||||
#define zva_lenw w7
|
||||
#define zva_val x5
|
||||
|
||||
#define L(l) .L ## l
|
||||
|
||||
.macro def_fn f p2align=0
|
||||
.text
|
||||
.p2align \p2align
|
||||
.global \f
|
||||
.type \f, %function
|
||||
\f:
|
||||
.endm
|
||||
|
||||
def_fn memset p2align=6
|
||||
ENTRY (memset)
|
||||
PTR_ARG (0)
|
||||
SIZE_ARG (2)
|
||||
|
||||
dup v0.16B, valw
|
||||
add dstend, dstin, count
|
||||
@ -101,7 +42,7 @@ def_fn memset p2align=6
|
||||
str val, [dstin]
|
||||
str val, [dstend, -8]
|
||||
ret
|
||||
nop
|
||||
.p2align 4
|
||||
1: tbz count, 2, 2f
|
||||
str valw, [dstin]
|
||||
str valw, [dstend, -4]
|
||||
@ -131,110 +72,49 @@ L(set96):
|
||||
stp q0, q0, [dstend, -32]
|
||||
ret
|
||||
|
||||
.p2align 3
|
||||
nop
|
||||
.p2align 4
|
||||
L(set_long):
|
||||
and valw, valw, 255
|
||||
bic dst, dstin, 15
|
||||
str q0, [dstin]
|
||||
cmp count, 256
|
||||
ccmp valw, 0, 0, cs
|
||||
b.eq L(try_zva)
|
||||
cmp count, 160
|
||||
ccmp valw, 0, 0, hs
|
||||
b.ne L(no_zva)
|
||||
|
||||
#ifndef SKIP_ZVA_CHECK
|
||||
mrs zva_val, dczid_el0
|
||||
and zva_val, zva_val, 31
|
||||
cmp zva_val, 4 /* ZVA size is 64 bytes. */
|
||||
b.ne L(no_zva)
|
||||
#endif
|
||||
str q0, [dst, 16]
|
||||
stp q0, q0, [dst, 32]
|
||||
bic dst, dst, 63
|
||||
sub count, dstend, dst /* Count is now 64 too large. */
|
||||
sub count, count, 128 /* Adjust count and bias for loop. */
|
||||
|
||||
.p2align 4
|
||||
L(zva_loop):
|
||||
add dst, dst, 64
|
||||
dc zva, dst
|
||||
subs count, count, 64
|
||||
b.hi L(zva_loop)
|
||||
stp q0, q0, [dstend, -64]
|
||||
stp q0, q0, [dstend, -32]
|
||||
ret
|
||||
|
||||
L(no_zva):
|
||||
sub count, dstend, dst /* Count is 16 too large. */
|
||||
sub dst, dst, 16 /* Dst is biased by -32. */
|
||||
sub count, count, 64 + 16 /* Adjust count and bias for loop. */
|
||||
1: stp q0, q0, [dst, 32]
|
||||
L(no_zva_loop):
|
||||
stp q0, q0, [dst, 32]
|
||||
stp q0, q0, [dst, 64]!
|
||||
L(tail64):
|
||||
subs count, count, 64
|
||||
b.hi 1b
|
||||
2: stp q0, q0, [dstend, -64]
|
||||
stp q0, q0, [dstend, -32]
|
||||
ret
|
||||
|
||||
.p2align 3
|
||||
L(try_zva):
|
||||
mrs tmp1, dczid_el0
|
||||
tbnz tmp1w, 4, L(no_zva)
|
||||
and tmp1w, tmp1w, 15
|
||||
cmp tmp1w, 4 /* ZVA size is 64 bytes. */
|
||||
b.ne L(zva_128)
|
||||
|
||||
/* Write the first and last 64 byte aligned block using stp rather
|
||||
than using DC ZVA. This is faster on some cores.
|
||||
*/
|
||||
L(zva_64):
|
||||
str q0, [dst, 16]
|
||||
stp q0, q0, [dst, 32]
|
||||
bic dst, dst, 63
|
||||
stp q0, q0, [dst, 64]
|
||||
stp q0, q0, [dst, 96]
|
||||
sub count, dstend, dst /* Count is now 128 too large. */
|
||||
sub count, count, 128+64+64 /* Adjust count and bias for loop. */
|
||||
add dst, dst, 128
|
||||
nop
|
||||
1: dc zva, dst
|
||||
add dst, dst, 64
|
||||
subs count, count, 64
|
||||
b.hi 1b
|
||||
stp q0, q0, [dst, 0]
|
||||
stp q0, q0, [dst, 32]
|
||||
b.hi L(no_zva_loop)
|
||||
stp q0, q0, [dstend, -64]
|
||||
stp q0, q0, [dstend, -32]
|
||||
ret
|
||||
|
||||
.p2align 3
|
||||
L(zva_128):
|
||||
cmp tmp1w, 5 /* ZVA size is 128 bytes. */
|
||||
b.ne L(zva_other)
|
||||
|
||||
str q0, [dst, 16]
|
||||
stp q0, q0, [dst, 32]
|
||||
stp q0, q0, [dst, 64]
|
||||
stp q0, q0, [dst, 96]
|
||||
bic dst, dst, 127
|
||||
sub count, dstend, dst /* Count is now 128 too large. */
|
||||
sub count, count, 128+128 /* Adjust count and bias for loop. */
|
||||
add dst, dst, 128
|
||||
1: dc zva, dst
|
||||
add dst, dst, 128
|
||||
subs count, count, 128
|
||||
b.hi 1b
|
||||
stp q0, q0, [dstend, -128]
|
||||
stp q0, q0, [dstend, -96]
|
||||
stp q0, q0, [dstend, -64]
|
||||
stp q0, q0, [dstend, -32]
|
||||
ret
|
||||
|
||||
L(zva_other):
|
||||
mov tmp2w, 4
|
||||
lsl zva_lenw, tmp2w, tmp1w
|
||||
add tmp1, zva_len, 64 /* Max alignment bytes written. */
|
||||
cmp count, tmp1
|
||||
blo L(no_zva)
|
||||
|
||||
sub tmp2, zva_len, 1
|
||||
add tmp1, dst, zva_len
|
||||
add dst, dst, 16
|
||||
subs count, tmp1, dst /* Actual alignment bytes to write. */
|
||||
bic tmp1, tmp1, tmp2 /* Aligned dc zva start address. */
|
||||
beq 2f
|
||||
1: stp q0, q0, [dst], 64
|
||||
stp q0, q0, [dst, -32]
|
||||
subs count, count, 64
|
||||
b.hi 1b
|
||||
2: mov dst, tmp1
|
||||
sub count, dstend, tmp1 /* Remaining bytes to write. */
|
||||
subs count, count, zva_len
|
||||
b.lo 4f
|
||||
3: dc zva, dst
|
||||
add dst, dst, zva_len
|
||||
subs count, count, zva_len
|
||||
b.hs 3b
|
||||
4: add count, count, zva_len
|
||||
sub dst, dst, 32 /* Bias dst for tail loop. */
|
||||
b L(tail64)
|
||||
|
||||
.size memset, . - memset
|
||||
END (memset)
|
||||
#endif
|
||||
|
@ -1,34 +1,10 @@
|
||||
/*
|
||||
stpcpy - copy a string returning pointer to end.
|
||||
* stpcpy - copy a string returning pointer to end.
|
||||
*
|
||||
* Copyright (c) 2020, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
Copyright (c) 2015 ARM Ltd.
|
||||
All Rights Reserved.
|
||||
#define BUILD_STPCPY 1
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the company nor the names of its contributors
|
||||
may be used to endorse or promote products derived from this
|
||||
software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
|
||||
|
||||
/* This is just a wrapper that uses strcpy code with appropriate
|
||||
pre-defines. */
|
||||
|
||||
#define BUILD_STPCPY
|
||||
#include "strcpy.S"
|
||||
|
@ -1,32 +1,9 @@
|
||||
/*
|
||||
strchr - find a character in a string
|
||||
|
||||
Copyright (c) 2014, ARM Limited
|
||||
All rights Reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the company nor the names of its contributors
|
||||
may be used to endorse or promote products derived from this
|
||||
software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
|
||||
|
||||
* strchr - find a character in a string
|
||||
*
|
||||
* Copyright (c) 2014-2022, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
#if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
|
||||
/* See strchr-stub.c */
|
||||
#else
|
||||
@ -37,6 +14,8 @@
|
||||
* Neon Available.
|
||||
*/
|
||||
|
||||
#include "asmdefs.h"
|
||||
|
||||
/* Arguments and results. */
|
||||
#define srcin x0
|
||||
#define chrin w1
|
||||
@ -74,26 +53,19 @@
|
||||
|
||||
/* Locals and temporaries. */
|
||||
|
||||
.macro def_fn f p2align=0
|
||||
.text
|
||||
.p2align \p2align
|
||||
.global \f
|
||||
.type \f, %function
|
||||
\f:
|
||||
.endm
|
||||
|
||||
def_fn strchr
|
||||
/* Magic constant 0x40100401 to allow us to identify which lane
|
||||
matches the requested byte. Magic constant 0x80200802 used
|
||||
similarly for NUL termination. */
|
||||
mov wtmp2, #0x0401
|
||||
movk wtmp2, #0x4010, lsl #16
|
||||
ENTRY (strchr)
|
||||
PTR_ARG (0)
|
||||
/* Magic constant 0xc0300c03 to allow us to identify which lane
|
||||
matches the requested byte. Even bits are set if the character
|
||||
matches, odd bits if either the char is NUL or matches. */
|
||||
mov wtmp2, 0x0c03
|
||||
movk wtmp2, 0xc030, lsl 16
|
||||
dup vrepchr.16b, chrin
|
||||
bic src, srcin, #31 /* Work with aligned 32-byte hunks. */
|
||||
dup vrepmask_c.4s, wtmp2
|
||||
ands tmp1, srcin, #31
|
||||
add vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */
|
||||
b.eq .Lloop
|
||||
b.eq L(loop)
|
||||
|
||||
/* Input string is not 32-byte aligned. Rather than forcing
|
||||
the padding bytes to a safe value, we calculate the syndrome
|
||||
@ -105,49 +77,42 @@ def_fn strchr
|
||||
cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
|
||||
cmeq vhas_nul2.16b, vdata2.16b, #0
|
||||
cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
|
||||
and vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
|
||||
and vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
|
||||
and vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
|
||||
and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
|
||||
orr vend1.16b, vhas_nul1.16b, vhas_chr1.16b
|
||||
orr vend2.16b, vhas_nul2.16b, vhas_chr2.16b
|
||||
bif vhas_nul1.16b, vhas_chr1.16b, vrepmask_0.16b
|
||||
bif vhas_nul2.16b, vhas_chr2.16b, vrepmask_0.16b
|
||||
and vend1.16b, vhas_nul1.16b, vrepmask_c.16b
|
||||
and vend2.16b, vhas_nul2.16b, vrepmask_c.16b
|
||||
lsl tmp1, tmp1, #1
|
||||
addp vend1.16b, vend1.16b, vend2.16b // 256->128
|
||||
mov tmp3, #~0
|
||||
addp vend1.16b, vend1.16b, vend2.16b // 128->64
|
||||
lsr tmp1, tmp3, tmp1
|
||||
|
||||
mov tmp3, vend1.2d[0]
|
||||
mov tmp3, vend1.d[0]
|
||||
bic tmp1, tmp3, tmp1 // Mask padding bits.
|
||||
cbnz tmp1, .Ltail
|
||||
cbnz tmp1, L(tail)
|
||||
|
||||
.Lloop:
|
||||
.p2align 4
|
||||
L(loop):
|
||||
ld1 {vdata1.16b, vdata2.16b}, [src], #32
|
||||
cmeq vhas_nul1.16b, vdata1.16b, #0
|
||||
cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
|
||||
cmeq vhas_nul2.16b, vdata2.16b, #0
|
||||
cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
|
||||
/* Use a fast check for the termination condition. */
|
||||
orr vend1.16b, vhas_nul1.16b, vhas_chr1.16b
|
||||
orr vend2.16b, vhas_nul2.16b, vhas_chr2.16b
|
||||
orr vend1.16b, vend1.16b, vend2.16b
|
||||
addp vend1.2d, vend1.2d, vend1.2d
|
||||
mov tmp1, vend1.2d[0]
|
||||
cbz tmp1, .Lloop
|
||||
cmhs vhas_nul1.16b, vhas_chr1.16b, vdata1.16b
|
||||
cmhs vhas_nul2.16b, vhas_chr2.16b, vdata2.16b
|
||||
orr vend1.16b, vhas_nul1.16b, vhas_nul2.16b
|
||||
umaxp vend1.16b, vend1.16b, vend1.16b
|
||||
mov tmp1, vend1.d[0]
|
||||
cbz tmp1, L(loop)
|
||||
|
||||
/* Termination condition found. Now need to establish exactly why
|
||||
we terminated. */
|
||||
and vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
|
||||
and vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
|
||||
and vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
|
||||
and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
|
||||
orr vend1.16b, vhas_nul1.16b, vhas_chr1.16b
|
||||
orr vend2.16b, vhas_nul2.16b, vhas_chr2.16b
|
||||
bif vhas_nul1.16b, vhas_chr1.16b, vrepmask_0.16b
|
||||
bif vhas_nul2.16b, vhas_chr2.16b, vrepmask_0.16b
|
||||
and vend1.16b, vhas_nul1.16b, vrepmask_c.16b
|
||||
and vend2.16b, vhas_nul2.16b, vrepmask_c.16b
|
||||
addp vend1.16b, vend1.16b, vend2.16b // 256->128
|
||||
addp vend1.16b, vend1.16b, vend2.16b // 128->64
|
||||
|
||||
mov tmp1, vend1.2d[0]
|
||||
.Ltail:
|
||||
mov tmp1, vend1.d[0]
|
||||
L(tail):
|
||||
/* Count the trailing zeros, by bit reversing... */
|
||||
rbit tmp1, tmp1
|
||||
/* Re-bias source. */
|
||||
@ -160,5 +125,5 @@ def_fn strchr
|
||||
csel result, result, xzr, eq
|
||||
ret
|
||||
|
||||
.size strchr, . - strchr
|
||||
END (strchr)
|
||||
#endif
|
||||
|
@ -1,32 +1,9 @@
|
||||
/*
|
||||
strchrnul - find a character or nul in a string
|
||||
|
||||
Copyright (c) 2014, ARM Limited
|
||||
All rights Reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the company nor the names of its contributors
|
||||
may be used to endorse or promote products derived from this
|
||||
software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
|
||||
|
||||
* strchrnul - find a character or nul in a string
|
||||
*
|
||||
* Copyright (c) 2014-2022, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
#if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
|
||||
/* See strchrnul-stub.c */
|
||||
#else
|
||||
@ -37,6 +14,8 @@
|
||||
* Neon Available.
|
||||
*/
|
||||
|
||||
#include "asmdefs.h"
|
||||
|
||||
/* Arguments and results. */
|
||||
#define srcin x0
|
||||
#define chrin w1
|
||||
@ -70,15 +49,8 @@
|
||||
|
||||
/* Locals and temporaries. */
|
||||
|
||||
.macro def_fn f p2align=0
|
||||
.text
|
||||
.p2align \p2align
|
||||
.global \f
|
||||
.type \f, %function
|
||||
\f:
|
||||
.endm
|
||||
|
||||
def_fn strchrnul
|
||||
ENTRY (strchrnul)
|
||||
PTR_ARG (0)
|
||||
/* Magic constant 0x40100401 to allow us to identify which lane
|
||||
matches the termination condition. */
|
||||
mov wtmp2, #0x0401
|
||||
@ -87,7 +59,7 @@ def_fn strchrnul
|
||||
bic src, srcin, #31 /* Work with aligned 32-byte hunks. */
|
||||
dup vrepmask.4s, wtmp2
|
||||
ands tmp1, srcin, #31
|
||||
b.eq .Lloop
|
||||
b.eq L(loop)
|
||||
|
||||
/* Input string is not 32-byte aligned. Rather than forcing
|
||||
the padding bytes to a safe value, we calculate the syndrome
|
||||
@ -95,47 +67,43 @@ def_fn strchrnul
|
||||
syndrome that are related to the padding. */
|
||||
ld1 {vdata1.16b, vdata2.16b}, [src], #32
|
||||
neg tmp1, tmp1
|
||||
cmeq vhas_nul1.16b, vdata1.16b, #0
|
||||
cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
|
||||
cmeq vhas_nul2.16b, vdata2.16b, #0
|
||||
cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
|
||||
orr vhas_chr1.16b, vhas_chr1.16b, vhas_nul1.16b
|
||||
orr vhas_chr2.16b, vhas_chr2.16b, vhas_nul2.16b
|
||||
and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
|
||||
and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
|
||||
cmhs vhas_nul1.16b, vhas_chr1.16b, vdata1.16b
|
||||
cmhs vhas_nul2.16b, vhas_chr2.16b, vdata2.16b
|
||||
and vhas_chr1.16b, vhas_nul1.16b, vrepmask.16b
|
||||
and vhas_chr2.16b, vhas_nul2.16b, vrepmask.16b
|
||||
lsl tmp1, tmp1, #1
|
||||
addp vend1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128
|
||||
mov tmp3, #~0
|
||||
addp vend1.16b, vend1.16b, vend1.16b // 128->64
|
||||
lsr tmp1, tmp3, tmp1
|
||||
|
||||
mov tmp3, vend1.2d[0]
|
||||
mov tmp3, vend1.d[0]
|
||||
bic tmp1, tmp3, tmp1 // Mask padding bits.
|
||||
cbnz tmp1, .Ltail
|
||||
cbnz tmp1, L(tail)
|
||||
|
||||
.Lloop:
|
||||
.p2align 4
|
||||
L(loop):
|
||||
ld1 {vdata1.16b, vdata2.16b}, [src], #32
|
||||
cmeq vhas_nul1.16b, vdata1.16b, #0
|
||||
cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
|
||||
cmeq vhas_nul2.16b, vdata2.16b, #0
|
||||
cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
|
||||
/* Use a fast check for the termination condition. */
|
||||
orr vhas_chr1.16b, vhas_nul1.16b, vhas_chr1.16b
|
||||
orr vhas_chr2.16b, vhas_nul2.16b, vhas_chr2.16b
|
||||
orr vend1.16b, vhas_chr1.16b, vhas_chr2.16b
|
||||
addp vend1.2d, vend1.2d, vend1.2d
|
||||
mov tmp1, vend1.2d[0]
|
||||
cbz tmp1, .Lloop
|
||||
cmhs vhas_nul1.16b, vhas_chr1.16b, vdata1.16b
|
||||
cmhs vhas_nul2.16b, vhas_chr2.16b, vdata2.16b
|
||||
orr vend1.16b, vhas_nul1.16b, vhas_nul2.16b
|
||||
umaxp vend1.16b, vend1.16b, vend1.16b
|
||||
mov tmp1, vend1.d[0]
|
||||
cbz tmp1, L(loop)
|
||||
|
||||
/* Termination condition found. Now need to establish exactly why
|
||||
we terminated. */
|
||||
and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
|
||||
and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
|
||||
and vhas_chr1.16b, vhas_nul1.16b, vrepmask.16b
|
||||
and vhas_chr2.16b, vhas_nul2.16b, vrepmask.16b
|
||||
addp vend1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128
|
||||
addp vend1.16b, vend1.16b, vend1.16b // 128->64
|
||||
|
||||
mov tmp1, vend1.2d[0]
|
||||
.Ltail:
|
||||
mov tmp1, vend1.d[0]
|
||||
L(tail):
|
||||
/* Count the trailing zeros, by bit reversing... */
|
||||
rbit tmp1, tmp1
|
||||
/* Re-bias source. */
|
||||
@ -145,5 +113,5 @@ def_fn strchrnul
|
||||
add result, src, tmp1, lsr #1
|
||||
ret
|
||||
|
||||
.size strchrnul, . - strchrnul
|
||||
END (strchrnul)
|
||||
#endif
|
||||
|
@ -1,202 +1,192 @@
|
||||
/* Copyright (c) 2012-2018, Linaro Limited
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the Linaro nor the
|
||||
names of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
|
||||
|
||||
/* Assumptions:
|
||||
/*
|
||||
* strcmp - compare two strings
|
||||
*
|
||||
* ARMv8-a, AArch64
|
||||
* Copyright (c) 2012-2022, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
|
||||
/* See strcmp-stub.c */
|
||||
#else
|
||||
|
||||
.macro def_fn f p2align=0
|
||||
.text
|
||||
.p2align \p2align
|
||||
.global \f
|
||||
.type \f, %function
|
||||
\f:
|
||||
.endm
|
||||
/* Assumptions:
|
||||
*
|
||||
* ARMv8-a, AArch64.
|
||||
* MTE compatible.
|
||||
*/
|
||||
|
||||
#define L(label) .L ## label
|
||||
#include "asmdefs.h"
|
||||
|
||||
#define REP8_01 0x0101010101010101
|
||||
#define REP8_7f 0x7f7f7f7f7f7f7f7f
|
||||
#define REP8_80 0x8080808080808080
|
||||
|
||||
/* Parameters and result. */
|
||||
#define src1 x0
|
||||
#define src2 x1
|
||||
#define result x0
|
||||
|
||||
/* Internal variables. */
|
||||
#define data1 x2
|
||||
#define data1w w2
|
||||
#define data2 x3
|
||||
#define data2w w3
|
||||
#define has_nul x4
|
||||
#define diff x5
|
||||
#define off1 x5
|
||||
#define syndrome x6
|
||||
#define tmp1 x7
|
||||
#define tmp2 x8
|
||||
#define tmp3 x9
|
||||
#define zeroones x10
|
||||
#define pos x11
|
||||
#define tmp x6
|
||||
#define data3 x7
|
||||
#define zeroones x8
|
||||
#define shift x9
|
||||
#define off2 x10
|
||||
|
||||
/* Start of performance-critical section -- one 64B cache line. */
|
||||
def_fn strcmp p2align=6
|
||||
eor tmp1, src1, src2
|
||||
mov zeroones, #REP8_01
|
||||
tst tmp1, #7
|
||||
/* On big-endian early bytes are at MSB and on little-endian LSB.
|
||||
LS_FW means shifting towards early bytes. */
|
||||
#ifdef __AARCH64EB__
|
||||
# define LS_FW lsl
|
||||
#else
|
||||
# define LS_FW lsr
|
||||
#endif
|
||||
|
||||
/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
|
||||
(=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
|
||||
can be done in parallel across the entire word.
|
||||
Since carry propagation makes 0x1 bytes before a NUL byte appear
|
||||
NUL too in big-endian, byte-reverse the data before the NUL check. */
|
||||
|
||||
|
||||
ENTRY (strcmp)
|
||||
PTR_ARG (0)
|
||||
PTR_ARG (1)
|
||||
sub off2, src2, src1
|
||||
mov zeroones, REP8_01
|
||||
and tmp, src1, 7
|
||||
tst off2, 7
|
||||
b.ne L(misaligned8)
|
||||
ands tmp1, src1, #7
|
||||
b.ne L(mutual_align)
|
||||
/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
|
||||
(=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
|
||||
can be done in parallel across the entire word. */
|
||||
L(loop_aligned):
|
||||
ldr data1, [src1], #8
|
||||
ldr data2, [src2], #8
|
||||
L(start_realigned):
|
||||
sub tmp1, data1, zeroones
|
||||
orr tmp2, data1, #REP8_7f
|
||||
eor diff, data1, data2 /* Non-zero if differences found. */
|
||||
bic has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
|
||||
orr syndrome, diff, has_nul
|
||||
cbz syndrome, L(loop_aligned)
|
||||
/* End of performance-critical section -- one 64B cache line. */
|
||||
cbnz tmp, L(mutual_align)
|
||||
|
||||
.p2align 4
|
||||
|
||||
L(loop_aligned):
|
||||
ldr data2, [src1, off2]
|
||||
ldr data1, [src1], 8
|
||||
L(start_realigned):
|
||||
#ifdef __AARCH64EB__
|
||||
rev tmp, data1
|
||||
sub has_nul, tmp, zeroones
|
||||
orr tmp, tmp, REP8_7f
|
||||
#else
|
||||
sub has_nul, data1, zeroones
|
||||
orr tmp, data1, REP8_7f
|
||||
#endif
|
||||
bics has_nul, has_nul, tmp /* Non-zero if NUL terminator. */
|
||||
ccmp data1, data2, 0, eq
|
||||
b.eq L(loop_aligned)
|
||||
#ifdef __AARCH64EB__
|
||||
rev has_nul, has_nul
|
||||
#endif
|
||||
eor diff, data1, data2
|
||||
orr syndrome, diff, has_nul
|
||||
L(end):
|
||||
#ifndef __AARCH64EB__
|
||||
#ifndef __AARCH64EB__
|
||||
rev syndrome, syndrome
|
||||
rev data1, data1
|
||||
/* The MS-non-zero bit of the syndrome marks either the first bit
|
||||
that is different, or the top bit of the first zero byte.
|
||||
Shifting left now will bring the critical information into the
|
||||
top bits. */
|
||||
clz pos, syndrome
|
||||
rev data2, data2
|
||||
lsl data1, data1, pos
|
||||
lsl data2, data2, pos
|
||||
/* But we need to zero-extend (char is unsigned) the value and then
|
||||
perform a signed 32-bit subtraction. */
|
||||
lsr data1, data1, #56
|
||||
sub result, data1, data2, lsr #56
|
||||
ret
|
||||
#else
|
||||
/* For big-endian we cannot use the trick with the syndrome value
|
||||
as carry-propagation can corrupt the upper bits if the trailing
|
||||
bytes in the string contain 0x01. */
|
||||
/* However, if there is no NUL byte in the dword, we can generate
|
||||
the result directly. We can't just subtract the bytes as the
|
||||
MSB might be significant. */
|
||||
cbnz has_nul, 1f
|
||||
cmp data1, data2
|
||||
cset result, ne
|
||||
cneg result, result, lo
|
||||
ret
|
||||
1:
|
||||
/* Re-compute the NUL-byte detection, using a byte-reversed value. */
|
||||
rev tmp3, data1
|
||||
sub tmp1, tmp3, zeroones
|
||||
orr tmp2, tmp3, #REP8_7f
|
||||
bic has_nul, tmp1, tmp2
|
||||
rev has_nul, has_nul
|
||||
orr syndrome, diff, has_nul
|
||||
clz pos, syndrome
|
||||
/* The MS-non-zero bit of the syndrome marks either the first bit
|
||||
that is different, or the top bit of the first zero byte.
|
||||
#endif
|
||||
clz shift, syndrome
|
||||
/* The most-significant-non-zero bit of the syndrome marks either the
|
||||
first bit that is different, or the top bit of the first zero byte.
|
||||
Shifting left now will bring the critical information into the
|
||||
top bits. */
|
||||
lsl data1, data1, pos
|
||||
lsl data2, data2, pos
|
||||
lsl data1, data1, shift
|
||||
lsl data2, data2, shift
|
||||
/* But we need to zero-extend (char is unsigned) the value and then
|
||||
perform a signed 32-bit subtraction. */
|
||||
lsr data1, data1, #56
|
||||
sub result, data1, data2, lsr #56
|
||||
lsr data1, data1, 56
|
||||
sub result, data1, data2, lsr 56
|
||||
ret
|
||||
#endif
|
||||
|
||||
.p2align 4
|
||||
|
||||
L(mutual_align):
|
||||
/* Sources are mutually aligned, but are not currently at an
|
||||
alignment boundary. Round down the addresses and then mask off
|
||||
the bytes that preceed the start point. */
|
||||
bic src1, src1, #7
|
||||
bic src2, src2, #7
|
||||
lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */
|
||||
ldr data1, [src1], #8
|
||||
neg tmp1, tmp1 /* Bits to alignment -64. */
|
||||
ldr data2, [src2], #8
|
||||
mov tmp2, #~0
|
||||
#ifdef __AARCH64EB__
|
||||
/* Big-endian. Early bytes are at MSB. */
|
||||
lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
|
||||
#else
|
||||
/* Little-endian. Early bytes are at LSB. */
|
||||
lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
|
||||
#endif
|
||||
orr data1, data1, tmp2
|
||||
orr data2, data2, tmp2
|
||||
the bytes that precede the start point. */
|
||||
bic src1, src1, 7
|
||||
ldr data2, [src1, off2]
|
||||
ldr data1, [src1], 8
|
||||
neg shift, src2, lsl 3 /* Bits to alignment -64. */
|
||||
mov tmp, -1
|
||||
LS_FW tmp, tmp, shift
|
||||
orr data1, data1, tmp
|
||||
orr data2, data2, tmp
|
||||
b L(start_realigned)
|
||||
|
||||
L(misaligned8):
|
||||
/* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always
|
||||
checking to make sure that we don't access beyond page boundary in
|
||||
SRC2. */
|
||||
tst src1, #7
|
||||
b.eq L(loop_misaligned)
|
||||
checking to make sure that we don't access beyond the end of SRC2. */
|
||||
cbz tmp, L(src1_aligned)
|
||||
L(do_misaligned):
|
||||
ldrb data1w, [src1], #1
|
||||
ldrb data2w, [src2], #1
|
||||
cmp data1w, #1
|
||||
ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
|
||||
ldrb data1w, [src1], 1
|
||||
ldrb data2w, [src2], 1
|
||||
cmp data1w, 0
|
||||
ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */
|
||||
b.ne L(done)
|
||||
tst src1, #7
|
||||
tst src1, 7
|
||||
b.ne L(do_misaligned)
|
||||
|
||||
L(loop_misaligned):
|
||||
/* Test if we are within the last dword of the end of a 4K page. If
|
||||
yes then jump back to the misaligned loop to copy a byte at a time. */
|
||||
and tmp1, src2, #0xff8
|
||||
eor tmp1, tmp1, #0xff8
|
||||
cbz tmp1, L(do_misaligned)
|
||||
ldr data1, [src1], #8
|
||||
ldr data2, [src2], #8
|
||||
L(src1_aligned):
|
||||
neg shift, src2, lsl 3
|
||||
bic src2, src2, 7
|
||||
ldr data3, [src2], 8
|
||||
#ifdef __AARCH64EB__
|
||||
rev data3, data3
|
||||
#endif
|
||||
lsr tmp, zeroones, shift
|
||||
orr data3, data3, tmp
|
||||
sub has_nul, data3, zeroones
|
||||
orr tmp, data3, REP8_7f
|
||||
bics has_nul, has_nul, tmp
|
||||
b.ne L(tail)
|
||||
|
||||
sub tmp1, data1, zeroones
|
||||
orr tmp2, data1, #REP8_7f
|
||||
eor diff, data1, data2 /* Non-zero if differences found. */
|
||||
bic has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
|
||||
sub off1, src2, src1
|
||||
|
||||
.p2align 4
|
||||
|
||||
L(loop_unaligned):
|
||||
ldr data3, [src1, off1]
|
||||
ldr data2, [src1, off2]
|
||||
#ifdef __AARCH64EB__
|
||||
rev data3, data3
|
||||
#endif
|
||||
sub has_nul, data3, zeroones
|
||||
orr tmp, data3, REP8_7f
|
||||
ldr data1, [src1], 8
|
||||
bics has_nul, has_nul, tmp
|
||||
ccmp data1, data2, 0, eq
|
||||
b.eq L(loop_unaligned)
|
||||
|
||||
lsl tmp, has_nul, shift
|
||||
#ifdef __AARCH64EB__
|
||||
rev tmp, tmp
|
||||
#endif
|
||||
eor diff, data1, data2
|
||||
orr syndrome, diff, tmp
|
||||
cbnz syndrome, L(end)
|
||||
L(tail):
|
||||
ldr data1, [src1]
|
||||
neg shift, shift
|
||||
lsr data2, data3, shift
|
||||
lsr has_nul, has_nul, shift
|
||||
#ifdef __AARCH64EB__
|
||||
rev data2, data2
|
||||
rev has_nul, has_nul
|
||||
#endif
|
||||
eor diff, data1, data2
|
||||
orr syndrome, diff, has_nul
|
||||
cbz syndrome, L(loop_misaligned)
|
||||
b L(end)
|
||||
|
||||
L(done):
|
||||
sub result, data1, data2
|
||||
ret
|
||||
.size strcmp, .-strcmp
|
||||
|
||||
END (strcmp)
|
||||
#endif
|
||||
|
@ -1,341 +1,160 @@
|
||||
/*
|
||||
strcpy/stpcpy - copy a string returning pointer to start/end.
|
||||
|
||||
Copyright (c) 2013, 2014, 2015 ARM Ltd.
|
||||
All Rights Reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the company nor the names of its contributors
|
||||
may be used to endorse or promote products derived from this
|
||||
software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
|
||||
|
||||
* strcpy/stpcpy - copy a string returning pointer to start/end.
|
||||
*
|
||||
* Copyright (c) 2020-2023, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
#if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
|
||||
/* See strchr-stub.c */
|
||||
#else
|
||||
|
||||
/* Assumptions:
|
||||
*
|
||||
* ARMv8-a, AArch64, unaligned accesses, min page size 4k.
|
||||
* ARMv8-a, AArch64, Advanced SIMD.
|
||||
* MTE compatible.
|
||||
*/
|
||||
|
||||
/* To build as stpcpy, define BUILD_STPCPY before compiling this file.
|
||||
#include "asmdefs.h"
|
||||
|
||||
To test the page crossing code path more thoroughly, compile with
|
||||
-DSTRCPY_TEST_PAGE_CROSS - this will force all copies through the slower
|
||||
entry path. This option is not intended for production use. */
|
||||
|
||||
/* Arguments and results. */
|
||||
#define dstin x0
|
||||
#define srcin x1
|
||||
#define result x0
|
||||
|
||||
/* Locals and temporaries. */
|
||||
#define src x2
|
||||
#define dst x3
|
||||
#define data1 x4
|
||||
#define data1w w4
|
||||
#define data2 x5
|
||||
#define data2w w5
|
||||
#define has_nul1 x6
|
||||
#define has_nul2 x7
|
||||
#define tmp1 x8
|
||||
#define tmp2 x9
|
||||
#define tmp3 x10
|
||||
#define tmp4 x11
|
||||
#define zeroones x12
|
||||
#define data1a x13
|
||||
#define data2a x14
|
||||
#define pos x15
|
||||
#define len x16
|
||||
#define to_align x17
|
||||
#define len x4
|
||||
#define synd x4
|
||||
#define tmp x5
|
||||
#define shift x5
|
||||
#define data1 x6
|
||||
#define dataw1 w6
|
||||
#define data2 x7
|
||||
#define dataw2 w7
|
||||
|
||||
#define dataq q0
|
||||
#define vdata v0
|
||||
#define vhas_nul v1
|
||||
#define vend v2
|
||||
#define dend d2
|
||||
#define dataq2 q1
|
||||
|
||||
#ifdef BUILD_STPCPY
|
||||
#define STRCPY stpcpy
|
||||
# define STRCPY stpcpy
|
||||
# define IFSTPCPY(X,...) X,__VA_ARGS__
|
||||
#else
|
||||
#define STRCPY strcpy
|
||||
# define STRCPY strcpy
|
||||
# define IFSTPCPY(X,...)
|
||||
#endif
|
||||
|
||||
.macro def_fn f p2align=0
|
||||
.text
|
||||
.p2align \p2align
|
||||
.global \f
|
||||
.type \f, %function
|
||||
\f:
|
||||
.endm
|
||||
/*
|
||||
Core algorithm:
|
||||
For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
|
||||
per byte. We take 4 bits of every comparison byte with shift right and narrow
|
||||
by 4 instruction. Since the bits in the nibble mask reflect the order in
|
||||
which things occur in the original string, counting leading zeros identifies
|
||||
exactly which byte matched. */
|
||||
|
||||
/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
|
||||
(=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
|
||||
can be done in parallel across the entire word. */
|
||||
ENTRY (STRCPY)
|
||||
PTR_ARG (0)
|
||||
PTR_ARG (1)
|
||||
bic src, srcin, 15
|
||||
ld1 {vdata.16b}, [src]
|
||||
cmeq vhas_nul.16b, vdata.16b, 0
|
||||
lsl shift, srcin, 2
|
||||
shrn vend.8b, vhas_nul.8h, 4
|
||||
fmov synd, dend
|
||||
lsr synd, synd, shift
|
||||
cbnz synd, L(tail)
|
||||
|
||||
#define REP8_01 0x0101010101010101
|
||||
#define REP8_7f 0x7f7f7f7f7f7f7f7f
|
||||
#define REP8_80 0x8080808080808080
|
||||
ldr dataq, [src, 16]!
|
||||
cmeq vhas_nul.16b, vdata.16b, 0
|
||||
shrn vend.8b, vhas_nul.8h, 4
|
||||
fmov synd, dend
|
||||
cbz synd, L(start_loop)
|
||||
|
||||
/* AArch64 systems have a minimum page size of 4k. We can do a quick
|
||||
page size check for crossing this boundary on entry and if we
|
||||
do not, then we can short-circuit much of the entry code. We
|
||||
expect early page-crossing strings to be rare (probability of
|
||||
16/MIN_PAGE_SIZE ~= 0.4%), so the branch should be quite
|
||||
predictable, even with random strings.
|
||||
|
||||
We don't bother checking for larger page sizes, the cost of setting
|
||||
up the correct page size is just not worth the extra gain from
|
||||
a small reduction in the cases taking the slow path. Note that
|
||||
we only care about whether the first fetch, which may be
|
||||
misaligned, crosses a page boundary - after that we move to aligned
|
||||
fetches for the remainder of the string. */
|
||||
|
||||
#ifdef STRCPY_TEST_PAGE_CROSS
|
||||
/* Make everything that isn't Qword aligned look like a page cross. */
|
||||
#define MIN_PAGE_P2 4
|
||||
#else
|
||||
#define MIN_PAGE_P2 12
|
||||
#ifndef __AARCH64EB__
|
||||
rbit synd, synd
|
||||
#endif
|
||||
sub tmp, src, srcin
|
||||
clz len, synd
|
||||
add len, tmp, len, lsr 2
|
||||
tbz len, 4, L(less16)
|
||||
sub tmp, len, 15
|
||||
ldr dataq, [srcin]
|
||||
ldr dataq2, [srcin, tmp]
|
||||
str dataq, [dstin]
|
||||
str dataq2, [dstin, tmp]
|
||||
IFSTPCPY (add result, dstin, len)
|
||||
ret
|
||||
|
||||
#define MIN_PAGE_SIZE (1 << MIN_PAGE_P2)
|
||||
|
||||
def_fn STRCPY p2align=6
|
||||
/* For moderately short strings, the fastest way to do the copy is to
|
||||
calculate the length of the string in the same way as strlen, then
|
||||
essentially do a memcpy of the result. This avoids the need for
|
||||
multiple byte copies and further means that by the time we
|
||||
reach the bulk copy loop we know we can always use DWord
|
||||
accesses. We expect strcpy to rarely be called repeatedly
|
||||
with the same source string, so branch prediction is likely to
|
||||
always be difficult - we mitigate against this by preferring
|
||||
conditional select operations over branches whenever this is
|
||||
feasible. */
|
||||
and tmp2, srcin, #(MIN_PAGE_SIZE - 1)
|
||||
mov zeroones, #REP8_01
|
||||
and to_align, srcin, #15
|
||||
cmp tmp2, #(MIN_PAGE_SIZE - 16)
|
||||
neg tmp1, to_align
|
||||
/* The first fetch will straddle a (possible) page boundary iff
|
||||
srcin + 15 causes bit[MIN_PAGE_P2] to change value. A 16-byte
|
||||
aligned string will never fail the page align check, so will
|
||||
always take the fast path. */
|
||||
b.gt .Lpage_cross
|
||||
|
||||
.Lpage_cross_ok:
|
||||
ldp data1, data2, [srcin]
|
||||
#ifdef __AARCH64EB__
|
||||
/* Because we expect the end to be found within 16 characters
|
||||
(profiling shows this is the most common case), it's worth
|
||||
swapping the bytes now to save having to recalculate the
|
||||
termination syndrome later. We preserve data1 and data2
|
||||
so that we can re-use the values later on. */
|
||||
rev tmp2, data1
|
||||
sub tmp1, tmp2, zeroones
|
||||
orr tmp2, tmp2, #REP8_7f
|
||||
bics has_nul1, tmp1, tmp2
|
||||
b.ne .Lfp_le8
|
||||
rev tmp4, data2
|
||||
sub tmp3, tmp4, zeroones
|
||||
orr tmp4, tmp4, #REP8_7f
|
||||
#else
|
||||
sub tmp1, data1, zeroones
|
||||
orr tmp2, data1, #REP8_7f
|
||||
bics has_nul1, tmp1, tmp2
|
||||
b.ne .Lfp_le8
|
||||
sub tmp3, data2, zeroones
|
||||
orr tmp4, data2, #REP8_7f
|
||||
#endif
|
||||
bics has_nul2, tmp3, tmp4
|
||||
b.eq .Lbulk_entry
|
||||
|
||||
/* The string is short (<=16 bytes). We don't know exactly how
|
||||
short though, yet. Work out the exact length so that we can
|
||||
quickly select the optimal copy strategy. */
|
||||
.Lfp_gt8:
|
||||
rev has_nul2, has_nul2
|
||||
clz pos, has_nul2
|
||||
mov tmp2, #56
|
||||
add dst, dstin, pos, lsr #3 /* Bits to bytes. */
|
||||
sub pos, tmp2, pos
|
||||
#ifdef __AARCH64EB__
|
||||
lsr data2, data2, pos
|
||||
#else
|
||||
lsl data2, data2, pos
|
||||
#endif
|
||||
str data2, [dst, #1]
|
||||
L(tail):
|
||||
rbit synd, synd
|
||||
clz len, synd
|
||||
lsr len, len, 2
|
||||
L(less16):
|
||||
tbz len, 3, L(less8)
|
||||
sub tmp, len, 7
|
||||
ldr data1, [srcin]
|
||||
ldr data2, [srcin, tmp]
|
||||
str data1, [dstin]
|
||||
#ifdef BUILD_STPCPY
|
||||
add dstin, dst, #8
|
||||
#endif
|
||||
str data2, [dstin, tmp]
|
||||
IFSTPCPY (add result, dstin, len)
|
||||
ret
|
||||
|
||||
.Lfp_le8:
|
||||
rev has_nul1, has_nul1
|
||||
clz pos, has_nul1
|
||||
add dst, dstin, pos, lsr #3 /* Bits to bytes. */
|
||||
subs tmp2, pos, #24 /* Pos in bits. */
|
||||
b.lt .Lfp_lt4
|
||||
#ifdef __AARCH64EB__
|
||||
mov tmp2, #56
|
||||
sub pos, tmp2, pos
|
||||
lsr data2, data1, pos
|
||||
lsr data1, data1, #32
|
||||
#else
|
||||
lsr data2, data1, tmp2
|
||||
#endif
|
||||
/* 4->7 bytes to copy. */
|
||||
str data2w, [dst, #-3]
|
||||
str data1w, [dstin]
|
||||
#ifdef BUILD_STPCPY
|
||||
mov dstin, dst
|
||||
#endif
|
||||
ret
|
||||
.Lfp_lt4:
|
||||
cbz pos, .Lfp_lt2
|
||||
/* 2->3 bytes to copy. */
|
||||
#ifdef __AARCH64EB__
|
||||
lsr data1, data1, #48
|
||||
#endif
|
||||
strh data1w, [dstin]
|
||||
/* Fall-through, one byte (max) to go. */
|
||||
.Lfp_lt2:
|
||||
/* Null-terminated string. Last character must be zero! */
|
||||
strb wzr, [dst]
|
||||
#ifdef BUILD_STPCPY
|
||||
mov dstin, dst
|
||||
#endif
|
||||
.p2align 4
|
||||
L(less8):
|
||||
subs tmp, len, 3
|
||||
b.lo L(less4)
|
||||
ldr dataw1, [srcin]
|
||||
ldr dataw2, [srcin, tmp]
|
||||
str dataw1, [dstin]
|
||||
str dataw2, [dstin, tmp]
|
||||
IFSTPCPY (add result, dstin, len)
|
||||
ret
|
||||
|
||||
.p2align 6
|
||||
/* Aligning here ensures that the entry code and main loop all lies
|
||||
within one 64-byte cache line. */
|
||||
.Lbulk_entry:
|
||||
sub to_align, to_align, #16
|
||||
stp data1, data2, [dstin]
|
||||
sub src, srcin, to_align
|
||||
sub dst, dstin, to_align
|
||||
b .Lentry_no_page_cross
|
||||
|
||||
/* The inner loop deals with two Dwords at a time. This has a
|
||||
slightly higher start-up cost, but we should win quite quickly,
|
||||
especially on cores with a high number of issue slots per
|
||||
cycle, as we get much better parallelism out of the operations. */
|
||||
.Lmain_loop:
|
||||
stp data1, data2, [dst], #16
|
||||
.Lentry_no_page_cross:
|
||||
ldp data1, data2, [src], #16
|
||||
sub tmp1, data1, zeroones
|
||||
orr tmp2, data1, #REP8_7f
|
||||
sub tmp3, data2, zeroones
|
||||
orr tmp4, data2, #REP8_7f
|
||||
bic has_nul1, tmp1, tmp2
|
||||
bics has_nul2, tmp3, tmp4
|
||||
ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */
|
||||
b.eq .Lmain_loop
|
||||
|
||||
/* Since we know we are copying at least 16 bytes, the fastest way
|
||||
to deal with the tail is to determine the location of the
|
||||
trailing NUL, then (re)copy the 16 bytes leading up to that. */
|
||||
cmp has_nul1, #0
|
||||
#ifdef __AARCH64EB__
|
||||
/* For big-endian, carry propagation (if the final byte in the
|
||||
string is 0x01) means we cannot use has_nul directly. The
|
||||
easiest way to get the correct byte is to byte-swap the data
|
||||
and calculate the syndrome a second time. */
|
||||
csel data1, data1, data2, ne
|
||||
rev data1, data1
|
||||
sub tmp1, data1, zeroones
|
||||
orr tmp2, data1, #REP8_7f
|
||||
bic has_nul1, tmp1, tmp2
|
||||
#else
|
||||
csel has_nul1, has_nul1, has_nul2, ne
|
||||
#endif
|
||||
rev has_nul1, has_nul1
|
||||
clz pos, has_nul1
|
||||
add tmp1, pos, #72
|
||||
add pos, pos, #8
|
||||
csel pos, pos, tmp1, ne
|
||||
add src, src, pos, lsr #3
|
||||
add dst, dst, pos, lsr #3
|
||||
ldp data1, data2, [src, #-32]
|
||||
stp data1, data2, [dst, #-16]
|
||||
#ifdef BUILD_STPCPY
|
||||
sub dstin, dst, #1
|
||||
#endif
|
||||
L(less4):
|
||||
cbz len, L(zerobyte)
|
||||
ldrh dataw1, [srcin]
|
||||
strh dataw1, [dstin]
|
||||
L(zerobyte):
|
||||
strb wzr, [dstin, len]
|
||||
IFSTPCPY (add result, dstin, len)
|
||||
ret
|
||||
|
||||
.Lpage_cross:
|
||||
bic src, srcin, #15
|
||||
/* Start by loading two words at [srcin & ~15], then forcing the
|
||||
bytes that precede srcin to 0xff. This means they never look
|
||||
like termination bytes. */
|
||||
ldp data1, data2, [src]
|
||||
lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */
|
||||
tst to_align, #7
|
||||
csetm tmp2, ne
|
||||
#ifdef __AARCH64EB__
|
||||
lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
|
||||
#else
|
||||
lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
|
||||
.p2align 4
|
||||
L(start_loop):
|
||||
sub tmp, srcin, dstin
|
||||
ldr dataq2, [srcin]
|
||||
sub dst, src, tmp
|
||||
str dataq2, [dstin]
|
||||
L(loop):
|
||||
str dataq, [dst], 32
|
||||
ldr dataq, [src, 16]
|
||||
cmeq vhas_nul.16b, vdata.16b, 0
|
||||
umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
|
||||
fmov synd, dend
|
||||
cbnz synd, L(loopend)
|
||||
str dataq, [dst, -16]
|
||||
ldr dataq, [src, 32]!
|
||||
cmeq vhas_nul.16b, vdata.16b, 0
|
||||
umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
|
||||
fmov synd, dend
|
||||
cbz synd, L(loop)
|
||||
add dst, dst, 16
|
||||
L(loopend):
|
||||
shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
|
||||
fmov synd, dend
|
||||
sub dst, dst, 31
|
||||
#ifndef __AARCH64EB__
|
||||
rbit synd, synd
|
||||
#endif
|
||||
orr data1, data1, tmp2
|
||||
orr data2a, data2, tmp2
|
||||
cmp to_align, #8
|
||||
csinv data1, data1, xzr, lt
|
||||
csel data2, data2, data2a, lt
|
||||
sub tmp1, data1, zeroones
|
||||
orr tmp2, data1, #REP8_7f
|
||||
sub tmp3, data2, zeroones
|
||||
orr tmp4, data2, #REP8_7f
|
||||
bic has_nul1, tmp1, tmp2
|
||||
bics has_nul2, tmp3, tmp4
|
||||
ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */
|
||||
b.eq .Lpage_cross_ok
|
||||
/* We now need to make data1 and data2 look like they've been
|
||||
loaded directly from srcin. Do a rotate on the 128-bit value. */
|
||||
lsl tmp1, to_align, #3 /* Bytes->bits. */
|
||||
neg tmp2, to_align, lsl #3
|
||||
#ifdef __AARCH64EB__
|
||||
lsl data1a, data1, tmp1
|
||||
lsr tmp4, data2, tmp2
|
||||
lsl data2, data2, tmp1
|
||||
orr tmp4, tmp4, data1a
|
||||
cmp to_align, #8
|
||||
csel data1, tmp4, data2, lt
|
||||
rev tmp2, data1
|
||||
rev tmp4, data2
|
||||
sub tmp1, tmp2, zeroones
|
||||
orr tmp2, tmp2, #REP8_7f
|
||||
sub tmp3, tmp4, zeroones
|
||||
orr tmp4, tmp4, #REP8_7f
|
||||
#else
|
||||
lsr data1a, data1, tmp1
|
||||
lsl tmp4, data2, tmp2
|
||||
lsr data2, data2, tmp1
|
||||
orr tmp4, tmp4, data1a
|
||||
cmp to_align, #8
|
||||
csel data1, tmp4, data2, lt
|
||||
sub tmp1, data1, zeroones
|
||||
orr tmp2, data1, #REP8_7f
|
||||
sub tmp3, data2, zeroones
|
||||
orr tmp4, data2, #REP8_7f
|
||||
#endif
|
||||
bic has_nul1, tmp1, tmp2
|
||||
cbnz has_nul1, .Lfp_le8
|
||||
bic has_nul2, tmp3, tmp4
|
||||
b .Lfp_gt8
|
||||
clz len, synd
|
||||
lsr len, len, 2
|
||||
add dst, dst, len
|
||||
ldr dataq, [dst, tmp]
|
||||
str dataq, [dst]
|
||||
IFSTPCPY (add result, dst, 15)
|
||||
ret
|
||||
|
||||
.size STRCPY, . - STRCPY
|
||||
END (STRCPY)
|
||||
#endif
|
||||
|
@ -1,115 +1,92 @@
|
||||
/* Copyright (c) 2013-2015, Linaro Limited
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the Linaro nor the
|
||||
names of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
|
||||
|
||||
/*
|
||||
* strlen - calculate the length of a string.
|
||||
*
|
||||
* Copyright (c) 2020-2022, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
#if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
|
||||
/* See strlen-stub.c */
|
||||
#else
|
||||
|
||||
/* Assumptions:
|
||||
*
|
||||
* ARMv8-a, AArch64, unaligned accesses, min page size 4k.
|
||||
* ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
|
||||
* Not MTE compatible.
|
||||
*/
|
||||
|
||||
/* To test the page crossing code path more thoroughly, compile with
|
||||
-DTEST_PAGE_CROSS - this will force all calls through the slower
|
||||
entry path. This option is not intended for production use. */
|
||||
#include "asmdefs.h"
|
||||
|
||||
/* Arguments and results. */
|
||||
#define srcin x0
|
||||
#define len x0
|
||||
#define srcin x0
|
||||
#define len x0
|
||||
|
||||
/* Locals and temporaries. */
|
||||
#define src x1
|
||||
#define data1 x2
|
||||
#define data2 x3
|
||||
#define has_nul1 x4
|
||||
#define has_nul2 x5
|
||||
#define tmp1 x4
|
||||
#define tmp2 x5
|
||||
#define tmp3 x6
|
||||
#define tmp4 x7
|
||||
#define zeroones x8
|
||||
#define src x1
|
||||
#define data1 x2
|
||||
#define data2 x3
|
||||
#define has_nul1 x4
|
||||
#define has_nul2 x5
|
||||
#define tmp1 x4
|
||||
#define tmp2 x5
|
||||
#define tmp3 x6
|
||||
#define tmp4 x7
|
||||
#define zeroones x8
|
||||
|
||||
#define L(l) .L ## l
|
||||
#define maskv v0
|
||||
#define maskd d0
|
||||
#define dataq1 q1
|
||||
#define dataq2 q2
|
||||
#define datav1 v1
|
||||
#define datav2 v2
|
||||
#define tmp x2
|
||||
#define tmpw w2
|
||||
#define synd x3
|
||||
#define syndw w3
|
||||
#define shift x4
|
||||
|
||||
.macro def_fn f p2align=0
|
||||
.text
|
||||
.p2align \p2align
|
||||
.global \f
|
||||
.type \f, %function
|
||||
\f:
|
||||
.endm
|
||||
|
||||
/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
|
||||
(=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
|
||||
can be done in parallel across the entire word. A faster check
|
||||
(X - 1) & 0x80 is zero for non-NUL ASCII characters, but gives
|
||||
false hits for characters 129..255. */
|
||||
/* For the first 32 bytes, NUL detection works on the principle that
|
||||
(X - 1) & (~X) & 0x80 (=> (X - 1) & ~(X | 0x7f)) is non-zero if a
|
||||
byte is zero, and can be done in parallel across the entire word. */
|
||||
|
||||
#define REP8_01 0x0101010101010101
|
||||
#define REP8_7f 0x7f7f7f7f7f7f7f7f
|
||||
#define REP8_80 0x8080808080808080
|
||||
|
||||
/* To test the page crossing code path more thoroughly, compile with
|
||||
-DTEST_PAGE_CROSS - this will force all calls through the slower
|
||||
entry path. This option is not intended for production use. */
|
||||
|
||||
#ifdef TEST_PAGE_CROSS
|
||||
# define MIN_PAGE_SIZE 15
|
||||
# define MIN_PAGE_SIZE 32
|
||||
#else
|
||||
# define MIN_PAGE_SIZE 4096
|
||||
#endif
|
||||
|
||||
/* Since strings are short on average, we check the first 16 bytes
|
||||
of the string for a NUL character. In order to do an unaligned ldp
|
||||
safely we have to do a page cross check first. If there is a NUL
|
||||
byte we calculate the length from the 2 8-byte words using
|
||||
conditional select to reduce branch mispredictions (it is unlikely
|
||||
strlen will be repeatedly called on strings with the same length).
|
||||
/* Core algorithm:
|
||||
|
||||
If the string is longer than 16 bytes, we align src so don't need
|
||||
further page cross checks, and process 32 bytes per iteration
|
||||
using the fast NUL check. If we encounter non-ASCII characters,
|
||||
fallback to a second loop using the full NUL check.
|
||||
Since strings are short on average, we check the first 32 bytes of the
|
||||
string for a NUL character without aligning the string. In order to use
|
||||
unaligned loads safely we must do a page cross check first.
|
||||
|
||||
If the page cross check fails, we read 16 bytes from an aligned
|
||||
address, remove any characters before the string, and continue
|
||||
in the main loop using aligned loads. Since strings crossing a
|
||||
page in the first 16 bytes are rare (probability of
|
||||
16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized.
|
||||
If there is a NUL byte we calculate the length from the 2 8-byte words
|
||||
using conditional select to reduce branch mispredictions (it is unlikely
|
||||
strlen will be repeatedly called on strings with the same length).
|
||||
|
||||
AArch64 systems have a minimum page size of 4k. We don't bother
|
||||
checking for larger page sizes - the cost of setting up the correct
|
||||
page size is just not worth the extra gain from a small reduction in
|
||||
the cases taking the slow path. Note that we only care about
|
||||
whether the first fetch, which may be misaligned, crosses a page
|
||||
boundary. */
|
||||
If the string is longer than 32 bytes, align src so we don't need further
|
||||
page cross checks, and process 32 bytes per iteration using a fast SIMD
|
||||
loop.
|
||||
|
||||
def_fn strlen p2align=6
|
||||
If the page cross check fails, we read 32 bytes from an aligned address,
|
||||
and ignore any characters before the string. If it contains a NUL
|
||||
character, return the length, if not, continue in the main loop. */
|
||||
|
||||
ENTRY (strlen)
|
||||
PTR_ARG (0)
|
||||
and tmp1, srcin, MIN_PAGE_SIZE - 1
|
||||
mov zeroones, REP8_01
|
||||
cmp tmp1, MIN_PAGE_SIZE - 16
|
||||
b.gt L(page_cross)
|
||||
cmp tmp1, MIN_PAGE_SIZE - 32
|
||||
b.hi L(page_cross)
|
||||
|
||||
/* Look for a NUL byte in the first 16 bytes. */
|
||||
ldp data1, data2, [srcin]
|
||||
mov zeroones, REP8_01
|
||||
|
||||
#ifdef __AARCH64EB__
|
||||
/* For big-endian, carry propagation (if the final byte in the
|
||||
string is 0x01) means we cannot use has_nul1/2 directly.
|
||||
@ -125,114 +102,96 @@ def_fn strlen p2align=6
|
||||
bics has_nul1, tmp1, tmp2
|
||||
bic has_nul2, tmp3, tmp4
|
||||
ccmp has_nul2, 0, 0, eq
|
||||
beq L(main_loop_entry)
|
||||
b.eq L(bytes16_31)
|
||||
|
||||
/* Enter with C = has_nul1 == 0. */
|
||||
/* Find the exact offset of the first NUL byte in the first 16 bytes
|
||||
from the string start. Enter with C = has_nul1 == 0. */
|
||||
csel has_nul1, has_nul1, has_nul2, cc
|
||||
mov len, 8
|
||||
rev has_nul1, has_nul1
|
||||
clz tmp1, has_nul1
|
||||
csel len, xzr, len, cc
|
||||
add len, len, tmp1, lsr 3
|
||||
ret
|
||||
|
||||
/* The inner loop processes 32 bytes per iteration and uses the fast
|
||||
NUL check. If we encounter non-ASCII characters, use a second
|
||||
loop with the accurate NUL check. */
|
||||
.p2align 4
|
||||
L(main_loop_entry):
|
||||
bic src, srcin, 15
|
||||
sub src, src, 16
|
||||
L(main_loop):
|
||||
ldp data1, data2, [src, 32]!
|
||||
.Lpage_cross_entry:
|
||||
sub tmp1, data1, zeroones
|
||||
sub tmp3, data2, zeroones
|
||||
orr tmp2, tmp1, tmp3
|
||||
tst tmp2, zeroones, lsl 7
|
||||
bne 1f
|
||||
ldp data1, data2, [src, 16]
|
||||
sub tmp1, data1, zeroones
|
||||
sub tmp3, data2, zeroones
|
||||
orr tmp2, tmp1, tmp3
|
||||
tst tmp2, zeroones, lsl 7
|
||||
beq L(main_loop)
|
||||
add src, src, 16
|
||||
1:
|
||||
/* The fast check failed, so do the slower, accurate NUL check. */
|
||||
orr tmp2, data1, REP8_7f
|
||||
orr tmp4, data2, REP8_7f
|
||||
bics has_nul1, tmp1, tmp2
|
||||
bic has_nul2, tmp3, tmp4
|
||||
ccmp has_nul2, 0, 0, eq
|
||||
beq L(nonascii_loop)
|
||||
|
||||
/* Enter with C = has_nul1 == 0. */
|
||||
L(tail):
|
||||
#ifdef __AARCH64EB__
|
||||
/* For big-endian, carry propagation (if the final byte in the
|
||||
string is 0x01) means we cannot use has_nul1/2 directly. The
|
||||
easiest way to get the correct byte is to byte-swap the data
|
||||
and calculate the syndrome a second time. */
|
||||
csel data1, data1, data2, cc
|
||||
rev data1, data1
|
||||
sub tmp1, data1, zeroones
|
||||
orr tmp2, data1, REP8_7f
|
||||
bic has_nul1, tmp1, tmp2
|
||||
#else
|
||||
csel has_nul1, has_nul1, has_nul2, cc
|
||||
#endif
|
||||
sub len, src, srcin
|
||||
rev has_nul1, has_nul1
|
||||
add tmp2, len, 8
|
||||
clz tmp1, has_nul1
|
||||
csel len, len, tmp2, cc
|
||||
add len, len, tmp1, lsr 3
|
||||
ret
|
||||
|
||||
L(nonascii_loop):
|
||||
ldp data1, data2, [src, 16]!
|
||||
sub tmp1, data1, zeroones
|
||||
orr tmp2, data1, REP8_7f
|
||||
sub tmp3, data2, zeroones
|
||||
orr tmp4, data2, REP8_7f
|
||||
bics has_nul1, tmp1, tmp2
|
||||
bic has_nul2, tmp3, tmp4
|
||||
ccmp has_nul2, 0, 0, eq
|
||||
bne L(tail)
|
||||
ldp data1, data2, [src, 16]!
|
||||
sub tmp1, data1, zeroones
|
||||
orr tmp2, data1, REP8_7f
|
||||
sub tmp3, data2, zeroones
|
||||
orr tmp4, data2, REP8_7f
|
||||
bics has_nul1, tmp1, tmp2
|
||||
bic has_nul2, tmp3, tmp4
|
||||
ccmp has_nul2, 0, 0, eq
|
||||
beq L(nonascii_loop)
|
||||
b L(tail)
|
||||
|
||||
/* Load 16 bytes from [srcin & ~15] and force the bytes that precede
|
||||
srcin to 0x7f, so we ignore any NUL bytes before the string.
|
||||
Then continue in the aligned loop. */
|
||||
L(page_cross):
|
||||
bic src, srcin, 15
|
||||
ldp data1, data2, [src]
|
||||
lsl tmp1, srcin, 3
|
||||
mov tmp4, -1
|
||||
/* Look for a NUL byte at offset 16..31 in the string. */
|
||||
L(bytes16_31):
|
||||
ldp data1, data2, [srcin, 16]
|
||||
#ifdef __AARCH64EB__
|
||||
/* Big-endian. Early bytes are at MSB. */
|
||||
lsr tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */
|
||||
#else
|
||||
/* Little-endian. Early bytes are at LSB. */
|
||||
lsl tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */
|
||||
rev data1, data1
|
||||
rev data2, data2
|
||||
#endif
|
||||
orr tmp1, tmp1, REP8_80
|
||||
orn data1, data1, tmp1
|
||||
orn tmp2, data2, tmp1
|
||||
tst srcin, 8
|
||||
csel data1, data1, tmp4, eq
|
||||
csel data2, data2, tmp2, eq
|
||||
b L(page_cross_entry)
|
||||
sub tmp1, data1, zeroones
|
||||
orr tmp2, data1, REP8_7f
|
||||
sub tmp3, data2, zeroones
|
||||
orr tmp4, data2, REP8_7f
|
||||
bics has_nul1, tmp1, tmp2
|
||||
bic has_nul2, tmp3, tmp4
|
||||
ccmp has_nul2, 0, 0, eq
|
||||
b.eq L(loop_entry)
|
||||
|
||||
.size strlen, . - strlen
|
||||
/* Find the exact offset of the first NUL byte at offset 16..31 from
|
||||
the string start. Enter with C = has_nul1 == 0. */
|
||||
csel has_nul1, has_nul1, has_nul2, cc
|
||||
mov len, 24
|
||||
rev has_nul1, has_nul1
|
||||
mov tmp3, 16
|
||||
clz tmp1, has_nul1
|
||||
csel len, tmp3, len, cc
|
||||
add len, len, tmp1, lsr 3
|
||||
ret
|
||||
|
||||
nop
|
||||
L(loop_entry):
|
||||
bic src, srcin, 31
|
||||
|
||||
.p2align 5
|
||||
L(loop):
|
||||
ldp dataq1, dataq2, [src, 32]!
|
||||
uminp maskv.16b, datav1.16b, datav2.16b
|
||||
uminp maskv.16b, maskv.16b, maskv.16b
|
||||
cmeq maskv.8b, maskv.8b, 0
|
||||
fmov synd, maskd
|
||||
cbz synd, L(loop)
|
||||
|
||||
/* Low 32 bits of synd are non-zero if a NUL was found in datav1. */
|
||||
cmeq maskv.16b, datav1.16b, 0
|
||||
sub len, src, srcin
|
||||
cbnz syndw, 1f
|
||||
cmeq maskv.16b, datav2.16b, 0
|
||||
add len, len, 16
|
||||
1:
|
||||
/* Generate a bitmask and compute correct byte offset. */
|
||||
shrn maskv.8b, maskv.8h, 4
|
||||
fmov synd, maskd
|
||||
#ifndef __AARCH64EB__
|
||||
rbit synd, synd
|
||||
#endif
|
||||
clz tmp, synd
|
||||
add len, len, tmp, lsr 2
|
||||
ret
|
||||
|
||||
L(page_cross):
|
||||
bic src, srcin, 31
|
||||
mov tmpw, 0x0c03
|
||||
movk tmpw, 0xc030, lsl 16
|
||||
ld1 {datav1.16b, datav2.16b}, [src]
|
||||
dup maskv.4s, tmpw
|
||||
cmeq datav1.16b, datav1.16b, 0
|
||||
cmeq datav2.16b, datav2.16b, 0
|
||||
and datav1.16b, datav1.16b, maskv.16b
|
||||
and datav2.16b, datav2.16b, maskv.16b
|
||||
addp maskv.16b, datav1.16b, datav2.16b
|
||||
addp maskv.16b, maskv.16b, maskv.16b
|
||||
fmov synd, maskd
|
||||
lsl shift, srcin, 1
|
||||
lsr synd, synd, shift
|
||||
cbz synd, L(loop)
|
||||
|
||||
rbit synd, synd
|
||||
clz len, synd
|
||||
lsr len, len, 1
|
||||
ret
|
||||
|
||||
END (strlen)
|
||||
#endif
|
||||
|
@ -1,49 +1,23 @@
|
||||
/* Copyright (c) 2013, 2018, Linaro Limited
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the Linaro nor the
|
||||
names of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
|
||||
|
||||
/*
|
||||
* strncmp - compare two strings
|
||||
*
|
||||
* Copyright (c) 2013-2022, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
#if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
|
||||
/* See strcmp-stub.c */
|
||||
#else
|
||||
|
||||
/* Assumptions:
|
||||
*
|
||||
* ARMv8-a, AArch64
|
||||
* ARMv8-a, AArch64.
|
||||
* MTE compatible.
|
||||
*/
|
||||
|
||||
.macro def_fn f p2align=0
|
||||
.text
|
||||
.p2align \p2align
|
||||
.global \f
|
||||
.type \f, %function
|
||||
\f:
|
||||
.endm
|
||||
#include "asmdefs.h"
|
||||
|
||||
#define REP8_01 0x0101010101010101
|
||||
#define REP8_7f 0x7f7f7f7f7f7f7f7f
|
||||
#define REP8_80 0x8080808080808080
|
||||
|
||||
/* Parameters and result. */
|
||||
#define src1 x0
|
||||
@ -64,86 +38,91 @@
|
||||
#define tmp3 x10
|
||||
#define zeroones x11
|
||||
#define pos x12
|
||||
#define limit_wd x13
|
||||
#define mask x14
|
||||
#define endloop x15
|
||||
#define mask x13
|
||||
#define endloop x14
|
||||
#define count mask
|
||||
#define offset pos
|
||||
#define neg_offset x15
|
||||
|
||||
.text
|
||||
.p2align 6
|
||||
.rep 7
|
||||
nop /* Pad so that the loop below fits a cache line. */
|
||||
.endr
|
||||
def_fn strncmp
|
||||
cbz limit, .Lret0
|
||||
/* Define endian dependent shift operations.
|
||||
On big-endian early bytes are at MSB and on little-endian LSB.
|
||||
LS_FW means shifting towards early bytes.
|
||||
LS_BK means shifting towards later bytes.
|
||||
*/
|
||||
#ifdef __AARCH64EB__
|
||||
#define LS_FW lsl
|
||||
#define LS_BK lsr
|
||||
#else
|
||||
#define LS_FW lsr
|
||||
#define LS_BK lsl
|
||||
#endif
|
||||
|
||||
ENTRY (strncmp)
|
||||
PTR_ARG (0)
|
||||
PTR_ARG (1)
|
||||
SIZE_ARG (2)
|
||||
cbz limit, L(ret0)
|
||||
eor tmp1, src1, src2
|
||||
mov zeroones, #REP8_01
|
||||
tst tmp1, #7
|
||||
and count, src1, #7
|
||||
b.ne .Lmisaligned8
|
||||
cbnz count, .Lmutual_align
|
||||
/* Calculate the number of full and partial words -1. */
|
||||
sub limit_wd, limit, #1 /* limit != 0, so no underflow. */
|
||||
lsr limit_wd, limit_wd, #3 /* Convert to Dwords. */
|
||||
b.ne L(misaligned8)
|
||||
cbnz count, L(mutual_align)
|
||||
|
||||
/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
|
||||
(=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
|
||||
can be done in parallel across the entire word. */
|
||||
/* Start of performance-critical section -- one 64B cache line. */
|
||||
.Lloop_aligned:
|
||||
.p2align 4
|
||||
L(loop_aligned):
|
||||
ldr data1, [src1], #8
|
||||
ldr data2, [src2], #8
|
||||
.Lstart_realigned:
|
||||
subs limit_wd, limit_wd, #1
|
||||
L(start_realigned):
|
||||
subs limit, limit, #8
|
||||
sub tmp1, data1, zeroones
|
||||
orr tmp2, data1, #REP8_7f
|
||||
eor diff, data1, data2 /* Non-zero if differences found. */
|
||||
csinv endloop, diff, xzr, pl /* Last Dword or differences. */
|
||||
csinv endloop, diff, xzr, hi /* Last Dword or differences. */
|
||||
bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
|
||||
ccmp endloop, #0, #0, eq
|
||||
b.eq .Lloop_aligned
|
||||
/* End of performance-critical section -- one 64B cache line. */
|
||||
b.eq L(loop_aligned)
|
||||
/* End of main loop */
|
||||
|
||||
/* Not reached the limit, must have found the end or a diff. */
|
||||
tbz limit_wd, #63, .Lnot_limit
|
||||
|
||||
/* Limit % 8 == 0 => all bytes significant. */
|
||||
ands limit, limit, #7
|
||||
b.eq .Lnot_limit
|
||||
|
||||
lsl limit, limit, #3 /* Bits -> bytes. */
|
||||
mov mask, #~0
|
||||
#ifdef __AARCH64EB__
|
||||
lsr mask, mask, limit
|
||||
L(full_check):
|
||||
#ifndef __AARCH64EB__
|
||||
orr syndrome, diff, has_nul
|
||||
add limit, limit, 8 /* Rewind limit to before last subs. */
|
||||
L(syndrome_check):
|
||||
/* Limit was reached. Check if the NUL byte or the difference
|
||||
is before the limit. */
|
||||
rev syndrome, syndrome
|
||||
rev data1, data1
|
||||
clz pos, syndrome
|
||||
rev data2, data2
|
||||
lsl data1, data1, pos
|
||||
cmp limit, pos, lsr #3
|
||||
lsl data2, data2, pos
|
||||
/* But we need to zero-extend (char is unsigned) the value and then
|
||||
perform a signed 32-bit subtraction. */
|
||||
lsr data1, data1, #56
|
||||
sub result, data1, data2, lsr #56
|
||||
csel result, result, xzr, hi
|
||||
ret
|
||||
#else
|
||||
lsl mask, mask, limit
|
||||
#endif
|
||||
/* Not reached the limit, must have found the end or a diff. */
|
||||
tbz limit, #63, L(not_limit)
|
||||
add tmp1, limit, 8
|
||||
cbz limit, L(not_limit)
|
||||
|
||||
lsl limit, tmp1, #3 /* Bits -> bytes. */
|
||||
mov mask, #~0
|
||||
lsr mask, mask, limit
|
||||
bic data1, data1, mask
|
||||
bic data2, data2, mask
|
||||
|
||||
/* Make sure that the NUL byte is marked in the syndrome. */
|
||||
orr has_nul, has_nul, mask
|
||||
|
||||
.Lnot_limit:
|
||||
orr syndrome, diff, has_nul
|
||||
|
||||
#ifndef __AARCH64EB__
|
||||
rev syndrome, syndrome
|
||||
rev data1, data1
|
||||
/* The MS-non-zero bit of the syndrome marks either the first bit
|
||||
that is different, or the top bit of the first zero byte.
|
||||
Shifting left now will bring the critical information into the
|
||||
top bits. */
|
||||
clz pos, syndrome
|
||||
rev data2, data2
|
||||
lsl data1, data1, pos
|
||||
lsl data2, data2, pos
|
||||
/* But we need to zero-extend (char is unsigned) the value and then
|
||||
perform a signed 32-bit subtraction. */
|
||||
lsr data1, data1, #56
|
||||
sub result, data1, data2, lsr #56
|
||||
ret
|
||||
#else
|
||||
L(not_limit):
|
||||
/* For big-endian we cannot use the trick with the syndrome value
|
||||
as carry-propagation can corrupt the upper bits if the trailing
|
||||
bytes in the string contain 0x01. */
|
||||
@ -164,10 +143,11 @@ def_fn strncmp
|
||||
rev has_nul, has_nul
|
||||
orr syndrome, diff, has_nul
|
||||
clz pos, syndrome
|
||||
/* The MS-non-zero bit of the syndrome marks either the first bit
|
||||
that is different, or the top bit of the first zero byte.
|
||||
/* The most-significant-non-zero bit of the syndrome marks either the
|
||||
first bit that is different, or the top bit of the first zero byte.
|
||||
Shifting left now will bring the critical information into the
|
||||
top bits. */
|
||||
L(end_quick):
|
||||
lsl data1, data1, pos
|
||||
lsl data2, data2, pos
|
||||
/* But we need to zero-extend (char is unsigned) the value and then
|
||||
@ -177,7 +157,7 @@ def_fn strncmp
|
||||
ret
|
||||
#endif
|
||||
|
||||
.Lmutual_align:
|
||||
L(mutual_align):
|
||||
/* Sources are mutually aligned, but are not currently at an
|
||||
alignment boundary. Round down the addresses and then mask off
|
||||
the bytes that precede the start point.
|
||||
@ -189,102 +169,143 @@ def_fn strncmp
|
||||
neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */
|
||||
ldr data2, [src2], #8
|
||||
mov tmp2, #~0
|
||||
sub limit_wd, limit, #1 /* limit != 0, so no underflow. */
|
||||
#ifdef __AARCH64EB__
|
||||
/* Big-endian. Early bytes are at MSB. */
|
||||
lsl tmp2, tmp2, tmp3 /* Shift (count & 63). */
|
||||
#else
|
||||
/* Little-endian. Early bytes are at LSB. */
|
||||
lsr tmp2, tmp2, tmp3 /* Shift (count & 63). */
|
||||
#endif
|
||||
and tmp3, limit_wd, #7
|
||||
lsr limit_wd, limit_wd, #3
|
||||
/* Adjust the limit. Only low 3 bits used, so overflow irrelevant. */
|
||||
add limit, limit, count
|
||||
add tmp3, tmp3, count
|
||||
LS_FW tmp2, tmp2, tmp3 /* Shift (count & 63). */
|
||||
/* Adjust the limit and ensure it doesn't overflow. */
|
||||
adds limit, limit, count
|
||||
csinv limit, limit, xzr, lo
|
||||
orr data1, data1, tmp2
|
||||
orr data2, data2, tmp2
|
||||
add limit_wd, limit_wd, tmp3, lsr #3
|
||||
b .Lstart_realigned
|
||||
b L(start_realigned)
|
||||
|
||||
.p2align 6
|
||||
.p2align 4
|
||||
/* Don't bother with dwords for up to 16 bytes. */
|
||||
.Lmisaligned8:
|
||||
L(misaligned8):
|
||||
cmp limit, #16
|
||||
b.hs .Ltry_misaligned_words
|
||||
b.hs L(try_misaligned_words)
|
||||
|
||||
.Lbyte_loop:
|
||||
L(byte_loop):
|
||||
/* Perhaps we can do better than this. */
|
||||
ldrb data1w, [src1], #1
|
||||
ldrb data2w, [src2], #1
|
||||
subs limit, limit, #1
|
||||
ccmp data1w, #1, #0, hi /* NZCV = 0b0000. */
|
||||
ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
|
||||
b.eq .Lbyte_loop
|
||||
.Ldone:
|
||||
b.eq L(byte_loop)
|
||||
L(done):
|
||||
sub result, data1, data2
|
||||
ret
|
||||
/* Align the SRC1 to a dword by doing a bytewise compare and then do
|
||||
the dword loop. */
|
||||
.Ltry_misaligned_words:
|
||||
lsr limit_wd, limit, #3
|
||||
cbz count, .Ldo_misaligned
|
||||
L(try_misaligned_words):
|
||||
cbz count, L(src1_aligned)
|
||||
|
||||
neg count, count
|
||||
and count, count, #7
|
||||
sub limit, limit, count
|
||||
lsr limit_wd, limit, #3
|
||||
|
||||
.Lpage_end_loop:
|
||||
L(page_end_loop):
|
||||
ldrb data1w, [src1], #1
|
||||
ldrb data2w, [src2], #1
|
||||
cmp data1w, #1
|
||||
ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
|
||||
b.ne .Ldone
|
||||
b.ne L(done)
|
||||
subs count, count, #1
|
||||
b.hi .Lpage_end_loop
|
||||
b.hi L(page_end_loop)
|
||||
|
||||
.Ldo_misaligned:
|
||||
/* Prepare ourselves for the next page crossing. Unlike the aligned
|
||||
loop, we fetch 1 less dword because we risk crossing bounds on
|
||||
SRC2. */
|
||||
mov count, #8
|
||||
subs limit_wd, limit_wd, #1
|
||||
b.lo .Ldone_loop
|
||||
.Lloop_misaligned:
|
||||
and tmp2, src2, #0xff8
|
||||
eor tmp2, tmp2, #0xff8
|
||||
cbz tmp2, .Lpage_end_loop
|
||||
/* The following diagram explains the comparison of misaligned strings.
|
||||
The bytes are shown in natural order. For little-endian, it is
|
||||
reversed in the registers. The "x" bytes are before the string.
|
||||
The "|" separates data that is loaded at one time.
|
||||
src1 | a a a a a a a a | b b b c c c c c | . . .
|
||||
src2 | x x x x x a a a a a a a a b b b | c c c c c . . .
|
||||
|
||||
After shifting in each step, the data looks like this:
|
||||
STEP_A STEP_B STEP_C
|
||||
data1 a a a a a a a a b b b c c c c c b b b c c c c c
|
||||
data2 a a a a a a a a b b b 0 0 0 0 0 0 0 0 c c c c c
|
||||
|
||||
The bytes with "0" are eliminated from the syndrome via mask.
|
||||
|
||||
Align SRC2 down to 16 bytes. This way we can read 16 bytes at a
|
||||
time from SRC2. The comparison happens in 3 steps. After each step
|
||||
the loop can exit, or read from SRC1 or SRC2. */
|
||||
L(src1_aligned):
|
||||
/* Calculate offset from 8 byte alignment to string start in bits. No
|
||||
need to mask offset since shifts are ignoring upper bits. */
|
||||
lsl offset, src2, #3
|
||||
bic src2, src2, #0xf
|
||||
mov mask, -1
|
||||
neg neg_offset, offset
|
||||
ldr data1, [src1], #8
|
||||
ldp tmp1, tmp2, [src2], #16
|
||||
LS_BK mask, mask, neg_offset
|
||||
and neg_offset, neg_offset, #63 /* Need actual value for cmp later. */
|
||||
/* Skip the first compare if data in tmp1 is irrelevant. */
|
||||
tbnz offset, 6, L(misaligned_mid_loop)
|
||||
|
||||
L(loop_misaligned):
|
||||
/* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/
|
||||
LS_FW data2, tmp1, offset
|
||||
LS_BK tmp1, tmp2, neg_offset
|
||||
subs limit, limit, #8
|
||||
orr data2, data2, tmp1 /* 8 bytes from SRC2 combined from two regs.*/
|
||||
sub has_nul, data1, zeroones
|
||||
eor diff, data1, data2 /* Non-zero if differences found. */
|
||||
orr tmp3, data1, #REP8_7f
|
||||
csinv endloop, diff, xzr, hi /* If limit, set to all ones. */
|
||||
bic has_nul, has_nul, tmp3 /* Non-zero if NUL byte found in SRC1. */
|
||||
orr tmp3, endloop, has_nul
|
||||
cbnz tmp3, L(full_check)
|
||||
|
||||
ldr data1, [src1], #8
|
||||
ldr data2, [src2], #8
|
||||
sub tmp1, data1, zeroones
|
||||
orr tmp2, data1, #REP8_7f
|
||||
eor diff, data1, data2 /* Non-zero if differences found. */
|
||||
bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
|
||||
ccmp diff, #0, #0, eq
|
||||
b.ne .Lnot_limit
|
||||
subs limit_wd, limit_wd, #1
|
||||
b.pl .Lloop_misaligned
|
||||
L(misaligned_mid_loop):
|
||||
/* STEP_B: Compare first part of data1 to second part of tmp2. */
|
||||
LS_FW data2, tmp2, offset
|
||||
#ifdef __AARCH64EB__
|
||||
/* For big-endian we do a byte reverse to avoid carry-propagation
|
||||
problem described above. This way we can reuse the has_nul in the
|
||||
next step and also use syndrome value trick at the end. */
|
||||
rev tmp3, data1
|
||||
#define data1_fixed tmp3
|
||||
#else
|
||||
#define data1_fixed data1
|
||||
#endif
|
||||
sub has_nul, data1_fixed, zeroones
|
||||
orr tmp3, data1_fixed, #REP8_7f
|
||||
eor diff, data2, data1 /* Non-zero if differences found. */
|
||||
bic has_nul, has_nul, tmp3 /* Non-zero if NUL terminator. */
|
||||
#ifdef __AARCH64EB__
|
||||
rev has_nul, has_nul
|
||||
#endif
|
||||
cmp limit, neg_offset, lsr #3
|
||||
orr syndrome, diff, has_nul
|
||||
bic syndrome, syndrome, mask /* Ignore later bytes. */
|
||||
csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */
|
||||
cbnz tmp3, L(syndrome_check)
|
||||
|
||||
.Ldone_loop:
|
||||
/* We found a difference or a NULL before the limit was reached. */
|
||||
and limit, limit, #7
|
||||
cbz limit, .Lnot_limit
|
||||
/* Read the last word. */
|
||||
sub src1, src1, 8
|
||||
sub src2, src2, 8
|
||||
ldr data1, [src1, limit]
|
||||
ldr data2, [src2, limit]
|
||||
sub tmp1, data1, zeroones
|
||||
orr tmp2, data1, #REP8_7f
|
||||
eor diff, data1, data2 /* Non-zero if differences found. */
|
||||
bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
|
||||
ccmp diff, #0, #0, eq
|
||||
b.ne .Lnot_limit
|
||||
/* STEP_C: Compare second part of data1 to first part of tmp1. */
|
||||
ldp tmp1, tmp2, [src2], #16
|
||||
cmp limit, #8
|
||||
LS_BK data2, tmp1, neg_offset
|
||||
eor diff, data2, data1 /* Non-zero if differences found. */
|
||||
orr syndrome, diff, has_nul
|
||||
and syndrome, syndrome, mask /* Ignore earlier bytes. */
|
||||
csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */
|
||||
cbnz tmp3, L(syndrome_check)
|
||||
|
||||
.Lret0:
|
||||
ldr data1, [src1], #8
|
||||
sub limit, limit, #8
|
||||
b L(loop_misaligned)
|
||||
|
||||
#ifdef __AARCH64EB__
|
||||
L(syndrome_check):
|
||||
clz pos, syndrome
|
||||
cmp pos, limit, lsl #3
|
||||
b.lo L(end_quick)
|
||||
#endif
|
||||
|
||||
L(ret0):
|
||||
mov result, #0
|
||||
ret
|
||||
.size strncmp, . - strncmp
|
||||
END(strncmp)
|
||||
#endif
|
||||
|
@ -1,187 +1,105 @@
|
||||
/* strnlen - calculate the length of a string with limit.
|
||||
|
||||
Copyright (c) 2013, Linaro Limited
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the Linaro nor the
|
||||
names of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
|
||||
|
||||
/*
|
||||
* strnlen - calculate the length of a string with limit.
|
||||
*
|
||||
* Copyright (c) 2020-2022, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
#if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
|
||||
/* See strlen-stub.c */
|
||||
#else
|
||||
|
||||
/* Assumptions:
|
||||
*
|
||||
* ARMv8-a, AArch64
|
||||
* ARMv8-a, AArch64, Advanced SIMD.
|
||||
* MTE compatible.
|
||||
*/
|
||||
|
||||
/* Arguments and results. */
|
||||
#include "asmdefs.h"
|
||||
|
||||
#define srcin x0
|
||||
#define len x0
|
||||
#define limit x1
|
||||
#define cntin x1
|
||||
#define result x0
|
||||
|
||||
/* Locals and temporaries. */
|
||||
#define src x2
|
||||
#define data1 x3
|
||||
#define data2 x4
|
||||
#define data2a x5
|
||||
#define has_nul1 x6
|
||||
#define has_nul2 x7
|
||||
#define tmp1 x8
|
||||
#define tmp2 x9
|
||||
#define tmp3 x10
|
||||
#define tmp4 x11
|
||||
#define zeroones x12
|
||||
#define pos x13
|
||||
#define limit_wd x14
|
||||
#define synd x3
|
||||
#define shift x4
|
||||
#define tmp x4
|
||||
#define cntrem x5
|
||||
|
||||
.macro def_fn f p2align=0
|
||||
.text
|
||||
.p2align \p2align
|
||||
.global \f
|
||||
.type \f, %function
|
||||
\f:
|
||||
.endm
|
||||
#define qdata q0
|
||||
#define vdata v0
|
||||
#define vhas_chr v1
|
||||
#define vend v2
|
||||
#define dend d2
|
||||
|
||||
#define REP8_01 0x0101010101010101
|
||||
#define REP8_7f 0x7f7f7f7f7f7f7f7f
|
||||
#define REP8_80 0x8080808080808080
|
||||
/*
|
||||
Core algorithm:
|
||||
Process the string in 16-byte aligned chunks. Compute a 64-bit mask with
|
||||
four bits per byte using the shrn instruction. A count trailing zeros then
|
||||
identifies the first zero byte. */
|
||||
|
||||
.text
|
||||
.p2align 6
|
||||
.Lstart:
|
||||
/* Pre-pad to ensure critical loop begins an icache line. */
|
||||
.rep 7
|
||||
nop
|
||||
.endr
|
||||
/* Put this code here to avoid wasting more space with pre-padding. */
|
||||
.Lhit_limit:
|
||||
mov len, limit
|
||||
ENTRY (strnlen)
|
||||
PTR_ARG (0)
|
||||
SIZE_ARG (1)
|
||||
bic src, srcin, 15
|
||||
cbz cntin, L(nomatch)
|
||||
ld1 {vdata.16b}, [src]
|
||||
cmeq vhas_chr.16b, vdata.16b, 0
|
||||
lsl shift, srcin, 2
|
||||
shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
|
||||
fmov synd, dend
|
||||
lsr synd, synd, shift
|
||||
cbz synd, L(start_loop)
|
||||
L(finish):
|
||||
rbit synd, synd
|
||||
clz synd, synd
|
||||
lsr result, synd, 2
|
||||
cmp cntin, result
|
||||
csel result, cntin, result, ls
|
||||
ret
|
||||
|
||||
def_fn strnlen
|
||||
cbz limit, .Lhit_limit
|
||||
mov zeroones, #REP8_01
|
||||
bic src, srcin, #15
|
||||
ands tmp1, srcin, #15
|
||||
b.ne .Lmisaligned
|
||||
/* Calculate the number of full and partial words -1. */
|
||||
sub limit_wd, limit, #1 /* Limit != 0, so no underflow. */
|
||||
lsr limit_wd, limit_wd, #4 /* Convert to Qwords. */
|
||||
|
||||
/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
|
||||
(=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
|
||||
can be done in parallel across the entire word. */
|
||||
/* The inner loop deals with two Dwords at a time. This has a
|
||||
slightly higher start-up cost, but we should win quite quickly,
|
||||
especially on cores with a high number of issue slots per
|
||||
cycle, as we get much better parallelism out of the operations. */
|
||||
|
||||
/* Start of critial section -- keep to one 64Byte cache line. */
|
||||
.Lloop:
|
||||
ldp data1, data2, [src], #16
|
||||
.Lrealigned:
|
||||
sub tmp1, data1, zeroones
|
||||
orr tmp2, data1, #REP8_7f
|
||||
sub tmp3, data2, zeroones
|
||||
orr tmp4, data2, #REP8_7f
|
||||
bic has_nul1, tmp1, tmp2
|
||||
bic has_nul2, tmp3, tmp4
|
||||
subs limit_wd, limit_wd, #1
|
||||
orr tmp1, has_nul1, has_nul2
|
||||
ccmp tmp1, #0, #0, pl /* NZCV = 0000 */
|
||||
b.eq .Lloop
|
||||
/* End of critical section -- keep to one 64Byte cache line. */
|
||||
|
||||
orr tmp1, has_nul1, has_nul2
|
||||
cbz tmp1, .Lhit_limit /* No null in final Qword. */
|
||||
|
||||
/* We know there's a null in the final Qword. The easiest thing
|
||||
to do now is work out the length of the string and return
|
||||
MIN (len, limit). */
|
||||
|
||||
sub len, src, srcin
|
||||
cbz has_nul1, .Lnul_in_data2
|
||||
#ifdef __AARCH64EB__
|
||||
mov data2, data1
|
||||
#endif
|
||||
sub len, len, #8
|
||||
mov has_nul2, has_nul1
|
||||
.Lnul_in_data2:
|
||||
#ifdef __AARCH64EB__
|
||||
/* For big-endian, carry propagation (if the final byte in the
|
||||
string is 0x01) means we cannot use has_nul directly. The
|
||||
easiest way to get the correct byte is to byte-swap the data
|
||||
and calculate the syndrome a second time. */
|
||||
rev data2, data2
|
||||
sub tmp1, data2, zeroones
|
||||
orr tmp2, data2, #REP8_7f
|
||||
bic has_nul2, tmp1, tmp2
|
||||
#endif
|
||||
sub len, len, #8
|
||||
rev has_nul2, has_nul2
|
||||
clz pos, has_nul2
|
||||
add len, len, pos, lsr #3 /* Bits to bytes. */
|
||||
cmp len, limit
|
||||
csel len, len, limit, ls /* Return the lower value. */
|
||||
L(nomatch):
|
||||
mov result, cntin
|
||||
ret
|
||||
|
||||
.Lmisaligned:
|
||||
/* Deal with a partial first word.
|
||||
We're doing two things in parallel here;
|
||||
1) Calculate the number of words (but avoiding overflow if
|
||||
limit is near ULONG_MAX) - to do this we need to work out
|
||||
limit + tmp1 - 1 as a 65-bit value before shifting it;
|
||||
2) Load and mask the initial data words - we force the bytes
|
||||
before the ones we are interested in to 0xff - this ensures
|
||||
early bytes will not hit any zero detection. */
|
||||
sub limit_wd, limit, #1
|
||||
neg tmp4, tmp1
|
||||
cmp tmp1, #8
|
||||
L(start_loop):
|
||||
sub tmp, src, srcin
|
||||
add tmp, tmp, 17
|
||||
subs cntrem, cntin, tmp
|
||||
b.lo L(nomatch)
|
||||
|
||||
and tmp3, limit_wd, #15
|
||||
lsr limit_wd, limit_wd, #4
|
||||
mov tmp2, #~0
|
||||
|
||||
ldp data1, data2, [src], #16
|
||||
lsl tmp4, tmp4, #3 /* Bytes beyond alignment -> bits. */
|
||||
add tmp3, tmp3, tmp1
|
||||
|
||||
#ifdef __AARCH64EB__
|
||||
/* Big-endian. Early bytes are at MSB. */
|
||||
lsl tmp2, tmp2, tmp4 /* Shift (tmp1 & 63). */
|
||||
#else
|
||||
/* Little-endian. Early bytes are at LSB. */
|
||||
lsr tmp2, tmp2, tmp4 /* Shift (tmp1 & 63). */
|
||||
/* Make sure that it won't overread by a 16-byte chunk */
|
||||
tbz cntrem, 4, L(loop32_2)
|
||||
sub src, src, 16
|
||||
.p2align 5
|
||||
L(loop32):
|
||||
ldr qdata, [src, 32]!
|
||||
cmeq vhas_chr.16b, vdata.16b, 0
|
||||
umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
|
||||
fmov synd, dend
|
||||
cbnz synd, L(end)
|
||||
L(loop32_2):
|
||||
ldr qdata, [src, 16]
|
||||
subs cntrem, cntrem, 32
|
||||
cmeq vhas_chr.16b, vdata.16b, 0
|
||||
b.lo L(end_2)
|
||||
umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
|
||||
fmov synd, dend
|
||||
cbz synd, L(loop32)
|
||||
L(end_2):
|
||||
add src, src, 16
|
||||
L(end):
|
||||
shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
|
||||
sub result, src, srcin
|
||||
fmov synd, dend
|
||||
#ifndef __AARCH64EB__
|
||||
rbit synd, synd
|
||||
#endif
|
||||
add limit_wd, limit_wd, tmp3, lsr #4
|
||||
|
||||
orr data1, data1, tmp2
|
||||
orr data2a, data2, tmp2
|
||||
|
||||
csinv data1, data1, xzr, le
|
||||
csel data2, data2, data2a, le
|
||||
b .Lrealigned
|
||||
.size strnlen, . - .Lstart /* Include pre-padding in size. */
|
||||
clz synd, synd
|
||||
add result, result, synd, lsr 2
|
||||
cmp cntin, result
|
||||
csel result, cntin, result, ls
|
||||
ret
|
||||
|
||||
END (strnlen)
|
||||
#endif
|
||||
|
@ -1,32 +1,9 @@
|
||||
/*
|
||||
strrchr - find last instance of a character in a string
|
||||
|
||||
Copyright (c) 2014, ARM Limited
|
||||
All rights Reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the company nor the names of its contributors
|
||||
may be used to endorse or promote products derived from this
|
||||
software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
|
||||
|
||||
* strrchr - find last position of a character in a string.
|
||||
*
|
||||
* Copyright (c) 2014-2022, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
#if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
|
||||
/* See strchr-stub.c */
|
||||
#else
|
||||
@ -37,6 +14,8 @@
|
||||
* Neon Available.
|
||||
*/
|
||||
|
||||
#include "asmdefs.h"
|
||||
|
||||
/* Arguments and results. */
|
||||
#define srcin x0
|
||||
#define chrin w1
|
||||
@ -78,17 +57,8 @@
|
||||
in the original string a count_trailing_zeros() operation will
|
||||
identify exactly which byte is causing the termination, and why. */
|
||||
|
||||
/* Locals and temporaries. */
|
||||
|
||||
.macro def_fn f p2align=0
|
||||
.text
|
||||
.p2align \p2align
|
||||
.global \f
|
||||
.type \f, %function
|
||||
\f:
|
||||
.endm
|
||||
|
||||
def_fn strrchr
|
||||
ENTRY (strrchr)
|
||||
PTR_ARG (0)
|
||||
/* Magic constant 0x40100401 to allow us to identify which lane
|
||||
matches the requested byte. Magic constant 0x80200802 used
|
||||
similarly for NUL termination. */
|
||||
@ -100,7 +70,7 @@ def_fn strrchr
|
||||
mov src_offset, #0
|
||||
ands tmp1, srcin, #31
|
||||
add vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */
|
||||
b.eq .Laligned
|
||||
b.eq L(aligned)
|
||||
|
||||
/* Input string is not 32-byte aligned. Rather than forcing
|
||||
the padding bytes to a safe value, we calculate the syndrome
|
||||
@ -118,45 +88,45 @@ def_fn strrchr
|
||||
and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
|
||||
addp vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b // 256->128
|
||||
addp vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128
|
||||
addp vhas_nul1.16b, vhas_nul1.16b, vhas_nul1.16b // 128->64
|
||||
addp vhas_chr1.16b, vhas_chr1.16b, vhas_chr1.16b // 128->64
|
||||
mov nul_match, vhas_nul1.2d[0]
|
||||
addp vend1.16b, vhas_nul1.16b, vhas_chr1.16b // 128->64
|
||||
mov nul_match, vend1.d[0]
|
||||
lsl tmp1, tmp1, #1
|
||||
mov const_m1, #~0
|
||||
mov chr_match, vhas_chr1.2d[0]
|
||||
lsr tmp3, const_m1, tmp1
|
||||
mov chr_match, vend1.d[1]
|
||||
|
||||
bic nul_match, nul_match, tmp3 // Mask padding bits.
|
||||
bic chr_match, chr_match, tmp3 // Mask padding bits.
|
||||
cbnz nul_match, .Ltail
|
||||
cbnz nul_match, L(tail)
|
||||
|
||||
.Lloop:
|
||||
.p2align 4
|
||||
L(loop):
|
||||
cmp chr_match, #0
|
||||
csel src_match, src, src_match, ne
|
||||
csel src_offset, chr_match, src_offset, ne
|
||||
.Laligned:
|
||||
L(aligned):
|
||||
ld1 {vdata1.16b, vdata2.16b}, [src], #32
|
||||
cmeq vhas_nul1.16b, vdata1.16b, #0
|
||||
cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
|
||||
cmeq vhas_nul2.16b, vdata2.16b, #0
|
||||
cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
|
||||
addp vend1.16b, vhas_nul1.16b, vhas_nul2.16b // 256->128
|
||||
uminp vend1.16b, vdata1.16b, vdata2.16b
|
||||
and vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
|
||||
and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
|
||||
cmeq vend1.16b, vend1.16b, 0
|
||||
addp vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128
|
||||
addp vend1.16b, vend1.16b, vend1.16b // 128->64
|
||||
addp vhas_chr1.16b, vhas_chr1.16b, vhas_chr1.16b // 128->64
|
||||
mov nul_match, vend1.2d[0]
|
||||
mov chr_match, vhas_chr1.2d[0]
|
||||
cbz nul_match, .Lloop
|
||||
addp vend1.16b, vend1.16b, vhas_chr1.16b // 128->64
|
||||
mov nul_match, vend1.d[0]
|
||||
mov chr_match, vend1.d[1]
|
||||
cbz nul_match, L(loop)
|
||||
|
||||
cmeq vhas_nul1.16b, vdata1.16b, #0
|
||||
cmeq vhas_nul2.16b, vdata2.16b, #0
|
||||
and vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
|
||||
and vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
|
||||
addp vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b
|
||||
addp vhas_nul1.16b, vhas_nul1.16b, vhas_nul1.16b
|
||||
mov nul_match, vhas_nul1.2d[0]
|
||||
mov nul_match, vhas_nul1.d[0]
|
||||
|
||||
.Ltail:
|
||||
L(tail):
|
||||
/* Work out exactly where the string ends. */
|
||||
sub tmp4, nul_match, #1
|
||||
eor tmp4, tmp4, nul_match
|
||||
@ -178,5 +148,5 @@ def_fn strrchr
|
||||
|
||||
ret
|
||||
|
||||
.size strrchr, . - strrchr
|
||||
END (strrchr)
|
||||
#endif
|
||||
|
Loading…
x
Reference in New Issue
Block a user