4
0
mirror of git://sourceware.org/git/newlib-cygwin.git synced 2025-03-03 13:35:46 +08:00

Optimized memcmp

This is an optimized memcmp for AArch64.  This is a complete rewrite
using a different algorithm.  The previous version split into cases
where both inputs were aligned, the inputs were mutually aligned and
unaligned using a byte loop.  The new version combines all these cases,
while small inputs of less than 8 bytes are handled separately.

This allows the main code to be sped up using unaligned loads since
there are now at least 8 bytes to be compared.  After the first 8 bytes,
align the first input.  This ensures each iteration does at most one
unaligned access and mutually aligned inputs behave as aligned.
After the main loop, process the last 8 bytes using unaligned accesses.

This improves performance of (mutually) aligned cases by 25% and
unaligned by >500% (yes >6 times faster) on large inputs.

ChangeLog:
2017-06-28  Wilco Dijkstra  <wdijkstr@arm.com>

        * newlib/libc/machine/aarch64/memcmp.S (memcmp):
        Rewrite of optimized memcmp.

GLIBC benchtests/bench-memcmp.c performance comparison for Cortex-A53:

Length    1, alignment  1/ 1:		153%
Length    1, alignment  1/ 1:		119%
Length    1, alignment  1/ 1:		154%
Length    2, alignment  2/ 2:		121%
Length    2, alignment  2/ 2:		140%
Length    2, alignment  2/ 2:		121%
Length    3, alignment  3/ 3:		105%
Length    3, alignment  3/ 3:		105%
Length    3, alignment  3/ 3:		105%
Length    4, alignment  4/ 4:		155%
Length    4, alignment  4/ 4:		154%
Length    4, alignment  4/ 4:		161%
Length    5, alignment  5/ 5:		173%
Length    5, alignment  5/ 5:		173%
Length    5, alignment  5/ 5:		173%
Length    6, alignment  6/ 6:		145%
Length    6, alignment  6/ 6:		145%
Length    6, alignment  6/ 6:		145%
Length    7, alignment  7/ 7:		125%
Length    7, alignment  7/ 7:		125%
Length    7, alignment  7/ 7:		125%
Length    8, alignment  8/ 8:		111%
Length    8, alignment  8/ 8:		130%
Length    8, alignment  8/ 8:		124%
Length    9, alignment  9/ 9:		160%
Length    9, alignment  9/ 9:		160%
Length    9, alignment  9/ 9:		150%
Length   10, alignment 10/10:		170%
Length   10, alignment 10/10:		137%
Length   10, alignment 10/10:		150%
Length   11, alignment 11/11:		160%
Length   11, alignment 11/11:		160%
Length   11, alignment 11/11:		160%
Length   12, alignment 12/12:		146%
Length   12, alignment 12/12:		168%
Length   12, alignment 12/12:		156%
Length   13, alignment 13/13:		167%
Length   13, alignment 13/13:		167%
Length   13, alignment 13/13:		173%
Length   14, alignment 14/14:		167%
Length   14, alignment 14/14:		168%
Length   14, alignment 14/14:		168%
Length   15, alignment 15/15:		168%
Length   15, alignment 15/15:		173%
Length   15, alignment 15/15:		173%
Length    1, alignment  0/ 0:		134%
Length    1, alignment  0/ 0:		127%
Length    1, alignment  0/ 0:		119%
Length    2, alignment  0/ 0:		94%
Length    2, alignment  0/ 0:		94%
Length    2, alignment  0/ 0:		106%
Length    3, alignment  0/ 0:		82%
Length    3, alignment  0/ 0:		87%
Length    3, alignment  0/ 0:		82%
Length    4, alignment  0/ 0:		115%
Length    4, alignment  0/ 0:		115%
Length    4, alignment  0/ 0:		122%
Length    5, alignment  0/ 0:		127%
Length    5, alignment  0/ 0:		119%
Length    5, alignment  0/ 0:		127%
Length    6, alignment  0/ 0:		103%
Length    6, alignment  0/ 0:		100%
Length    6, alignment  0/ 0:		100%
Length    7, alignment  0/ 0:		82%
Length    7, alignment  0/ 0:		91%
Length    7, alignment  0/ 0:		87%
Length    8, alignment  0/ 0:		111%
Length    8, alignment  0/ 0:		124%
Length    8, alignment  0/ 0:		124%
Length    9, alignment  0/ 0:		136%
Length    9, alignment  0/ 0:		136%
Length    9, alignment  0/ 0:		136%
Length   10, alignment  0/ 0:		136%
Length   10, alignment  0/ 0:		135%
Length   10, alignment  0/ 0:		136%
Length   11, alignment  0/ 0:		136%
Length   11, alignment  0/ 0:		136%
Length   11, alignment  0/ 0:		135%
Length   12, alignment  0/ 0:		136%
Length   12, alignment  0/ 0:		136%
Length   12, alignment  0/ 0:		136%
Length   13, alignment  0/ 0:		135%
Length   13, alignment  0/ 0:		136%
Length   13, alignment  0/ 0:		136%
Length   14, alignment  0/ 0:		136%
Length   14, alignment  0/ 0:		136%
Length   14, alignment  0/ 0:		136%
Length   15, alignment  0/ 0:		136%
Length   15, alignment  0/ 0:		136%
Length   15, alignment  0/ 0:		136%
Length    4, alignment  0/ 0:		115%
Length    4, alignment  0/ 0:		115%
Length    4, alignment  0/ 0:		115%
Length   32, alignment  0/ 0:		127%
Length   32, alignment  7/ 2:		395%
Length   32, alignment  0/ 0:		127%
Length   32, alignment  0/ 0:		127%
Length    8, alignment  0/ 0:		111%
Length    8, alignment  0/ 0:		124%
Length    8, alignment  0/ 0:		124%
Length   64, alignment  0/ 0:		128%
Length   64, alignment  6/ 4:		475%
Length   64, alignment  0/ 0:		131%
Length   64, alignment  0/ 0:		134%
Length   16, alignment  0/ 0:		128%
Length   16, alignment  0/ 0:		119%
Length   16, alignment  0/ 0:		128%
Length  128, alignment  0/ 0:		129%
Length  128, alignment  5/ 6:		475%
Length  128, alignment  0/ 0:		130%
Length  128, alignment  0/ 0:		129%
Length   32, alignment  0/ 0:		126%
Length   32, alignment  0/ 0:		126%
Length   32, alignment  0/ 0:		126%
Length  256, alignment  0/ 0:		127%
Length  256, alignment  4/ 8:		545%
Length  256, alignment  0/ 0:		126%
Length  256, alignment  0/ 0:		128%
Length   64, alignment  0/ 0:		171%
Length   64, alignment  0/ 0:		171%
Length   64, alignment  0/ 0:		174%
Length  512, alignment  0/ 0:		126%
Length  512, alignment  3/10:		585%
Length  512, alignment  0/ 0:		126%
Length  512, alignment  0/ 0:		127%
Length  128, alignment  0/ 0:		129%
Length  128, alignment  0/ 0:		128%
Length  128, alignment  0/ 0:		129%
Length 1024, alignment  0/ 0:		125%
Length 1024, alignment  2/12:		611%
Length 1024, alignment  0/ 0:		126%
Length 1024, alignment  0/ 0:		126%
Length  256, alignment  0/ 0:		128%
Length  256, alignment  0/ 0:		127%
Length  256, alignment  0/ 0:		128%
Length 2048, alignment  0/ 0:		125%
Length 2048, alignment  1/14:		625%
Length 2048, alignment  0/ 0:		125%
Length 2048, alignment  0/ 0:		125%
Length  512, alignment  0/ 0:		126%
Length  512, alignment  0/ 0:		127%
Length  512, alignment  0/ 0:		127%
Length 4096, alignment  0/ 0:		125%
Length 4096, alignment  0/16:		125%
Length 4096, alignment  0/ 0:		125%
Length 4096, alignment  0/ 0:		125%
Length 1024, alignment  0/ 0:		126%
Length 1024, alignment  0/ 0:		126%
Length 1024, alignment  0/ 0:		126%
Length 8192, alignment  0/ 0:		125%
Length 8192, alignment 63/18:		636%
Length 8192, alignment  0/ 0:		125%
Length 8192, alignment  0/ 0:		125%
Length   16, alignment  1/ 2:		317%
Length   16, alignment  1/ 2:		317%
Length   16, alignment  1/ 2:		317%
Length   32, alignment  2/ 4:		395%
Length   32, alignment  2/ 4:		395%
Length   32, alignment  2/ 4:		398%
Length   64, alignment  3/ 6:		475%
Length   64, alignment  3/ 6:		475%
Length   64, alignment  3/ 6:		477%
Length  128, alignment  4/ 8:		479%
Length  128, alignment  4/ 8:		479%
Length  128, alignment  4/ 8:		479%
Length  256, alignment  5/10:		543%
Length  256, alignment  5/10:		539%
Length  256, alignment  5/10:		543%
Length  512, alignment  6/12:		585%
Length  512, alignment  6/12:		585%
Length  512, alignment  6/12:		585%
Length 1024, alignment  7/14:		611%
Length 1024, alignment  7/14:		611%
Length 1024, alignment  7/14:		611%
This commit is contained in:
Wilco Dijkstra 2017-06-29 14:32:09 +00:00 committed by Corinna Vinschen
parent 181d8393ae
commit c86063bdc0

View File

@ -1,40 +1,53 @@
/* memcmp - compare memory /*
* Copyright (c) 2017 ARM Ltd
Copyright (c) 2013, Linaro Limited * All rights reserved.
Copyright (c) 2017, Samsung Austin R&D Center *
All rights reserved. * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
Redistribution and use in source and binary forms, with or without * are met:
modification, are permitted provided that the following conditions are met: * 1. Redistributions of source code must retain the above copyright
* Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer.
notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright
* Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the
notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution.
documentation and/or other materials provided with the distribution. * 3. The name of the company may not be used to endorse or promote
* Neither the name of the Linaro nor the * products derived from this software without specific prior written
names of its contributors may be used to endorse or promote products * permission.
derived from this software without specific prior written permission. *
* THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE */
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
#if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED)) #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
/* See memcmp-stub.c */ /* See memcmp-stub.c */
#else #else
/* Assumptions: /* Assumptions:
* *
* ARMv8-a, AArch64 * ARMv8-a, AArch64, unaligned accesses.
*/ */
/* Parameters and result. */
#define src1 x0
#define src2 x1
#define limit x2
#define result w0
/* Internal variables. */
#define data1 x3
#define data1w w3
#define data2 x4
#define data2w w4
#define tmp1 x5
.macro def_fn f p2align=0 .macro def_fn f p2align=0
.text .text
.p2align \p2align .p2align \p2align
@ -43,178 +56,85 @@
\f: \f:
.endm .endm
/* Parameters and result. */ /* Small inputs of less than 8 bytes are handled separately. This allows the
#define src1 x0 main code to be sped up using unaligned loads since there are now at least
#define src2 x1 8 bytes to be compared. If the first 8 bytes are equal, align src1.
#define limit x2 This ensures each iteration does at most one unaligned access even if both
#define result x0 src1 and src2 are unaligned, and mutually aligned inputs behave as if
aligned. After the main loop, process the last 8 bytes using unaligned
/* Internal variables. */ accesses. */
#define data1 x3
#define data1w w3
#define data2 x4
#define data2w w4
#define has_nul x5
#define diff x6
#define endloop x7
#define tmp1 x8
#define tmp2 x9
#define tmp3 x10
#define pos x11
#define limit_wd x12
#define mask x13
def_fn memcmp p2align=6 def_fn memcmp p2align=6
cbz limit, .Lret0 subs limit, limit, 8
eor tmp1, src1, src2 b.lo .Lless8
tst tmp1, #7
b.ne .Lmisaligned8
ands tmp1, src1, #7
b.ne .Lmutual_align
add limit_wd, limit, #7
lsr limit_wd, limit_wd, #3
/* Start of performance-critical section -- one 64B cache line. */
.Lloop_aligned:
ldr data1, [src1], #8
ldr data2, [src2], #8
.Lstart_realigned:
subs limit_wd, limit_wd, #1
eor diff, data1, data2 /* Non-zero if differences found. */
csinv endloop, diff, xzr, ne /* Last Dword or differences. */
cbz endloop, .Lloop_aligned
/* End of performance-critical section -- one 64B cache line. */
/* Not reached the limit, must have found a diff. */ /* Limit >= 8, so check first 8 bytes using unaligned loads. */
cbnz limit_wd, .Lnot_limit ldr data1, [src1], 8
ldr data2, [src2], 8
and tmp1, src1, 7
add limit, limit, tmp1
cmp data1, data2
bne .Lreturn
/* Limit % 8 == 0 => all bytes significant. */ /* Align src1 and adjust src2 with bytes not yet done. */
ands limit, limit, #7 sub src1, src1, tmp1
b.eq .Lnot_limit sub src2, src2, tmp1
lsl limit, limit, #3 /* Bits -> bytes. */ subs limit, limit, 8
mov mask, #~0 b.ls .Llast_bytes
#ifdef __AARCH64EB__
lsr mask, mask, limit
#else
lsl mask, mask, limit
#endif
bic data1, data1, mask
bic data2, data2, mask
orr diff, diff, mask /* Loop performing 8 bytes per iteration using aligned src1.
.Lnot_limit: Limit is pre-decremented by 8 and must be larger than zero.
Exit if <= 8 bytes left to do or if the data is not equal. */
.p2align 4
.Lloop8:
ldr data1, [src1], 8
ldr data2, [src2], 8
subs limit, limit, 8
ccmp data1, data2, 0, hi /* NZCV = 0b0000. */
b.eq .Lloop8
cmp data1, data2
bne .Lreturn
/* Compare last 1-8 bytes using unaligned access. */
.Llast_bytes:
ldr data1, [src1, limit]
ldr data2, [src2, limit]
/* Compare data bytes and set return value to 0, -1 or 1. */
.Lreturn:
#ifndef __AARCH64EB__ #ifndef __AARCH64EB__
rev diff, diff
rev data1, data1 rev data1, data1
rev data2, data2 rev data2, data2
#endif #endif
/* The MS-non-zero bit of DIFF marks either the first bit cmp data1, data2
that is different, or the end of the significant data. .Lret_eq:
Shifting left now will bring the critical information into the cset result, ne
top bits. */ cneg result, result, lo
clz pos, diff
lsl data1, data1, pos
lsl data2, data2, pos
/* But we need to zero-extend (char is unsigned) the value and then
perform a signed 32-bit subtraction. */
lsr data1, data1, #56
sub result, data1, data2, lsr #56
ret ret
.Lmutual_align: .p2align 4
/* Sources are mutually aligned, but are not currently at an /* Compare up to 8 bytes. Limit is [-8..-1]. */
alignment boundary. Round down the addresses and then mask off .Lless8:
the bytes that precede the start point. */ adds limit, limit, 4
bic src1, src1, #7 b.lo .Lless4
bic src2, src2, #7 ldr data1w, [src1], 4
add limit, limit, tmp1 /* Adjust the limit for the extra. */ ldr data2w, [src2], 4
lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */ cmp data1w, data2w
ldr data1, [src1], #8 b.ne .Lreturn
neg tmp1, tmp1 /* Bits to alignment -64. */ sub limit, limit, 4
ldr data2, [src2], #8 .Lless4:
mov tmp2, #~0 adds limit, limit, 4
#ifdef __AARCH64EB__ beq .Lret_eq
/* Big-endian. Early bytes are at MSB. */ .Lbyte_loop:
lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */ ldrb data1w, [src1], 1
#else ldrb data2w, [src2], 1
/* Little-endian. Early bytes are at LSB. */ subs limit, limit, 1
lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */ ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */
#endif b.eq .Lbyte_loop
add limit_wd, limit, #7 sub result, data1w, data2w
orr data1, data1, tmp2
orr data2, data2, tmp2
lsr limit_wd, limit_wd, #3
b .Lstart_realigned
.Lret0:
mov result, #0
ret ret
.p2align 6
.Lmisaligned8:
cmp limit, #8
b.lo .LmisalignedLt8
.LunalignedGe8 :
/* Load the first dword with both src potentially unaligned. */
ldr data1, [src1]
ldr data2, [src2]
eor diff, data1, data2 /* Non-zero if differences found. */
cbnz diff, .Lnot_limit
/* Sources are not aligned: align one of the sources. */
and tmp1, src1, #0x7
orr tmp3, xzr, #0x8
sub pos, tmp3, tmp1
/* Increment SRC pointers by POS so SRC1 is word-aligned. */
add src1, src1, pos
add src2, src2, pos
sub limit, limit, pos
lsr limit_wd, limit, #3
cmp limit_wd, #0
/* save #bytes to go back to be able to read 8byte at end
pos=negative offset position to read 8 bytes when len%8 != 0 */
and limit, limit, #7
sub pos, limit, #8
b .Lstart_part_realigned
.p2align 5
.Lloop_part_aligned:
ldr data1, [src1], #8
ldr data2, [src2], #8
subs limit_wd, limit_wd, #1
.Lstart_part_realigned:
eor diff, data1, data2 /* Non-zero if differences found. */
cbnz diff, .Lnot_limit
b.ne .Lloop_part_aligned
/* process leftover bytes: read the leftover bytes, starting with
negative offset - so we can load 8 bytes. */
ldr data1, [src1, pos]
ldr data2, [src2, pos]
eor diff, data1, data2 /* Non-zero if differences found. */
b .Lnot_limit
.LmisalignedLt8:
sub limit, limit, #1
1:
ldrb data1w, [src1], #1
ldrb data2w, [src2], #1
subs limit, limit, #1
ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
b.eq 1b
sub result, data1, data2
ret
.size memcmp, . - memcmp .size memcmp, . - memcmp
#endif #endif