memcmp.S: optimize for medium to large sizes

This improved memcmp provides a fast path for compares up to 16 bytes and then compares 16 bytes at a time, thus optimizing loads from both sources. The glibc memcmp microbenchmark retains performance (with an error of ~1ns) for smaller compare sizes and reduces up to 31% of execution time for compares up to 4K on the APM Mustang. On Qualcomm Falkor this improves to almost 48%, i.e. it is almost 2x improvement for sizes of 2K and above.
2025-01-19 04:49:25 +08:00 · 2018-06-29 18:08:21 +05:30 · 2018-06-29 18:08:21 +05:30 · 2d9f35c2cc
commit 2d9f35c2cc
parent f44eee8f1b
1 changed files with 99 additions and 43 deletions
--- a/newlib/libc/machine/aarch64/memcmp.S
+++ b/newlib/libc/machine/aarch64/memcmp.S
@ -1,3 +1,31 @@
+/* memcmp - compare memory
+
+   Copyright (c) 2018 Linaro Limited
+   All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are met:
+       * Redistributions of source code must retain the above copyright
+         notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above copyright
+         notice, this list of conditions and the following disclaimer in the
+         documentation and/or other materials provided with the distribution.
+       * Neither the name of the Linaro nor the
+         names of its contributors may be used to endorse or promote products
+         derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
+
 /*
 * Copyright (c) 2017 ARM Ltd
 * All rights reserved.
@ -35,6 +63,8 @@
 * ARMv8-a, AArch64, unaligned accesses.
 */

+#define L(l) .L ## l
+
 /* Parameters and result.  */
 #define src1		x0
 #define src2		x1
@ -44,9 +74,12 @@
 /* Internal variables.  */
 #define data1		x3
 #define data1w		w3
-#define data2		x4
-#define data2w		w4
-#define tmp1		x5
+#define data1h		x4
+#define data2		x5
+#define data2w		w5
+#define data2h		x6
+#define tmp1		x7
+#define tmp2		x8

        .macro def_fn f p2align=0
        .text
@ -56,83 +89,106 @@
 \f:
        .endm

-/* Small inputs of less than 8 bytes are handled separately.  This allows the
-   main code to be sped up using unaligned loads since there are now at least
-   8 bytes to be compared.  If the first 8 bytes are equal, align src1.
-   This ensures each iteration does at most one unaligned access even if both
-   src1 and src2 are unaligned, and mutually aligned inputs behave as if
-   aligned.  After the main loop, process the last 8 bytes using unaligned
-   accesses.  */
-
 def_fn memcmp p2align=6
 	subs	limit, limit, 8
-	b.lo	.Lless8
+	b.lo	L(less8)

-	/* Limit >= 8, so check first 8 bytes using unaligned loads.  */
 	ldr	data1, [src1], 8
 	ldr	data2, [src2], 8
-	and	tmp1, src1, 7
-	add	limit, limit, tmp1
 	cmp	data1, data2
-	bne	.Lreturn
+	b.ne	L(return)
+
+	subs	limit, limit, 8
+	b.gt	L(more16)
+
+	ldr	data1, [src1, limit]
+	ldr	data2, [src2, limit]
+	b	L(return)
+
+L(more16):
+	ldr	data1, [src1], 8
+	ldr	data2, [src2], 8
+	cmp	data1, data2
+	bne	L(return)
+
+	/* Jump directly to comparing the last 16 bytes for 32 byte (or less)
+	   strings.  */
+	subs	limit, limit, 16
+	b.ls	L(last_bytes)
+
+	/* We overlap loads between 0-32 bytes at either side of SRC1 when we
+	   try to align, so limit it only to strings larger than 128 bytes.  */
+	cmp	limit, 96
+	b.ls	L(loop16)

 	/* Align src1 and adjust src2 with bytes not yet done.  */
+	and	tmp1, src1, 15
+	add	limit, limit, tmp1
 	sub	src1, src1, tmp1
 	sub	src2, src2, tmp1

-	subs	limit, limit, 8
-	b.ls	.Llast_bytes
-
-	/* Loop performing 8 bytes per iteration using aligned src1.
-	   Limit is pre-decremented by 8 and must be larger than zero.
-	   Exit if <= 8 bytes left to do or if the data is not equal.  */
+	/* Loop performing 16 bytes per iteration using aligned src1.
+	   Limit is pre-decremented by 16 and must be larger than zero.
+	   Exit if <= 16 bytes left to do or if the data is not equal.  */
 	.p2align 4
-.Lloop8:
-	ldr	data1, [src1], 8
-	ldr	data2, [src2], 8
-	subs	limit, limit, 8
-	ccmp	data1, data2, 0, hi  /* NZCV = 0b0000.  */
-	b.eq	.Lloop8
+L(loop16):
+	ldp	data1, data1h, [src1], 16
+	ldp	data2, data2h, [src2], 16
+	subs	limit, limit, 16
+	ccmp	data1, data2, 0, hi
+	ccmp	data1h, data2h, 0, eq
+	b.eq	L(loop16)

 	cmp	data1, data2
-	bne	.Lreturn
+	bne	L(return)
+	mov	data1, data1h
+	mov	data2, data2h
+	cmp	data1, data2
+	bne	L(return)

-	/* Compare last 1-8 bytes using unaligned access.  */
-.Llast_bytes:
-	ldr	data1, [src1, limit]
-	ldr	data2, [src2, limit]
+	/* Compare last 1-16 bytes using unaligned access.  */
+L(last_bytes):
+	add	src1, src1, limit
+	add	src2, src2, limit
+	ldp	data1, data1h, [src1]
+	ldp	data2, data2h, [src2]
+	cmp     data1, data2
+	bne	L(return)
+	mov	data1, data1h
+	mov	data2, data2h
+	cmp	data1, data2

 	/* Compare data bytes and set return value to 0, -1 or 1.  */
-.Lreturn:
+L(return):
 #ifndef __AARCH64EB__
 	rev	data1, data1
 	rev	data2, data2
 #endif
 	cmp     data1, data2
-.Lret_eq:
+L(ret_eq):
 	cset	result, ne
 	cneg	result, result, lo
-        ret
+	ret

 	.p2align 4
 	/* Compare up to 8 bytes.  Limit is [-8..-1].  */
-.Lless8:
+L(less8):
 	adds	limit, limit, 4
-	b.lo	.Lless4
+	b.lo	L(less4)
 	ldr	data1w, [src1], 4
 	ldr	data2w, [src2], 4
 	cmp	data1w, data2w
-	b.ne	.Lreturn
+	b.ne	L(return)
 	sub	limit, limit, 4
-.Lless4:
+L(less4):
 	adds	limit, limit, 4
-	beq	.Lret_eq
-.Lbyte_loop:
+	beq	L(ret_eq)
+L(byte_loop):
 	ldrb	data1w, [src1], 1
 	ldrb	data2w, [src2], 1
 	subs	limit, limit, 1
 	ccmp	data1w, data2w, 0, ne	/* NZCV = 0b0000.  */
-	b.eq	.Lbyte_loop
+	b.eq	L(byte_loop)
 	sub	result, data1w, data2w
 	ret