[AArch64] Reverting recent optimized memset().

2015-07-15 13:34:58 +01:00 · 2015-07-15 13:34:58 +01:00 · c7806ef76a
parent c028685518
commit c7806ef76a
2 changed files with 203 additions and 191 deletions
--- a/newlib/ChangeLog
+++ b/newlib/ChangeLog
@ -1,3 +1,8 @@
 2015-07-15  Wilco Dijkstra  <wdijkstr@arm.com>
 	* libc/machine/aarch64/memset.S (memset):
 	Revert: Rewrite of optimized memset.
 2015-07-13  Wilco Dijkstra  <wdijkstr@arm.com>
 	* libc/machine/aarch64/memset.S (memset):
--- a/newlib/libc/machine/aarch64/memset.S
+++ b/newlib/libc/machine/aarch64/memset.S
@ -24,37 +24,10 @@
   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
 /*
 * Copyright (c) 2015 ARM Ltd
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. The name of the company may not be used to endorse or promote
 *    products derived from this software without specific prior written
 *    permission.
 *
 * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 /* Assumptions:
 *
- * ARMv8-a, AArch64, unaligned accesses
+ * ARMv8-a, AArch64
 * Unaligned accesses
 *
 */
@ -62,20 +35,32 @@
 /* See memset-stub.c  */
 #else
-#define dstin	x0
+/* By default we assume that the DC instruction can be used to zero
-#define val	x1
+   data blocks more efficiently.  In some circumstances this might be
-#define valw	w1
+   unsafe, for example in an asymmetric multiprocessor environment with
-#define count	x2
+   different DC clear lengths (neither the upper nor lower lengths are
-#define dst	x3
+   safe to use).  The feature can be disabled by defining DONT_USE_DC.
-#define dstend	x4
+
-#define tmp1	x5
+   If code may be run in a virtualized environment, then define
-#define tmp1w	w5
+   MAYBE_VIRT.  This will cause the code to cache the system register
-#define tmp2	x6
+   values rather than re-reading them each call.  */
-#define tmp2w	w6
+
-#define zva_len x7
+#define dstin		x0
-#define zva_lenw w7
+#define val		w1
 #define count		x2
 #define tmp1		x3
 #define tmp1w		w3
 #define tmp2		x4
 #define tmp2w		w4
 #define zva_len_x	x5
 #define zva_len		w5
 #define zva_bits_x	x6
 #define A_l		x7
 #define A_lw		w7
 #define dst		x8
 #define tmp3w		w9
 #define L(l) .L ## l
 	.macro def_fn f p2align=0
 	.text
@ -87,153 +72,175 @@
 def_fn memset p2align=6
-	dup	v0.16B, valw
+	mov	dst, dstin		/* Preserve return value.  */
-	add	dstend, dstin, count
+	ands	A_lw, val, #255
-
+#ifndef DONT_USE_DC
-	cmp	count, 96
+	b.eq	.Lzero_mem
-	b.hi	L(set_long)
+#endif
-	cmp	count, 16
+	orr	A_lw, A_lw, A_lw, lsl #8
-	b.hs	L(set_medium)
+	orr	A_lw, A_lw, A_lw, lsl #16
-	mov	val, v0.D[0]
+	orr	A_l, A_l, A_l, lsl #32
-
+.Ltail_maybe_long:
-	/* Set 0..15 bytes.  */
+	cmp	count, #64
-	tbz	count, 3, 1f
+	b.ge	.Lnot_short
-	str	val, [dstin]
+.Ltail_maybe_tiny:
-	str	val, [dstend, -8]
+	cmp	count, #15
-	ret
+	b.le	.Ltail15tiny
-	nop
+.Ltail63:
-1:	tbz	count, 2, 2f
+	ands	tmp1, count, #0x30
-	str	valw, [dstin]
+	b.eq	.Ltail15
-	str	valw, [dstend, -4]
+	add	dst, dst, tmp1
-	ret
+	cmp	tmp1w, #0x20
-2:	cbz	count, 3f
+	b.eq	1f
-	strb	valw, [dstin]
+	b.lt	2f
-	tbz	count, 1, 3f
+	stp	A_l, A_l, [dst, #-48]
-	strh	valw, [dstend, -2]
+1:
-3:	ret
+	stp	A_l, A_l, [dst, #-32]
-
+2:
-	/* Set 17..96 bytes.  */
+	stp	A_l, A_l, [dst, #-16]
-L(set_medium):
+
-	str	q0, [dstin]
+.Ltail15:
-	tbnz	count, 6, L(set96)
+	and	count, count, #15
-	str	q0, [dstend, -16]
+	add	dst, dst, count
-	tbz	count, 5, 1f
+	stp	A_l, A_l, [dst, #-16]	/* Repeat some/all of last store. */
-	str	q0, [dstin, 16]
+	ret
-	str	q0, [dstend, -32]
+
-1:	ret
+.Ltail15tiny:
-
+	/* Set up to 15 bytes.  Does not assume earlier memory
-	.p2align 4
+	   being set.  */
-	/* Set 64..96 bytes.  Write 64 bytes from the start and
+	tbz	count, #3, 1f
-	   32 bytes from the end.  */
+	str	A_l, [dst], #8
-L(set96):
+1:
-	str	q0, [dstin, 16]
+	tbz	count, #2, 1f
-	stp	q0, q0, [dstin, 32]
+	str	A_lw, [dst], #4
-	stp	q0, q0, [dstend, -32]
+1:
-	ret
+	tbz	count, #1, 1f
-
+	strh	A_lw, [dst], #2
-	.p2align 3
+1:
-	nop
+	tbz	count, #0, 1f
-L(set_long):
+	strb	A_lw, [dst]
-	and	valw, valw, 255
+1:
-	bic	dst, dstin, 15
+	ret
-	str	q0, [dstin]
+
-	cmp	count, 256
+	/* Critical loop.  Start at a new cache line boundary.  Assuming
-	ccmp	valw, 0, 0, cs
+	 * 64 bytes per line, this ensures the entire loop is in one line.  */
-	b.eq	L(try_zva)
+	.p2align 6
-L(no_zva):
+.Lnot_short:
-	sub	count, dstend, dst	/* Count is 16 too large.  */
+	neg	tmp2, dst
-	add	dst, dst, 16
+	ands	tmp2, tmp2, #15
-	sub	count, count, 64 + 16	/* Adjust count and bias for loop.  */
+	b.eq	2f
-1:	stp	q0, q0, [dst], 64
+	/* Bring DST to 128-bit (16-byte) alignment.  We know that there's
-	stp	q0, q0, [dst, -32]
+	 * more than that to set, so we simply store 16 bytes and advance by
-L(tail64):
+	 * the amount required to reach alignment.  */
-	subs	count, count, 64
+	sub	count, count, tmp2
-	b.hi	1b
+	stp	A_l, A_l, [dst]
-2:	stp	q0, q0, [dstend, -64]
+	add	dst, dst, tmp2
-	stp	q0, q0, [dstend, -32]
+	/* There may be less than 63 bytes to go now.  */
-	ret
+	cmp	count, #63
-
+	b.le	.Ltail63
-	.p2align 3
+2:
-L(try_zva):
+	sub	dst, dst, #16		/* Pre-bias.  */
-	mrs	tmp1, dczid_el0
+	sub	count, count, #64
-	tbnz	tmp1w, 4, L(no_zva)
+1:
-	and	tmp1w, tmp1w, 4
+	stp	A_l, A_l, [dst, #16]
-	cmp	tmp1w, 4	/* ZVA size is 64 bytes.  */
+	stp	A_l, A_l, [dst, #32]
-	b.ne	 L(zva_128)
+	stp	A_l, A_l, [dst, #48]
-
+	stp	A_l, A_l, [dst, #64]!
-	/* Write the first and last 64 byte aligned block using stp rather
+	subs	count, count, #64
-	   than using DC ZVA.  This is faster on some cores.
+	b.ge	1b
-	 */
+	tst	count, #0x3f
-L(zva_64):
+	add	dst, dst, #16
-	str	q0, [dst, 16]
+	b.ne	.Ltail63
-	stp	q0, q0, [dst, 32]
+	ret
-	bic	dst, dst, 63
+
-	stp	q0, q0, [dst, 64]
+#ifndef DONT_USE_DC
-	stp	q0, q0, [dst, 96]
+	/* For zeroing memory, check to see if we can use the ZVA feature to
-	sub	count, dstend, dst	/* Count is now 128 too large.	*/
+	 * zero entire 'cache' lines.  */
-	sub	count, count, 128+64+64	/* Adjust count and bias for loop.  */
+.Lzero_mem:
-	add	dst, dst, 128
+	mov	A_l, #0
-	nop
+	cmp	count, #63
-1:	dc	zva, dst
+	b.le	.Ltail_maybe_tiny
-	add	dst, dst, 64
+	neg	tmp2, dst
-	subs	count, count, 64
+	ands	tmp2, tmp2, #15
-	b.hi	1b
+	b.eq	1f
-	stp	q0, q0, [dst, 0]
+	sub	count, count, tmp2
-	stp	q0, q0, [dst, 32]
+	stp	A_l, A_l, [dst]
-	stp	q0, q0, [dstend, -64]
+	add	dst, dst, tmp2
-	stp	q0, q0, [dstend, -32]
+	cmp	count, #63
-	ret
+	b.le	.Ltail63
-
+1:
-	.p2align 3
+	/* For zeroing small amounts of memory, it's not worth setting up
-L(zva_128):
+	 * the line-clear code.  */
-	cmp	tmp1w, 5	/* ZVA size is 128 bytes.  */
+	cmp	count, #128
-	b.ne	L(zva_other)
+	b.lt	.Lnot_short
-
+#ifdef MAYBE_VIRT
-	str	q0, [dst, 16]
+	/* For efficiency when virtualized, we cache the ZVA capability.  */
-	stp	q0, q0, [dst, 32]
+	adrp	tmp2, .Lcache_clear
-	stp	q0, q0, [dst, 64]
+	ldr	zva_len, [tmp2, #:lo12:.Lcache_clear]
-	stp	q0, q0, [dst, 96]
+	tbnz	zva_len, #31, .Lnot_short
-	bic	dst, dst, 127
+	cbnz	zva_len, .Lzero_by_line
-	sub	count, dstend, dst	/* Count is now 128 too large.	*/
+	mrs	tmp1, dczid_el0
-	sub	count, count, 128+128	/* Adjust count and bias for loop.  */
+	tbz	tmp1, #4, 1f
-	add	dst, dst, 128
+	/* ZVA not available.  Remember this for next time.  */
-1:	dc	zva, dst
+	mov	zva_len, #~0
-	add	dst, dst, 128
+	str	zva_len, [tmp2, #:lo12:.Lcache_clear]
-	subs	count, count, 128
+	b	.Lnot_short
-	b.hi	1b
+1:
-	stp	q0, q0, [dstend, -128]
+	mov	tmp3w, #4
-	stp	q0, q0, [dstend, -96]
+	and	zva_len, tmp1w, #15	/* Safety: other bits reserved.  */
-	stp	q0, q0, [dstend, -64]
+	lsl	zva_len, tmp3w, zva_len
-	stp	q0, q0, [dstend, -32]
+	str	zva_len, [tmp2, #:lo12:.Lcache_clear]
-	ret
+#else
-
+	mrs	tmp1, dczid_el0
-L(zva_other):
+	tbnz	tmp1, #4, .Lnot_short
-	mov	tmp2w, 4
+	mov	tmp3w, #4
-	lsl	zva_lenw, tmp2w, tmp1w
+	and	zva_len, tmp1w, #15	/* Safety: other bits reserved.  */
-	add	tmp1, zva_len, 64	/* Max alignment bytes written.	 */
+	lsl	zva_len, tmp3w, zva_len
-	cmp	count, tmp1
+#endif
-	blo	L(no_zva)
+
-
+.Lzero_by_line:
-	sub	tmp2, zva_len, 1
+	/* Compute how far we need to go to become suitably aligned.  We're
-	add	tmp1, dst, zva_len
+	 * already at quad-word alignment.  */
-	add	dst, dst, 16
+	cmp	count, zva_len_x
-	subs	count, tmp1, dst	/* Actual alignment bytes to write.  */
+	b.lt	.Lnot_short		/* Not enough to reach alignment.  */
-	bic	tmp1, tmp1, tmp2	/* Aligned dc zva start address.  */
+	sub	zva_bits_x, zva_len_x, #1
-	beq	2f
+	neg	tmp2, dst
-1:	stp	q0, q0, [dst], 64
+	ands	tmp2, tmp2, zva_bits_x
-	stp	q0, q0, [dst, -32]
+	b.eq	1f			/* Already aligned.  */
-	subs	count, count, 64
+	/* Not aligned, check that there's enough to copy after alignment.  */
-	b.hi	1b
+	sub	tmp1, count, tmp2
-2:	mov	dst, tmp1
+	cmp	tmp1, #64
-	sub	count, dstend, tmp1	/* Remaining bytes to write.  */
+	ccmp	tmp1, zva_len_x, #8, ge	/* NZCV=0b1000 */
-	subs	count, count, zva_len
+	b.lt	.Lnot_short
-	b.lo	4f
+	/* We know that there's at least 64 bytes to zero and that it's safe
-3:	dc	zva, dst
+	 * to overrun by 64 bytes.  */
-	add	dst, dst, zva_len
+	mov	count, tmp1
-	subs	count, count, zva_len
+2:
-	b.hs	3b
+	stp	A_l, A_l, [dst]
-4:	add	count, count, zva_len
+	stp	A_l, A_l, [dst, #16]
-	b	L(tail64)
+	stp	A_l, A_l, [dst, #32]
-
+	subs	tmp2, tmp2, #64
-	.size	memset, . - memset
+	stp	A_l, A_l, [dst, #48]
 	add	dst, dst, #64
 	b.ge	2b
 	/* We've overrun a bit, so adjust dst downwards.  */
 	add	dst, dst, tmp2
 1:
 	sub	count, count, zva_len_x
 3:
 	dc	zva, dst
 	add	dst, dst, zva_len_x
 	subs	count, count, zva_len_x
 	b.ge	3b
 	ands	count, count, zva_bits_x
 	b.ne	.Ltail_maybe_long
 	ret
 	.size	memset, .-memset
 #ifdef MAYBE_VIRT
 	.bss
 	.p2align 2
 .Lcache_clear:
 	.space 4
 #endif
 #endif /* DONT_USE_DC */
 #endif