newlib-cygwin/winsup/cygwin/x86_64/bcopy.S

/*-
 * Copyright (c) 1990 The Regents of the University of California.
 * All rights reserved.
 *
 * This code is derived from locore.s.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

#include <machine/asm.h>

#if defined(LIBC_SCCS)
	RCSID("$NetBSD: bcopy.S,v 1.5 2014/03/22 19:16:34 jakllsch Exp $")
#endif

	/*
	 * (ov)bcopy (src,dst,cnt)
	 *  ws@tools.de     (Wolfgang Solfrank, TooLs GmbH) +49-228-985800
	 *
	 * Hacked about by dsl@netbsd.org
	 */

#ifdef MEMCOPY
#ifdef WIDE
#ifdef POST
ENTRY3(wmempcpy)
#else
ENTRY3(wmemcpy)
#endif
#else
#ifdef POST
ENTRY3(mempcpy)
#else
ENTRY3(memcpy)
#endif
#endif
#define NO_OVERLAP
#else
#ifdef MEMMOVE
#ifdef WIDE
ENTRY3(wmemmove)
#else
ENTRY3(memmove)
#endif
#else
ENTRY3(bcopy)
#endif
#endif
#ifdef WIDE
	shlq	$1,%rdx		/* cnt * sizeof (wchar_t) */
#endif
	movq	%rdx,%rcx
#if defined(MEMCOPY) || defined(MEMMOVE)
	movq	%rdi,%rax	/* must return destination address */
#ifdef POST
	addq	%rdx,%rax	/* + n */
#endif
	mov	%rdi,%r11	/* for misaligned check */
#else
	mov	%rsi,%r11	/* for misaligned check */
	xchgq	%rdi,%rsi	/* bcopy() has arg order reversed */
#endif

#if !defined(NO_OVERLAP)
	movq	%rdi,%r8
	subq	%rsi,%r8
#endif

	shrq	$3,%rcx		/* count for copy by words */
	jz	8f		/* j if less than 8 bytes */

	lea	-8(%rdi,%rdx),%r9	/* target address of last 8 */
	mov	-8(%rsi,%rdx),%r10	/* get last word */
#if !defined(NO_OVERLAP)
	cmpq	%rdx,%r8	/* overlapping? */
	jb	10f
#endif

/*
 * Non-overlaping, copy forwards.
 * Newer Intel cpus (Nehalem) will do 16byte read/write transfers
 * if %ecx is more than 76.
 * AMD might do something similar some day.
 */
	and	$7,%r11		/* destination misaligned ? */
	jnz	2f
	rep
	movsq
	mov	%r10,(%r9)	/* write last word */
	ret

/*
 * Destination misaligned
 * AMD say it is better to align the destination (not the source).
 * This will also re-align copies if the source and dest are both
 * misaligned by the same amount)
 * (I think Nehalem will use its accelerated copy if the source
 * and destination have the same alignment.)
 */
2:
	lea	-9(%r11,%rdx),%rcx	/* post re-alignment count */
	neg	%r11			/* now -1 .. -7 */
	mov	(%rsi),%rdx		/* get first word */
	mov	%rdi,%r8		/* target for first word */
	lea	8(%rsi,%r11),%rsi
	lea	8(%rdi,%r11),%rdi
	shr	$3,%rcx
	rep
	movsq
	mov	%rdx,(%r8)		/* write first word */
	mov	%r10,(%r9)		/* write last word */
	ret

#if !defined(NO_OVERLAP)
/* Must copy backwards.
 * Reverse copy is probably easy to code faster than 'rep movds'
 * since that requires (IIRC) an extra clock every 3 iterations (AMD).
 * However I don't suppose anything cares that much!
 * The big cost is the std/cld pair - reputedly 50+ cycles on Netburst P4.
 * The copy is aligned with the buffer start (more likely to
 * be a multiple of 8 than the end).
 */
10:
	lea	-8(%rsi,%rcx,8),%rsi
	lea	-8(%rdi,%rcx,8),%rdi
	std
	rep
	movsq
	cld
	mov	%r10,(%r9)	/* write last bytes */
	ret
#endif

/* Less than 8 bytes to copy, copy by bytes */
/* Intel Nehalem optimise 'rep movsb' for <= 7 bytes (9-15 clocks).
 * For longer transfers it is 50+ !
 */
8:	mov	%rdx,%rcx

#if !defined(NO_OVERLAP)
	cmpq	%rdx,%r8	/* overlapping? */
	jb	81f
#endif

	/* nope, copy forwards. */
	rep
	movsb
	ret

#if !defined(NO_OVERLAP)
/* Must copy backwards */
81:
	lea	-1(%rsi,%rcx),%rsi
	lea	-1(%rdi,%rcx),%rdi
	std
	rep
	movsb
	cld
	ret
#endif

#ifdef MEMCOPY
END(memcpy)
#else
#ifdef MEMMOVE
END(memmove)
#else
END(bcopy)
#endif
#endif
Cygwin: x86_64: import latest NetBSD bcopy.S Tweak slightly to allow implementing entire {w}mem{p}{cpy,move} family: Add WIDE macro processing for wmem* and POST macro processing for memp* functions. Signed-off-by: Corinna Vinschen <corinna@vinschen.de> 2022-12-20 17:13:38 +08:00			`/*-`
			`* Copyright (c) 1990 The Regents of the University of California.`
			`* All rights reserved.`
			`*`
			`* This code is derived from locore.s.`
			`*`
			`* Redistribution and use in source and binary forms, with or without`
			`* modification, are permitted provided that the following conditions`
			`* are met:`
			`* 1. Redistributions of source code must retain the above copyright`
			`* notice, this list of conditions and the following disclaimer.`
			`* 2. Redistributions in binary form must reproduce the above copyright`
			`* notice, this list of conditions and the following disclaimer in the`
			`* documentation and/or other materials provided with the distribution.`
			`* 3. Neither the name of the University nor the names of its contributors`
			`* may be used to endorse or promote products derived from this software`
			`* without specific prior written permission.`
			`*`
			* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
			`* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE`
			`* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE`
			`* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE`
			`* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL`
			`* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS`
			`* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)`
			`* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT`
			`* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY`
			`* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF`
			`* SUCH DAMAGE.`
			`*/`

			`#include <machine/asm.h>`

			`#if defined(LIBC_SCCS)`
			`RCSID("$NetBSD: bcopy.S,v 1.5 2014/03/22 19:16:34 jakllsch Exp $")`
			`#endif`

			`/*`
			`* (ov)bcopy (src,dst,cnt)`
			`* ws@tools.de (Wolfgang Solfrank, TooLs GmbH) +49-228-985800`
			`*`
			`* Hacked about by dsl@netbsd.org`
			`*/`

			`#ifdef MEMCOPY`
			`#ifdef WIDE`
			`#ifdef POST`
			`ENTRY3(wmempcpy)`
			`#else`
			`ENTRY3(wmemcpy)`
			`#endif`
			`#else`
			`#ifdef POST`
			`ENTRY3(mempcpy)`
			`#else`
			`ENTRY3(memcpy)`
			`#endif`
			`#endif`
			`#define NO_OVERLAP`
			`#else`
			`#ifdef MEMMOVE`
			`#ifdef WIDE`
			`ENTRY3(wmemmove)`
			`#else`
			`ENTRY3(memmove)`
			`#endif`
			`#else`
			`ENTRY3(bcopy)`
			`#endif`
			`#endif`
			`#ifdef WIDE`
			`shlq $1,%rdx /* cnt * sizeof (wchar_t) */`
			`#endif`
			`movq %rdx,%rcx`
			`#if defined(MEMCOPY) \|\| defined(MEMMOVE)`
			`movq %rdi,%rax /* must return destination address */`
			`#ifdef POST`
			`addq %rdx,%rax /* + n */`
			`#endif`
			`mov %rdi,%r11 /* for misaligned check */`
			`#else`
			`mov %rsi,%r11 /* for misaligned check */`
			`xchgq %rdi,%rsi /* bcopy() has arg order reversed */`
			`#endif`

			`#if !defined(NO_OVERLAP)`
			`movq %rdi,%r8`
			`subq %rsi,%r8`
			`#endif`

			`shrq $3,%rcx /* count for copy by words */`
			`jz 8f /* j if less than 8 bytes */`

			`lea -8(%rdi,%rdx),%r9 /* target address of last 8 */`
			`mov -8(%rsi,%rdx),%r10 /* get last word */`
			`#if !defined(NO_OVERLAP)`
			`cmpq %rdx,%r8 /* overlapping? */`
			`jb 10f`
			`#endif`

			`/*`
			`* Non-overlaping, copy forwards.`
			`* Newer Intel cpus (Nehalem) will do 16byte read/write transfers`
			`* if %ecx is more than 76.`
			`* AMD might do something similar some day.`
			`*/`
			`and $7,%r11 /* destination misaligned ? */`
			`jnz 2f`
			`rep`
			`movsq`
			`mov %r10,(%r9) /* write last word */`
			`ret`

			`/*`
			`* Destination misaligned`
			`* AMD say it is better to align the destination (not the source).`
			`* This will also re-align copies if the source and dest are both`
			`* misaligned by the same amount)`
			`* (I think Nehalem will use its accelerated copy if the source`
			`* and destination have the same alignment.)`
			`*/`
			`2:`
			`lea -9(%r11,%rdx),%rcx /* post re-alignment count */`
			`neg %r11 /* now -1 .. -7 */`
			`mov (%rsi),%rdx /* get first word */`
			`mov %rdi,%r8 /* target for first word */`
			`lea 8(%rsi,%r11),%rsi`
			`lea 8(%rdi,%r11),%rdi`
			`shr $3,%rcx`
			`rep`
			`movsq`
			`mov %rdx,(%r8) /* write first word */`
			`mov %r10,(%r9) /* write last word */`
			`ret`

			`#if !defined(NO_OVERLAP)`
			`/* Must copy backwards.`
			`* Reverse copy is probably easy to code faster than 'rep movds'`
			`* since that requires (IIRC) an extra clock every 3 iterations (AMD).`
			`* However I don't suppose anything cares that much!`
			`* The big cost is the std/cld pair - reputedly 50+ cycles on Netburst P4.`
			`* The copy is aligned with the buffer start (more likely to`
			`* be a multiple of 8 than the end).`
			`*/`
			`10:`
			`lea -8(%rsi,%rcx,8),%rsi`
			`lea -8(%rdi,%rcx,8),%rdi`
			`std`
			`rep`
			`movsq`
			`cld`
			`mov %r10,(%r9) /* write last bytes */`
			`ret`
			`#endif`

			`/* Less than 8 bytes to copy, copy by bytes */`
			`/* Intel Nehalem optimise 'rep movsb' for <= 7 bytes (9-15 clocks).`
			`* For longer transfers it is 50+ !`
			`*/`
			`8: mov %rdx,%rcx`

			`#if !defined(NO_OVERLAP)`
			`cmpq %rdx,%r8 /* overlapping? */`
			`jb 81f`
			`#endif`

			`/* nope, copy forwards. */`
			`rep`
			`movsb`
			`ret`

			`#if !defined(NO_OVERLAP)`
			`/* Must copy backwards */`
			`81:`
			`lea -1(%rsi,%rcx),%rsi`
			`lea -1(%rdi,%rcx),%rdi`
			`std`
			`rep`
			`movsb`
			`cld`
			`ret`
			`#endif`

			`#ifdef MEMCOPY`
			`END(memcpy)`
			`#else`
			`#ifdef MEMMOVE`
			`END(memmove)`
			`#else`
			`END(bcopy)`
			`#endif`
			`#endif`