mirror of
git://sourceware.org/git/newlib-cygwin.git
synced 2025-01-18 12:29:32 +08:00
2008-05-26 Eric Blake <ebb9@byu.net>
Optimize the generic and x86 memset. * libc/string/memset.c (memset) [!__OPTIMIZE_SIZE__]: Pre-align pointer so unaligned stores aren't penalized. * libc/machine/i386/memset.S (memset): [!__OPTIMIZE_SIZE__]: Pre-align pointer so unaligned stores aren't penalized. Prefer 8-byte over 4-byte alignment. Reduce register pressure.
This commit is contained in:
parent
cae28869c1
commit
a6bd72a278
@ -1,3 +1,12 @@
|
||||
2008-05-26 Eric Blake <ebb9@byu.net>
|
||||
|
||||
Optimize the generic and x86 memset.
|
||||
* libc/string/memset.c (memset) [!__OPTIMIZE_SIZE__]:
|
||||
Pre-align pointer so unaligned stores aren't penalized.
|
||||
* libc/machine/i386/memset.S (memset): [!__OPTIMIZE_SIZE__]:
|
||||
Pre-align pointer so unaligned stores aren't penalized. Prefer
|
||||
8-byte over 4-byte alignment. Reduce register pressure.
|
||||
|
||||
2008-05-26 Eric Blake <ebb9@byu.net>
|
||||
|
||||
Optimize the generic and x86 strlen.
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
* ====================================================
|
||||
* Copyright (C) 1998, 2002 by Red Hat Inc. All rights reserved.
|
||||
* Copyright (C) 1998, 2002, 2008 by Red Hat Inc. All rights reserved.
|
||||
*
|
||||
* Permission to use, copy, modify, and distribute this
|
||||
* software is freely granted, provided that this notice
|
||||
@ -18,43 +18,83 @@ SYM (memset):
|
||||
pushl ebp
|
||||
movl esp,ebp
|
||||
pushl edi
|
||||
pushl ebx
|
||||
movl 8(ebp),edi
|
||||
movl 12(ebp),eax
|
||||
movl 16(ebp),ecx
|
||||
cld
|
||||
|
||||
#ifndef __OPTIMIZE_SIZE__
|
||||
andl $255,eax
|
||||
movl ecx,ebx
|
||||
testl $3,edi
|
||||
jne .L19
|
||||
/* Less than 16 bytes won't benefit from the 'rep stosl' loop. */
|
||||
cmpl $16,ecx
|
||||
jbe .L19
|
||||
cbw
|
||||
testl $7,edi
|
||||
je .L10
|
||||
|
||||
movl eax,edx
|
||||
sall $8,eax
|
||||
orl edx,eax
|
||||
/* It turns out that 8-byte aligned 'rep stosl' outperforms
|
||||
4-byte aligned on some x86 platforms. */
|
||||
movb al,(edi)
|
||||
incl edi
|
||||
decl ecx
|
||||
testl $7,edi
|
||||
je .L10
|
||||
|
||||
movb al,(edi)
|
||||
incl edi
|
||||
decl ecx
|
||||
testl $7,edi
|
||||
je .L10
|
||||
|
||||
movb al,(edi)
|
||||
incl edi
|
||||
decl ecx
|
||||
testl $7,edi
|
||||
je .L10
|
||||
|
||||
movb al,(edi)
|
||||
incl edi
|
||||
decl ecx
|
||||
testl $7,edi
|
||||
je .L10
|
||||
|
||||
movb al,(edi)
|
||||
incl edi
|
||||
decl ecx
|
||||
testl $7,edi
|
||||
je .L10
|
||||
|
||||
movb al,(edi)
|
||||
incl edi
|
||||
decl ecx
|
||||
testl $7,edi
|
||||
je .L10
|
||||
|
||||
movb al,(edi)
|
||||
incl edi
|
||||
decl ecx
|
||||
|
||||
/* At this point, ecx>8 and edi%8==0. */
|
||||
.L10:
|
||||
movb al,ah
|
||||
movl eax,edx
|
||||
sall $16,edx
|
||||
orl edx,eax
|
||||
|
||||
movl ecx,edx
|
||||
shrl $2,ecx
|
||||
andl $3,ebx
|
||||
andl $3,edx
|
||||
rep
|
||||
stosl
|
||||
movl ebx,ecx
|
||||
movl edx,ecx
|
||||
#endif /* not __OPTIMIZE_SIZE__ */
|
||||
|
||||
|
||||
.L19:
|
||||
rep
|
||||
stosb
|
||||
|
||||
movl 8(ebp),eax
|
||||
|
||||
leal -8(ebp),esp
|
||||
popl ebx
|
||||
leal -4(ebp),esp
|
||||
popl edi
|
||||
leave
|
||||
ret
|
||||
|
@ -22,7 +22,7 @@ DESCRIPTION
|
||||
pointed to by <[dst]> to the value.
|
||||
|
||||
RETURNS
|
||||
<<memset>> returns the value of <[m]>.
|
||||
<<memset>> returns the value of <[dst]>.
|
||||
|
||||
PORTABILITY
|
||||
<<memset>> is ANSI C.
|
||||
@ -39,48 +39,42 @@ QUICKREF
|
||||
#define UNALIGNED(X) ((long)X & (LBLOCKSIZE - 1))
|
||||
#define TOO_SMALL(LEN) ((LEN) < LBLOCKSIZE)
|
||||
|
||||
_PTR
|
||||
_PTR
|
||||
_DEFUN (memset, (m, c, n),
|
||||
_PTR m _AND
|
||||
int c _AND
|
||||
size_t n)
|
||||
{
|
||||
#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
|
||||
char *s = (char *) m;
|
||||
|
||||
while (n-- != 0)
|
||||
{
|
||||
*s++ = (char) c;
|
||||
}
|
||||
|
||||
return m;
|
||||
#else
|
||||
char *s = (char *) m;
|
||||
#if !defined(PREFER_SIZE_OVER_SPEED) && !defined(__OPTIMIZE_SIZE__)
|
||||
int i;
|
||||
unsigned long buffer;
|
||||
unsigned long *aligned_addr;
|
||||
unsigned int d = c & 0xff; /* To avoid sign extension, copy C to an
|
||||
unsigned variable. */
|
||||
|
||||
if (!TOO_SMALL (n) && !UNALIGNED (m))
|
||||
while (UNALIGNED (s))
|
||||
{
|
||||
/* If we get this far, we know that n is large and m is word-aligned. */
|
||||
aligned_addr = (unsigned long*)m;
|
||||
if (n--)
|
||||
*s++ = (char) c;
|
||||
else
|
||||
return m;
|
||||
}
|
||||
|
||||
if (!TOO_SMALL (n))
|
||||
{
|
||||
/* If we get this far, we know that n is large and s is word-aligned. */
|
||||
aligned_addr = (unsigned long *) s;
|
||||
|
||||
/* Store D into each char sized location in BUFFER so that
|
||||
we can set large blocks quickly. */
|
||||
if (LBLOCKSIZE == 4)
|
||||
{
|
||||
buffer = (d << 8) | d;
|
||||
buffer |= (buffer << 16);
|
||||
}
|
||||
else
|
||||
{
|
||||
buffer = 0;
|
||||
for (i = 0; i < LBLOCKSIZE; i++)
|
||||
buffer = (buffer << 8) | d;
|
||||
}
|
||||
buffer = (d << 8) | d;
|
||||
buffer |= (buffer << 16);
|
||||
for (i = 32; i < LBLOCKSIZE * 8; i <<= 1)
|
||||
buffer = (buffer << i) | buffer;
|
||||
|
||||
/* Unroll the loop. */
|
||||
while (n >= LBLOCKSIZE*4)
|
||||
{
|
||||
*aligned_addr++ = buffer;
|
||||
@ -99,11 +93,10 @@ _DEFUN (memset, (m, c, n),
|
||||
s = (char*)aligned_addr;
|
||||
}
|
||||
|
||||
#endif /* not PREFER_SIZE_OVER_SPEED */
|
||||
|
||||
while (n--)
|
||||
{
|
||||
*s++ = (char)d;
|
||||
}
|
||||
*s++ = (char) c;
|
||||
|
||||
return m;
|
||||
#endif /* not PREFER_SIZE_OVER_SPEED */
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user