2008-02-11 Patrick Mansfield <patmans@us.ibm.com>
* libc/machine/spu/straddr.h: New file, supplies _straddr. * libc/machine/spu/strcat.c: Use _straddr and _strcpy. * libc/machine/spu/strcpy.c: Use _strcpy. * libc/machine/spu/strcpy.h: Supply _strcpy for optimized SPU str[n]cpy and str[n]cat. * libc/machine/spu/strncat.c: Use _straddr and _strcpy. * libc/machine/spu/strncpy.c: Use _strcpy.
This commit is contained in:
parent
de20111eba
commit
9cb5dea0e2
|
@ -0,0 +1,180 @@
|
|||
/*
|
||||
(C) Copyright 2008
|
||||
International Business Machines Corporation,
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the names of the copyright holders nor the names of their
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include <spu_intrinsics.h>
|
||||
#include "vec_literal.h"
|
||||
|
||||
/*
|
||||
* Supply an inline _strncpy for strcpy/cat and strncpy/cat. Relies on
|
||||
* checklen and lastzero code being optimized out when they are constant
|
||||
* zero values.
|
||||
*/
|
||||
static inline void * _strncpy(char * __restrict__ dest, const char *
|
||||
__restrict__ src, size_t maxlen, int
|
||||
checklen, int lastzero)
|
||||
{
|
||||
int adjust, offset, soffset, doffset, shift;
|
||||
vec_uchar16 *vsrc, *vdest;
|
||||
vec_uchar16 sdata1, sdata2, sdata, shuffle;
|
||||
vec_uchar16 mask1, maskzero, cmp0;
|
||||
vec_uint4 nonzeroes, gathered_cmp, vtmp, vtmp2;
|
||||
vec_uint4 curlen; /* assumes size_t is 4 bytes */
|
||||
const vec_uint4 val31 = { 31, 31, 31, 31 };
|
||||
const vec_uint4 val_0123 = { 0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F };
|
||||
const vec_uchar16 all_ones = { 0xff,0xff,0xff,0xff, 0xff,0xff,0xff,0xff,
|
||||
0xff,0xff,0xff,0xff, 0xff,0xff,0xff,0xff };
|
||||
|
||||
vsrc = (vec_uchar16 *) src;
|
||||
vdest = (vec_uchar16 *) dest;
|
||||
soffset = (int) src & 15;
|
||||
doffset = (int) dest & 15;
|
||||
|
||||
if (checklen) {
|
||||
/*
|
||||
* Set curlen so it is the number of bytes we would copy if starting
|
||||
* from vdest & ~0xf.
|
||||
*
|
||||
* curlen could probably be replaced by comparing vdest plus some
|
||||
* offset to dest + maxlen, that would help mainly in the while loop
|
||||
* but would lose only one instruction (the curlen -= 16).
|
||||
*/
|
||||
curlen = spu_splats((unsigned int) (maxlen + doffset));
|
||||
}
|
||||
|
||||
/*
|
||||
* Setup a shuffle pattern to align the source string with the
|
||||
* alignment of the destination string.
|
||||
*/
|
||||
vtmp = spu_cmpgt(spu_promote(doffset, 0), spu_promote(soffset, 0));
|
||||
adjust = spu_extract(vtmp, 0);
|
||||
offset = soffset - doffset;
|
||||
offset += adjust & 16;
|
||||
shuffle = spu_splats((unsigned char) offset);
|
||||
shuffle = (vec_uchar16) spu_add((vec_uint4) shuffle, val_0123);
|
||||
|
||||
vsrc += adjust;
|
||||
sdata1 = *vsrc++;
|
||||
sdata2 = *vsrc++;
|
||||
sdata = spu_shuffle(sdata1, sdata2, shuffle);
|
||||
|
||||
/*
|
||||
* mask out leading bytes
|
||||
*/
|
||||
mask1 = spu_rlmaskqwbyte(all_ones, -doffset);
|
||||
|
||||
cmp0 = spu_and(mask1, spu_cmpeq(sdata, 0));
|
||||
nonzeroes = spu_cntlz(spu_gather(cmp0));
|
||||
/*
|
||||
* First element of nonzeroes - 15 is the number of leading non-zero
|
||||
* bytes plus 1 for the zero byte.
|
||||
*/
|
||||
if (checklen) {
|
||||
vtmp = spu_add(curlen, 15);
|
||||
vtmp2 = spu_cmpgt(nonzeroes, vtmp);
|
||||
nonzeroes = spu_sel(nonzeroes, vtmp, vtmp2);
|
||||
}
|
||||
|
||||
vtmp = spu_cmpgt(nonzeroes, val31);
|
||||
/*
|
||||
* Note: using immediate (constant 31) vs a vector value (val31) does
|
||||
* not give different results, and we have to have a vector val31 for
|
||||
* the spu_sel below, so use val31 everywhere.
|
||||
*/
|
||||
vtmp = spu_sel(nonzeroes, val31, vtmp);
|
||||
/*
|
||||
* So vtmp is now min(nonzeroes, 31), the number of bytes + 16 that we
|
||||
* want to copy from the first 16 bytes of the source.
|
||||
*/
|
||||
if (checklen) {
|
||||
curlen = spu_sub(vtmp, curlen);
|
||||
curlen = spu_sub(15, curlen);
|
||||
}
|
||||
|
||||
/*
|
||||
* We want a right shift 0xff with fill by ones of (vtmp - 15) bytes, but
|
||||
* that doesn't exist so use spu_slqwbyte and vtmp all ones left by
|
||||
* (31 - vtmp). Note: this can also use spu_rlqwbytebc with spu_rlqw.
|
||||
*/
|
||||
shift = spu_extract(spu_sub(val31, vtmp), 0);
|
||||
maskzero = spu_slqwbyte(all_ones, shift);
|
||||
maskzero = spu_and(mask1, maskzero);
|
||||
*vdest = spu_sel(*vdest, sdata, maskzero);
|
||||
|
||||
vtmp = spu_cmpgt(nonzeroes, val31);
|
||||
if (checklen) {
|
||||
vtmp2 = spu_cmpgt(curlen, 0);
|
||||
vtmp = spu_and(vtmp, vtmp2);
|
||||
}
|
||||
if (spu_extract(vtmp, 0)) {
|
||||
sdata1 = sdata2;
|
||||
sdata2 = *vsrc++;
|
||||
sdata = spu_shuffle(sdata1, sdata2, shuffle);
|
||||
cmp0 = spu_cmpeq(sdata, 0);
|
||||
gathered_cmp = spu_gather(cmp0);
|
||||
/*
|
||||
* Copy 16 bytes at a time.
|
||||
*/
|
||||
while ((spu_extract(gathered_cmp, 0) == 0) &&
|
||||
(!checklen || (spu_extract(curlen, 0) > 15))) {
|
||||
if (checklen)
|
||||
curlen = spu_add(curlen, -16);
|
||||
*++vdest = sdata;
|
||||
sdata1 = sdata2;
|
||||
sdata2 = *vsrc++;
|
||||
sdata = spu_shuffle(sdata1, sdata2, shuffle);
|
||||
cmp0 = spu_cmpeq(sdata, 0);
|
||||
gathered_cmp = spu_gather(cmp0);
|
||||
}
|
||||
/*
|
||||
* Copy 0 to 15 trailing bytes, either up to the smaller of curlen or
|
||||
* the number of non-zero bytes.
|
||||
*/
|
||||
nonzeroes = spu_cntlz(gathered_cmp);
|
||||
if (checklen) {
|
||||
vtmp = spu_add(curlen, 15);
|
||||
vtmp2 = spu_cmpgt(nonzeroes, vtmp);
|
||||
nonzeroes = spu_sel(nonzeroes, vtmp, vtmp2);
|
||||
curlen = spu_sub(nonzeroes, curlen);
|
||||
curlen = spu_sub(15, curlen);
|
||||
}
|
||||
shift = spu_extract(spu_sub(val31, nonzeroes), 0);
|
||||
maskzero = spu_slqwbyte(all_ones, shift);
|
||||
++vdest;
|
||||
*vdest = spu_sel(*vdest, sdata, maskzero);
|
||||
}
|
||||
|
||||
if (checklen && lastzero) {
|
||||
/*
|
||||
* For strncat.
|
||||
*/
|
||||
dest[maxlen - spu_extract(curlen, 0)] = '\0';
|
||||
}
|
||||
return (dest);
|
||||
}
|
Loading…
Reference in New Issue