Improve performance of strstr
v3: Add support for read ahead using strnlen, giving an additional 25% speedup on large inputs (both short and long needles). This patch significantly improves performance of strstr by using Sunday's Quick-Search algorithm. Due to its simplicity it has the best average performance of string matching algorithms on almost all inputs. It uses a bad-character shift table to skip past mismatches. The needle length is limited to 254 - this reduces the shift table memory 4 to 8 times, lowering preprocessing overhead and minimizing cache effects. The limit also implies its worst-case performance is linear. Larger needles are processed by the Two-Way algorithm. The macro AVAILABLE has been improved to use strnlen to read the input in chunks. This results in a 2.5 times speedup for large needles, reducing the performance drop when the Quick-Search algorithm can't be used. The code for 1-4 byte needles has been simplified and now uses unsigned char. Since the optimized code relies on 8-bit chars, we defer to the size-optimized implementation if CHAR_BIT > 8. The performance gain of finding a set of randomly chosen words of size 8 in 256 bytes of English text is 14 times on AArch64. For longer haystacks the gain is well over 20 times. The size-optimized strstr has also been rewritten from scratch to improve performance. On the same test the performance gain is 69%. Tested against GLIBC testsuite, randomized tests and the GNULIB strstr test (https://git.savannah.gnu.org/cgit/gnulib.git/tree/tests/test-strstr.c). --
This commit is contained in:
parent
4f7a6c326a
commit
473f1a3a5d
|
@ -1,3 +1,31 @@
|
||||||
|
/* Optimized strstr function.
|
||||||
|
Copyright (c) 2018 Arm Ltd. All rights reserved.
|
||||||
|
|
||||||
|
SPDX-License-Identifier: BSD-3-Clause
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions
|
||||||
|
are met:
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
3. The name of the company may not be used to endorse or promote
|
||||||
|
products derived from this software without specific prior written
|
||||||
|
permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
|
||||||
|
WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
||||||
|
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
||||||
|
IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
|
||||||
|
TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||||
|
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||||
|
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||||
|
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||||
|
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
|
||||||
|
|
||||||
/*
|
/*
|
||||||
FUNCTION
|
FUNCTION
|
||||||
<<strstr>>---find string segment
|
<<strstr>>---find string segment
|
||||||
|
@ -29,141 +57,144 @@ QUICKREF
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
|
#include <limits.h>
|
||||||
|
|
||||||
#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
|
#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__) \
|
||||||
|
|| CHAR_BIT > 8
|
||||||
|
|
||||||
|
/* Small and efficient strstr implementation. */
|
||||||
char *
|
char *
|
||||||
strstr (const char *searchee,
|
strstr (const char *hs, const char *ne)
|
||||||
const char *lookfor)
|
|
||||||
{
|
{
|
||||||
/* Less code size, but quadratic performance in the worst case. */
|
size_t i;
|
||||||
if (*searchee == 0)
|
int c = ne[0];
|
||||||
|
|
||||||
|
if (c == 0)
|
||||||
|
return (char*)hs;
|
||||||
|
|
||||||
|
for ( ; hs[0] != '\0'; hs++)
|
||||||
{
|
{
|
||||||
if (*lookfor)
|
if (hs[0] != c)
|
||||||
return (char *) NULL;
|
continue;
|
||||||
return (char *) searchee;
|
for (i = 1; ne[i] != 0; i++)
|
||||||
|
if (hs[i] != ne[i])
|
||||||
|
break;
|
||||||
|
if (ne[i] == '\0')
|
||||||
|
return (char*)hs;
|
||||||
}
|
}
|
||||||
|
|
||||||
while (*searchee)
|
return NULL;
|
||||||
{
|
}
|
||||||
size_t i;
|
|
||||||
i = 0;
|
|
||||||
|
|
||||||
while (1)
|
|
||||||
{
|
|
||||||
if (lookfor[i] == 0)
|
|
||||||
{
|
|
||||||
return (char *) searchee;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (lookfor[i] != searchee[i])
|
|
||||||
{
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
i++;
|
|
||||||
}
|
|
||||||
searchee++;
|
|
||||||
}
|
|
||||||
|
|
||||||
return (char *) NULL;
|
|
||||||
|
|
||||||
#else /* compilation for speed */
|
#else /* compilation for speed */
|
||||||
|
|
||||||
# define RETURN_TYPE char *
|
# define RETURN_TYPE char *
|
||||||
# define AVAILABLE(h, h_l, j, n_l) \
|
# define AVAILABLE(h, h_l, j, n_l) (((j) <= (h_l) - (n_l)) \
|
||||||
(!memchr ((h) + (h_l), '\0', (j) + (n_l) - (h_l)) \
|
|| ((h_l) += strnlen ((h) + (h_l), (n_l) | 2048), ((j) <= (h_l) - (n_l))))
|
||||||
&& ((h_l) = (j) + (n_l)))
|
|
||||||
# include "str-two-way.h"
|
# include "str-two-way.h"
|
||||||
|
|
||||||
|
/* Number of bits used to index shift table. */
|
||||||
|
#define SHIFT_TABLE_BITS 6
|
||||||
|
|
||||||
static inline char *
|
static inline char *
|
||||||
strstr2 (const char *hs, const char *ne)
|
strstr2 (const unsigned char *hs, const unsigned char *ne)
|
||||||
{
|
{
|
||||||
uint32_t h1 = (ne[0] << 16) | ne[1];
|
uint32_t h1 = (ne[0] << 16) | ne[1];
|
||||||
uint32_t h2 = 0;
|
uint32_t h2 = 0;
|
||||||
int c = hs[0];
|
for (int c = hs[0]; h1 != h2 && c != 0; c = *++hs)
|
||||||
while (h1 != h2 && c != 0)
|
|
||||||
{
|
|
||||||
h2 = (h2 << 16) | c;
|
h2 = (h2 << 16) | c;
|
||||||
c = *++hs;
|
|
||||||
}
|
|
||||||
return h1 == h2 ? (char *)hs - 2 : NULL;
|
return h1 == h2 ? (char *)hs - 2 : NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline char *
|
static inline char *
|
||||||
strstr3 (const char *hs, const char *ne)
|
strstr3 (const unsigned char *hs, const unsigned char *ne)
|
||||||
{
|
{
|
||||||
uint32_t h1 = (ne[0] << 24) | (ne[1] << 16) | (ne[2] << 8);
|
uint32_t h1 = (ne[0] << 24) | (ne[1] << 16) | (ne[2] << 8);
|
||||||
uint32_t h2 = 0;
|
uint32_t h2 = 0;
|
||||||
int c = hs[0];
|
for (int c = hs[0]; h1 != h2 && c != 0; c = *++hs)
|
||||||
while (h1 != h2 && c != 0)
|
|
||||||
{
|
|
||||||
h2 = (h2 | c) << 8;
|
h2 = (h2 | c) << 8;
|
||||||
c = *++hs;
|
|
||||||
}
|
|
||||||
return h1 == h2 ? (char *)hs - 3 : NULL;
|
return h1 == h2 ? (char *)hs - 3 : NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline char *
|
static inline char *
|
||||||
strstr4 (const char *hs, const char *ne)
|
strstr4 (const unsigned char *hs, const unsigned char *ne)
|
||||||
{
|
{
|
||||||
uint32_t h1 = (ne[0] << 24) | (ne[1] << 16) | (ne[2] << 8) | ne[3];
|
uint32_t h1 = (ne[0] << 24) | (ne[1] << 16) | (ne[2] << 8) | ne[3];
|
||||||
uint32_t h2 = 0;
|
uint32_t h2 = 0;
|
||||||
int c = hs[0];
|
for (int c = hs[0]; c != 0 && h1 != h2; c = *++hs)
|
||||||
while (h1 != h2 && c != 0)
|
h2 = (h2 << 8) | c;
|
||||||
{
|
|
||||||
h2 = (h2 << 8) | c;
|
|
||||||
c = *++hs;
|
|
||||||
}
|
|
||||||
return h1 == h2 ? (char *)hs - 4 : NULL;
|
return h1 == h2 ? (char *)hs - 4 : NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Extremely fast strstr algorithm with guaranteed linear-time performance.
|
||||||
|
Small needles up to size 4 use a dedicated linear search. Longer needles
|
||||||
|
up to size 254 use Sunday's Quick-Search algorithm. Due to its simplicity
|
||||||
|
it has the best average performance of string matching algorithms on almost
|
||||||
|
all inputs. It uses a bad-character shift table to skip past mismatches.
|
||||||
|
By limiting the needle length to 254, the shift table can be reduced to 8
|
||||||
|
bits per entry, lowering preprocessing overhead and minimizing cache effects.
|
||||||
|
The limit also implies the worst-case performance is linear.
|
||||||
|
Even larger needles are processed by the linear-time Two-Way algorithm.
|
||||||
|
*/
|
||||||
char *
|
char *
|
||||||
strstr (const char *searchee,
|
strstr (const char *haystack, const char *needle)
|
||||||
const char *lookfor)
|
|
||||||
{
|
{
|
||||||
/* Larger code size, but guaranteed linear performance. */
|
const unsigned char *hs = (const unsigned char *) haystack;
|
||||||
const char *haystack = searchee;
|
const unsigned char *ne = (const unsigned char *) needle;
|
||||||
const char *needle = lookfor;
|
|
||||||
size_t needle_len; /* Length of NEEDLE. */
|
|
||||||
size_t haystack_len; /* Known minimum length of HAYSTACK. */
|
|
||||||
int ok = 1; /* True if NEEDLE is prefix of HAYSTACK. */
|
|
||||||
|
|
||||||
/* Handle short needle special cases first. */
|
/* Handle short needle special cases first. */
|
||||||
if (needle[0] == '\0')
|
if (ne[0] == '\0')
|
||||||
return (char *) haystack;
|
return (char *) hs;
|
||||||
if (needle[1] == '\0')
|
if (ne[1] == '\0')
|
||||||
return strchr (haystack, needle[0]);
|
return (char*)strchr (hs, ne[0]);
|
||||||
if (needle[2] == '\0')
|
if (ne[2] == '\0')
|
||||||
return strstr2 (haystack, needle);
|
return strstr2 (hs, ne);
|
||||||
if (needle[3] == '\0')
|
if (ne[3] == '\0')
|
||||||
return strstr3 (haystack, needle);
|
return strstr3 (hs, ne);
|
||||||
if (needle[4] == '\0')
|
if (ne[4] == '\0')
|
||||||
return strstr4 (haystack, needle);
|
return strstr4 (hs, ne);
|
||||||
|
|
||||||
/* Determine length of NEEDLE, and in the process, make sure
|
size_t ne_len = strlen (ne);
|
||||||
HAYSTACK is at least as long (no point processing all of a long
|
size_t hs_len = strnlen (hs, ne_len | 512);
|
||||||
NEEDLE if HAYSTACK is too short). */
|
|
||||||
while (*haystack && *needle)
|
/* Ensure haystack length is >= needle length. */
|
||||||
ok &= *haystack++ == *needle++;
|
if (hs_len < ne_len)
|
||||||
if (*needle)
|
|
||||||
return NULL;
|
return NULL;
|
||||||
if (ok)
|
|
||||||
return (char *) searchee;
|
|
||||||
|
|
||||||
/* Reduce the size of haystack using strchr, since it has a smaller
|
/* Use the Quick-Search algorithm for needle lengths less than 255. */
|
||||||
linear coefficient than the Two-Way algorithm. */
|
if (__builtin_expect (ne_len < 255, 1))
|
||||||
needle_len = needle - lookfor;
|
{
|
||||||
haystack = strchr (searchee + 1, *lookfor);
|
uint8_t shift[1 << SHIFT_TABLE_BITS];
|
||||||
if (!haystack || needle_len == 1)
|
const unsigned char *end = hs + hs_len - ne_len;
|
||||||
return (char *) haystack;
|
|
||||||
haystack_len = (haystack > searchee + needle_len ? 1
|
|
||||||
: needle_len + searchee - haystack);
|
|
||||||
|
|
||||||
/* Perform the search. */
|
/* Initialize bad character shift hash table. */
|
||||||
if (needle_len < LONG_NEEDLE_THRESHOLD)
|
memset (shift, ne_len + 1, sizeof (shift));
|
||||||
return two_way_short_needle ((const unsigned char *) haystack,
|
for (int i = 0; i < ne_len; i++)
|
||||||
haystack_len,
|
shift[ne[i] % sizeof (shift)] = ne_len - i;
|
||||||
(const unsigned char *) lookfor, needle_len);
|
|
||||||
return two_way_long_needle ((const unsigned char *) haystack, haystack_len,
|
do
|
||||||
(const unsigned char *) lookfor, needle_len);
|
{
|
||||||
#endif /* compilation for speed */
|
hs--;
|
||||||
|
|
||||||
|
/* Search by skipping past bad characters. */
|
||||||
|
size_t tmp = shift[hs[ne_len] % sizeof (shift)];
|
||||||
|
for (hs += tmp; hs <= end; hs += tmp)
|
||||||
|
{
|
||||||
|
tmp = shift[hs[ne_len] % sizeof (shift)];
|
||||||
|
if (memcmp (hs, ne, ne_len) == 0)
|
||||||
|
return (char*) hs;
|
||||||
|
}
|
||||||
|
if (end[ne_len] == 0)
|
||||||
|
return NULL;
|
||||||
|
end += strnlen (end + ne_len, 2048);
|
||||||
|
}
|
||||||
|
while (hs <= end);
|
||||||
|
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Use Two-Way algorithm for very long needles. */
|
||||||
|
return two_way_long_needle (hs, hs_len, ne, ne_len);
|
||||||
}
|
}
|
||||||
|
#endif /* compilation for speed */
|
||||||
|
|
Loading…
Reference in New Issue