Improve performance of memmem

This patch significantly improves performance of memmem using a novel modified Horspool algorithm. Needles up to size 256 use a bad-character table indexed by hashed pairs of characters to quickly skip past mismatches. Long needles use a self-adapting filtering step to avoid comparing the whole needle repeatedly. By limiting the needle length to 256, the shift table only requires 8 bits per entry, lowering preprocessing overhead and minimizing cache effects. This limit also implies worst-case performance is linear. Small needles up to size 2 use a dedicated linear search. Very long needles use the Two-Way algorithm (to avoid increasing stack size inlining is now disabled). The performance gain is 6.6 times on English text on AArch64 using random needles with average size 8 (this is even faster than the recently improved strstr algorithm, so I'll update that in the near future). The size-optimized memmem has also been rewritten from scratch to get a 2.7x performance gain. Tested against GLIBC testsuite and randomized tests. Message-Id: <DB5PR08MB1030649D051FA8532A4512C883B20@DB5PR08MB1030.eurprd08.prod.outlook.com>
2018-12-31 18:01:52 +00:00 · 2018-12-31 18:01:52 +00:00 · 353ebae304
parent 5726873100
commit 353ebae304
2 changed files with 144 additions and 58 deletions
--- a/newlib/libc/string/memmem.c
+++ b/newlib/libc/string/memmem.c
@ -1,8 +1,30 @@
-/* Byte-wise substring search, using the Two-Way algorithm.
- * Copyright (C) 2008 Eric Blake
- * Permission to use, copy, modify, and distribute this software
- * is freely granted, provided that this notice is preserved.
- */
+/* Optimized memmem function.
+   Copyright (c) 2018 Arm Ltd.  All rights reserved.
+
+   SPDX-License-Identifier: BSD-3-Clause
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+   3. The name of the company may not be used to endorse or promote
+      products derived from this software without specific prior written
+      permission.
+
+   THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
+   WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+   MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+   IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+   TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */

 /*
 FUNCTION
@ -13,7 +35,7 @@ INDEX

 SYNOPSIS
 	#include <string.h>
-	char *memmem(const void *<[s1]>, size_t <[l1]>, const void *<[s2]>,
+	void *memmem(const void *<[s1]>, size_t <[l1]>, const void *<[s2]>,
 		     size_t <[l2]>);

 DESCRIPTION
@ -21,8 +43,8 @@ DESCRIPTION
 	Locates the first occurrence in the memory region pointed to
 	by <[s1]> with length <[l1]> of the sequence of bytes pointed
 	to by <[s2]> of length <[l2]>.  If you already know the
-	lengths of your haystack and needle, <<memmem>> can be much
-	faster than <<strstr>>.
+	lengths of your haystack and needle, <<memmem>> is much faster
+	than <<strstr>>.

 RETURNS
 	Returns a pointer to the located segment, or a null pointer if
@ -38,64 +60,127 @@ QUICKREF
 */

 #include <string.h>
-
-#if !defined(PREFER_SIZE_OVER_SPEED) && !defined(__OPTIMIZE_SIZE__)
-# define RETURN_TYPE void *
-# define AVAILABLE(h, h_l, j, n_l) ((j) <= (h_l) - (n_l))
-# include "str-two-way.h"
-#endif
-
-void *
-memmem (const void *haystack_start,
-	size_t haystack_len,
-	const void *needle_start,
-	size_t needle_len)
-{
-  /* Abstract memory is considered to be an array of 'unsigned char' values,
-     not an array of 'char' values.  See ISO C 99 section 6.2.6.1.  */
-  const unsigned char *haystack = (const unsigned char *) haystack_start;
-  const unsigned char *needle = (const unsigned char *) needle_start;
-
-  if (needle_len == 0)
-    /* The first occurrence of the empty string is deemed to occur at
-       the beginning of the string.  */
-    return (void *) haystack;
+#include <stdint.h>

 #if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)

-  /* Less code size, but quadratic performance in the worst case.  */
-  while (needle_len <= haystack_len)
+/* Small and efficient memmem implementation (quadratic worst-case).  */
+void *
+memmem (const void *haystack, size_t hs_len, const void *needle, size_t ne_len)
 {
-      if (!memcmp (haystack, needle, needle_len))
-        return (void *) haystack;
-      haystack++;
-      haystack_len--;
-    }
-  return NULL;
+  const char *hs = haystack;
+  const char *ne = needle;

-#else /* compilation for speed */
+  if (ne_len == 0)
+    return (void *)hs;
+  int i;
+  int c = ne[0];
+  const char *end = hs + hs_len - ne_len;

-  /* Larger code size, but guaranteed linear performance.  */
-
-  /* Sanity check, otherwise the loop might search through the whole
-     memory.  */
-  if (haystack_len < needle_len)
-    return NULL;
-
-  /* Use optimizations in memchr when possible, to reduce the search
-     size of haystack using a linear algorithm with a smaller
-     coefficient.  However, avoid memchr for long needles, since we
-     can often achieve sublinear performance.  */
-  if (needle_len < LONG_NEEDLE_THRESHOLD)
+  for ( ; hs <= end; hs++)
  {
-      haystack = memchr (haystack, *needle, haystack_len);
-      if (!haystack || needle_len == 1)
-	return (void *) haystack;
-      haystack_len -= haystack - (const unsigned char *) haystack_start;
-      if (haystack_len < needle_len)
+    if (hs[0] != c)
+      continue;
+    for (i = ne_len - 1; i != 0; i--)
+      if (hs[i] != ne[i])
+	break;
+    if (i == 0)
+      return (void *)hs;
+  }
+
  return NULL;
-      return two_way_short_needle (haystack, haystack_len, needle, needle_len);
 }
-  return two_way_long_needle (haystack, haystack_len, needle, needle_len);
-#endif /* compilation for speed */
+
+#else
+
+# define RETURN_TYPE void *
+# define AVAILABLE(h, h_l, j, n_l) ((j) <= (h_l) - (n_l))
+# include "str-two-way.h"
+
+#define hash2(p) (((size_t)(p)[0] - ((size_t)(p)[-1] << 3)) % sizeof (shift))
+
+/* Fast memmem algorithm with guaranteed linear-time performance.
+   Small needles up to size 2 use a dedicated linear search.  Longer needles
+   up to size 256 use a novel modified Horspool algorithm.  It hashes pairs
+   of characters to quickly skip past mismatches.  The main search loop only
+   exits if the last 2 characters match, avoiding unnecessary calls to memcmp
+   and allowing for a larger skip if there is no match.  A self-adapting
+   filtering check is used to quickly detect mismatches in long needles.
+   By limiting the needle length to 256, the shift table can be reduced to 8
+   bits per entry, lowering preprocessing overhead and minimizing cache effects.
+   The limit also implies worst-case performance is linear.
+   Needles larger than 256 characters use the linear-time Two-Way algorithm.  */
+void *
+memmem (const void *haystack, size_t hs_len, const void *needle, size_t ne_len)
+{
+  const unsigned char *hs = haystack;
+  const unsigned char *ne = needle;
+
+  if (ne_len == 0)
+    return (void *) hs;
+  if (ne_len == 1)
+    return (void *) memchr (hs, ne[0], hs_len);
+
+  /* Ensure haystack length is >= needle length.  */
+  if (hs_len < ne_len)
+    return NULL;
+
+  const unsigned char *end = hs + hs_len - ne_len;
+
+  if (ne_len == 2)
+    {
+      uint32_t nw = ne[0] << 16 | ne[1], hw = hs[0] << 16 | hs[1];
+      for (hs++; hs <= end && hw != nw; )
+	hw = hw << 16 | *++hs;
+      return hw == nw ? (void *)(hs - 1) : NULL;
    }
+
+  /* Use Two-Way algorithm for very long needles.  */
+  if (__builtin_expect (ne_len > 256, 0))
+    return two_way_long_needle (hs, hs_len, ne, ne_len);
+
+  uint8_t shift[256];
+  size_t tmp, shift1;
+  size_t m1 = ne_len - 1;
+  size_t offset = 0;
+
+  /* Initialize bad character shift hash table.  */
+  memset (shift, 0, sizeof (shift));
+  for (int i = 1; i < m1; i++)
+    shift[hash2 (ne + i)] = i;
+  shift1 = m1 - shift[hash2 (ne + m1)];
+  shift[hash2 (ne + m1)] = m1;
+
+  for ( ; hs <= end; )
+    {
+      /* Skip past character pairs not in the needle.  */
+      do
+	{
+	  hs += m1;
+	  tmp = shift[hash2 (hs)];
+	}
+      while (hs <= end && tmp == 0);
+
+      /* If the match is not at the end of the needle, shift to the end
+	 and continue until we match the last 2 characters.  */
+      hs -= tmp;
+      if (tmp < m1)
+	continue;
+
+      /* The last 2 characters match.  If the needle is long, check a
+	 fixed number of characters first to quickly filter out mismatches.  */
+      if (m1 <= 15 || memcmp (hs + offset, ne + offset, sizeof (long)) == 0)
+	{
+	  if (memcmp (hs, ne, m1) == 0)
+	    return (void *) hs;
+
+	  /* Adjust filter offset when it doesn't find the mismatch.  */
+	  offset = (offset >= sizeof (long) ? offset : m1) - sizeof (long);
+	}
+
+      /* Skip based on matching the last 2 characters.  */
+      hs += shift1;
+    }
+  return NULL;
+}
+#endif /* Compilation for speed.  */
--- a/newlib/libc/string/str-two-way.h
+++ b/newlib/libc/string/str-two-way.h
@ -31,6 +31,7 @@

 #include <limits.h>
 #include <stdint.h>
+#include <_ansi.h>

 /* We use the Two-Way string matching algorithm, which guarantees
   linear complexity with constant space.  Additionally, for long
@ -288,7 +289,7 @@ two_way_short_needle (const unsigned char *haystack, size_t haystack_len,
   If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 *
   HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching, and
   sublinear performance is not possible.  */
-static RETURN_TYPE
+_NOINLINE_STATIC RETURN_TYPE
 two_way_long_needle (const unsigned char *haystack, size_t haystack_len,
 		     const unsigned char *needle, size_t needle_len)
 {