(root)/
glibc-2.38/
string/
str-two-way.h
       1  /* Byte-wise substring search, using the Two-Way algorithm.
       2     Copyright (C) 2008-2023 Free Software Foundation, Inc.
       3     This file is part of the GNU C Library.
       4     Written by Eric Blake <ebb9@byu.net>, 2008.
       5  
       6     The GNU C Library is free software; you can redistribute it and/or
       7     modify it under the terms of the GNU Lesser General Public
       8     License as published by the Free Software Foundation; either
       9     version 2.1 of the License, or (at your option) any later version.
      10  
      11     The GNU C Library is distributed in the hope that it will be useful,
      12     but WITHOUT ANY WARRANTY; without even the implied warranty of
      13     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
      14     Lesser General Public License for more details.
      15  
      16     You should have received a copy of the GNU Lesser General Public
      17     License along with the GNU C Library; if not, see
      18     <https://www.gnu.org/licenses/>.  */
      19  
      20  /* Before including this file, you need to include <string.h> (and
      21     <config.h> before that, if not part of libc), and define:
      22       RETURN_TYPE             A macro that expands to the return type.
      23       AVAILABLE(h, h_l, j, n_l)
      24  			     A macro that returns nonzero if there are
      25  			     at least N_L bytes left starting at H[J].
      26  			     H is 'unsigned char *', H_L, J, and N_L
      27  			     are 'size_t'; H_L is an lvalue.  For
      28  			     NUL-terminated searches, H_L can be
      29  			     modified each iteration to avoid having
      30  			     to compute the end of H up front.
      31  
      32    For case-insensitivity, you may optionally define:
      33       CMP_FUNC(p1, p2, l)     A macro that returns 0 iff the first L
      34  			     characters of P1 and P2 are equal.
      35       CANON_ELEMENT(c)        A macro that canonicalizes an element right after
      36  			     it has been fetched from one of the two strings.
      37  			     The argument is an 'unsigned char'; the result
      38  			     must be an 'unsigned char' as well.
      39  
      40    Other macros you may optionally define:
      41       RET0_IF_0(a)            Documented below at default definition.
      42       CHECK_EOL               Same.
      43  
      44    This file undefines the macros listed above, and defines
      45    LONG_NEEDLE_THRESHOLD.
      46  */
      47  
      48  #include <limits.h>
      49  #include <stdint.h>
      50  #include <sys/param.h>                  /* Defines MAX.  */
      51  
      52  /* We use the Two-Way string matching algorithm, which guarantees
      53     linear complexity with constant space.  Additionally, for long
      54     needles, we also use a bad character shift table similar to the
      55     Boyer-Moore algorithm to achieve improved (potentially sub-linear)
      56     performance.
      57  
      58     See http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260
      59     and http://en.wikipedia.org/wiki/Boyer-Moore_string_search_algorithm
      60  */
      61  
      62  /* Point at which computing a bad-byte shift table is likely to be
      63     worthwhile.  Small needles should not compute a table, since it
      64     adds (1 << CHAR_BIT) + NEEDLE_LEN computations of preparation for a
      65     speedup no greater than a factor of NEEDLE_LEN.  The larger the
      66     needle, the better the potential performance gain.  On the other
      67     hand, on non-POSIX systems with CHAR_BIT larger than eight, the
      68     memory required for the table is prohibitive.  */
      69  #if CHAR_BIT < 10
      70  # define LONG_NEEDLE_THRESHOLD 32U
      71  #else
      72  # define LONG_NEEDLE_THRESHOLD SIZE_MAX
      73  #endif
      74  
      75  #ifndef CANON_ELEMENT
      76  # define CANON_ELEMENT(c) c
      77  #endif
      78  #ifndef CMP_FUNC
      79  # define CMP_FUNC memcmp
      80  #endif
      81  
      82  /* Check for end-of-line in strstr and strcasestr routines.
      83     We piggy-back matching procedure for detecting EOL where possible,
      84     and use AVAILABLE macro otherwise.  */
      85  #ifndef CHECK_EOL
      86  # define CHECK_EOL (0)
      87  #endif
      88  
      89  /* Return NULL if argument is '\0'.  */
      90  #ifndef RET0_IF_0
      91  # define RET0_IF_0(a) /* nothing */
      92  #endif
      93  
      94  /* Perform a critical factorization of NEEDLE, of length NEEDLE_LEN.
      95     Return the index of the first byte in the right half, and set
      96     *PERIOD to the global period of the right half.
      97  
      98     The global period of a string is the smallest index (possibly its
      99     length) at which all remaining bytes in the string are repetitions
     100     of the prefix (the last repetition may be a subset of the prefix).
     101  
     102     When NEEDLE is factored into two halves, a local period is the
     103     length of the smallest word that shares a suffix with the left half
     104     and shares a prefix with the right half.  All factorizations of a
     105     non-empty NEEDLE have a local period of at least 1 and no greater
     106     than NEEDLE_LEN.
     107  
     108     A critical factorization has the property that the local period
     109     equals the global period.  All strings have at least one critical
     110     factorization with the left half smaller than the global period.
     111  
     112     Given an ordered alphabet, a critical factorization can be computed
     113     in linear time, with 2 * NEEDLE_LEN comparisons, by computing the
     114     larger of two ordered maximal suffixes.  The ordered maximal
     115     suffixes are determined by lexicographic comparison of
     116     periodicity.  */
     117  static size_t
     118  critical_factorization (const unsigned char *needle, size_t needle_len,
     119  			size_t *period)
     120  {
     121    /* Index of last byte of left half, or SIZE_MAX.  */
     122    size_t max_suffix, max_suffix_rev;
     123    size_t j; /* Index into NEEDLE for current candidate suffix.  */
     124    size_t k; /* Offset into current period.  */
     125    size_t p; /* Intermediate period.  */
     126    unsigned char a, b; /* Current comparison bytes.  */
     127  
     128    /* Invariants:
     129       0 <= j < NEEDLE_LEN - 1
     130       -1 <= max_suffix{,_rev} < j (treating SIZE_MAX as if it were signed)
     131       min(max_suffix, max_suffix_rev) < global period of NEEDLE
     132       1 <= p <= global period of NEEDLE
     133       p == global period of the substring NEEDLE[max_suffix{,_rev}+1...j]
     134       1 <= k <= p
     135    */
     136  
     137    /* Perform lexicographic search.  */
     138    max_suffix = SIZE_MAX;
     139    j = 0;
     140    k = p = 1;
     141    while (j + k < needle_len)
     142      {
     143        a = CANON_ELEMENT (needle[j + k]);
     144        b = CANON_ELEMENT (needle[max_suffix + k]);
     145        if (a < b)
     146  	{
     147  	  /* Suffix is smaller, period is entire prefix so far.  */
     148  	  j += k;
     149  	  k = 1;
     150  	  p = j - max_suffix;
     151  	}
     152        else if (a == b)
     153  	{
     154  	  /* Advance through repetition of the current period.  */
     155  	  if (k != p)
     156  	    ++k;
     157  	  else
     158  	    {
     159  	      j += p;
     160  	      k = 1;
     161  	    }
     162  	}
     163        else /* b < a */
     164  	{
     165  	  /* Suffix is larger, start over from current location.  */
     166  	  max_suffix = j++;
     167  	  k = p = 1;
     168  	}
     169      }
     170    *period = p;
     171  
     172    /* Perform reverse lexicographic search.  */
     173    max_suffix_rev = SIZE_MAX;
     174    j = 0;
     175    k = p = 1;
     176    while (j + k < needle_len)
     177      {
     178        a = CANON_ELEMENT (needle[j + k]);
     179        b = CANON_ELEMENT (needle[max_suffix_rev + k]);
     180        if (b < a)
     181  	{
     182  	  /* Suffix is smaller, period is entire prefix so far.  */
     183  	  j += k;
     184  	  k = 1;
     185  	  p = j - max_suffix_rev;
     186  	}
     187        else if (a == b)
     188  	{
     189  	  /* Advance through repetition of the current period.  */
     190  	  if (k != p)
     191  	    ++k;
     192  	  else
     193  	    {
     194  	      j += p;
     195  	      k = 1;
     196  	    }
     197  	}
     198        else /* a < b */
     199  	{
     200  	  /* Suffix is larger, start over from current location.  */
     201  	  max_suffix_rev = j++;
     202  	  k = p = 1;
     203  	}
     204      }
     205  
     206    /* Choose the longer suffix.  Return the first byte of the right
     207       half, rather than the last byte of the left half.  */
     208    if (max_suffix_rev + 1 < max_suffix + 1)
     209      return max_suffix + 1;
     210    *period = p;
     211    return max_suffix_rev + 1;
     212  }
     213  
     214  /* Return the first location of non-empty NEEDLE within HAYSTACK, or
     215     NULL.  HAYSTACK_LEN is the minimum known length of HAYSTACK.  This
     216     method is optimized for NEEDLE_LEN < LONG_NEEDLE_THRESHOLD.
     217     Performance is guaranteed to be linear, with an initialization cost
     218     of 2 * NEEDLE_LEN comparisons.
     219  
     220     If AVAILABLE does not modify HAYSTACK_LEN (as in memmem), then at
     221     most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching.
     222     If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 *
     223     HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching.  */
     224  static inline RETURN_TYPE
     225  two_way_short_needle (const unsigned char *haystack, size_t haystack_len,
     226  		      const unsigned char *needle, size_t needle_len)
     227  {
     228    size_t i; /* Index into current byte of NEEDLE.  */
     229    size_t j; /* Index into current window of HAYSTACK.  */
     230    size_t period; /* The period of the right half of needle.  */
     231    size_t suffix; /* The index of the right half of needle.  */
     232  
     233    /* Factor the needle into two halves, such that the left half is
     234       smaller than the global period, and the right half is
     235       periodic (with a period as large as NEEDLE_LEN - suffix).  */
     236    suffix = critical_factorization (needle, needle_len, &period);
     237  
     238    /* Perform the search.  Each iteration compares the right half
     239       first.  */
     240    if (CMP_FUNC (needle, needle + period, suffix) == 0)
     241      {
     242        /* Entire needle is periodic; a mismatch can only advance by the
     243  	 period, so use memory to avoid rescanning known occurrences
     244  	 of the period.  */
     245        size_t memory = 0;
     246        j = 0;
     247        while (AVAILABLE (haystack, haystack_len, j, needle_len))
     248  	{
     249  	  const unsigned char *pneedle;
     250  	  const unsigned char *phaystack;
     251  
     252  	  /* Scan for matches in right half.  */
     253  	  i = MAX (suffix, memory);
     254  	  pneedle = &needle[i];
     255  	  phaystack = &haystack[i + j];
     256  	  while (i < needle_len && (CANON_ELEMENT (*pneedle++)
     257  				    == CANON_ELEMENT (*phaystack++)))
     258  	    ++i;
     259  	  if (needle_len <= i)
     260  	    {
     261  	      /* Scan for matches in left half.  */
     262  	      i = suffix - 1;
     263  	      pneedle = &needle[i];
     264  	      phaystack = &haystack[i + j];
     265  	      while (memory < i + 1 && (CANON_ELEMENT (*pneedle--)
     266  					== CANON_ELEMENT (*phaystack--)))
     267  		--i;
     268  	      if (i + 1 < memory + 1)
     269  		return (RETURN_TYPE) (haystack + j);
     270  	      /* No match, so remember how many repetitions of period
     271  		 on the right half were scanned.  */
     272  	      j += period;
     273  	      memory = needle_len - period;
     274  	    }
     275  	  else
     276  	    {
     277  	      j += i - suffix + 1;
     278  	      memory = 0;
     279  	    }
     280  	}
     281      }
     282    else
     283      {
     284        const unsigned char *phaystack;
     285        /* The comparison always starts from needle[suffix], so cache it
     286  	 and use an optimized first-character loop.  */
     287        unsigned char needle_suffix = CANON_ELEMENT (needle[suffix]);
     288  
     289        /* The two halves of needle are distinct; no extra memory is
     290  	 required, and any mismatch results in a maximal shift.  */
     291        period = MAX (suffix, needle_len - suffix) + 1;
     292        j = 0;
     293        while (AVAILABLE (haystack, haystack_len, j, needle_len))
     294  	{
     295  	  unsigned char haystack_char;
     296  	  const unsigned char *pneedle;
     297  
     298  	  phaystack = &haystack[suffix + j];
     299  
     300  #ifdef FASTSEARCH
     301  	  if (*phaystack++ != needle_suffix)
     302  	    {
     303  	      phaystack = FASTSEARCH (phaystack, needle_suffix,
     304  				      haystack_len - needle_len - j);
     305  	      if (phaystack == NULL)
     306  		goto ret0;
     307  	      j = phaystack - &haystack[suffix];
     308  	      phaystack++;
     309  	    }
     310  #else
     311  	  while (needle_suffix
     312  	      != (haystack_char = CANON_ELEMENT (*phaystack++)))
     313  	    {
     314  	      RET0_IF_0 (haystack_char);
     315  # if !CHECK_EOL
     316  	      ++j;
     317  	      if (!AVAILABLE (haystack, haystack_len, j, needle_len))
     318  		goto ret0;
     319  # endif
     320  	    }
     321  
     322  # if CHECK_EOL
     323  	  /* Calculate J if it wasn't kept up-to-date in the first-character
     324  	     loop.  */
     325  	  j = phaystack - &haystack[suffix] - 1;
     326  # endif
     327  #endif
     328  	  /* Scan for matches in right half.  */
     329  	  i = suffix + 1;
     330  	  pneedle = &needle[i];
     331  	  while (i < needle_len)
     332  	    {
     333  	      if (CANON_ELEMENT (*pneedle++)
     334  		  != (haystack_char = CANON_ELEMENT (*phaystack++)))
     335  		{
     336  		  RET0_IF_0 (haystack_char);
     337  		  break;
     338  		}
     339  	      ++i;
     340  	    }
     341  #if CHECK_EOL
     342  	  /* Update minimal length of haystack.  */
     343  	  if (phaystack > haystack + haystack_len)
     344  	    haystack_len = phaystack - haystack;
     345  #endif
     346  	  if (needle_len <= i)
     347  	    {
     348  	      /* Scan for matches in left half.  */
     349  	      i = suffix - 1;
     350  	      pneedle = &needle[i];
     351  	      phaystack = &haystack[i + j];
     352  	      while (i != SIZE_MAX)
     353  		{
     354  		  if (CANON_ELEMENT (*pneedle--)
     355  		      != (haystack_char = CANON_ELEMENT (*phaystack--)))
     356  		    {
     357  		      RET0_IF_0 (haystack_char);
     358  		      break;
     359  		    }
     360  		  --i;
     361  		}
     362  	      if (i == SIZE_MAX)
     363  		return (RETURN_TYPE) (haystack + j);
     364  	      j += period;
     365  	    }
     366  	  else
     367  	    j += i - suffix + 1;
     368  	}
     369      }
     370   ret0: __attribute__ ((unused))
     371    return NULL;
     372  }
     373  
     374  /* Return the first location of non-empty NEEDLE within HAYSTACK, or
     375     NULL.  HAYSTACK_LEN is the minimum known length of HAYSTACK.  This
     376     method is optimized for LONG_NEEDLE_THRESHOLD <= NEEDLE_LEN.
     377     Performance is guaranteed to be linear, with an initialization cost
     378     of 3 * NEEDLE_LEN + (1 << CHAR_BIT) operations.
     379  
     380     If AVAILABLE does not modify HAYSTACK_LEN (as in memmem), then at
     381     most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching,
     382     and sublinear performance O(HAYSTACK_LEN / NEEDLE_LEN) is possible.
     383     If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 *
     384     HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching, and
     385     sublinear performance is not possible.
     386  
     387     Since this function is large and complex, block inlining to avoid
     388     slowing down the common case of small needles.  */
     389  __attribute__((noinline)) static RETURN_TYPE
     390  two_way_long_needle (const unsigned char *haystack, size_t haystack_len,
     391  		     const unsigned char *needle, size_t needle_len)
     392  {
     393    size_t i; /* Index into current byte of NEEDLE.  */
     394    size_t j; /* Index into current window of HAYSTACK.  */
     395    size_t period; /* The period of the right half of needle.  */
     396    size_t suffix; /* The index of the right half of needle.  */
     397    size_t shift_table[1U << CHAR_BIT]; /* See below.  */
     398  
     399    /* Factor the needle into two halves, such that the left half is
     400       smaller than the global period, and the right half is
     401       periodic (with a period as large as NEEDLE_LEN - suffix).  */
     402    suffix = critical_factorization (needle, needle_len, &period);
     403  
     404    /* Populate shift_table.  For each possible byte value c,
     405       shift_table[c] is the distance from the last occurrence of c to
     406       the end of NEEDLE, or NEEDLE_LEN if c is absent from the NEEDLE.
     407       shift_table[NEEDLE[NEEDLE_LEN - 1]] contains the only 0.  */
     408    for (i = 0; i < 1U << CHAR_BIT; i++)
     409      shift_table[i] = needle_len;
     410    for (i = 0; i < needle_len; i++)
     411      shift_table[CANON_ELEMENT (needle[i])] = needle_len - i - 1;
     412  
     413    /* Perform the search.  Each iteration compares the right half
     414       first.  */
     415    if (CMP_FUNC (needle, needle + period, suffix) == 0)
     416      {
     417        /* Entire needle is periodic; a mismatch can only advance by the
     418  	 period, so use memory to avoid rescanning known occurrences
     419  	 of the period.  */
     420        size_t memory = 0;
     421        size_t shift;
     422        j = 0;
     423        while (AVAILABLE (haystack, haystack_len, j, needle_len))
     424  	{
     425  	  const unsigned char *pneedle;
     426  	  const unsigned char *phaystack;
     427  
     428  	  /* Check the last byte first; if it does not match, then
     429  	     shift to the next possible match location.  */
     430  	  shift = shift_table[CANON_ELEMENT (haystack[j + needle_len - 1])];
     431  	  if (0 < shift)
     432  	    {
     433  	      if (memory && shift < period)
     434  		{
     435  		  /* Since needle is periodic, but the last period has
     436  		     a byte out of place, there can be no match until
     437  		     after the mismatch.  */
     438  		  shift = needle_len - period;
     439  		}
     440  	      memory = 0;
     441  	      j += shift;
     442  	      continue;
     443  	    }
     444  	  /* Scan for matches in right half.  The last byte has
     445  	     already been matched, by virtue of the shift table.  */
     446  	  i = MAX (suffix, memory);
     447  	  pneedle = &needle[i];
     448  	  phaystack = &haystack[i + j];
     449  	  while (i < needle_len - 1 && (CANON_ELEMENT (*pneedle++)
     450  					== CANON_ELEMENT (*phaystack++)))
     451  	    ++i;
     452  	  if (needle_len - 1 <= i)
     453  	    {
     454  	      /* Scan for matches in left half.  */
     455  	      i = suffix - 1;
     456  	      pneedle = &needle[i];
     457  	      phaystack = &haystack[i + j];
     458  	      while (memory < i + 1 && (CANON_ELEMENT (*pneedle--)
     459  					== CANON_ELEMENT (*phaystack--)))
     460  		--i;
     461  	      if (i + 1 < memory + 1)
     462  		return (RETURN_TYPE) (haystack + j);
     463  	      /* No match, so remember how many repetitions of period
     464  		 on the right half were scanned.  */
     465  	      j += period;
     466  	      memory = needle_len - period;
     467  	    }
     468  	  else
     469  	    {
     470  	      j += i - suffix + 1;
     471  	      memory = 0;
     472  	    }
     473  	}
     474      }
     475    else
     476      {
     477        /* The two halves of needle are distinct; no extra memory is
     478  	 required, and any mismatch results in a maximal shift.  */
     479        size_t shift;
     480        period = MAX (suffix, needle_len - suffix) + 1;
     481        j = 0;
     482        while (AVAILABLE (haystack, haystack_len, j, needle_len))
     483  	{
     484  	  const unsigned char *pneedle;
     485  	  const unsigned char *phaystack;
     486  
     487  	  /* Check the last byte first; if it does not match, then
     488  	     shift to the next possible match location.  */
     489  	  shift = shift_table[CANON_ELEMENT (haystack[j + needle_len - 1])];
     490  	  if (0 < shift)
     491  	    {
     492  	      j += shift;
     493  	      continue;
     494  	    }
     495  	  /* Scan for matches in right half.  The last byte has
     496  	     already been matched, by virtue of the shift table.  */
     497  	  i = suffix;
     498  	  pneedle = &needle[i];
     499  	  phaystack = &haystack[i + j];
     500  	  while (i < needle_len - 1 && (CANON_ELEMENT (*pneedle++)
     501  					== CANON_ELEMENT (*phaystack++)))
     502  	    ++i;
     503  	  if (needle_len - 1 <= i)
     504  	    {
     505  	      /* Scan for matches in left half.  */
     506  	      i = suffix - 1;
     507  	      pneedle = &needle[i];
     508  	      phaystack = &haystack[i + j];
     509  	      while (i != SIZE_MAX && (CANON_ELEMENT (*pneedle--)
     510  				       == CANON_ELEMENT (*phaystack--)))
     511  		--i;
     512  	      if (i == SIZE_MAX)
     513  		return (RETURN_TYPE) (haystack + j);
     514  	      j += period;
     515  	    }
     516  	  else
     517  	    j += i - suffix + 1;
     518  	}
     519      }
     520    return NULL;
     521  }
     522  
     523  #undef AVAILABLE
     524  #undef CANON_ELEMENT
     525  #undef CMP_FUNC
     526  #undef RET0_IF_0
     527  #undef RETURN_TYPE
     528  #undef CHECK_EOL