(root)/
coreutils-9.4/
gnulib-tests/
str-two-way.h
       1  /* Byte-wise substring search, using the Two-Way algorithm.
       2     Copyright (C) 2008-2023 Free Software Foundation, Inc.
       3     This file is part of the GNU C Library.
       4     Written by Eric Blake <ebb9@byu.net>, 2008.
       5  
       6     This file is free software: you can redistribute it and/or modify
       7     it under the terms of the GNU Lesser General Public License as
       8     published by the Free Software Foundation; either version 2.1 of the
       9     License, or (at your option) any later version.
      10  
      11     This file is distributed in the hope that it will be useful,
      12     but WITHOUT ANY WARRANTY; without even the implied warranty of
      13     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      14     GNU Lesser General Public License for more details.
      15  
      16     You should have received a copy of the GNU Lesser General Public License
      17     along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
      18  
      19  /* Before including this file, you need to include <config.h> and
      20     <string.h>, and define:
      21       RETURN_TYPE             A macro that expands to the return type.
      22       AVAILABLE(h, h_l, j, n_l)
      23                               A macro that returns nonzero if there are
      24                               at least N_L bytes left starting at H[J].
      25                               H is 'unsigned char *', H_L, J, and N_L
      26                               are 'size_t'; H_L is an lvalue.  For
      27                               NUL-terminated searches, H_L can be
      28                               modified each iteration to avoid having
      29                               to compute the end of H up front.
      30  
      31    For case-insensitivity, you may optionally define:
      32       CMP_FUNC(p1, p2, l)     A macro that returns 0 iff the first L
      33                               characters of P1 and P2 are equal.
      34       CANON_ELEMENT(c)        A macro that canonicalizes an element right after
      35                               it has been fetched from one of the two strings.
      36                               The argument is an 'unsigned char'; the result
      37                               must be an 'unsigned char' as well.
      38  
      39    This file undefines the macros documented above, and defines
      40    LONG_NEEDLE_THRESHOLD.
      41  */
      42  
      43  #include <limits.h>
      44  #include <stdint.h>
      45  
      46  /* We use the Two-Way string matching algorithm (also known as
      47     Chrochemore-Perrin), which guarantees linear complexity with
      48     constant space.  Additionally, for long needles, we also use a bad
      49     character shift table similar to the Boyer-Moore algorithm to
      50     achieve improved (potentially sub-linear) performance.
      51  
      52     See https://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260,
      53     https://en.wikipedia.org/wiki/Boyer-Moore_string_search_algorithm,
      54     https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.34.6641&rep=rep1&type=pdf
      55  */
      56  
      57  /* Point at which computing a bad-byte shift table is likely to be
      58     worthwhile.  Small needles should not compute a table, since it
      59     adds (1 << CHAR_BIT) + NEEDLE_LEN computations of preparation for a
      60     speedup no greater than a factor of NEEDLE_LEN.  The larger the
      61     needle, the better the potential performance gain.  On the other
      62     hand, on non-POSIX systems with CHAR_BIT larger than eight, the
      63     memory required for the table is prohibitive.  */
      64  #if CHAR_BIT < 10
      65  # define LONG_NEEDLE_THRESHOLD 32U
      66  #else
      67  # define LONG_NEEDLE_THRESHOLD SIZE_MAX
      68  #endif
      69  
      70  #ifndef MAX
      71  # define MAX(a, b) ((a < b) ? (b) : (a))
      72  #endif
      73  
      74  #ifndef CANON_ELEMENT
      75  # define CANON_ELEMENT(c) c
      76  #endif
      77  #ifndef CMP_FUNC
      78  # define CMP_FUNC memcmp
      79  #endif
      80  
      81  /* Perform a critical factorization of NEEDLE, of length NEEDLE_LEN.
      82     Return the index of the first byte in the right half, and set
      83     *PERIOD to the global period of the right half.
      84  
      85     The global period of a string is the smallest index (possibly its
      86     length) at which all remaining bytes in the string are repetitions
      87     of the prefix (the last repetition may be a subset of the prefix).
      88  
      89     When NEEDLE is factored into two halves, a local period is the
      90     length of the smallest word that shares a suffix with the left half
      91     and shares a prefix with the right half.  All factorizations of a
      92     non-empty NEEDLE have a local period of at least 1 and no greater
      93     than NEEDLE_LEN.
      94  
      95     A critical factorization has the property that the local period
      96     equals the global period.  All strings have at least one critical
      97     factorization with the left half smaller than the global period.
      98     And while some strings have more than one critical factorization,
      99     it is provable that with an ordered alphabet, at least one of the
     100     critical factorizations corresponds to a maximal suffix.
     101  
     102     Given an ordered alphabet, a critical factorization can be computed
     103     in linear time, with 2 * NEEDLE_LEN comparisons, by computing the
     104     shorter of two ordered maximal suffixes.  The ordered maximal
     105     suffixes are determined by lexicographic comparison while tracking
     106     periodicity.  */
     107  static size_t
     108  critical_factorization (const unsigned char *needle, size_t needle_len,
     109                          size_t *period)
     110  {
     111    /* Index of last byte of left half, or SIZE_MAX.  */
     112    size_t max_suffix, max_suffix_rev;
     113    size_t j; /* Index into NEEDLE for current candidate suffix.  */
     114    size_t k; /* Offset into current period.  */
     115    size_t p; /* Intermediate period.  */
     116    unsigned char a, b; /* Current comparison bytes.  */
     117  
     118    /* Special case NEEDLE_LEN of 1 or 2 (all callers already filtered
     119       out 0-length needles.  */
     120    if (needle_len < 3)
     121      {
     122        *period = 1;
     123        return needle_len - 1;
     124      }
     125  
     126    /* Invariants:
     127       0 <= j < NEEDLE_LEN - 1
     128       -1 <= max_suffix{,_rev} < j (treating SIZE_MAX as if it were signed)
     129       min(max_suffix, max_suffix_rev) < global period of NEEDLE
     130       1 <= p <= global period of NEEDLE
     131       p == global period of the substring NEEDLE[max_suffix{,_rev}+1...j]
     132       1 <= k <= p
     133    */
     134  
     135    /* Perform lexicographic search.  */
     136    max_suffix = SIZE_MAX;
     137    j = 0;
     138    k = p = 1;
     139    while (j + k < needle_len)
     140      {
     141        a = CANON_ELEMENT (needle[j + k]);
     142        b = CANON_ELEMENT (needle[max_suffix + k]);
     143        if (a < b)
     144          {
     145            /* Suffix is smaller, period is entire prefix so far.  */
     146            j += k;
     147            k = 1;
     148            p = j - max_suffix;
     149          }
     150        else if (a == b)
     151          {
     152            /* Advance through repetition of the current period.  */
     153            if (k != p)
     154              ++k;
     155            else
     156              {
     157                j += p;
     158                k = 1;
     159              }
     160          }
     161        else /* b < a */
     162          {
     163            /* Suffix is larger, start over from current location.  */
     164            max_suffix = j++;
     165            k = p = 1;
     166          }
     167      }
     168    *period = p;
     169  
     170    /* Perform reverse lexicographic search.  */
     171    max_suffix_rev = SIZE_MAX;
     172    j = 0;
     173    k = p = 1;
     174    while (j + k < needle_len)
     175      {
     176        a = CANON_ELEMENT (needle[j + k]);
     177        b = CANON_ELEMENT (needle[max_suffix_rev + k]);
     178        if (b < a)
     179          {
     180            /* Suffix is smaller, period is entire prefix so far.  */
     181            j += k;
     182            k = 1;
     183            p = j - max_suffix_rev;
     184          }
     185        else if (a == b)
     186          {
     187            /* Advance through repetition of the current period.  */
     188            if (k != p)
     189              ++k;
     190            else
     191              {
     192                j += p;
     193                k = 1;
     194              }
     195          }
     196        else /* a < b */
     197          {
     198            /* Suffix is larger, start over from current location.  */
     199            max_suffix_rev = j++;
     200            k = p = 1;
     201          }
     202      }
     203  
     204    /* Choose the shorter suffix.  Return the index of the first byte of
     205       the right half, rather than the last byte of the left half.
     206  
     207       For some examples, 'banana' has two critical factorizations, both
     208       exposed by the two lexicographic extreme suffixes of 'anana' and
     209       'nana', where both suffixes have a period of 2.  On the other
     210       hand, with 'aab' and 'bba', both strings have a single critical
     211       factorization of the last byte, with the suffix having a period
     212       of 1.  While the maximal lexicographic suffix of 'aab' is 'b',
     213       the maximal lexicographic suffix of 'bba' is 'ba', which is not a
     214       critical factorization.  Conversely, the maximal reverse
     215       lexicographic suffix of 'a' works for 'bba', but not 'ab' for
     216       'aab'.  The shorter suffix of the two will always be a critical
     217       factorization.  */
     218    if (max_suffix_rev + 1 < max_suffix + 1)
     219      return max_suffix + 1;
     220    *period = p;
     221    return max_suffix_rev + 1;
     222  }
     223  
     224  /* Return the first location of non-empty NEEDLE within HAYSTACK, or
     225     NULL.  HAYSTACK_LEN is the minimum known length of HAYSTACK.  This
     226     method is optimized for NEEDLE_LEN < LONG_NEEDLE_THRESHOLD.
     227     Performance is guaranteed to be linear, with an initialization cost
     228     of 2 * NEEDLE_LEN comparisons.
     229  
     230     If AVAILABLE does not modify HAYSTACK_LEN (as in memmem), then at
     231     most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching.
     232     If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 *
     233     HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching.  */
     234  static RETURN_TYPE _GL_ATTRIBUTE_PURE
     235  two_way_short_needle (const unsigned char *haystack, size_t haystack_len,
     236                        const unsigned char *needle, size_t needle_len)
     237  {
     238    size_t i; /* Index into current byte of NEEDLE.  */
     239    size_t j; /* Index into current window of HAYSTACK.  */
     240    size_t period; /* The period of the right half of needle.  */
     241    size_t suffix; /* The index of the right half of needle.  */
     242  
     243    /* Factor the needle into two halves, such that the left half is
     244       smaller than the global period, and the right half is
     245       periodic (with a period as large as NEEDLE_LEN - suffix).  */
     246    suffix = critical_factorization (needle, needle_len, &period);
     247  
     248    /* Perform the search.  Each iteration compares the right half
     249       first.  */
     250    if (CMP_FUNC (needle, needle + period, suffix) == 0)
     251      {
     252        /* Entire needle is periodic; a mismatch in the left half can
     253           only advance by the period, so use memory to avoid rescanning
     254           known occurrences of the period in the right half.  */
     255        size_t memory = 0;
     256        j = 0;
     257        while (AVAILABLE (haystack, haystack_len, j, needle_len))
     258          {
     259            /* Scan for matches in right half.  */
     260            i = MAX (suffix, memory);
     261            while (i < needle_len && (CANON_ELEMENT (needle[i])
     262                                      == CANON_ELEMENT (haystack[i + j])))
     263              ++i;
     264            if (needle_len <= i)
     265              {
     266                /* Scan for matches in left half.  */
     267                i = suffix - 1;
     268                while (memory < i + 1 && (CANON_ELEMENT (needle[i])
     269                                          == CANON_ELEMENT (haystack[i + j])))
     270                  --i;
     271                if (i + 1 < memory + 1)
     272                  return (RETURN_TYPE) (haystack + j);
     273                /* No match, so remember how many repetitions of period
     274                   on the right half were scanned.  */
     275                j += period;
     276                memory = needle_len - period;
     277              }
     278            else
     279              {
     280                j += i - suffix + 1;
     281                memory = 0;
     282              }
     283          }
     284      }
     285    else
     286      {
     287        /* The two halves of needle are distinct; no extra memory is
     288           required, and any mismatch results in a maximal shift.  */
     289        period = MAX (suffix, needle_len - suffix) + 1;
     290        j = 0;
     291        while (AVAILABLE (haystack, haystack_len, j, needle_len))
     292          {
     293            /* Scan for matches in right half.  */
     294            i = suffix;
     295            while (i < needle_len && (CANON_ELEMENT (needle[i])
     296                                      == CANON_ELEMENT (haystack[i + j])))
     297              ++i;
     298            if (needle_len <= i)
     299              {
     300                /* Scan for matches in left half.  */
     301                i = suffix - 1;
     302                while (i != SIZE_MAX && (CANON_ELEMENT (needle[i])
     303                                         == CANON_ELEMENT (haystack[i + j])))
     304                  --i;
     305                if (i == SIZE_MAX)
     306                  return (RETURN_TYPE) (haystack + j);
     307                j += period;
     308              }
     309            else
     310              j += i - suffix + 1;
     311          }
     312      }
     313    return NULL;
     314  }
     315  
     316  /* Return the first location of non-empty NEEDLE within HAYSTACK, or
     317     NULL.  HAYSTACK_LEN is the minimum known length of HAYSTACK.  This
     318     method is optimized for LONG_NEEDLE_THRESHOLD <= NEEDLE_LEN.
     319     Performance is guaranteed to be linear, with an initialization cost
     320     of 3 * NEEDLE_LEN + (1 << CHAR_BIT) operations.
     321  
     322     If AVAILABLE does not modify HAYSTACK_LEN (as in memmem), then at
     323     most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching,
     324     and sublinear performance O(HAYSTACK_LEN / NEEDLE_LEN) is possible.
     325     If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 *
     326     HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching, and
     327     sublinear performance is not possible.  */
     328  static RETURN_TYPE _GL_ATTRIBUTE_PURE
     329  two_way_long_needle (const unsigned char *haystack, size_t haystack_len,
     330                       const unsigned char *needle, size_t needle_len)
     331  {
     332    size_t i; /* Index into current byte of NEEDLE.  */
     333    size_t j; /* Index into current window of HAYSTACK.  */
     334    size_t period; /* The period of the right half of needle.  */
     335    size_t suffix; /* The index of the right half of needle.  */
     336    size_t shift_table[1U << CHAR_BIT]; /* See below.  */
     337  
     338    /* Factor the needle into two halves, such that the left half is
     339       smaller than the global period, and the right half is
     340       periodic (with a period as large as NEEDLE_LEN - suffix).  */
     341    suffix = critical_factorization (needle, needle_len, &period);
     342  
     343    /* Populate shift_table.  For each possible byte value c,
     344       shift_table[c] is the distance from the last occurrence of c to
     345       the end of NEEDLE, or NEEDLE_LEN if c is absent from the NEEDLE.
     346       shift_table[NEEDLE[NEEDLE_LEN - 1]] contains the only 0.  */
     347    for (i = 0; i < 1U << CHAR_BIT; i++)
     348      shift_table[i] = needle_len;
     349    for (i = 0; i < needle_len; i++)
     350      shift_table[CANON_ELEMENT (needle[i])] = needle_len - i - 1;
     351  
     352    /* Perform the search.  Each iteration compares the right half
     353       first.  */
     354    if (CMP_FUNC (needle, needle + period, suffix) == 0)
     355      {
     356        /* Entire needle is periodic; a mismatch in the left half can
     357           only advance by the period, so use memory to avoid rescanning
     358           known occurrences of the period in the right half.  */
     359        size_t memory = 0;
     360        size_t shift;
     361        j = 0;
     362        while (AVAILABLE (haystack, haystack_len, j, needle_len))
     363          {
     364            /* Check the last byte first; if it does not match, then
     365               shift to the next possible match location.  */
     366            shift = shift_table[CANON_ELEMENT (haystack[j + needle_len - 1])];
     367            if (0 < shift)
     368              {
     369                if (memory && shift < period)
     370                  {
     371                    /* Since needle is periodic, but the last period has
     372                       a byte out of place, there can be no match until
     373                       after the mismatch.  */
     374                    shift = needle_len - period;
     375                  }
     376                memory = 0;
     377                j += shift;
     378                continue;
     379              }
     380            /* Scan for matches in right half.  The last byte has
     381               already been matched, by virtue of the shift table.  */
     382            i = MAX (suffix, memory);
     383            while (i < needle_len - 1 && (CANON_ELEMENT (needle[i])
     384                                          == CANON_ELEMENT (haystack[i + j])))
     385              ++i;
     386            if (needle_len - 1 <= i)
     387              {
     388                /* Scan for matches in left half.  */
     389                i = suffix - 1;
     390                while (memory < i + 1 && (CANON_ELEMENT (needle[i])
     391                                          == CANON_ELEMENT (haystack[i + j])))
     392                  --i;
     393                if (i + 1 < memory + 1)
     394                  return (RETURN_TYPE) (haystack + j);
     395                /* No match, so remember how many repetitions of period
     396                   on the right half were scanned.  */
     397                j += period;
     398                memory = needle_len - period;
     399              }
     400            else
     401              {
     402                j += i - suffix + 1;
     403                memory = 0;
     404              }
     405          }
     406      }
     407    else
     408      {
     409        /* The two halves of needle are distinct; no extra memory is
     410           required, and any mismatch results in a maximal shift.  */
     411        size_t shift;
     412        period = MAX (suffix, needle_len - suffix) + 1;
     413        j = 0;
     414        while (AVAILABLE (haystack, haystack_len, j, needle_len))
     415          {
     416            /* Check the last byte first; if it does not match, then
     417               shift to the next possible match location.  */
     418            shift = shift_table[CANON_ELEMENT (haystack[j + needle_len - 1])];
     419            if (0 < shift)
     420              {
     421                j += shift;
     422                continue;
     423              }
     424            /* Scan for matches in right half.  The last byte has
     425               already been matched, by virtue of the shift table.  */
     426            i = suffix;
     427            while (i < needle_len - 1 && (CANON_ELEMENT (needle[i])
     428                                          == CANON_ELEMENT (haystack[i + j])))
     429              ++i;
     430            if (needle_len - 1 <= i)
     431              {
     432                /* Scan for matches in left half.  */
     433                i = suffix - 1;
     434                while (i != SIZE_MAX && (CANON_ELEMENT (needle[i])
     435                                         == CANON_ELEMENT (haystack[i + j])))
     436                  --i;
     437                if (i == SIZE_MAX)
     438                  return (RETURN_TYPE) (haystack + j);
     439                j += period;
     440              }
     441            else
     442              j += i - suffix + 1;
     443          }
     444      }
     445    return NULL;
     446  }
     447  
     448  #undef AVAILABLE
     449  #undef CANON_ELEMENT
     450  #undef CMP_FUNC
     451  #undef MAX
     452  #undef RETURN_TYPE