(root)/
grep-3.11/
lib/
mbsstr.c
       1  /* Searching in a string.  -*- coding: utf-8 -*-
       2     Copyright (C) 2005-2023 Free Software Foundation, Inc.
       3     Written by Bruno Haible <bruno@clisp.org>, 2005.
       4  
       5     This file is free software: you can redistribute it and/or modify
       6     it under the terms of the GNU Lesser General Public License as
       7     published by the Free Software Foundation, either version 3 of the
       8     License, or (at your option) any later version.
       9  
      10     This file is distributed in the hope that it will be useful,
      11     but WITHOUT ANY WARRANTY; without even the implied warranty of
      12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      13     GNU Lesser General Public License for more details.
      14  
      15     You should have received a copy of the GNU Lesser General Public License
      16     along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
      17  
      18  #include <config.h>
      19  
      20  /* Specification.  */
      21  #include <string.h>
      22  
      23  #include <stddef.h>  /* for NULL, in case a nonstandard string.h lacks it */
      24  #include <stdlib.h>
      25  
      26  #include "malloca.h"
      27  #include "mbuiter.h"
      28  
      29  /* Knuth-Morris-Pratt algorithm.  */
      30  #define UNIT unsigned char
      31  #define CANON_ELEMENT(c) c
      32  #include "str-kmp.h"
      33  
      34  /* Knuth-Morris-Pratt algorithm.
      35     See https://en.wikipedia.org/wiki/Knuth-Morris-Pratt_algorithm
      36     Return a boolean indicating success:
      37     Return true and set *RESULTP if the search was completed.
      38     Return false if it was aborted because not enough memory was available.  */
      39  static bool
      40  knuth_morris_pratt_multibyte (const char *haystack, const char *needle,
      41                                const char **resultp)
      42  {
      43    size_t m = mbslen (needle);
      44    mbchar_t *needle_mbchars;
      45    size_t *table;
      46  
      47    /* Allocate room for needle_mbchars and the table.  */
      48    void *memory = nmalloca (m, sizeof (mbchar_t) + sizeof (size_t));
      49    void *table_memory;
      50    if (memory == NULL)
      51      return false;
      52    needle_mbchars = memory;
      53    table_memory = needle_mbchars + m;
      54    table = table_memory;
      55  
      56    /* Fill needle_mbchars.  */
      57    {
      58      mbui_iterator_t iter;
      59      size_t j;
      60  
      61      j = 0;
      62      for (mbui_init (iter, needle); mbui_avail (iter); mbui_advance (iter), j++)
      63        mb_copy (&needle_mbchars[j], &mbui_cur (iter));
      64    }
      65  
      66    /* Fill the table.
      67       For 0 < i < m:
      68         0 < table[i] <= i is defined such that
      69         forall 0 < x < table[i]: needle[x..i-1] != needle[0..i-1-x],
      70         and table[i] is as large as possible with this property.
      71       This implies:
      72       1) For 0 < i < m:
      73            If table[i] < i,
      74            needle[table[i]..i-1] = needle[0..i-1-table[i]].
      75       2) For 0 < i < m:
      76            rhaystack[0..i-1] == needle[0..i-1]
      77            and exists h, i <= h < m: rhaystack[h] != needle[h]
      78            implies
      79            forall 0 <= x < table[i]: rhaystack[x..x+m-1] != needle[0..m-1].
      80       table[0] remains uninitialized.  */
      81    {
      82      size_t i, j;
      83  
      84      /* i = 1: Nothing to verify for x = 0.  */
      85      table[1] = 1;
      86      j = 0;
      87  
      88      for (i = 2; i < m; i++)
      89        {
      90          /* Here: j = i-1 - table[i-1].
      91             The inequality needle[x..i-1] != needle[0..i-1-x] is known to hold
      92             for x < table[i-1], by induction.
      93             Furthermore, if j>0: needle[i-1-j..i-2] = needle[0..j-1].  */
      94          mbchar_t *b = &needle_mbchars[i - 1];
      95  
      96          for (;;)
      97            {
      98              /* Invariants: The inequality needle[x..i-1] != needle[0..i-1-x]
      99                 is known to hold for x < i-1-j.
     100                 Furthermore, if j>0: needle[i-1-j..i-2] = needle[0..j-1].  */
     101              if (mb_equal (*b, needle_mbchars[j]))
     102                {
     103                  /* Set table[i] := i-1-j.  */
     104                  table[i] = i - ++j;
     105                  break;
     106                }
     107              /* The inequality needle[x..i-1] != needle[0..i-1-x] also holds
     108                 for x = i-1-j, because
     109                   needle[i-1] != needle[j] = needle[i-1-x].  */
     110              if (j == 0)
     111                {
     112                  /* The inequality holds for all possible x.  */
     113                  table[i] = i;
     114                  break;
     115                }
     116              /* The inequality needle[x..i-1] != needle[0..i-1-x] also holds
     117                 for i-1-j < x < i-1-j+table[j], because for these x:
     118                   needle[x..i-2]
     119                   = needle[x-(i-1-j)..j-1]
     120                   != needle[0..j-1-(x-(i-1-j))]  (by definition of table[j])
     121                      = needle[0..i-2-x],
     122                 hence needle[x..i-1] != needle[0..i-1-x].
     123                 Furthermore
     124                   needle[i-1-j+table[j]..i-2]
     125                   = needle[table[j]..j-1]
     126                   = needle[0..j-1-table[j]]  (by definition of table[j]).  */
     127              j = j - table[j];
     128            }
     129          /* Here: j = i - table[i].  */
     130        }
     131    }
     132  
     133    /* Search, using the table to accelerate the processing.  */
     134    {
     135      size_t j;
     136      mbui_iterator_t rhaystack;
     137      mbui_iterator_t phaystack;
     138  
     139      *resultp = NULL;
     140      j = 0;
     141      mbui_init (rhaystack, haystack);
     142      mbui_init (phaystack, haystack);
     143      /* Invariant: phaystack = rhaystack + j.  */
     144      while (mbui_avail (phaystack))
     145        if (mb_equal (needle_mbchars[j], mbui_cur (phaystack)))
     146          {
     147            j++;
     148            mbui_advance (phaystack);
     149            if (j == m)
     150              {
     151                /* The entire needle has been found.  */
     152                *resultp = mbui_cur_ptr (rhaystack);
     153                break;
     154              }
     155          }
     156        else if (j > 0)
     157          {
     158            /* Found a match of needle[0..j-1], mismatch at needle[j].  */
     159            size_t count = table[j];
     160            j -= count;
     161            for (; count > 0; count--)
     162              {
     163                if (!mbui_avail (rhaystack))
     164                  abort ();
     165                mbui_advance (rhaystack);
     166              }
     167          }
     168        else
     169          {
     170            /* Found a mismatch at needle[0] already.  */
     171            if (!mbui_avail (rhaystack))
     172              abort ();
     173            mbui_advance (rhaystack);
     174            mbui_advance (phaystack);
     175          }
     176    }
     177  
     178    freea (memory);
     179    return true;
     180  }
     181  
     182  /* Find the first occurrence of the character string NEEDLE in the character
     183     string HAYSTACK.  Return NULL if NEEDLE is not found in HAYSTACK.  */
     184  char *
     185  mbsstr (const char *haystack, const char *needle)
     186  {
     187    /* Be careful not to look at the entire extent of haystack or needle
     188       until needed.  This is useful because of these two cases:
     189         - haystack may be very long, and a match of needle found early,
     190         - needle may be very long, and not even a short initial segment of
     191           needle may be found in haystack.  */
     192    if (MB_CUR_MAX > 1)
     193      {
     194        mbui_iterator_t iter_needle;
     195  
     196        mbui_init (iter_needle, needle);
     197        if (mbui_avail (iter_needle))
     198          {
     199            /* Minimizing the worst-case complexity:
     200               Let n = mbslen(haystack), m = mbslen(needle).
     201               The naïve algorithm is O(n*m) worst-case.
     202               The Knuth-Morris-Pratt algorithm is O(n) worst-case but it needs a
     203               memory allocation.
     204               To achieve linear complexity and yet amortize the cost of the
     205               memory allocation, we activate the Knuth-Morris-Pratt algorithm
     206               only once the naïve algorithm has already run for some time; more
     207               precisely, when
     208                 - the outer loop count is >= 10,
     209                 - the average number of comparisons per outer loop is >= 5,
     210                 - the total number of comparisons is >= m.
     211               But we try it only once.  If the memory allocation attempt failed,
     212               we don't retry it.  */
     213            bool try_kmp = true;
     214            size_t outer_loop_count = 0;
     215            size_t comparison_count = 0;
     216            size_t last_ccount = 0;                  /* last comparison count */
     217            mbui_iterator_t iter_needle_last_ccount; /* = needle + last_ccount */
     218  
     219            mbui_iterator_t iter_haystack;
     220  
     221            mbui_init (iter_needle_last_ccount, needle);
     222            mbui_init (iter_haystack, haystack);
     223            for (;; mbui_advance (iter_haystack))
     224              {
     225                if (!mbui_avail (iter_haystack))
     226                  /* No match.  */
     227                  return NULL;
     228  
     229                /* See whether it's advisable to use an asymptotically faster
     230                   algorithm.  */
     231                if (try_kmp
     232                    && outer_loop_count >= 10
     233                    && comparison_count >= 5 * outer_loop_count)
     234                  {
     235                    /* See if needle + comparison_count now reaches the end of
     236                       needle.  */
     237                    size_t count = comparison_count - last_ccount;
     238                    for (;
     239                         count > 0 && mbui_avail (iter_needle_last_ccount);
     240                         count--)
     241                      mbui_advance (iter_needle_last_ccount);
     242                    last_ccount = comparison_count;
     243                    if (!mbui_avail (iter_needle_last_ccount))
     244                      {
     245                        /* Try the Knuth-Morris-Pratt algorithm.  */
     246                        const char *result;
     247                        bool success =
     248                          knuth_morris_pratt_multibyte (haystack, needle,
     249                                                        &result);
     250                        if (success)
     251                          return (char *) result;
     252                        try_kmp = false;
     253                      }
     254                  }
     255  
     256                outer_loop_count++;
     257                comparison_count++;
     258                if (mb_equal (mbui_cur (iter_haystack), mbui_cur (iter_needle)))
     259                  /* The first character matches.  */
     260                  {
     261                    mbui_iterator_t rhaystack;
     262                    mbui_iterator_t rneedle;
     263  
     264                    memcpy (&rhaystack, &iter_haystack, sizeof (mbui_iterator_t));
     265                    mbui_advance (rhaystack);
     266  
     267                    mbui_init (rneedle, needle);
     268                    if (!mbui_avail (rneedle))
     269                      abort ();
     270                    mbui_advance (rneedle);
     271  
     272                    for (;; mbui_advance (rhaystack), mbui_advance (rneedle))
     273                      {
     274                        if (!mbui_avail (rneedle))
     275                          /* Found a match.  */
     276                          return (char *) mbui_cur_ptr (iter_haystack);
     277                        if (!mbui_avail (rhaystack))
     278                          /* No match.  */
     279                          return NULL;
     280                        comparison_count++;
     281                        if (!mb_equal (mbui_cur (rhaystack), mbui_cur (rneedle)))
     282                          /* Nothing in this round.  */
     283                          break;
     284                      }
     285                  }
     286              }
     287          }
     288        else
     289          return (char *) haystack;
     290      }
     291    else
     292      {
     293        if (*needle != '\0')
     294          {
     295            /* Minimizing the worst-case complexity:
     296               Let n = strlen(haystack), m = strlen(needle).
     297               The naïve algorithm is O(n*m) worst-case.
     298               The Knuth-Morris-Pratt algorithm is O(n) worst-case but it needs a
     299               memory allocation.
     300               To achieve linear complexity and yet amortize the cost of the
     301               memory allocation, we activate the Knuth-Morris-Pratt algorithm
     302               only once the naïve algorithm has already run for some time; more
     303               precisely, when
     304                 - the outer loop count is >= 10,
     305                 - the average number of comparisons per outer loop is >= 5,
     306                 - the total number of comparisons is >= m.
     307               But we try it only once.  If the memory allocation attempt failed,
     308               we don't retry it.  */
     309            bool try_kmp = true;
     310            size_t outer_loop_count = 0;
     311            size_t comparison_count = 0;
     312            size_t last_ccount = 0;                  /* last comparison count */
     313            const char *needle_last_ccount = needle; /* = needle + last_ccount */
     314  
     315            /* Speed up the following searches of needle by caching its first
     316               character.  */
     317            char b = *needle++;
     318  
     319            for (;; haystack++)
     320              {
     321                if (*haystack == '\0')
     322                  /* No match.  */
     323                  return NULL;
     324  
     325                /* See whether it's advisable to use an asymptotically faster
     326                   algorithm.  */
     327                if (try_kmp
     328                    && outer_loop_count >= 10
     329                    && comparison_count >= 5 * outer_loop_count)
     330                  {
     331                    /* See if needle + comparison_count now reaches the end of
     332                       needle.  */
     333                    if (needle_last_ccount != NULL)
     334                      {
     335                        needle_last_ccount +=
     336                          strnlen (needle_last_ccount,
     337                                   comparison_count - last_ccount);
     338                        if (*needle_last_ccount == '\0')
     339                          needle_last_ccount = NULL;
     340                        last_ccount = comparison_count;
     341                      }
     342                    if (needle_last_ccount == NULL)
     343                      {
     344                        /* Try the Knuth-Morris-Pratt algorithm.  */
     345                        const unsigned char *result;
     346                        bool success =
     347                          knuth_morris_pratt ((const unsigned char *) haystack,
     348                                              (const unsigned char *) (needle - 1),
     349                                              strlen (needle - 1),
     350                                              &result);
     351                        if (success)
     352                          return (char *) result;
     353                        try_kmp = false;
     354                      }
     355                  }
     356  
     357                outer_loop_count++;
     358                comparison_count++;
     359                if (*haystack == b)
     360                  /* The first character matches.  */
     361                  {
     362                    const char *rhaystack = haystack + 1;
     363                    const char *rneedle = needle;
     364  
     365                    for (;; rhaystack++, rneedle++)
     366                      {
     367                        if (*rneedle == '\0')
     368                          /* Found a match.  */
     369                          return (char *) haystack;
     370                        if (*rhaystack == '\0')
     371                          /* No match.  */
     372                          return NULL;
     373                        comparison_count++;
     374                        if (*rhaystack != *rneedle)
     375                          /* Nothing in this round.  */
     376                          break;
     377                      }
     378                  }
     379              }
     380          }
     381        else
     382          return (char *) haystack;
     383      }
     384  }