(root)/
m4-1.4.19/
lib/
mbsstr.c
       1  /* Searching in a string.  -*- coding: utf-8 -*-
       2     Copyright (C) 2005-2021 Free Software Foundation, Inc.
       3     Written by Bruno Haible <bruno@clisp.org>, 2005.
       4  
       5     This program is free software: you can redistribute it and/or modify
       6     it under the terms of the GNU General Public License as published by
       7     the Free Software Foundation; either version 3 of the License, or
       8     (at your option) any later version.
       9  
      10     This program is distributed in the hope that it will be useful,
      11     but WITHOUT ANY WARRANTY; without even the implied warranty of
      12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      13     GNU General Public License for more details.
      14  
      15     You should have received a copy of the GNU General Public License
      16     along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
      17  
      18  #include <config.h>
      19  
      20  /* Specification.  */
      21  #include <string.h>
      22  
      23  #include <stdbool.h>
      24  #include <stddef.h>  /* for NULL, in case a nonstandard string.h lacks it */
      25  #include <stdlib.h>
      26  
      27  #include "malloca.h"
      28  #include "mbuiter.h"
      29  
      30  /* Knuth-Morris-Pratt algorithm.  */
      31  #define UNIT unsigned char
      32  #define CANON_ELEMENT(c) c
      33  #include "str-kmp.h"
      34  
      35  /* Knuth-Morris-Pratt algorithm.
      36     See https://en.wikipedia.org/wiki/Knuth-Morris-Pratt_algorithm
      37     Return a boolean indicating success:
      38     Return true and set *RESULTP if the search was completed.
      39     Return false if it was aborted because not enough memory was available.  */
      40  static bool
      41  knuth_morris_pratt_multibyte (const char *haystack, const char *needle,
      42                                const char **resultp)
      43  {
      44    size_t m = mbslen (needle);
      45    mbchar_t *needle_mbchars;
      46    size_t *table;
      47  
      48    /* Allocate room for needle_mbchars and the table.  */
      49    void *memory = nmalloca (m, sizeof (mbchar_t) + sizeof (size_t));
      50    void *table_memory;
      51    if (memory == NULL)
      52      return false;
      53    needle_mbchars = memory;
      54    table_memory = needle_mbchars + m;
      55    table = table_memory;
      56  
      57    /* Fill needle_mbchars.  */
      58    {
      59      mbui_iterator_t iter;
      60      size_t j;
      61  
      62      j = 0;
      63      for (mbui_init (iter, needle); mbui_avail (iter); mbui_advance (iter), j++)
      64        mb_copy (&needle_mbchars[j], &mbui_cur (iter));
      65    }
      66  
      67    /* Fill the table.
      68       For 0 < i < m:
      69         0 < table[i] <= i is defined such that
      70         forall 0 < x < table[i]: needle[x..i-1] != needle[0..i-1-x],
      71         and table[i] is as large as possible with this property.
      72       This implies:
      73       1) For 0 < i < m:
      74            If table[i] < i,
      75            needle[table[i]..i-1] = needle[0..i-1-table[i]].
      76       2) For 0 < i < m:
      77            rhaystack[0..i-1] == needle[0..i-1]
      78            and exists h, i <= h < m: rhaystack[h] != needle[h]
      79            implies
      80            forall 0 <= x < table[i]: rhaystack[x..x+m-1] != needle[0..m-1].
      81       table[0] remains uninitialized.  */
      82    {
      83      size_t i, j;
      84  
      85      /* i = 1: Nothing to verify for x = 0.  */
      86      table[1] = 1;
      87      j = 0;
      88  
      89      for (i = 2; i < m; i++)
      90        {
      91          /* Here: j = i-1 - table[i-1].
      92             The inequality needle[x..i-1] != needle[0..i-1-x] is known to hold
      93             for x < table[i-1], by induction.
      94             Furthermore, if j>0: needle[i-1-j..i-2] = needle[0..j-1].  */
      95          mbchar_t *b = &needle_mbchars[i - 1];
      96  
      97          for (;;)
      98            {
      99              /* Invariants: The inequality needle[x..i-1] != needle[0..i-1-x]
     100                 is known to hold for x < i-1-j.
     101                 Furthermore, if j>0: needle[i-1-j..i-2] = needle[0..j-1].  */
     102              if (mb_equal (*b, needle_mbchars[j]))
     103                {
     104                  /* Set table[i] := i-1-j.  */
     105                  table[i] = i - ++j;
     106                  break;
     107                }
     108              /* The inequality needle[x..i-1] != needle[0..i-1-x] also holds
     109                 for x = i-1-j, because
     110                   needle[i-1] != needle[j] = needle[i-1-x].  */
     111              if (j == 0)
     112                {
     113                  /* The inequality holds for all possible x.  */
     114                  table[i] = i;
     115                  break;
     116                }
     117              /* The inequality needle[x..i-1] != needle[0..i-1-x] also holds
     118                 for i-1-j < x < i-1-j+table[j], because for these x:
     119                   needle[x..i-2]
     120                   = needle[x-(i-1-j)..j-1]
     121                   != needle[0..j-1-(x-(i-1-j))]  (by definition of table[j])
     122                      = needle[0..i-2-x],
     123                 hence needle[x..i-1] != needle[0..i-1-x].
     124                 Furthermore
     125                   needle[i-1-j+table[j]..i-2]
     126                   = needle[table[j]..j-1]
     127                   = needle[0..j-1-table[j]]  (by definition of table[j]).  */
     128              j = j - table[j];
     129            }
     130          /* Here: j = i - table[i].  */
     131        }
     132    }
     133  
     134    /* Search, using the table to accelerate the processing.  */
     135    {
     136      size_t j;
     137      mbui_iterator_t rhaystack;
     138      mbui_iterator_t phaystack;
     139  
     140      *resultp = NULL;
     141      j = 0;
     142      mbui_init (rhaystack, haystack);
     143      mbui_init (phaystack, haystack);
     144      /* Invariant: phaystack = rhaystack + j.  */
     145      while (mbui_avail (phaystack))
     146        if (mb_equal (needle_mbchars[j], mbui_cur (phaystack)))
     147          {
     148            j++;
     149            mbui_advance (phaystack);
     150            if (j == m)
     151              {
     152                /* The entire needle has been found.  */
     153                *resultp = mbui_cur_ptr (rhaystack);
     154                break;
     155              }
     156          }
     157        else if (j > 0)
     158          {
     159            /* Found a match of needle[0..j-1], mismatch at needle[j].  */
     160            size_t count = table[j];
     161            j -= count;
     162            for (; count > 0; count--)
     163              {
     164                if (!mbui_avail (rhaystack))
     165                  abort ();
     166                mbui_advance (rhaystack);
     167              }
     168          }
     169        else
     170          {
     171            /* Found a mismatch at needle[0] already.  */
     172            if (!mbui_avail (rhaystack))
     173              abort ();
     174            mbui_advance (rhaystack);
     175            mbui_advance (phaystack);
     176          }
     177    }
     178  
     179    freea (memory);
     180    return true;
     181  }
     182  
     183  /* Find the first occurrence of the character string NEEDLE in the character
     184     string HAYSTACK.  Return NULL if NEEDLE is not found in HAYSTACK.  */
     185  char *
     186  mbsstr (const char *haystack, const char *needle)
     187  {
     188    /* Be careful not to look at the entire extent of haystack or needle
     189       until needed.  This is useful because of these two cases:
     190         - haystack may be very long, and a match of needle found early,
     191         - needle may be very long, and not even a short initial segment of
     192           needle may be found in haystack.  */
     193    if (MB_CUR_MAX > 1)
     194      {
     195        mbui_iterator_t iter_needle;
     196  
     197        mbui_init (iter_needle, needle);
     198        if (mbui_avail (iter_needle))
     199          {
     200            /* Minimizing the worst-case complexity:
     201               Let n = mbslen(haystack), m = mbslen(needle).
     202               The naïve algorithm is O(n*m) worst-case.
     203               The Knuth-Morris-Pratt algorithm is O(n) worst-case but it needs a
     204               memory allocation.
     205               To achieve linear complexity and yet amortize the cost of the
     206               memory allocation, we activate the Knuth-Morris-Pratt algorithm
     207               only once the naïve algorithm has already run for some time; more
     208               precisely, when
     209                 - the outer loop count is >= 10,
     210                 - the average number of comparisons per outer loop is >= 5,
     211                 - the total number of comparisons is >= m.
     212               But we try it only once.  If the memory allocation attempt failed,
     213               we don't retry it.  */
     214            bool try_kmp = true;
     215            size_t outer_loop_count = 0;
     216            size_t comparison_count = 0;
     217            size_t last_ccount = 0;                  /* last comparison count */
     218            mbui_iterator_t iter_needle_last_ccount; /* = needle + last_ccount */
     219  
     220            mbui_iterator_t iter_haystack;
     221  
     222            mbui_init (iter_needle_last_ccount, needle);
     223            mbui_init (iter_haystack, haystack);
     224            for (;; mbui_advance (iter_haystack))
     225              {
     226                if (!mbui_avail (iter_haystack))
     227                  /* No match.  */
     228                  return NULL;
     229  
     230                /* See whether it's advisable to use an asymptotically faster
     231                   algorithm.  */
     232                if (try_kmp
     233                    && outer_loop_count >= 10
     234                    && comparison_count >= 5 * outer_loop_count)
     235                  {
     236                    /* See if needle + comparison_count now reaches the end of
     237                       needle.  */
     238                    size_t count = comparison_count - last_ccount;
     239                    for (;
     240                         count > 0 && mbui_avail (iter_needle_last_ccount);
     241                         count--)
     242                      mbui_advance (iter_needle_last_ccount);
     243                    last_ccount = comparison_count;
     244                    if (!mbui_avail (iter_needle_last_ccount))
     245                      {
     246                        /* Try the Knuth-Morris-Pratt algorithm.  */
     247                        const char *result;
     248                        bool success =
     249                          knuth_morris_pratt_multibyte (haystack, needle,
     250                                                        &result);
     251                        if (success)
     252                          return (char *) result;
     253                        try_kmp = false;
     254                      }
     255                  }
     256  
     257                outer_loop_count++;
     258                comparison_count++;
     259                if (mb_equal (mbui_cur (iter_haystack), mbui_cur (iter_needle)))
     260                  /* The first character matches.  */
     261                  {
     262                    mbui_iterator_t rhaystack;
     263                    mbui_iterator_t rneedle;
     264  
     265                    memcpy (&rhaystack, &iter_haystack, sizeof (mbui_iterator_t));
     266                    mbui_advance (rhaystack);
     267  
     268                    mbui_init (rneedle, needle);
     269                    if (!mbui_avail (rneedle))
     270                      abort ();
     271                    mbui_advance (rneedle);
     272  
     273                    for (;; mbui_advance (rhaystack), mbui_advance (rneedle))
     274                      {
     275                        if (!mbui_avail (rneedle))
     276                          /* Found a match.  */
     277                          return (char *) mbui_cur_ptr (iter_haystack);
     278                        if (!mbui_avail (rhaystack))
     279                          /* No match.  */
     280                          return NULL;
     281                        comparison_count++;
     282                        if (!mb_equal (mbui_cur (rhaystack), mbui_cur (rneedle)))
     283                          /* Nothing in this round.  */
     284                          break;
     285                      }
     286                  }
     287              }
     288          }
     289        else
     290          return (char *) haystack;
     291      }
     292    else
     293      {
     294        if (*needle != '\0')
     295          {
     296            /* Minimizing the worst-case complexity:
     297               Let n = strlen(haystack), m = strlen(needle).
     298               The naïve algorithm is O(n*m) worst-case.
     299               The Knuth-Morris-Pratt algorithm is O(n) worst-case but it needs a
     300               memory allocation.
     301               To achieve linear complexity and yet amortize the cost of the
     302               memory allocation, we activate the Knuth-Morris-Pratt algorithm
     303               only once the naïve algorithm has already run for some time; more
     304               precisely, when
     305                 - the outer loop count is >= 10,
     306                 - the average number of comparisons per outer loop is >= 5,
     307                 - the total number of comparisons is >= m.
     308               But we try it only once.  If the memory allocation attempt failed,
     309               we don't retry it.  */
     310            bool try_kmp = true;
     311            size_t outer_loop_count = 0;
     312            size_t comparison_count = 0;
     313            size_t last_ccount = 0;                  /* last comparison count */
     314            const char *needle_last_ccount = needle; /* = needle + last_ccount */
     315  
     316            /* Speed up the following searches of needle by caching its first
     317               character.  */
     318            char b = *needle++;
     319  
     320            for (;; haystack++)
     321              {
     322                if (*haystack == '\0')
     323                  /* No match.  */
     324                  return NULL;
     325  
     326                /* See whether it's advisable to use an asymptotically faster
     327                   algorithm.  */
     328                if (try_kmp
     329                    && outer_loop_count >= 10
     330                    && comparison_count >= 5 * outer_loop_count)
     331                  {
     332                    /* See if needle + comparison_count now reaches the end of
     333                       needle.  */
     334                    if (needle_last_ccount != NULL)
     335                      {
     336                        needle_last_ccount +=
     337                          strnlen (needle_last_ccount,
     338                                   comparison_count - last_ccount);
     339                        if (*needle_last_ccount == '\0')
     340                          needle_last_ccount = NULL;
     341                        last_ccount = comparison_count;
     342                      }
     343                    if (needle_last_ccount == NULL)
     344                      {
     345                        /* Try the Knuth-Morris-Pratt algorithm.  */
     346                        const unsigned char *result;
     347                        bool success =
     348                          knuth_morris_pratt ((const unsigned char *) haystack,
     349                                              (const unsigned char *) (needle - 1),
     350                                              strlen (needle - 1),
     351                                              &result);
     352                        if (success)
     353                          return (char *) result;
     354                        try_kmp = false;
     355                      }
     356                  }
     357  
     358                outer_loop_count++;
     359                comparison_count++;
     360                if (*haystack == b)
     361                  /* The first character matches.  */
     362                  {
     363                    const char *rhaystack = haystack + 1;
     364                    const char *rneedle = needle;
     365  
     366                    for (;; rhaystack++, rneedle++)
     367                      {
     368                        if (*rneedle == '\0')
     369                          /* Found a match.  */
     370                          return (char *) haystack;
     371                        if (*rhaystack == '\0')
     372                          /* No match.  */
     373                          return NULL;
     374                        comparison_count++;
     375                        if (*rhaystack != *rneedle)
     376                          /* Nothing in this round.  */
     377                          break;
     378                      }
     379                  }
     380              }
     381          }
     382        else
     383          return (char *) haystack;
     384      }
     385  }