(root)/
gettext-0.22.4/
gettext-tools/
gnulib-lib/
mbsstr.c
       1  /* Searching in a string.  -*- coding: utf-8 -*-
       2     Copyright (C) 2005-2023 Free Software Foundation, Inc.
       3     Written by Bruno Haible <bruno@clisp.org>, 2005.
       4  
       5     This file is free software: you can redistribute it and/or modify
       6     it under the terms of the GNU Lesser General Public License as
       7     published by the Free Software Foundation, either version 3 of the
       8     License, or (at your option) any later version.
       9  
      10     This file is distributed in the hope that it will be useful,
      11     but WITHOUT ANY WARRANTY; without even the implied warranty of
      12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      13     GNU Lesser General Public License for more details.
      14  
      15     You should have received a copy of the GNU Lesser General Public License
      16     along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
      17  
      18  #include <config.h>
      19  
      20  /* Specification.  */
      21  #include <string.h>
      22  
      23  #include <stddef.h>  /* for NULL, in case a nonstandard string.h lacks it */
      24  #include <stdlib.h>
      25  
      26  #include "malloca.h"
      27  
      28  #if GNULIB_MCEL_PREFER
      29  # include "mcel.h"
      30  typedef mcel_t mbchar_t;
      31  static bool mb_equal (mcel_t a, mcel_t b) { return mcel_cmp (a, b) == 0; }
      32  #else
      33  # include "mbuiter.h"
      34  #endif
      35  
      36  /* Knuth-Morris-Pratt algorithm.  */
      37  #define UNIT unsigned char
      38  #define CANON_ELEMENT(c) c
      39  #include "str-kmp.h"
      40  
      41  /* Knuth-Morris-Pratt algorithm.
      42     See https://en.wikipedia.org/wiki/Knuth-Morris-Pratt_algorithm
      43     Return a boolean indicating success:
      44     Return true and set *RESULTP if the search was completed.
      45     Return false if it was aborted because not enough memory was available.  */
      46  static bool
      47  knuth_morris_pratt_multibyte (const char *haystack, const char *needle,
      48                                const char **resultp)
      49  {
      50    size_t m = mbslen (needle);
      51    mbchar_t *needle_mbchars;
      52    size_t extra_align = (alignof (mbchar_t) < alignof (size_t)
      53                          ? alignof (size_t) - alignof (mbchar_t)
      54                          : 0);
      55  
      56    /* Allocate room for needle_mbchars and the table.  */
      57    void *memory = nmalloca (m + !!extra_align,
      58                             sizeof (mbchar_t) + sizeof (size_t));
      59    void *table_memory;
      60    if (memory == NULL)
      61      return false;
      62    needle_mbchars = memory;
      63    table_memory = needle_mbchars + m;
      64    char *aligned = table_memory;
      65    aligned += extra_align;
      66    aligned -= (uintptr_t) aligned % alignof (size_t);
      67    size_t *table = table_memory = aligned;
      68  
      69    /* Fill needle_mbchars.  */
      70  #if GNULIB_MCEL_PREFER
      71    for (size_t j = 0; *needle; needle += needle_mbchars[j++].len)
      72      needle_mbchars[j] = mcel_scanz (needle);
      73  #else
      74    {
      75      mbui_iterator_t iter;
      76      size_t j;
      77  
      78      j = 0;
      79      for (mbui_init (iter, needle); mbui_avail (iter); mbui_advance (iter), j++)
      80        mb_copy (&needle_mbchars[j], &mbui_cur (iter));
      81    }
      82  #endif
      83  
      84    /* Fill the table.
      85       For 0 < i < m:
      86         0 < table[i] <= i is defined such that
      87         forall 0 < x < table[i]: needle[x..i-1] != needle[0..i-1-x],
      88         and table[i] is as large as possible with this property.
      89       This implies:
      90       1) For 0 < i < m:
      91            If table[i] < i,
      92            needle[table[i]..i-1] = needle[0..i-1-table[i]].
      93       2) For 0 < i < m:
      94            rhaystack[0..i-1] == needle[0..i-1]
      95            and exists h, i <= h < m: rhaystack[h] != needle[h]
      96            implies
      97            forall 0 <= x < table[i]: rhaystack[x..x+m-1] != needle[0..m-1].
      98       table[0] remains uninitialized.  */
      99    {
     100      size_t i, j;
     101  
     102      /* i = 1: Nothing to verify for x = 0.  */
     103      table[1] = 1;
     104      j = 0;
     105  
     106      for (i = 2; i < m; i++)
     107        {
     108          /* Here: j = i-1 - table[i-1].
     109             The inequality needle[x..i-1] != needle[0..i-1-x] is known to hold
     110             for x < table[i-1], by induction.
     111             Furthermore, if j>0: needle[i-1-j..i-2] = needle[0..j-1].  */
     112          mbchar_t *b = &needle_mbchars[i - 1];
     113  
     114          for (;;)
     115            {
     116              /* Invariants: The inequality needle[x..i-1] != needle[0..i-1-x]
     117                 is known to hold for x < i-1-j.
     118                 Furthermore, if j>0: needle[i-1-j..i-2] = needle[0..j-1].  */
     119              if (mb_equal (*b, needle_mbchars[j]))
     120                {
     121                  /* Set table[i] := i-1-j.  */
     122                  table[i] = i - ++j;
     123                  break;
     124                }
     125              /* The inequality needle[x..i-1] != needle[0..i-1-x] also holds
     126                 for x = i-1-j, because
     127                   needle[i-1] != needle[j] = needle[i-1-x].  */
     128              if (j == 0)
     129                {
     130                  /* The inequality holds for all possible x.  */
     131                  table[i] = i;
     132                  break;
     133                }
     134              /* The inequality needle[x..i-1] != needle[0..i-1-x] also holds
     135                 for i-1-j < x < i-1-j+table[j], because for these x:
     136                   needle[x..i-2]
     137                   = needle[x-(i-1-j)..j-1]
     138                   != needle[0..j-1-(x-(i-1-j))]  (by definition of table[j])
     139                      = needle[0..i-2-x],
     140                 hence needle[x..i-1] != needle[0..i-1-x].
     141                 Furthermore
     142                   needle[i-1-j+table[j]..i-2]
     143                   = needle[table[j]..j-1]
     144                   = needle[0..j-1-table[j]]  (by definition of table[j]).  */
     145              j = j - table[j];
     146            }
     147          /* Here: j = i - table[i].  */
     148        }
     149    }
     150  
     151    /* Search, using the table to accelerate the processing.  */
     152    {
     153  #if GNULIB_MCEL_PREFER
     154      size_t j;
     155      char const *rhaystack = haystack;
     156      char const *phaystack = haystack;
     157  
     158      j = 0;
     159      /* Invariant: phaystack = rhaystack + j.  */
     160      for (;;)
     161        {
     162          if (!*phaystack)
     163            {
     164              rhaystack = NULL;
     165              break;
     166            }
     167          mcel_t g = mcel_scanz (phaystack);
     168          if (mcel_cmp (needle_mbchars[j], g) == 0)
     169            {
     170              j++;
     171              /* Exit loop successfully if the entire needle has been found.  */
     172              if (j == m)
     173                break;
     174              phaystack += g.len;
     175            }
     176          else if (j == 0)
     177            {
     178              /* Found a mismatch at needle[0] already.  */
     179              rhaystack += mcel_scanz (rhaystack).len;
     180              phaystack += g.len;
     181            }
     182          else
     183            {
     184              /* Found a match of needle[0..j-1], mismatch at needle[j].  */
     185              size_t count = table[j];
     186              j -= count;
     187              for (; count != 0; count--)
     188                rhaystack += mcel_scanz (rhaystack).len;
     189            }
     190        }
     191      *resultp = rhaystack;
     192  #else
     193      size_t j;
     194      mbui_iterator_t rhaystack;
     195      mbui_iterator_t phaystack;
     196  
     197      *resultp = NULL;
     198      j = 0;
     199      mbui_init (rhaystack, haystack);
     200      mbui_init (phaystack, haystack);
     201      /* Invariant: phaystack = rhaystack + j.  */
     202      while (mbui_avail (phaystack))
     203        if (mb_equal (needle_mbchars[j], mbui_cur (phaystack)))
     204          {
     205            j++;
     206            mbui_advance (phaystack);
     207            if (j == m)
     208              {
     209                /* The entire needle has been found.  */
     210                *resultp = mbui_cur_ptr (rhaystack);
     211                break;
     212              }
     213          }
     214        else if (j > 0)
     215          {
     216            /* Found a match of needle[0..j-1], mismatch at needle[j].  */
     217            size_t count = table[j];
     218            j -= count;
     219            for (; count > 0; count--)
     220              {
     221                if (!mbui_avail (rhaystack))
     222                  abort ();
     223                mbui_advance (rhaystack);
     224              }
     225          }
     226        else
     227          {
     228            /* Found a mismatch at needle[0] already.  */
     229            if (!mbui_avail (rhaystack))
     230              abort ();
     231            mbui_advance (rhaystack);
     232            mbui_advance (phaystack);
     233          }
     234  #endif
     235    }
     236  
     237    freea (memory);
     238    return true;
     239  }
     240  
     241  /* Find the first occurrence of the character string NEEDLE in the character
     242     string HAYSTACK.  Return NULL if NEEDLE is not found in HAYSTACK.  */
     243  char *
     244  mbsstr (const char *haystack, const char *needle)
     245  {
     246    /* Be careful not to look at the entire extent of haystack or needle
     247       until needed.  This is useful because of these two cases:
     248         - haystack may be very long, and a match of needle found early,
     249         - needle may be very long, and not even a short initial segment of
     250           needle may be found in haystack.  */
     251    if (MB_CUR_MAX > 1)
     252      {
     253  #if GNULIB_MCEL_PREFER
     254        if (!*needle)
     255          return (char *) haystack;
     256  
     257        mcel_t ng = mcel_scanz (needle);
     258  
     259        /* Minimizing the worst-case complexity:
     260           Let n = mbslen(haystack), m = mbslen(needle).
     261           The naïve algorithm is O(n*m) worst-case.
     262           The Knuth-Morris-Pratt algorithm is O(n) worst-case but it needs a
     263           memory allocation.
     264           To achieve linear complexity and yet amortize the cost of the
     265           memory allocation, we activate the Knuth-Morris-Pratt algorithm
     266           only once the naïve algorithm has already run for some time; more
     267           precisely, when
     268             - the outer loop count is >= 10,
     269             - the average number of comparisons per outer loop is >= 5,
     270             - the total number of comparisons is >= m.
     271           But we try it only once.  If the memory allocation attempt failed,
     272           we don't retry it.  */
     273        bool try_kmp = true;
     274        size_t outer_loop_count = 0;
     275        size_t comparison_count = 0;
     276  
     277        /* Last comparison count, and needle + last_ccount.  */
     278        size_t last_ccount = 0;
     279        char const *iter_needle_last_ccount = needle;
     280  
     281        char const *iter_haystack = haystack;
     282  
     283        for (mcel_t hg; *iter_haystack; iter_haystack += hg.len)
     284          {
     285            /* See whether it's advisable to use an asymptotically faster
     286               algorithm.  */
     287            if (try_kmp
     288                && outer_loop_count >= 10
     289                && comparison_count >= 5 * outer_loop_count)
     290              {
     291                /* See if needle + comparison_count now reaches the end of
     292                   needle.  */
     293                size_t count = comparison_count - last_ccount;
     294                for (;
     295                     count > 0 && *iter_needle_last_ccount;
     296                     count--)
     297                  iter_needle_last_ccount
     298                    += mcel_scanz (iter_needle_last_ccount).len;
     299                last_ccount = comparison_count;
     300                if (!*iter_needle_last_ccount)
     301                  {
     302                    char const *result;
     303                    if (knuth_morris_pratt_multibyte (haystack, needle,
     304                                                      &result))
     305                      return (char *) result;
     306                    try_kmp = false;
     307                  }
     308              }
     309  
     310            outer_loop_count++;
     311            comparison_count++;
     312            hg = mcel_scanz (iter_haystack);
     313            if (mcel_cmp (hg, ng) == 0)
     314              /* The first character matches.  */
     315              {
     316                char const *rhaystack = iter_haystack + hg.len;
     317                char const *rneedle = needle + ng.len;
     318                mcel_t rhg, rng;
     319                do
     320                  {
     321                    if (!*rneedle)
     322                      return (char *) iter_haystack;
     323                    if (!*rhaystack)
     324                      return NULL;
     325                    rhg = mcel_scanz (rhaystack); rhaystack += rhg.len;
     326                    rng = mcel_scanz (rneedle); rneedle += rng.len;
     327                    comparison_count++;
     328                  }
     329                while (mcel_cmp (rhg, rng) == 0);
     330              }
     331          }
     332  
     333        return NULL;
     334  #else
     335        mbui_iterator_t iter_needle;
     336  
     337        mbui_init (iter_needle, needle);
     338        if (mbui_avail (iter_needle))
     339          {
     340            /* Minimizing the worst-case complexity:
     341               Let n = mbslen(haystack), m = mbslen(needle).
     342               The naïve algorithm is O(n*m) worst-case.
     343               The Knuth-Morris-Pratt algorithm is O(n) worst-case but it needs a
     344               memory allocation.
     345               To achieve linear complexity and yet amortize the cost of the
     346               memory allocation, we activate the Knuth-Morris-Pratt algorithm
     347               only once the naïve algorithm has already run for some time; more
     348               precisely, when
     349                 - the outer loop count is >= 10,
     350                 - the average number of comparisons per outer loop is >= 5,
     351                 - the total number of comparisons is >= m.
     352               But we try it only once.  If the memory allocation attempt failed,
     353               we don't retry it.  */
     354            bool try_kmp = true;
     355            size_t outer_loop_count = 0;
     356            size_t comparison_count = 0;
     357            size_t last_ccount = 0;                  /* last comparison count */
     358            mbui_iterator_t iter_needle_last_ccount; /* = needle + last_ccount */
     359  
     360            mbui_iterator_t iter_haystack;
     361  
     362            mbui_init (iter_needle_last_ccount, needle);
     363            mbui_init (iter_haystack, haystack);
     364            for (;; mbui_advance (iter_haystack))
     365              {
     366                if (!mbui_avail (iter_haystack))
     367                  /* No match.  */
     368                  return NULL;
     369  
     370                /* See whether it's advisable to use an asymptotically faster
     371                   algorithm.  */
     372                if (try_kmp
     373                    && outer_loop_count >= 10
     374                    && comparison_count >= 5 * outer_loop_count)
     375                  {
     376                    /* See if needle + comparison_count now reaches the end of
     377                       needle.  */
     378                    size_t count = comparison_count - last_ccount;
     379                    for (;
     380                         count > 0 && mbui_avail (iter_needle_last_ccount);
     381                         count--)
     382                      mbui_advance (iter_needle_last_ccount);
     383                    last_ccount = comparison_count;
     384                    if (!mbui_avail (iter_needle_last_ccount))
     385                      {
     386                        /* Try the Knuth-Morris-Pratt algorithm.  */
     387                        const char *result;
     388                        bool success =
     389                          knuth_morris_pratt_multibyte (haystack, needle,
     390                                                        &result);
     391                        if (success)
     392                          return (char *) result;
     393                        try_kmp = false;
     394                      }
     395                  }
     396  
     397                outer_loop_count++;
     398                comparison_count++;
     399                if (mb_equal (mbui_cur (iter_haystack), mbui_cur (iter_needle)))
     400                  /* The first character matches.  */
     401                  {
     402                    mbui_iterator_t rhaystack;
     403                    mbui_iterator_t rneedle;
     404  
     405                    memcpy (&rhaystack, &iter_haystack, sizeof (mbui_iterator_t));
     406                    mbui_advance (rhaystack);
     407  
     408                    mbui_init (rneedle, needle);
     409                    if (!mbui_avail (rneedle))
     410                      abort ();
     411                    mbui_advance (rneedle);
     412  
     413                    for (;; mbui_advance (rhaystack), mbui_advance (rneedle))
     414                      {
     415                        if (!mbui_avail (rneedle))
     416                          /* Found a match.  */
     417                          return (char *) mbui_cur_ptr (iter_haystack);
     418                        if (!mbui_avail (rhaystack))
     419                          /* No match.  */
     420                          return NULL;
     421                        comparison_count++;
     422                        if (!mb_equal (mbui_cur (rhaystack), mbui_cur (rneedle)))
     423                          /* Nothing in this round.  */
     424                          break;
     425                      }
     426                  }
     427              }
     428          }
     429        else
     430          return (char *) haystack;
     431  #endif
     432      }
     433    else
     434      {
     435        if (*needle != '\0')
     436          {
     437            /* Minimizing the worst-case complexity:
     438               Let n = strlen(haystack), m = strlen(needle).
     439               The naïve algorithm is O(n*m) worst-case.
     440               The Knuth-Morris-Pratt algorithm is O(n) worst-case but it needs a
     441               memory allocation.
     442               To achieve linear complexity and yet amortize the cost of the
     443               memory allocation, we activate the Knuth-Morris-Pratt algorithm
     444               only once the naïve algorithm has already run for some time; more
     445               precisely, when
     446                 - the outer loop count is >= 10,
     447                 - the average number of comparisons per outer loop is >= 5,
     448                 - the total number of comparisons is >= m.
     449               But we try it only once.  If the memory allocation attempt failed,
     450               we don't retry it.  */
     451            bool try_kmp = true;
     452            size_t outer_loop_count = 0;
     453            size_t comparison_count = 0;
     454            size_t last_ccount = 0;                  /* last comparison count */
     455            const char *needle_last_ccount = needle; /* = needle + last_ccount */
     456  
     457            /* Speed up the following searches of needle by caching its first
     458               character.  */
     459            char b = *needle++;
     460  
     461            for (;; haystack++)
     462              {
     463                if (*haystack == '\0')
     464                  /* No match.  */
     465                  return NULL;
     466  
     467                /* See whether it's advisable to use an asymptotically faster
     468                   algorithm.  */
     469                if (try_kmp
     470                    && outer_loop_count >= 10
     471                    && comparison_count >= 5 * outer_loop_count)
     472                  {
     473                    /* See if needle + comparison_count now reaches the end of
     474                       needle.  */
     475                    if (needle_last_ccount != NULL)
     476                      {
     477                        needle_last_ccount +=
     478                          strnlen (needle_last_ccount,
     479                                   comparison_count - last_ccount);
     480                        if (*needle_last_ccount == '\0')
     481                          needle_last_ccount = NULL;
     482                        last_ccount = comparison_count;
     483                      }
     484                    if (needle_last_ccount == NULL)
     485                      {
     486                        /* Try the Knuth-Morris-Pratt algorithm.  */
     487                        const unsigned char *result;
     488                        bool success =
     489                          knuth_morris_pratt ((const unsigned char *) haystack,
     490                                              (const unsigned char *) (needle - 1),
     491                                              strlen (needle - 1),
     492                                              &result);
     493                        if (success)
     494                          return (char *) result;
     495                        try_kmp = false;
     496                      }
     497                  }
     498  
     499                outer_loop_count++;
     500                comparison_count++;
     501                if (*haystack == b)
     502                  /* The first character matches.  */
     503                  {
     504                    const char *rhaystack = haystack + 1;
     505                    const char *rneedle = needle;
     506  
     507                    for (;; rhaystack++, rneedle++)
     508                      {
     509                        if (*rneedle == '\0')
     510                          /* Found a match.  */
     511                          return (char *) haystack;
     512                        if (*rhaystack == '\0')
     513                          /* No match.  */
     514                          return NULL;
     515                        comparison_count++;
     516                        if (*rhaystack != *rneedle)
     517                          /* Nothing in this round.  */
     518                          break;
     519                      }
     520                  }
     521              }
     522          }
     523        else
     524          return (char *) haystack;
     525      }
     526  }