(root)/
findutils-4.9.0/
gl/
lib/
mbscasestr.c
       1  /* Case-insensitive searching in a string.  -*- coding: utf-8 -*-
       2     Copyright (C) 2005-2022 Free Software Foundation, Inc.
       3     Written by Bruno Haible <bruno@clisp.org>, 2005.
       4  
       5     This file is free software: you can redistribute it and/or modify
       6     it under the terms of the GNU Lesser General Public License as
       7     published by the Free Software Foundation, either version 3 of the
       8     License, or (at your option) any later version.
       9  
      10     This file is distributed in the hope that it will be useful,
      11     but WITHOUT ANY WARRANTY; without even the implied warranty of
      12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      13     GNU Lesser General Public License for more details.
      14  
      15     You should have received a copy of the GNU Lesser General Public License
      16     along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
      17  
      18  #include <config.h>
      19  
      20  /* Specification.  */
      21  #include <string.h>
      22  
      23  #include <ctype.h>
      24  #include <stdbool.h>
      25  #include <stddef.h>  /* for NULL, in case a nonstandard string.h lacks it */
      26  #include <stdlib.h>
      27  
      28  #include "malloca.h"
      29  #include "mbuiter.h"
      30  
      31  #define TOLOWER(Ch) (isupper (Ch) ? tolower (Ch) : (Ch))
      32  
      33  /* Knuth-Morris-Pratt algorithm.  */
      34  #define UNIT unsigned char
      35  #define CANON_ELEMENT(c) TOLOWER (c)
      36  #include "str-kmp.h"
      37  
      38  /* Knuth-Morris-Pratt algorithm.
      39     See https://en.wikipedia.org/wiki/Knuth-Morris-Pratt_algorithm
      40     Return a boolean indicating success:
      41     Return true and set *RESULTP if the search was completed.
      42     Return false if it was aborted because not enough memory was available.  */
      43  static bool
      44  knuth_morris_pratt_multibyte (const char *haystack, const char *needle,
      45                                const char **resultp)
      46  {
      47    size_t m = mbslen (needle);
      48    mbchar_t *needle_mbchars;
      49    size_t *table;
      50  
      51    /* Allocate room for needle_mbchars and the table.  */
      52    char *memory = (char *) nmalloca (m, sizeof (mbchar_t) + sizeof (size_t));
      53    if (memory == NULL)
      54      return false;
      55    needle_mbchars = (mbchar_t *) memory;
      56    table = (size_t *) (memory + m * sizeof (mbchar_t));
      57  
      58    /* Fill needle_mbchars.  */
      59    {
      60      mbui_iterator_t iter;
      61      size_t j;
      62  
      63      j = 0;
      64      for (mbui_init (iter, needle); mbui_avail (iter); mbui_advance (iter), j++)
      65        {
      66          mb_copy (&needle_mbchars[j], &mbui_cur (iter));
      67          if (needle_mbchars[j].wc_valid)
      68            needle_mbchars[j].wc = towlower (needle_mbchars[j].wc);
      69        }
      70    }
      71  
      72    /* Fill the table.
      73       For 0 < i < m:
      74         0 < table[i] <= i is defined such that
      75         forall 0 < x < table[i]: needle[x..i-1] != needle[0..i-1-x],
      76         and table[i] is as large as possible with this property.
      77       This implies:
      78       1) For 0 < i < m:
      79            If table[i] < i,
      80            needle[table[i]..i-1] = needle[0..i-1-table[i]].
      81       2) For 0 < i < m:
      82            rhaystack[0..i-1] == needle[0..i-1]
      83            and exists h, i <= h < m: rhaystack[h] != needle[h]
      84            implies
      85            forall 0 <= x < table[i]: rhaystack[x..x+m-1] != needle[0..m-1].
      86       table[0] remains uninitialized.  */
      87    {
      88      size_t i, j;
      89  
      90      /* i = 1: Nothing to verify for x = 0.  */
      91      table[1] = 1;
      92      j = 0;
      93  
      94      for (i = 2; i < m; i++)
      95        {
      96          /* Here: j = i-1 - table[i-1].
      97             The inequality needle[x..i-1] != needle[0..i-1-x] is known to hold
      98             for x < table[i-1], by induction.
      99             Furthermore, if j>0: needle[i-1-j..i-2] = needle[0..j-1].  */
     100          mbchar_t *b = &needle_mbchars[i - 1];
     101  
     102          for (;;)
     103            {
     104              /* Invariants: The inequality needle[x..i-1] != needle[0..i-1-x]
     105                 is known to hold for x < i-1-j.
     106                 Furthermore, if j>0: needle[i-1-j..i-2] = needle[0..j-1].  */
     107              if (mb_equal (*b, needle_mbchars[j]))
     108                {
     109                  /* Set table[i] := i-1-j.  */
     110                  table[i] = i - ++j;
     111                  break;
     112                }
     113              /* The inequality needle[x..i-1] != needle[0..i-1-x] also holds
     114                 for x = i-1-j, because
     115                   needle[i-1] != needle[j] = needle[i-1-x].  */
     116              if (j == 0)
     117                {
     118                  /* The inequality holds for all possible x.  */
     119                  table[i] = i;
     120                  break;
     121                }
     122              /* The inequality needle[x..i-1] != needle[0..i-1-x] also holds
     123                 for i-1-j < x < i-1-j+table[j], because for these x:
     124                   needle[x..i-2]
     125                   = needle[x-(i-1-j)..j-1]
     126                   != needle[0..j-1-(x-(i-1-j))]  (by definition of table[j])
     127                      = needle[0..i-2-x],
     128                 hence needle[x..i-1] != needle[0..i-1-x].
     129                 Furthermore
     130                   needle[i-1-j+table[j]..i-2]
     131                   = needle[table[j]..j-1]
     132                   = needle[0..j-1-table[j]]  (by definition of table[j]).  */
     133              j = j - table[j];
     134            }
     135          /* Here: j = i - table[i].  */
     136        }
     137    }
     138  
     139    /* Search, using the table to accelerate the processing.  */
     140    {
     141      size_t j;
     142      mbui_iterator_t rhaystack;
     143      mbui_iterator_t phaystack;
     144  
     145      *resultp = NULL;
     146      j = 0;
     147      mbui_init (rhaystack, haystack);
     148      mbui_init (phaystack, haystack);
     149      /* Invariant: phaystack = rhaystack + j.  */
     150      while (mbui_avail (phaystack))
     151        {
     152          mbchar_t c;
     153  
     154          mb_copy (&c, &mbui_cur (phaystack));
     155          if (c.wc_valid)
     156            c.wc = towlower (c.wc);
     157          if (mb_equal (needle_mbchars[j], c))
     158            {
     159              j++;
     160              mbui_advance (phaystack);
     161              if (j == m)
     162                {
     163                  /* The entire needle has been found.  */
     164                  *resultp = mbui_cur_ptr (rhaystack);
     165                  break;
     166                }
     167            }
     168          else if (j > 0)
     169            {
     170              /* Found a match of needle[0..j-1], mismatch at needle[j].  */
     171              size_t count = table[j];
     172              j -= count;
     173              for (; count > 0; count--)
     174                {
     175                  if (!mbui_avail (rhaystack))
     176                    abort ();
     177                  mbui_advance (rhaystack);
     178                }
     179            }
     180          else
     181            {
     182              /* Found a mismatch at needle[0] already.  */
     183              if (!mbui_avail (rhaystack))
     184                abort ();
     185              mbui_advance (rhaystack);
     186              mbui_advance (phaystack);
     187            }
     188        }
     189    }
     190  
     191    freea (memory);
     192    return true;
     193  }
     194  
     195  /* Find the first occurrence of the character string NEEDLE in the character
     196     string HAYSTACK, using case-insensitive comparison.
     197     Note: This function may, in multibyte locales, return success even if
     198     strlen (haystack) < strlen (needle) !  */
     199  char *
     200  mbscasestr (const char *haystack, const char *needle)
     201  {
     202    /* Be careful not to look at the entire extent of haystack or needle
     203       until needed.  This is useful because of these two cases:
     204         - haystack may be very long, and a match of needle found early,
     205         - needle may be very long, and not even a short initial segment of
     206           needle may be found in haystack.  */
     207    if (MB_CUR_MAX > 1)
     208      {
     209        mbui_iterator_t iter_needle;
     210  
     211        mbui_init (iter_needle, needle);
     212        if (mbui_avail (iter_needle))
     213          {
     214            /* Minimizing the worst-case complexity:
     215               Let n = mbslen(haystack), m = mbslen(needle).
     216               The naïve algorithm is O(n*m) worst-case.
     217               The Knuth-Morris-Pratt algorithm is O(n) worst-case but it needs a
     218               memory allocation.
     219               To achieve linear complexity and yet amortize the cost of the
     220               memory allocation, we activate the Knuth-Morris-Pratt algorithm
     221               only once the naïve algorithm has already run for some time; more
     222               precisely, when
     223                 - the outer loop count is >= 10,
     224                 - the average number of comparisons per outer loop is >= 5,
     225                 - the total number of comparisons is >= m.
     226               But we try it only once.  If the memory allocation attempt failed,
     227               we don't retry it.  */
     228            bool try_kmp = true;
     229            size_t outer_loop_count = 0;
     230            size_t comparison_count = 0;
     231            size_t last_ccount = 0;                  /* last comparison count */
     232            mbui_iterator_t iter_needle_last_ccount; /* = needle + last_ccount */
     233  
     234            mbchar_t b;
     235            mbui_iterator_t iter_haystack;
     236  
     237            mbui_init (iter_needle_last_ccount, needle);
     238  
     239            mb_copy (&b, &mbui_cur (iter_needle));
     240            if (b.wc_valid)
     241              b.wc = towlower (b.wc);
     242  
     243            mbui_init (iter_haystack, haystack);
     244            for (;; mbui_advance (iter_haystack))
     245              {
     246                mbchar_t c;
     247  
     248                if (!mbui_avail (iter_haystack))
     249                  /* No match.  */
     250                  return NULL;
     251  
     252                /* See whether it's advisable to use an asymptotically faster
     253                   algorithm.  */
     254                if (try_kmp
     255                    && outer_loop_count >= 10
     256                    && comparison_count >= 5 * outer_loop_count)
     257                  {
     258                    /* See if needle + comparison_count now reaches the end of
     259                       needle.  */
     260                    size_t count = comparison_count - last_ccount;
     261                    for (;
     262                         count > 0 && mbui_avail (iter_needle_last_ccount);
     263                         count--)
     264                      mbui_advance (iter_needle_last_ccount);
     265                    last_ccount = comparison_count;
     266                    if (!mbui_avail (iter_needle_last_ccount))
     267                      {
     268                        /* Try the Knuth-Morris-Pratt algorithm.  */
     269                        const char *result;
     270                        bool success =
     271                          knuth_morris_pratt_multibyte (haystack, needle,
     272                                                        &result);
     273                        if (success)
     274                          return (char *) result;
     275                        try_kmp = false;
     276                      }
     277                  }
     278  
     279                outer_loop_count++;
     280                comparison_count++;
     281                mb_copy (&c, &mbui_cur (iter_haystack));
     282                if (c.wc_valid)
     283                  c.wc = towlower (c.wc);
     284                if (mb_equal (c, b))
     285                  /* The first character matches.  */
     286                  {
     287                    mbui_iterator_t rhaystack;
     288                    mbui_iterator_t rneedle;
     289  
     290                    memcpy (&rhaystack, &iter_haystack, sizeof (mbui_iterator_t));
     291                    mbui_advance (rhaystack);
     292  
     293                    mbui_init (rneedle, needle);
     294                    if (!mbui_avail (rneedle))
     295                      abort ();
     296                    mbui_advance (rneedle);
     297  
     298                    for (;; mbui_advance (rhaystack), mbui_advance (rneedle))
     299                      {
     300                        if (!mbui_avail (rneedle))
     301                          /* Found a match.  */
     302                          return (char *) mbui_cur_ptr (iter_haystack);
     303                        if (!mbui_avail (rhaystack))
     304                          /* No match.  */
     305                          return NULL;
     306                        comparison_count++;
     307                        if (!mb_caseequal (mbui_cur (rhaystack),
     308                                           mbui_cur (rneedle)))
     309                          /* Nothing in this round.  */
     310                          break;
     311                      }
     312                  }
     313              }
     314          }
     315        else
     316          return (char *) haystack;
     317      }
     318    else
     319      {
     320        if (*needle != '\0')
     321          {
     322            /* Minimizing the worst-case complexity:
     323               Let n = strlen(haystack), m = strlen(needle).
     324               The naïve algorithm is O(n*m) worst-case.
     325               The Knuth-Morris-Pratt algorithm is O(n) worst-case but it needs a
     326               memory allocation.
     327               To achieve linear complexity and yet amortize the cost of the
     328               memory allocation, we activate the Knuth-Morris-Pratt algorithm
     329               only once the naïve algorithm has already run for some time; more
     330               precisely, when
     331                 - the outer loop count is >= 10,
     332                 - the average number of comparisons per outer loop is >= 5,
     333                 - the total number of comparisons is >= m.
     334               But we try it only once.  If the memory allocation attempt failed,
     335               we don't retry it.  */
     336            bool try_kmp = true;
     337            size_t outer_loop_count = 0;
     338            size_t comparison_count = 0;
     339            size_t last_ccount = 0;                  /* last comparison count */
     340            const char *needle_last_ccount = needle; /* = needle + last_ccount */
     341  
     342            /* Speed up the following searches of needle by caching its first
     343               character.  */
     344            unsigned char b = TOLOWER ((unsigned char) *needle);
     345  
     346            needle++;
     347            for (;; haystack++)
     348              {
     349                if (*haystack == '\0')
     350                  /* No match.  */
     351                  return NULL;
     352  
     353                /* See whether it's advisable to use an asymptotically faster
     354                   algorithm.  */
     355                if (try_kmp
     356                    && outer_loop_count >= 10
     357                    && comparison_count >= 5 * outer_loop_count)
     358                  {
     359                    /* See if needle + comparison_count now reaches the end of
     360                       needle.  */
     361                    if (needle_last_ccount != NULL)
     362                      {
     363                        needle_last_ccount +=
     364                          strnlen (needle_last_ccount,
     365                                   comparison_count - last_ccount);
     366                        if (*needle_last_ccount == '\0')
     367                          needle_last_ccount = NULL;
     368                        last_ccount = comparison_count;
     369                      }
     370                    if (needle_last_ccount == NULL)
     371                      {
     372                        /* Try the Knuth-Morris-Pratt algorithm.  */
     373                        const unsigned char *result;
     374                        bool success =
     375                          knuth_morris_pratt ((const unsigned char *) haystack,
     376                                              (const unsigned char *) (needle - 1),
     377                                              strlen (needle - 1),
     378                                              &result);
     379                        if (success)
     380                          return (char *) result;
     381                        try_kmp = false;
     382                      }
     383                  }
     384  
     385                outer_loop_count++;
     386                comparison_count++;
     387                if (TOLOWER ((unsigned char) *haystack) == b)
     388                  /* The first character matches.  */
     389                  {
     390                    const char *rhaystack = haystack + 1;
     391                    const char *rneedle = needle;
     392  
     393                    for (;; rhaystack++, rneedle++)
     394                      {
     395                        if (*rneedle == '\0')
     396                          /* Found a match.  */
     397                          return (char *) haystack;
     398                        if (*rhaystack == '\0')
     399                          /* No match.  */
     400                          return NULL;
     401                        comparison_count++;
     402                        if (TOLOWER ((unsigned char) *rhaystack)
     403                            != TOLOWER ((unsigned char) *rneedle))
     404                          /* Nothing in this round.  */
     405                          break;
     406                      }
     407                  }
     408              }
     409          }
     410        else
     411          return (char *) haystack;
     412      }
     413  }