(root)/
grep-3.11/
src/
kwsearch.c
       1  /* kwsearch.c - searching subroutines using kwset for grep.
       2     Copyright 1992, 1998, 2000, 2007, 2009-2023 Free Software Foundation, Inc.
       3  
       4     This program is free software; you can redistribute it and/or modify
       5     it under the terms of the GNU General Public License as published by
       6     the Free Software Foundation; either version 3, or (at your option)
       7     any later version.
       8  
       9     This program is distributed in the hope that it will be useful,
      10     but WITHOUT ANY WARRANTY; without even the implied warranty of
      11     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      12     GNU General Public License for more details.
      13  
      14     You should have received a copy of the GNU General Public License
      15     along with this program; if not, write to the Free Software
      16     Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
      17     02110-1301, USA.  */
      18  
      19  /* Written August 1992 by Mike Haertel. */
      20  
      21  #include <config.h>
      22  #include "search.h"
      23  
      24  /* A compiled -F pattern list.  */
      25  
      26  struct kwsearch
      27  {
      28    /* The kwset for this pattern list.  */
      29    kwset_t kwset;
      30  
      31    /* The number of user-specified patterns.  This is less than
      32       'kwswords (kwset)' when some extra one-character words have been
      33       appended, one for each troublesome character that will require a
      34       DFA search.  */
      35    idx_t words;
      36  
      37    /* The user's pattern and its size in bytes.  */
      38    char *pattern;
      39    idx_t size;
      40  
      41    /* The user's pattern compiled as a regular expression,
      42       or null if it has not been compiled.  */
      43    void *re;
      44  };
      45  
      46  /* Compile the -F style PATTERN, containing SIZE bytes that are
      47     followed by '\n'.  Return a description of the compiled pattern.  */
      48  
      49  void *
      50  Fcompile (char *pattern, idx_t size, reg_syntax_t ignored, bool exact)
      51  {
      52    kwset_t kwset;
      53    char *buf = NULL;
      54    idx_t bufalloc = 0;
      55  
      56    kwset = kwsinit (true);
      57  
      58    char const *p = pattern;
      59    do
      60      {
      61        char const *sep = rawmemchr (p, '\n');
      62        idx_t len = sep - p;
      63  
      64        if (match_lines)
      65          {
      66            if (eolbyte == '\n' && pattern < p)
      67              p--;
      68            else
      69              {
      70                if (bufalloc < len + 2)
      71                  {
      72                    free (buf);
      73                    bufalloc = len;
      74                    buf = xpalloc (NULL, &bufalloc, 2, -1, 1);
      75                    buf[0] = eolbyte;
      76                  }
      77                memcpy (buf + 1, p, len);
      78                buf[len + 1] = eolbyte;
      79                p = buf;
      80              }
      81            len += 2;
      82          }
      83        kwsincr (kwset, p, len);
      84  
      85        p = sep + 1;
      86      }
      87    while (p <= pattern + size);
      88  
      89    free (buf);
      90  
      91    idx_t words = kwswords (kwset);
      92    kwsprep (kwset);
      93  
      94    struct kwsearch *kwsearch = xmalloc (sizeof *kwsearch);
      95    kwsearch->kwset = kwset;
      96    kwsearch->words = words;
      97    kwsearch->pattern = pattern;
      98    kwsearch->size = size;
      99    kwsearch->re = NULL;
     100    return kwsearch;
     101  }
     102  
     103  /* Use the compiled pattern VCP to search the buffer BUF of size SIZE.
     104     If found, return the offset of the first match and store its
     105     size into *MATCH_SIZE.  If not found, return -1.
     106     If START_PTR is nonnull, start searching there.  */
     107  ptrdiff_t
     108  Fexecute (void *vcp, char const *buf, idx_t size, idx_t *match_size,
     109            char const *start_ptr)
     110  {
     111    char const *beg, *end, *mb_start;
     112    idx_t len;
     113    char eol = eolbyte;
     114    struct kwsearch *kwsearch = vcp;
     115    kwset_t kwset = kwsearch->kwset;
     116    bool mb_check = localeinfo.multibyte & !localeinfo.using_utf8 & !match_lines;
     117    bool longest = (mb_check | !!start_ptr | match_words) & !match_lines;
     118  
     119    for (mb_start = beg = start_ptr ? start_ptr : buf; beg <= buf + size; beg++)
     120      {
     121        struct kwsmatch kwsmatch;
     122        ptrdiff_t offset = kwsexec (kwset, beg - match_lines,
     123                                    buf + size - beg + match_lines, &kwsmatch,
     124                                    longest);
     125        if (offset < 0)
     126          break;
     127        len = kwsmatch.size - 2 * match_lines;
     128  
     129        idx_t mbclen = 0;
     130        if (mb_check
     131            && mb_goback (&mb_start, &mbclen, beg + offset, buf + size) != 0)
     132          {
     133            /* We have matched a single byte that is not at the beginning of a
     134               multibyte character.  mb_goback has advanced MB_START past that
     135               multibyte character.  Now, we want to position BEG so that the
     136               next kwsexec search starts there.  Thus, to compensate for the
     137               for-loop's BEG++, above, subtract one here.  This code is
     138               unusually hard to reach, and exceptionally, let's show how to
     139               trigger it here:
     140  
     141                 printf '\203AA\n'|LC_ALL=ja_JP.SHIFT_JIS src/grep -F A
     142  
     143               That assumes the named locale is installed.
     144               Note that your system's shift-JIS locale may have a different
     145               name, possibly including "sjis".  */
     146            beg = mb_start - 1;
     147            continue;
     148          }
     149        beg += offset;
     150        if (!!start_ptr & !match_words)
     151          goto success_in_beg_and_len;
     152        if (match_lines)
     153          {
     154            len += start_ptr == NULL;
     155            goto success_in_beg_and_len;
     156          }
     157        if (! match_words)
     158          goto success;
     159  
     160        /* We need a preceding mb_start pointer.  Use the beginning of line
     161           if there is a preceding newline.  */
     162        if (mbclen == 0)
     163          {
     164            char const *nl = memrchr (mb_start, eol, beg - mb_start);
     165            if (nl)
     166              mb_start = nl + 1;
     167          }
     168  
     169        /* Succeed if neither the preceding nor the following character is a
     170           word constituent.  If the preceding is not, yet the following
     171           character IS a word constituent, keep trying with shorter matches.  */
     172        if (mbclen > 0
     173            ? ! wordchar_next (beg - mbclen, buf + size)
     174            : ! wordchar_prev (mb_start, beg, buf + size))
     175          for (;;)
     176            {
     177              if (! wordchar_next (beg + len, buf + size))
     178                {
     179                  if (start_ptr)
     180                    goto success_in_beg_and_len;
     181                  else
     182                    goto success;
     183                }
     184              if (!start_ptr && !localeinfo.multibyte)
     185                {
     186                  if (! kwsearch->re)
     187                    {
     188                      fgrep_to_grep_pattern (&kwsearch->pattern, &kwsearch->size);
     189                      kwsearch->re = GEAcompile (kwsearch->pattern,
     190                                                 kwsearch->size,
     191                                                 RE_SYNTAX_GREP, !!start_ptr);
     192                    }
     193                  if (beg + len < buf + size)
     194                    {
     195                      end = rawmemchr (beg + len, eol);
     196                      end++;
     197                    }
     198                  else
     199                    end = buf + size;
     200  
     201                  if (0 <= EGexecute (kwsearch->re, beg, end - beg,
     202                                      match_size, NULL))
     203                    goto success_match_words;
     204                  beg = end - 1;
     205                  break;
     206                }
     207              if (!len)
     208                break;
     209  
     210              struct kwsmatch shorter_match;
     211              if (kwsexec (kwset, beg, --len, &shorter_match, true) != 0)
     212                break;
     213              len = shorter_match.size;
     214            }
     215  
     216        /* No word match was found at BEG.  Skip past word constituents,
     217           since they cannot precede the next match and not skipping
     218           them could make things much slower.  */
     219        beg += wordchars_size (beg, buf + size);
     220        mb_start = beg;
     221      }
     222  
     223    return -1;
     224  
     225   success:
     226    if (beg + len < buf + size)
     227      {
     228        end = rawmemchr (beg + len, eol);
     229        end++;
     230      }
     231    else
     232      end = buf + size;
     233   success_match_words:
     234    beg = memrchr (buf, eol, beg - buf);
     235    beg = beg ? beg + 1 : buf;
     236    len = end - beg;
     237   success_in_beg_and_len:;
     238    *match_size = len;
     239    return beg - buf;
     240  }