(root)/
grep-3.11/
src/
pcresearch.c
       1  /* pcresearch.c - searching subroutines using PCRE for grep.
       2     Copyright 2000, 2007, 2009-2023 Free Software Foundation, Inc.
       3  
       4     This program is free software; you can redistribute it and/or modify
       5     it under the terms of the GNU General Public License as published by
       6     the Free Software Foundation; either version 3, or (at your option)
       7     any later version.
       8  
       9     This program is distributed in the hope that it will be useful,
      10     but WITHOUT ANY WARRANTY; without even the implied warranty of
      11     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      12     GNU General Public License for more details.
      13  
      14     You should have received a copy of the GNU General Public License
      15     along with this program; if not, write to the Free Software
      16     Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
      17     02110-1301, USA.  */
      18  
      19  #include <config.h>
      20  
      21  #include "search.h"
      22  #include "die.h"
      23  
      24  #include <stdckdint.h>
      25  
      26  #define PCRE2_CODE_UNIT_WIDTH 8
      27  #include <pcre2.h>
      28  
      29  /* For older PCRE2.  */
      30  #ifndef PCRE2_SIZE_MAX
      31  # define PCRE2_SIZE_MAX SIZE_MAX
      32  #endif
      33  #ifndef PCRE2_CONFIG_DEPTHLIMIT
      34  # define PCRE2_CONFIG_DEPTHLIMIT PCRE2_CONFIG_RECURSIONLIMIT
      35  # define PCRE2_ERROR_DEPTHLIMIT PCRE2_ERROR_RECURSIONLIMIT
      36  # define pcre2_set_depth_limit pcre2_set_recursion_limit
      37  #endif
      38  #ifndef PCRE2_EXTRA_ASCII_BSD
      39  # define PCRE2_EXTRA_ASCII_BSD 0
      40  #endif
      41  
      42  /* Use PCRE2_MATCH_INVALID_UTF if supported and not buggy;
      43     see <https://github.com/PCRE2Project/pcre2/issues/224>.
      44     Assume the bug will be fixed after PCRE2 10.42.  */
      45  #if defined PCRE2_MATCH_INVALID_UTF && 10 < PCRE2_MAJOR + (42 < PCRE2_MINOR)
      46  enum { MATCH_INVALID_UTF = PCRE2_MATCH_INVALID_UTF };
      47  #else
      48  enum { MATCH_INVALID_UTF = 0 };
      49  #endif
      50  
      51  struct pcre_comp
      52  {
      53    /* General context for PCRE operations.  */
      54    pcre2_general_context *gcontext;
      55  
      56    /* Compiled internal form of a Perl regular expression.  */
      57    pcre2_code *cre;
      58  
      59    /* Match context and data block.  */
      60    pcre2_match_context *mcontext;
      61    pcre2_match_data *data;
      62  
      63    /* The JIT stack and its maximum size.  */
      64    pcre2_jit_stack *jit_stack;
      65    idx_t jit_stack_size;
      66  
      67    /* Table, indexed by ! (flag & PCRE2_NOTBOL), of whether the empty
      68       string matches when that flag is used.  */
      69    int empty_match[2];
      70  };
      71  
      72  /* Memory allocation functions for PCRE.  */
      73  static void *
      74  private_malloc (PCRE2_SIZE size, _GL_UNUSED void *unused)
      75  {
      76    if (IDX_MAX < size)
      77      xalloc_die ();
      78    return ximalloc (size);
      79  }
      80  static void
      81  private_free (void *ptr, _GL_UNUSED void *unused)
      82  {
      83    free (ptr);
      84  }
      85  
      86  void
      87  Pprint_version (void)
      88  {
      89    char buf[128];
      90    if (sizeof buf <= pcre2_config (PCRE2_CONFIG_VERSION, buf))
      91      abort ();
      92    printf (_("\ngrep -P uses PCRE2 %s\n"), buf);
      93  }
      94  
      95  /* Match the already-compiled PCRE pattern against the data in SUBJECT,
      96     of size SEARCH_BYTES and starting with offset SEARCH_OFFSET, with
      97     options OPTIONS.
      98     Return the (nonnegative) match count or a (negative) error number.  */
      99  static int
     100  jit_exec (struct pcre_comp *pc, char const *subject, idx_t search_bytes,
     101            idx_t search_offset, int options)
     102  {
     103    while (true)
     104      {
     105        /* STACK_GROWTH_RATE is taken from PCRE's src/pcre2_jit_compile.c.
     106           Going over the jitstack_max limit could trigger an int
     107           overflow bug.  */
     108        int STACK_GROWTH_RATE = 8192;
     109        idx_t jitstack_max = MIN (IDX_MAX, SIZE_MAX - (STACK_GROWTH_RATE - 1));
     110  
     111        int e = pcre2_match (pc->cre, (PCRE2_SPTR) subject, search_bytes,
     112                             search_offset, options, pc->data, pc->mcontext);
     113        if (e == PCRE2_ERROR_JIT_STACKLIMIT
     114            && pc->jit_stack_size <= jitstack_max / 2)
     115          {
     116            idx_t old_size = pc->jit_stack_size;
     117            idx_t new_size = pc->jit_stack_size = old_size * 2;
     118            pcre2_jit_stack_free (pc->jit_stack);
     119            pc->jit_stack = pcre2_jit_stack_create (old_size, new_size,
     120                                                    pc->gcontext);
     121            if (!pc->jit_stack)
     122              xalloc_die ();
     123            if (!pc->mcontext)
     124              pc->mcontext = pcre2_match_context_create (pc->gcontext);
     125            pcre2_jit_stack_assign (pc->mcontext, NULL, pc->jit_stack);
     126          }
     127        else if (e == PCRE2_ERROR_DEPTHLIMIT)
     128          {
     129            uint32_t lim;
     130            pcre2_config (PCRE2_CONFIG_DEPTHLIMIT, &lim);
     131            if (ckd_mul (&lim, lim, 2))
     132              return e;
     133            if (!pc->mcontext)
     134              pc->mcontext = pcre2_match_context_create (pc->gcontext);
     135            pcre2_set_depth_limit (pc->mcontext, lim);
     136          }
     137        else
     138          return e;
     139      }
     140  }
     141  
     142  /* Return true if E is an error code for bad UTF-8.  */
     143  static bool
     144  bad_utf8_from_pcre2 (int e)
     145  {
     146    return PCRE2_ERROR_UTF8_ERR21 <= e && e <= PCRE2_ERROR_UTF8_ERR1;
     147  }
     148  
     149  /* Compile the -P style PATTERN, containing SIZE bytes that are
     150     followed by '\n'.  Return a description of the compiled pattern.  */
     151  
     152  void *
     153  Pcompile (char *pattern, idx_t size, reg_syntax_t ignored, bool exact)
     154  {
     155    PCRE2_SIZE e;
     156    int ec;
     157    int flags = PCRE2_DOLLAR_ENDONLY | (match_icase ? PCRE2_CASELESS : 0);
     158    char *patlim = pattern + size;
     159    struct pcre_comp *pc = ximalloc (sizeof *pc);
     160    pcre2_general_context *gcontext = pc->gcontext
     161      = pcre2_general_context_create (private_malloc, private_free, NULL);
     162    pcre2_compile_context *ccontext = pcre2_compile_context_create (gcontext);
     163  
     164    if (localeinfo.multibyte)
     165      {
     166        uint32_t unicode;
     167        if (pcre2_config (PCRE2_CONFIG_UNICODE, &unicode) < 0 || !unicode)
     168          die (EXIT_TROUBLE, 0,
     169               _("-P supports only unibyte locales on this platform"));
     170        if (! localeinfo.using_utf8)
     171          die (EXIT_TROUBLE, 0, _("-P supports only unibyte and UTF-8 locales"));
     172  
     173        flags |= PCRE2_UTF;
     174  
     175        /* If supported, consider invalid UTF-8 as a barrier not an error.  */
     176        flags |= MATCH_INVALID_UTF;
     177  
     178        /* If PCRE2_EXTRA_ASCII_BSD is available, use PCRE2_UCP
     179           so that \d does not have the undesirable effect of matching
     180           non-ASCII digits.  Otherwise (i.e., with PCRE2 10.42 and earlier),
     181           escapes like \w have only their ASCII interpretations,
     182           but that's better than the confusion that would ensue if \d
     183           matched non-ASCII digits.  */
     184        flags |= PCRE2_EXTRA_ASCII_BSD ? PCRE2_UCP : 0;
     185  
     186  #if 0
     187        /* Do not match individual code units but only UTF-8.  */
     188        flags |= PCRE2_NEVER_BACKSLASH_C;
     189  #endif
     190      }
     191  
     192    /* FIXME: Remove this restriction.  */
     193    if (rawmemchr (pattern, '\n') != patlim)
     194      die (EXIT_TROUBLE, 0, _("the -P option only supports a single pattern"));
     195  
     196  #ifdef PCRE2_EXTRA_MATCH_LINE
     197    uint32_t extra_options = (PCRE2_EXTRA_ASCII_BSD
     198                              | (match_lines ? PCRE2_EXTRA_MATCH_LINE : 0));
     199    pcre2_set_compile_extra_options (ccontext, extra_options);
     200  #endif
     201  
     202    void *re_storage = NULL;
     203    if (match_lines)
     204      {
     205  #ifndef PCRE2_EXTRA_MATCH_LINE
     206        static char const /* These sizes omit trailing NUL.  */
     207          xprefix[4] = "^(?:", xsuffix[2] = ")$";
     208        idx_t re_size = size + sizeof xprefix + sizeof xsuffix;
     209        char *re = re_storage = ximalloc (re_size);
     210        char *rez = mempcpy (re, xprefix, sizeof xprefix);
     211        rez = mempcpy (rez, pattern, size);
     212        memcpy (rez, xsuffix, sizeof xsuffix);
     213        pattern = re;
     214        size = re_size;
     215  #endif
     216      }
     217    else if (match_words)
     218      {
     219        /* PCRE2_EXTRA_MATCH_WORD is incompatible with grep -w;
     220           do things the grep way.  */
     221        static char const /* These sizes omit trailing NUL.  */
     222          wprefix[10] = "(?<!\\w)(?:", wsuffix[7] = ")(?!\\w)";
     223        idx_t re_size = size + sizeof wprefix + sizeof wsuffix;
     224        char *re = re_storage = ximalloc (re_size);
     225        char *rez = mempcpy (re, wprefix, sizeof wprefix);
     226        rez = mempcpy (rez, pattern, size);
     227        memcpy (rez, wsuffix, sizeof wsuffix);
     228        pattern = re;
     229        size = re_size;
     230      }
     231  
     232    if (!localeinfo.multibyte)
     233      pcre2_set_character_tables (ccontext, pcre2_maketables (gcontext));
     234  
     235    pc->cre = pcre2_compile ((PCRE2_SPTR) pattern, size, flags,
     236                             &ec, &e, ccontext);
     237    if (!pc->cre)
     238      {
     239        enum { ERRBUFSIZ = 256 }; /* Taken from pcre2grep.c ERRBUFSIZ.  */
     240        PCRE2_UCHAR8 ep[ERRBUFSIZ];
     241        pcre2_get_error_message (ec, ep, sizeof ep);
     242        die (EXIT_TROUBLE, 0, "%s", ep);
     243      }
     244  
     245    free (re_storage);
     246    pcre2_compile_context_free (ccontext);
     247  
     248    pc->mcontext = NULL;
     249    pc->data = pcre2_match_data_create_from_pattern (pc->cre, gcontext);
     250  
     251    /* Ignore any failure return from pcre2_jit_compile, as that merely
     252       means JIT won't be used during matching.  */
     253    pcre2_jit_compile (pc->cre, PCRE2_JIT_COMPLETE);
     254  
     255    /* The PCRE documentation says that a 32 KiB stack is the default.  */
     256    pc->jit_stack = NULL;
     257    pc->jit_stack_size = 32 << 10;
     258  
     259    pc->empty_match[false] = jit_exec (pc, "", 0, 0, PCRE2_NOTBOL);
     260    pc->empty_match[true] = jit_exec (pc, "", 0, 0, 0);
     261  
     262    return pc;
     263  }
     264  
     265  ptrdiff_t
     266  Pexecute (void *vcp, char const *buf, idx_t size, idx_t *match_size,
     267            char const *start_ptr)
     268  {
     269    char const *p = start_ptr ? start_ptr : buf;
     270    bool bol = p[-1] == eolbyte;
     271    char const *line_start = buf;
     272    int e = PCRE2_ERROR_NOMATCH;
     273    char const *line_end;
     274    struct pcre_comp *pc = vcp;
     275    PCRE2_SIZE *sub = pcre2_get_ovector_pointer (pc->data);
     276  
     277    /* The search address to pass to PCRE.  This is the start of
     278       the buffer, or just past the most-recently discovered encoding
     279       error or line end.  */
     280    char const *subject = buf;
     281  
     282    do
     283      {
     284        /* Search line by line.  Although this formerly used something like
     285           PCRE2_MULTILINE for performance, the performance wasn't always
     286           better and the correctness issues were too puzzling.  See
     287           Bug#22655.  */
     288        line_end = rawmemchr (p, eolbyte);
     289        if (PCRE2_SIZE_MAX < line_end - p)
     290          die (EXIT_TROUBLE, 0, _("exceeded PCRE's line length limit"));
     291  
     292        for (;;)
     293          {
     294            /* Skip past bytes that are easily determined to be encoding
     295               errors, treating them as data that cannot match.  This is
     296               faster than having PCRE check them.  */
     297            while (localeinfo.sbclen[to_uchar (*p)] == -1)
     298              {
     299                p++;
     300                subject = p;
     301                bol = false;
     302              }
     303  
     304            idx_t search_offset = p - subject;
     305  
     306            /* Check for an empty match; this is faster than letting
     307               PCRE do it.  */
     308            if (p == line_end)
     309              {
     310                sub[0] = sub[1] = search_offset;
     311                e = pc->empty_match[bol];
     312                break;
     313              }
     314  
     315            int options = 0;
     316            if (!bol)
     317              options |= PCRE2_NOTBOL;
     318  
     319            e = jit_exec (pc, subject, line_end - subject,
     320                          search_offset, options);
     321            if (MATCH_INVALID_UTF || !bad_utf8_from_pcre2 (e))
     322              break;
     323  
     324            idx_t valid_bytes = pcre2_get_startchar (pc->data);
     325  
     326            if (search_offset <= valid_bytes)
     327              {
     328                /* Try to match the string before the encoding error.  */
     329                if (valid_bytes == 0)
     330                  {
     331                    /* Handle the empty-match case specially, for speed.
     332                       This optimization is valid if VALID_BYTES is zero,
     333                       which means SEARCH_OFFSET is also zero.  */
     334                    sub[0] = valid_bytes;
     335                    sub[1] = 0;
     336                    e = pc->empty_match[bol];
     337                  }
     338                else
     339                  e = jit_exec (pc, subject, valid_bytes, search_offset,
     340                                options | PCRE2_NO_UTF_CHECK | PCRE2_NOTEOL);
     341  
     342                if (e != PCRE2_ERROR_NOMATCH)
     343                  break;
     344  
     345                /* Treat the encoding error as data that cannot match.  */
     346                p = subject + valid_bytes + 1;
     347                bol = false;
     348              }
     349  
     350            subject += valid_bytes + 1;
     351          }
     352  
     353        if (e != PCRE2_ERROR_NOMATCH)
     354          break;
     355        bol = true;
     356        p = subject = line_start = line_end + 1;
     357      }
     358    while (p < buf + size);
     359  
     360    if (e <= 0)
     361      {
     362        switch (e)
     363          {
     364          case PCRE2_ERROR_NOMATCH:
     365            break;
     366  
     367          case PCRE2_ERROR_NOMEMORY:
     368            die (EXIT_TROUBLE, 0, _("%s: memory exhausted"), input_filename ());
     369  
     370          case PCRE2_ERROR_JIT_STACKLIMIT:
     371            die (EXIT_TROUBLE, 0, _("%s: exhausted PCRE JIT stack"),
     372                 input_filename ());
     373  
     374          case PCRE2_ERROR_MATCHLIMIT:
     375            die (EXIT_TROUBLE, 0, _("%s: exceeded PCRE's backtracking limit"),
     376                 input_filename ());
     377  
     378          case PCRE2_ERROR_DEPTHLIMIT:
     379            die (EXIT_TROUBLE, 0,
     380                 _("%s: exceeded PCRE's nested backtracking limit"),
     381                 input_filename ());
     382  
     383          case PCRE2_ERROR_RECURSELOOP:
     384            die (EXIT_TROUBLE, 0, _("%s: PCRE detected recurse loop"),
     385                 input_filename ());
     386  
     387  #ifdef PCRE2_ERROR_HEAPLIMIT
     388          case PCRE2_ERROR_HEAPLIMIT:
     389            die (EXIT_TROUBLE, 0, _("%s: exceeded PCRE's heap limit"),
     390                 input_filename ());
     391  #endif
     392  
     393          default:
     394            /* For now, we lump all remaining PCRE failures into this basket.
     395               If anyone cares to provide sample grep usage that can trigger
     396               particular PCRE errors, we can add to the list (above) of more
     397               detailed diagnostics.  */
     398            die (EXIT_TROUBLE, 0, _("%s: internal PCRE error: %d"),
     399                 input_filename (), e);
     400          }
     401  
     402        return -1;
     403      }
     404    else
     405      {
     406        char const *matchbeg = subject + sub[0];
     407        char const *matchend = subject + sub[1];
     408        char const *beg;
     409        char const *end;
     410        if (start_ptr)
     411          {
     412            beg = matchbeg;
     413            end = matchend;
     414          }
     415        else
     416          {
     417            beg = line_start;
     418            end = line_end + 1;
     419          }
     420        *match_size = end - beg;
     421        return beg - buf;
     422      }
     423  }