(root)/
gettext-0.22.4/
gettext-tools/
libgrep/
m-regex.c
       1  /* Pattern Matchers for Regular Expressions.
       2     Copyright (C) 1992, 1998, 2000, 2005-2006, 2010, 2013 Free Software
       3     Foundation, Inc.
       4  
       5     This program is free software: you can redistribute it and/or modify
       6     it under the terms of the GNU General Public License as published by
       7     the Free Software Foundation; either version 3 of the License, or
       8     (at your option) any later version.
       9  
      10     This program is distributed in the hope that it will be useful,
      11     but WITHOUT ANY WARRANTY; without even the implied warranty of
      12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      13     GNU General Public License for more details.
      14  
      15     You should have received a copy of the GNU General Public License
      16     along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
      17  
      18  #ifdef HAVE_CONFIG_H
      19  # include <config.h>
      20  #endif
      21  
      22  /* Specification.  */
      23  #include "libgrep.h"
      24  
      25  #include <ctype.h>
      26  #include <stdbool.h>
      27  #include <stdlib.h>
      28  #include <string.h>
      29  #include <regex.h>
      30  
      31  #include "error.h"
      32  #include "exitfail.h"
      33  #include "xalloc.h"
      34  
      35  #if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII))
      36  # define IN_CTYPE_DOMAIN(c) 1
      37  #else
      38  # define IN_CTYPE_DOMAIN(c) isascii(c)
      39  #endif
      40  #define ISALNUM(C) (IN_CTYPE_DOMAIN (C) && isalnum (C))
      41  #define IS_WORD_CONSTITUENT(C) (ISALNUM(C) || (C) == '_')
      42  
      43  struct patterns
      44  {
      45    /* Regex compiled regexp. */
      46    struct re_pattern_buffer regexbuf;
      47    struct re_registers regs; /* This is here on account of a BRAIN-DEAD
      48                                 Q@#%!# library interface in regex.c.  */
      49  };
      50  
      51  struct compiled_regex {
      52    bool match_words;
      53    bool match_lines;
      54    char eolbyte;
      55  
      56    /* The Regex compiled patterns.  */
      57    struct patterns *patterns;
      58    size_t pcount;
      59  };
      60  
      61  static void *
      62  compile (const char *pattern, size_t pattern_size,
      63           bool match_icase, bool match_words, bool match_lines, char eolbyte,
      64           reg_syntax_t syntax)
      65  {
      66    struct compiled_regex *cregex;
      67  
      68    cregex = XMALLOC (struct compiled_regex);
      69    memset (cregex, '\0', sizeof (struct compiled_regex));
      70    cregex->match_words = match_words;
      71    cregex->match_lines = match_lines;
      72    cregex->eolbyte = eolbyte;
      73    cregex->patterns = NULL;
      74    cregex->pcount = 0;
      75  
      76    re_set_syntax (syntax);
      77  
      78    /* For GNU regex compiler we have to pass the patterns separately to detect
      79       errors like "[\nallo\n]\n".  The patterns here are "[", "allo" and "]"
      80       GNU regex should have raised a syntax error.  The same for backref, where
      81       the backref should have been local to each pattern.  */
      82    {
      83      const char *sep;
      84      size_t total = pattern_size;
      85      const char *motif = pattern;
      86  
      87      do
      88        {
      89          size_t len;
      90          const char *err;
      91  
      92          sep = (const char *) memchr (motif, '\n', total);
      93          if (sep)
      94            {
      95              len = sep - motif;
      96              sep++;
      97              total -= (len + 1);
      98            }
      99          else
     100            {
     101              len = total;
     102              total = 0;
     103            }
     104  
     105          cregex->patterns = xrealloc (cregex->patterns, (cregex->pcount + 1) * sizeof (struct patterns));
     106          memset (&cregex->patterns[cregex->pcount], '\0', sizeof (struct patterns));
     107  
     108          if ((err = re_compile_pattern (motif, len,
     109                                         &cregex->patterns[cregex->pcount].regexbuf)) != NULL)
     110            error (exit_failure, 0, "%s", err);
     111          cregex->pcount++;
     112  
     113          motif = sep;
     114        }
     115      while (sep && total != 0);
     116    }
     117  
     118    return cregex;
     119  }
     120  
     121  static void *
     122  Gcompile (const char *pattern, size_t pattern_size,
     123            bool match_icase, bool match_words, bool match_lines, char eolbyte)
     124  {
     125    return compile (pattern, pattern_size,
     126                    match_icase, match_words, match_lines, eolbyte,
     127                    RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE);
     128  }
     129  
     130  static void *
     131  Ecompile (const char *pattern, size_t pattern_size,
     132            bool match_icase, bool match_words, bool match_lines, char eolbyte)
     133  {
     134    return compile (pattern, pattern_size,
     135                    match_icase, match_words, match_lines, eolbyte,
     136                    RE_SYNTAX_POSIX_EGREP);
     137  }
     138  
     139  static void *
     140  AWKcompile (const char *pattern, size_t pattern_size,
     141              bool match_icase, bool match_words, bool match_lines, char eolbyte)
     142  {
     143    return compile (pattern, pattern_size,
     144                    match_icase, match_words, match_lines, eolbyte,
     145                    RE_SYNTAX_AWK);
     146  }
     147  
     148  static size_t
     149  EGexecute (const void *compiled_pattern,
     150             const char *buf, size_t buf_size,
     151             size_t *match_size, bool exact)
     152  {
     153    struct compiled_regex *cregex = (struct compiled_regex *) compiled_pattern;
     154    char eol = cregex->eolbyte;
     155    register const char *buflim = buf + buf_size;
     156    register const char *beg;
     157    register const char *end;
     158  
     159    for (beg = buf; beg < buflim; beg = end)
     160      {
     161        size_t i;
     162  
     163        end = (const char *) memchr (beg, eol, buflim - beg);
     164        if (end == NULL)
     165          end = buflim;
     166        /* Here, either end < buflim && *end == eol, or end == buflim.  */
     167  
     168        for (i = 0; i < cregex->pcount; i++)
     169          {
     170            int start, len;
     171  
     172            cregex->patterns[i].regexbuf.not_eol = 0;
     173            if (0 <= (start = re_search (&cregex->patterns[i].regexbuf, beg,
     174                                         end - beg, 0,
     175                                         end - beg, &cregex->patterns[i].regs)))
     176              {
     177                len = cregex->patterns[i].regs.end[0] - start;
     178                if (exact)
     179                  {
     180                    *match_size = len;
     181                    return start;
     182                  }
     183                if (cregex->match_lines)
     184                  {
     185                    if (len == end - beg) /* implies start == 0 */
     186                      goto success;
     187                  }
     188                else if (cregex->match_words)
     189                  {
     190                    /* If -w, check if the match aligns with word boundaries.
     191                       We do this iteratively because:
     192                       (a) the line may contain more than one occurence of the
     193                           pattern, and
     194                       (b) Several alternatives in the pattern might be valid at
     195                           a given point, and we may need to consider a shorter
     196                           one to find a word boundary.  */
     197                    while (start >= 0)
     198                      {
     199                        if ((start == 0 || !IS_WORD_CONSTITUENT ((unsigned char) beg[start - 1]))
     200                            && (start + len == end - beg
     201                                || !IS_WORD_CONSTITUENT ((unsigned char) beg[start + len])))
     202                          goto success;
     203                        if (len > 0)
     204                          {
     205                            /* Try a shorter length anchored at the same place. */
     206                            --len;
     207                            cregex->patterns[i].regexbuf.not_eol = 1;
     208                            len = re_match (&cregex->patterns[i].regexbuf, beg,
     209                                            start + len, start,
     210                                            &cregex->patterns[i].regs);
     211                          }
     212                        if (len <= 0)
     213                          {
     214                            /* Try looking further on. */
     215                            if (start == end - beg)
     216                              break;
     217                            ++start;
     218                            cregex->patterns[i].regexbuf.not_eol = 0;
     219                            start = re_search (&cregex->patterns[i].regexbuf, beg,
     220                                               end - beg,
     221                                               start, end - beg - start,
     222                                               &cregex->patterns[i].regs);
     223                            len = cregex->patterns[i].regs.end[0] - start;
     224                          }
     225                      }
     226                  }
     227                else
     228                  goto success;
     229              }
     230          }
     231  
     232        if (end < buflim)
     233          end++;
     234      }
     235    return (size_t) -1;
     236  
     237   success:
     238    *match_size = end - beg;
     239    return beg - buf;
     240  }
     241  
     242  static void
     243  EGfree (void *compiled_pattern)
     244  {
     245    struct compiled_regex *cregex = (struct compiled_regex *) compiled_pattern;
     246  
     247    free (cregex->patterns);
     248    free (cregex);
     249  }
     250  
     251  /* POSIX Basic Regular Expressions */
     252  matcher_t matcher_grep =
     253    {
     254      Gcompile,
     255      EGexecute,
     256      EGfree
     257    };
     258  
     259  /* POSIX Extended Regular Expressions */
     260  matcher_t matcher_egrep =
     261    {
     262      Ecompile,
     263      EGexecute,
     264      EGfree
     265    };
     266  
     267  /* AWK Regular Expressions */
     268  matcher_t matcher_awk =
     269    {
     270      AWKcompile,
     271      EGexecute,
     272      EGfree
     273    };