1  /*
       2   * re.c - compile regular expressions.
       3   */
       4  
       5  /*
       6   * Copyright (C) 1991-2019, 2021, 2022, the Free Software Foundation, Inc.
       7   *
       8   * This file is part of GAWK, the GNU implementation of the
       9   * AWK Programming Language.
      10   *
      11   * GAWK is free software; you can redistribute it and/or modify
      12   * it under the terms of the GNU General Public License as published by
      13   * the Free Software Foundation; either version 3 of the License, or
      14   * (at your option) any later version.
      15   *
      16   * GAWK is distributed in the hope that it will be useful,
      17   * but WITHOUT ANY WARRANTY; without even the implied warranty of
      18   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      19   * GNU General Public License for more details.
      20   *
      21   * You should have received a copy of the GNU General Public License
      22   * along with this program; if not, write to the Free Software
      23   * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA
      24   */
      25  
      26  #include "awk.h"
      27  
      28  #include "localeinfo.h"
      29  
      30  static reg_syntax_t syn;
      31  static void check_bracket_exp(char *s, size_t len);
      32  const char *regexflags2str(int flags);
      33  
      34  static struct localeinfo localeinfo;
      35  
      36  /* make_regexp --- generate compiled regular expressions */
      37  
      38  Regexp *
      39  make_regexp(const char *s, size_t len, bool ignorecase, bool dfa, bool canfatal)
      40  {
      41  	static char metas[] = ".*+(){}[]|?^$\\";
      42  	Regexp *rp;
      43  	const char *rerr;
      44  	const char *src = s;
      45  	static char *buf = NULL;
      46  	static size_t buflen;
      47  	const char *end = s + len;
      48  	char *dest;
      49  	int c, c2;
      50  	static bool first = true;
      51  	static bool no_dfa = false;
      52  	int i;
      53  	static struct dfa* dfaregs[2] = { NULL, NULL };
      54  	static bool nul_warned = false;
      55  
      56  	if (do_lint && ! nul_warned && memchr(s, '\0', len) != NULL) {
      57  		nul_warned = true;
      58  		lintwarn(_("behavior of matching a regexp containing NUL characters is not defined by POSIX"));
      59  	}
      60  
      61  	/*
      62  	 * The number of bytes in the current multibyte character.
      63  	 * It is 0, when the current character is a singlebyte character.
      64  	 */
      65  	size_t is_multibyte = 0;
      66  	mbstate_t mbs;
      67  
      68  	memset(&mbs, 0, sizeof(mbstate_t)); /* Initialize.  */
      69  
      70  	if (first) {
      71  		/* for debugging and testing */
      72  		no_dfa = (getenv("GAWK_NO_DFA") != NULL);
      73  		/* don't set first to false here, we do it below */
      74  	}
      75  
      76  	/* always check */
      77  	check_bracket_exp((char *) s, len);
      78  
      79  	/* Handle escaped characters first. */
      80  
      81  	/*
      82  	 * Build a copy of the string (in buf) with the
      83  	 * escaped characters translated, and generate the regex
      84  	 * from that.
      85  	 */
      86  	if (buf == NULL) {
      87  		emalloc(buf, char *, len + 1, "make_regexp");
      88  		buflen = len;
      89  	} else if (len > buflen) {
      90  		erealloc(buf, char *, len + 1, "make_regexp");
      91  		buflen = len;
      92  	}
      93  	dest = buf;
      94  
      95  	while (src < end) {
      96  		if (gawk_mb_cur_max > 1 && ! is_multibyte) {
      97  			/* The previous byte is a singlebyte character, or last byte
      98  			   of a multibyte character.  We check the next character.  */
      99  			is_multibyte = mbrlen(src, end - src, &mbs);
     100  			if (   is_multibyte == 1
     101  			    || is_multibyte == (size_t) -1
     102  			    || is_multibyte == (size_t) -2
     103  			    || is_multibyte == 0) {
     104  				/* We treat it as a single-byte character.  */
     105  				is_multibyte = 0;
     106  			}
     107  		}
     108  
     109  		const char *ok_to_escape;
     110  		if (do_posix)
     111  			ok_to_escape = "{}()|*+?.^$\\[]/-";
     112  		else if (do_traditional)
     113  			ok_to_escape = "()|*+?.^$\\[]/-";
     114  		else
     115  			ok_to_escape = "<>`'BywWsS{}()|*+?.^$\\[]/-";
     116  
     117  		/* We skip multibyte character, since it must not be a special
     118  		   character.  */
     119  		if ((gawk_mb_cur_max == 1 || ! is_multibyte) &&
     120  		    (*src == '\\')) {
     121  			c = *++src;
     122  			switch (c) {
     123  			case '\0':	/* \\ before \0, either dynamic data or real end of string */
     124  				if (src >= s + len)
     125  					*dest++ = '\\';	// at end of string, will fatal below
     126  				else
     127  					fatal(_("invalid NUL byte in dynamic regexp"));
     128  				break;
     129  			case 'a':
     130  			case 'b':
     131  			case 'f':
     132  			case 'n':
     133  			case 'r':
     134  			case 't':
     135  			case 'v':
     136  			case 'x':
     137  			case '0':
     138  			case '1':
     139  			case '2':
     140  			case '3':
     141  			case '4':
     142  			case '5':
     143  			case '6':
     144  			case '7':
     145  				c2 = parse_escape(&src);
     146  				if (c2 < 0)
     147  					cant_happen("received bad result %d from parse_escape()", c2);
     148  				/*
     149  				 * Unix awk treats octal (and hex?) chars
     150  				 * literally in re's, so escape regexp
     151  				 * metacharacters.
     152  				 */
     153  				if (do_traditional
     154  				    && ! do_posix
     155  				    && (isdigit(c) || c == 'x')
     156  				    && strchr("()|*+?.^$\\[]", c2) != NULL)
     157  					*dest++ = '\\';
     158  				*dest++ = (char) c2;
     159  				if (do_lint
     160  				    && ! nul_warned
     161  				    && c2 == '\0') {
     162  					nul_warned = true;
     163  					lintwarn(_("behavior of matching a regexp containing NUL characters is not defined by POSIX"));
     164  				}
     165  				break;
     166  			case '8':
     167  			case '9':	/* a\9b not valid */
     168  				*dest++ = c;
     169  				src++;
     170  			{
     171  				static bool warned[2];
     172  
     173  				if (! warned[c - '8']) {
     174  					warning(_("regexp escape sequence `\\%c' treated as plain `%c'"), c, c);
     175  					warned[c - '8'] = true;
     176  				}
     177  			}
     178  				break;
     179  			case 'y':	/* normally \b */
     180  				/* gnu regex op */
     181  				if (! do_traditional) {
     182  					*dest++ = '\\';
     183  					*dest++ = 'b';
     184  					src++;
     185  					break;
     186  				}
     187  				/* else, fall through */
     188  			default:
     189  				if (strchr(ok_to_escape, c) == NULL) {
     190  					static bool warned[256];
     191  
     192  					if (! warned[c & 0xFF]) {
     193  						warning(_("regexp escape sequence `\\%c' is not a known regexp operator"), c);
     194  						warned[c & 0xFF] = true;
     195  					}
     196  				}
     197  				*dest++ = '\\';
     198  				*dest++ = (char) c;
     199  				src++;
     200  				break;
     201  			} /* switch */
     202  		} else {
     203  			c = *src;
     204  			*dest++ = *src++;	/* not '\\' */
     205  		}
     206  		if (gawk_mb_cur_max > 1 && is_multibyte)
     207  			is_multibyte--;
     208  	} /* while */
     209  
     210  	*dest = '\0';
     211  	len = dest - buf;
     212  
     213  	ezalloc(rp, Regexp *, sizeof(*rp), "make_regexp");
     214  	rp->pat.allocated = 0;	/* regex will allocate the buffer */
     215  	emalloc(rp->pat.fastmap, char *, 256, "make_regexp");
     216  
     217  	/*
     218  	 * Lo these many years ago, had I known what a P.I.T.A. IGNORECASE
     219  	 * was going to turn out to be, I wouldn't have bothered with it.
     220  	 *
     221  	 * In the case where we have a multibyte character set, we have no
     222  	 * choice but to use RE_ICASE, since the casetable is for single-byte
     223  	 * character sets only.
     224  	 *
     225  	 * On the other hand, if we do have a single-byte character set,
     226  	 * using the casetable should give  a performance improvement, since
     227  	 * it's computed only once, not each time a regex is compiled.  We
     228  	 * also think it's probably better for portability.  See the
     229  	 * discussion by the definition of casetable[] in eval.c.
     230  	 */
     231  
     232  	ignorecase = !! ignorecase;	/* force to 1 or 0 */
     233  	if (ignorecase) {
     234  		if (gawk_mb_cur_max > 1) {
     235  			syn |= RE_ICASE;
     236  			rp->pat.translate = NULL;
     237  		} else {
     238  			syn &= ~RE_ICASE;
     239  			rp->pat.translate = (RE_TRANSLATE_TYPE) casetable;
     240  		}
     241  	} else {
     242  		rp->pat.translate = NULL;
     243  		syn &= ~RE_ICASE;
     244  	}
     245  
     246  	/* initialize dfas to hold syntax */
     247  	if (first) {
     248  		first = false;
     249  		dfaregs[0] = dfaalloc();
     250  		dfaregs[1] = dfaalloc();
     251  		dfasyntax(dfaregs[0], & localeinfo, syn, DFA_ANCHOR);
     252  		dfasyntax(dfaregs[1], & localeinfo, syn | RE_ICASE, DFA_ANCHOR);
     253  	}
     254  
     255  	re_set_syntax(syn);
     256  
     257  	if ((rerr = re_compile_pattern(buf, len, &(rp->pat))) != NULL) {
     258  		refree(rp);
     259  		if (! canfatal) {
     260  			/* rerr already gettextized inside regex routines */
     261  			error("%s: /%.*s/", rerr, (int) len, s);
     262   			return NULL;
     263  		}
     264  		fatal("invalid regexp: %s: /%.*s/", rerr, (int) len, s);
     265  	}
     266  
     267  	/* gack. this must be done *after* re_compile_pattern */
     268  	rp->pat.newline_anchor = false; /* don't get \n in middle of string */
     269  	if (dfa && ! no_dfa) {
     270  		rp->dfareg = dfaalloc();
     271  		dfacopysyntax(rp->dfareg, dfaregs[ignorecase]);
     272  		dfacomp(buf, len, rp->dfareg, true);
     273  	} else
     274  		rp->dfareg = NULL;
     275  
     276  	/* Additional flags that help with RS as regexp. */
     277  	for (i = 0; i < len; i++) {
     278  		if (strchr(metas, buf[i]) != NULL) {
     279  			rp->has_meta = true;
     280  			break;
     281  		}
     282  	}
     283  
     284  	for (i = len - 1; i >= 0; i--) {
     285  		if (strchr("*+|?{}", buf[i]) != NULL) {
     286  			rp->maybe_long = true;
     287  			break;
     288  		}
     289  	}
     290  
     291  	return rp;
     292  }
     293  
     294  /* research --- do a regexp search. use dfa if possible */
     295  
     296  int
     297  research(Regexp *rp, char *str, int start,
     298  	 size_t len, int flags)
     299  {
     300  	const char *ret = str;
     301  	bool try_backref = false;
     302  	int need_start;
     303  	int no_bol;
     304  	int res;
     305  
     306  	need_start = ((flags & RE_NEED_START) != 0);
     307  	no_bol = ((flags & RE_NO_BOL) != 0);
     308  
     309  	if (no_bol)
     310  		rp->pat.not_bol = 1;
     311  
     312  	/*
     313  	 * Always do dfa search if can; if it fails, then even if
     314  	 * need_start is true, we won't bother with the regex search.
     315  	 *
     316  	 * The dfa matcher doesn't have a no_bol flag, so don't bother
     317  	 * trying it in that case.
     318  	 *
     319  	 * 7/2008: Skip the dfa matcher if need_start. The dfa matcher
     320  	 * has bugs in certain multibyte cases and it's too difficult
     321  	 * to try to special case things.
     322  	 * 7/2017: Apparently there are some cases where DFA gets
     323  	 * stuck, even in the C locale, so we use dfa only if not need_start.
     324  	 *
     325  	 * Should that issue ever get resolved, note this comment:
     326  	 *
     327  	 * 7/2016: The dfa matcher can't handle a case where searching
     328  	 * starts in the middle of a string, so don't bother trying it
     329  	 * in that case.
     330  	 *	if (rp->dfa && ! no_bol && start == 0) ...
     331  	 */
     332  	if (rp->dfareg != NULL && ! no_bol && ! need_start) {
     333  		struct dfa *superset = dfasuperset(rp->dfareg);
     334  		if (superset)
     335  			ret = dfaexec(superset, str+start, str+start+len,
     336  							true, NULL, NULL);
     337  
     338  		if (ret && (! need_start
     339  				|| (! superset && dfaisfast(rp->dfareg))))
     340  			ret = dfaexec(rp->dfareg, str+start, str+start+len,
     341  						true, NULL, &try_backref);
     342  	}
     343  
     344  	if (ret) {
     345  		if (   rp->dfareg == NULL
     346  			|| start != 0
     347  			|| no_bol
     348  			|| need_start
     349  			|| try_backref) {
     350  			/*
     351  			 * Passing NULL as last arg speeds up search for cases
     352  			 * where we don't need the start/end info.
     353  			 */
     354  			res = re_search(&(rp->pat), str, start+len,
     355  				start, len, need_start ? &(rp->regs) : NULL);
     356  		} else
     357  			res = 1;
     358  	} else
     359  		res = -1;
     360  
     361  	rp->pat.not_bol = 0;
     362  	return res;
     363  }
     364  
     365  /* refree --- free up the dynamic memory used by a compiled regexp */
     366  
     367  void
     368  refree(Regexp *rp)
     369  {
     370  	if (rp == NULL)
     371  		return;
     372  	rp->pat.translate = NULL;
     373  	regfree(& rp->pat);
     374  	if (rp->regs.start)
     375  		free(rp->regs.start);
     376  	if (rp->regs.end)
     377  		free(rp->regs.end);
     378  	if (rp->dfareg != NULL) {
     379  		dfafree(rp->dfareg);
     380  		free(rp->dfareg);
     381  	}
     382  	efree(rp);
     383  }
     384  
     385  /* dfaerror --- print an error message for the dfa routines */
     386  
     387  void
     388  dfaerror(const char *s)
     389  {
     390  	fatal("%s", s);
     391  	exit(EXIT_FATAL);
     392  }
     393  
     394  /* re_cache_get --- populate regexp cache if empty */
     395  
     396  static inline Regexp *
     397  re_cache_get(NODE *t)
     398  {
     399  	if (t->re_reg[IGNORECASE] == NULL)
     400  		t->re_reg[IGNORECASE] = make_regexp(t->re_exp->stptr, t->re_exp->stlen, IGNORECASE, t->re_cnt, true);
     401  	return t->re_reg[IGNORECASE];
     402  }
     403  
     404  /* re_update --- recompile a dynamic regexp */
     405  
     406  Regexp *
     407  re_update(NODE *t)
     408  {
     409  	NODE *t1;
     410  
     411  	if (t->type == Node_val && (t->flags & REGEX) != 0)
     412  		return re_cache_get(t->typed_re);
     413  
     414  	if ((t->re_flags & CONSTANT) != 0) {
     415  		/* it's a constant, so just return it as is */
     416  		assert(t->type == Node_regex);
     417  		return re_cache_get(t);
     418  	}
     419  	t1 = t->re_exp;
     420  	if (t->re_text != NULL) {
     421  		/* if contents haven't changed, just return it */
     422  		if (cmp_nodes(t->re_text, t1, true) == 0)
     423  			return re_cache_get(t);
     424  		/* things changed, fall through to recompile */
     425  		unref(t->re_text);
     426  	}
     427  	/* get fresh copy of the text of the regexp */
     428  	t->re_text = dupnode(t1);
     429  
     430  	/* text changed */
     431  
     432  	/* free old */
     433  	if (t->re_reg[0] != NULL) {
     434  		refree(t->re_reg[0]);
     435  		t->re_reg[0] = NULL;
     436  	}
     437  	if (t->re_reg[1] != NULL) {
     438  		refree(t->re_reg[1]);
     439  		t->re_reg[1] = NULL;
     440  	}
     441  	if (t->re_cnt > 0 && ++t->re_cnt > 10)
     442  		/*
     443  		 * The regex appears to update frequently, so disable DFA
     444  		 * matching (which trades off expensive upfront compilation
     445  		 * overhead for faster subsequent matching).
     446  		 */
     447  		t->re_cnt = 0;
     448  	if (t->re_text == NULL) {
     449  		/* reset regexp text if needed */
     450  		t1 = t->re_exp;
     451  		unref(t->re_text);
     452  		t->re_text = dupnode(t1);
     453  	}
     454  	return re_cache_get(t);
     455  }
     456  
     457  /* resetup --- choose what kind of regexps we match */
     458  
     459  void
     460  resetup()
     461  {
     462  	// init localeinfo for dfa
     463  	init_localeinfo(& localeinfo);
     464  
     465  	/*
     466  	 * Syntax bits: _that_ is yet another mind trip.  Recreational drugs
     467  	 * are helpful for recovering from the experience.
     468  	 *
     469  	 *	Aharon Robbins <arnold@skeeve.com>
     470  	 *	Sun, 21 Oct 2007 23:55:33 +0200
     471  	 */
     472  	if (do_posix)
     473  		syn = RE_SYNTAX_POSIX_AWK;	/* strict POSIX re's */
     474  	else if (do_traditional)
     475  		syn = RE_SYNTAX_AWK;		/* traditional Unix awk re's */
     476  	else
     477  		syn = RE_SYNTAX_GNU_AWK;	/* POSIX re's + GNU ops */
     478  
     479  	/*
     480  	 * Interval expressions are now on by default, as POSIX is
     481  	 * wide-spread enough that people want it.
     482  	 *
     483  	 * 2/2022: BWK awk has supported interval expressions since
     484  	 * March 2019, with an important fix added in Januay 2020.
     485  	 * So we add that support even for --traditional. It's easier to
     486  	 * do it here than to try to get the GLIBC / GNULIB folks to change
     487  	 * the definition of RE_SYNTAX_AWK, which likely would cause
     488  	 * binary compatibility issues.
     489  	 */
     490  	if (do_traditional)
     491  		syn |= RE_INTERVALS | RE_INVALID_INTERVAL_ORD | RE_NO_BK_BRACES;
     492  
     493  	(void) re_set_syntax(syn);
     494  }
     495  
     496  /* using_utf8 --- are we using utf8 */
     497  
     498  bool
     499  using_utf8(void)
     500  {
     501  	return localeinfo.using_utf8;
     502  }
     503  
     504  /* reisstring --- return true if the RE match is a simple string match */
     505  
     506  int
     507  reisstring(const char *text, size_t len, Regexp *re, const char *buf)
     508  {
     509  	int res;
     510  	const char *matched;
     511  
     512  	/* simple checking for meta characters in re */
     513  	if (re->has_meta)
     514  		return false;	/* give up early, can't be string match */
     515  
     516  	/* make accessable to gdb */
     517  	matched = &buf[RESTART(re, buf)];
     518  
     519  	res = (memcmp(text, matched, len) == 0);
     520  
     521  	return res;
     522  }
     523  
     524  /* reflags2str --- make a regex flags value readable */
     525  
     526  const char *
     527  reflags2str(int flagval)
     528  {
     529  	static const struct flagtab values[] = {
     530  		{ RE_BACKSLASH_ESCAPE_IN_LISTS, "RE_BACKSLASH_ESCAPE_IN_LISTS" },
     531  		{ RE_BK_PLUS_QM, "RE_BK_PLUS_QM" },
     532  		{ RE_CHAR_CLASSES, "RE_CHAR_CLASSES" },
     533  		{ RE_CONTEXT_INDEP_ANCHORS, "RE_CONTEXT_INDEP_ANCHORS" },
     534  		{ RE_CONTEXT_INDEP_OPS, "RE_CONTEXT_INDEP_OPS" },
     535  		{ RE_CONTEXT_INVALID_OPS, "RE_CONTEXT_INVALID_OPS" },
     536  		{ RE_DOT_NEWLINE, "RE_DOT_NEWLINE" },
     537  		{ RE_DOT_NOT_NULL, "RE_DOT_NOT_NULL" },
     538  		{ RE_HAT_LISTS_NOT_NEWLINE, "RE_HAT_LISTS_NOT_NEWLINE" },
     539  		{ RE_INTERVALS, "RE_INTERVALS" },
     540  		{ RE_LIMITED_OPS, "RE_LIMITED_OPS" },
     541  		{ RE_NEWLINE_ALT, "RE_NEWLINE_ALT" },
     542  		{ RE_NO_BK_BRACES, "RE_NO_BK_BRACES" },
     543  		{ RE_NO_BK_PARENS, "RE_NO_BK_PARENS" },
     544  		{ RE_NO_BK_REFS, "RE_NO_BK_REFS" },
     545  		{ RE_NO_BK_VBAR, "RE_NO_BK_VBAR" },
     546  		{ RE_NO_EMPTY_RANGES, "RE_NO_EMPTY_RANGES" },
     547  		{ RE_UNMATCHED_RIGHT_PAREN_ORD, "RE_UNMATCHED_RIGHT_PAREN_ORD" },
     548  		{ RE_NO_POSIX_BACKTRACKING, "RE_NO_POSIX_BACKTRACKING" },
     549  		{ RE_NO_GNU_OPS, "RE_NO_GNU_OPS" },
     550  		{ RE_INVALID_INTERVAL_ORD, "RE_INVALID_INTERVAL_ORD" },
     551  		{ RE_ICASE, "RE_ICASE" },
     552  		{ RE_CARET_ANCHORS_HERE, "RE_CARET_ANCHORS_HERE" },
     553  		{ RE_CONTEXT_INVALID_DUP, "RE_CONTEXT_INVALID_DUP" },
     554  		{ RE_NO_SUB, "RE_NO_SUB" },
     555  		{ 0,	NULL },
     556  	};
     557  
     558  	if (flagval == RE_SYNTAX_EMACS) /* == 0 */
     559  		return "RE_SYNTAX_EMACS";
     560  
     561  	return genflags2str(flagval, values);
     562  }
     563  
     564  /*
     565   * dfawarn() is called by the dfa routines whenever a regex is compiled
     566   * must supply a dfawarn.
     567   */
     568  
     569  void
     570  dfawarn(const char *dfa_warning)
     571  {
     572  	/*
     573  	 * This routine does nothing, since gawk does its own
     574  	 * (better) check for bad [[:foo:]] syntax.
     575  	 */
     576  }
     577  
     578  /* check_bracket_exp --- look for /[:space:]/ that should be /[[:space:]]/ */
     579  
     580  static void
     581  check_bracket_exp(char *s, size_t length)
     582  {
     583  	static struct reclass {
     584  		const char *name;
     585  		size_t len;
     586  		bool warned;
     587  	} classes[] = {
     588  		/*
     589  		 * Ordered by what we hope is frequency,
     590  		 * since it's linear searched.
     591  		 */
     592  		{ "[:alpha:]", 9, false },
     593  		{ "[:digit:]", 9, false },
     594  		{ "[:alnum:]", 9, false },
     595  		{ "[:upper:]", 9, false },
     596  		{ "[:lower:]", 9, false },
     597  		{ "[:space:]", 9, false },
     598  		{ "[:xdigit:]", 10, false },
     599  		{ "[:punct:]", 9, false },
     600  		{ "[:print:]", 9, false },
     601  		{ "[:graph:]", 9, false },
     602  		{ "[:cntrl:]", 9, false },
     603  		{ "[:blank:]", 9, false },
     604  		{ NULL, 0 }
     605  	};
     606  	int i;
     607  	bool found = false;
     608  	char save;
     609  	char *sp, *sp2, *end;
     610  	int len;
     611  	int count = 0;
     612  
     613  	if (length == 0)
     614  		return;
     615  
     616  	end = s + length;
     617  	save = s[length];
     618  	s[length] = '\0';
     619  	sp = s;
     620  
     621  again:
     622  	sp = sp2 = (char *) memchr(sp, '[', (end - sp));
     623  	if (sp == NULL)
     624  		goto done;
     625  
     626  	for (count++, sp++; *sp != '\0'; sp++) {
     627  		if (*sp == '[')
     628  			count++;
     629  		/*
     630  		 * ] as first char after open [ is skipped
     631  		 * \] is skipped
     632  		 * [^]] is skipped
     633  		 */
     634  		if (*sp == ']' && sp > sp2) {
     635  			 if (sp[-1] != '['
     636  			     && sp[-1] != '\\')
     637  				 ;
     638  			 else if ((sp - sp2) >= 2
     639  				  && sp[-1] == '^' && sp[-2] == '[')
     640  				 ;
     641  			 else
     642  				count--;
     643  		}
     644  
     645  		if (count == 0) {
     646  			sp++;	/* skip past ']' */
     647  			break;
     648  		}
     649  	}
     650  
     651  	if (count > 0) {	/* bad regex, give up */
     652  		goto done;
     653  	}
     654  
     655  	/* sp2 has start */
     656  
     657  	for (i = 0; classes[i].name != NULL; i++) {
     658  		if (classes[i].warned)
     659  			continue;
     660  		len = classes[i].len;
     661  		if (   len == (sp - sp2)
     662  		    && memcmp(sp2, classes[i].name, len) == 0) {
     663  			found = true;
     664  			break;
     665  		}
     666  	}
     667  
     668  	if (found && ! classes[i].warned) {
     669  		warning(_("regexp component `%.*s' should probably be `[%.*s]'"),
     670  				len, sp2, len, sp2);
     671  		classes[i].warned = true;
     672  	}
     673  
     674  	if (sp < end) {
     675  		found = false;
     676  		goto again;
     677  	}
     678  done:
     679  	s[length] = save;
     680  }