1  /* Extended regular expression matching and search library.
       2     Copyright (C) 2002-2023 Free Software Foundation, Inc.
       3     This file is part of the GNU C Library.
       4     Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
       5  
       6     The GNU C Library is free software; you can redistribute it and/or
       7     modify it under the terms of the GNU Lesser General Public
       8     License as published by the Free Software Foundation; either
       9     version 2.1 of the License, or (at your option) any later version.
      10  
      11     The GNU C Library is distributed in the hope that it will be useful,
      12     but WITHOUT ANY WARRANTY; without even the implied warranty of
      13     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
      14     Lesser General Public License for more details.
      15  
      16     You should have received a copy of the GNU Lesser General Public
      17     License along with the GNU C Library; if not, see
      18     <https://www.gnu.org/licenses/>.  */
      19  
      20  #ifdef _LIBC
      21  # include <locale/weight.h>
      22  #endif
      23  
      24  static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
      25  					  size_t length, reg_syntax_t syntax);
      26  static void re_compile_fastmap_iter (regex_t *bufp,
      27  				     const re_dfastate_t *init_state,
      28  				     char *fastmap);
      29  static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
      30  static void free_charset (re_charset_t *cset);
      31  static void free_workarea_compile (regex_t *preg);
      32  static reg_errcode_t create_initial_state (re_dfa_t *dfa);
      33  static void optimize_utf8 (re_dfa_t *dfa);
      34  static reg_errcode_t analyze (regex_t *preg);
      35  static reg_errcode_t preorder (bin_tree_t *root,
      36  			       reg_errcode_t (fn (void *, bin_tree_t *)),
      37  			       void *extra);
      38  static reg_errcode_t postorder (bin_tree_t *root,
      39  				reg_errcode_t (fn (void *, bin_tree_t *)),
      40  				void *extra);
      41  static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
      42  static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
      43  static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
      44  				 bin_tree_t *node);
      45  static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
      46  static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
      47  static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
      48  static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
      49  static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
      50  				   unsigned int constraint);
      51  static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
      52  static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
      53  					 Idx node, bool root);
      54  static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
      55  static Idx fetch_number (re_string_t *input, re_token_t *token,
      56  			 reg_syntax_t syntax);
      57  static int peek_token (re_token_t *token, re_string_t *input,
      58  			reg_syntax_t syntax);
      59  static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
      60  			  reg_syntax_t syntax, reg_errcode_t *err);
      61  static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
      62  				  re_token_t *token, reg_syntax_t syntax,
      63  				  Idx nest, reg_errcode_t *err);
      64  static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
      65  				 re_token_t *token, reg_syntax_t syntax,
      66  				 Idx nest, reg_errcode_t *err);
      67  static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
      68  				     re_token_t *token, reg_syntax_t syntax,
      69  				     Idx nest, reg_errcode_t *err);
      70  static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
      71  				  re_token_t *token, reg_syntax_t syntax,
      72  				  Idx nest, reg_errcode_t *err);
      73  static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
      74  				 re_dfa_t *dfa, re_token_t *token,
      75  				 reg_syntax_t syntax, reg_errcode_t *err);
      76  static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
      77  				      re_token_t *token, reg_syntax_t syntax,
      78  				      reg_errcode_t *err);
      79  static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
      80  					    re_string_t *regexp,
      81  					    re_token_t *token, int token_len,
      82  					    re_dfa_t *dfa,
      83  					    reg_syntax_t syntax,
      84  					    bool accept_hyphen);
      85  static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
      86  					  re_string_t *regexp,
      87  					  re_token_t *token);
      88  static reg_errcode_t build_equiv_class (bitset_t sbcset,
      89  					re_charset_t *mbcset,
      90  					Idx *equiv_class_alloc,
      91  					const unsigned char *name);
      92  static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
      93  				      bitset_t sbcset,
      94  				      re_charset_t *mbcset,
      95  				      Idx *char_class_alloc,
      96  				      const char *class_name,
      97  				      reg_syntax_t syntax);
      98  static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
      99  				       RE_TRANSLATE_TYPE trans,
     100  				       const char *class_name,
     101  				       const char *extra,
     102  				       bool non_match, reg_errcode_t *err);
     103  static bin_tree_t *create_tree (re_dfa_t *dfa,
     104  				bin_tree_t *left, bin_tree_t *right,
     105  				re_token_type_t type);
     106  static bin_tree_t *create_token_tree (re_dfa_t *dfa,
     107  				      bin_tree_t *left, bin_tree_t *right,
     108  				      const re_token_t *token);
     109  static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
     110  static void free_token (re_token_t *node);
     111  static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
     112  static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
     113  
     114  /* This table gives an error message for each of the error codes listed
     115     in regex.h.  Obviously the order here has to be same as there.
     116     POSIX doesn't require that we do anything for REG_NOERROR,
     117     but why not be nice?  */
     118  
     119  static const char __re_error_msgid[] =
     120    {
     121  #define REG_NOERROR_IDX	0
     122      gettext_noop ("Success")	/* REG_NOERROR */
     123      "\0"
     124  #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
     125      gettext_noop ("No match")	/* REG_NOMATCH */
     126      "\0"
     127  #define REG_BADPAT_IDX	(REG_NOMATCH_IDX + sizeof "No match")
     128      gettext_noop ("Invalid regular expression") /* REG_BADPAT */
     129      "\0"
     130  #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
     131      gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
     132      "\0"
     133  #define REG_ECTYPE_IDX	(REG_ECOLLATE_IDX + sizeof "Invalid collation character")
     134      gettext_noop ("Invalid character class name") /* REG_ECTYPE */
     135      "\0"
     136  #define REG_EESCAPE_IDX	(REG_ECTYPE_IDX + sizeof "Invalid character class name")
     137      gettext_noop ("Trailing backslash") /* REG_EESCAPE */
     138      "\0"
     139  #define REG_ESUBREG_IDX	(REG_EESCAPE_IDX + sizeof "Trailing backslash")
     140      gettext_noop ("Invalid back reference") /* REG_ESUBREG */
     141      "\0"
     142  #define REG_EBRACK_IDX	(REG_ESUBREG_IDX + sizeof "Invalid back reference")
     143      gettext_noop ("Unmatched [, [^, [:, [., or [=")	/* REG_EBRACK */
     144      "\0"
     145  #define REG_EPAREN_IDX	(REG_EBRACK_IDX + sizeof "Unmatched [, [^, [:, [., or [=")
     146      gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
     147      "\0"
     148  #define REG_EBRACE_IDX	(REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
     149      gettext_noop ("Unmatched \\{") /* REG_EBRACE */
     150      "\0"
     151  #define REG_BADBR_IDX	(REG_EBRACE_IDX + sizeof "Unmatched \\{")
     152      gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
     153      "\0"
     154  #define REG_ERANGE_IDX	(REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
     155      gettext_noop ("Invalid range end")	/* REG_ERANGE */
     156      "\0"
     157  #define REG_ESPACE_IDX	(REG_ERANGE_IDX + sizeof "Invalid range end")
     158      gettext_noop ("Memory exhausted") /* REG_ESPACE */
     159      "\0"
     160  #define REG_BADRPT_IDX	(REG_ESPACE_IDX + sizeof "Memory exhausted")
     161      gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
     162      "\0"
     163  #define REG_EEND_IDX	(REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
     164      gettext_noop ("Premature end of regular expression") /* REG_EEND */
     165      "\0"
     166  #define REG_ESIZE_IDX	(REG_EEND_IDX + sizeof "Premature end of regular expression")
     167      gettext_noop ("Regular expression too big") /* REG_ESIZE */
     168      "\0"
     169  #define REG_ERPAREN_IDX	(REG_ESIZE_IDX + sizeof "Regular expression too big")
     170      gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
     171    };
     172  
     173  static const size_t __re_error_msgid_idx[] =
     174    {
     175      REG_NOERROR_IDX,
     176      REG_NOMATCH_IDX,
     177      REG_BADPAT_IDX,
     178      REG_ECOLLATE_IDX,
     179      REG_ECTYPE_IDX,
     180      REG_EESCAPE_IDX,
     181      REG_ESUBREG_IDX,
     182      REG_EBRACK_IDX,
     183      REG_EPAREN_IDX,
     184      REG_EBRACE_IDX,
     185      REG_BADBR_IDX,
     186      REG_ERANGE_IDX,
     187      REG_ESPACE_IDX,
     188      REG_BADRPT_IDX,
     189      REG_EEND_IDX,
     190      REG_ESIZE_IDX,
     191      REG_ERPAREN_IDX
     192    };
     193  
     194  /* Entry points for GNU code.  */
     195  
     196  /* re_compile_pattern is the GNU regular expression compiler: it
     197     compiles PATTERN (of length LENGTH) and puts the result in BUFP.
     198     Returns 0 if the pattern was valid, otherwise an error string.
     199  
     200     Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
     201     are set in BUFP on entry.  */
     202  
     203  const char *
     204  re_compile_pattern (const char *pattern, size_t length,
     205  		    struct re_pattern_buffer *bufp)
     206  {
     207    reg_errcode_t ret;
     208  
     209    /* And GNU code determines whether or not to get register information
     210       by passing null for the REGS argument to re_match, etc., not by
     211       setting no_sub, unless RE_NO_SUB is set.  */
     212    bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
     213  
     214    /* Match anchors at newline.  */
     215    bufp->newline_anchor = 1;
     216  
     217    ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
     218  
     219    if (!ret)
     220      return NULL;
     221    return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
     222  }
     223  weak_alias (__re_compile_pattern, re_compile_pattern)
     224  
     225  /* Set by 're_set_syntax' to the current regexp syntax to recognize.  Can
     226     also be assigned to arbitrarily: each pattern buffer stores its own
     227     syntax, so it can be changed between regex compilations.  */
     228  /* This has no initializer because initialized variables in Emacs
     229     become read-only after dumping.  */
     230  reg_syntax_t re_syntax_options;
     231  
     232  
     233  /* Specify the precise syntax of regexps for compilation.  This provides
     234     for compatibility for various utilities which historically have
     235     different, incompatible syntaxes.
     236  
     237     The argument SYNTAX is a bit mask comprised of the various bits
     238     defined in regex.h.  We return the old syntax.  */
     239  
     240  reg_syntax_t
     241  re_set_syntax (reg_syntax_t syntax)
     242  {
     243    reg_syntax_t ret = re_syntax_options;
     244  
     245    re_syntax_options = syntax;
     246    return ret;
     247  }
     248  weak_alias (__re_set_syntax, re_set_syntax)
     249  
     250  int
     251  re_compile_fastmap (struct re_pattern_buffer *bufp)
     252  {
     253    re_dfa_t *dfa = bufp->buffer;
     254    char *fastmap = bufp->fastmap;
     255  
     256    memset (fastmap, '\0', sizeof (char) * SBC_MAX);
     257    re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
     258    if (dfa->init_state != dfa->init_state_word)
     259      re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
     260    if (dfa->init_state != dfa->init_state_nl)
     261      re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
     262    if (dfa->init_state != dfa->init_state_begbuf)
     263      re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
     264    bufp->fastmap_accurate = 1;
     265    return 0;
     266  }
     267  weak_alias (__re_compile_fastmap, re_compile_fastmap)
     268  
     269  static __always_inline void
     270  re_set_fastmap (char *fastmap, bool icase, int ch)
     271  {
     272    fastmap[ch] = 1;
     273    if (icase)
     274      fastmap[tolower (ch)] = 1;
     275  }
     276  
     277  /* Helper function for re_compile_fastmap.
     278     Compile fastmap for the initial_state INIT_STATE.  */
     279  
     280  static void
     281  re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
     282  			 char *fastmap)
     283  {
     284    re_dfa_t *dfa = bufp->buffer;
     285    Idx node_cnt;
     286    bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
     287    for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
     288      {
     289        Idx node = init_state->nodes.elems[node_cnt];
     290        re_token_type_t type = dfa->nodes[node].type;
     291  
     292        if (type == CHARACTER)
     293  	{
     294  	  re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
     295  	  if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
     296  	    {
     297  	      unsigned char buf[MB_LEN_MAX];
     298  	      unsigned char *p;
     299  	      wchar_t wc;
     300  	      mbstate_t state;
     301  
     302  	      p = buf;
     303  	      *p++ = dfa->nodes[node].opr.c;
     304  	      while (++node < dfa->nodes_len
     305  		     &&	dfa->nodes[node].type == CHARACTER
     306  		     && dfa->nodes[node].mb_partial)
     307  		*p++ = dfa->nodes[node].opr.c;
     308  	      memset (&state, '\0', sizeof (state));
     309  	      if (__mbrtowc (&wc, (const char *) buf, p - buf,
     310  			     &state) == p - buf
     311  		  && (__wcrtomb ((char *) buf, __towlower (wc), &state)
     312  		      != (size_t) -1))
     313  		re_set_fastmap (fastmap, false, buf[0]);
     314  	    }
     315  	}
     316        else if (type == SIMPLE_BRACKET)
     317  	{
     318  	  int i, ch;
     319  	  for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
     320  	    {
     321  	      int j;
     322  	      bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
     323  	      for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
     324  		if (w & ((bitset_word_t) 1 << j))
     325  		  re_set_fastmap (fastmap, icase, ch);
     326  	    }
     327  	}
     328        else if (type == COMPLEX_BRACKET)
     329  	{
     330  	  re_charset_t *cset = dfa->nodes[node].opr.mbcset;
     331  	  Idx i;
     332  
     333  #ifdef _LIBC
     334  	  /* See if we have to try all bytes which start multiple collation
     335  	     elements.
     336  	     e.g. In da_DK, we want to catch 'a' since "aa" is a valid
     337  		  collation element, and don't catch 'b' since 'b' is
     338  		  the only collation element which starts from 'b' (and
     339  		  it is caught by SIMPLE_BRACKET).  */
     340  	      if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
     341  		  && (cset->ncoll_syms || cset->nranges))
     342  		{
     343  		  const int32_t *table = (const int32_t *)
     344  		    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
     345  		  for (i = 0; i < SBC_MAX; ++i)
     346  		    if (table[i] < 0)
     347  		      re_set_fastmap (fastmap, icase, i);
     348  		}
     349  #endif /* _LIBC */
     350  
     351  	  /* See if we have to start the match at all multibyte characters,
     352  	     i.e. where we would not find an invalid sequence.  This only
     353  	     applies to multibyte character sets; for single byte character
     354  	     sets, the SIMPLE_BRACKET again suffices.  */
     355  	  if (dfa->mb_cur_max > 1
     356  	      && (cset->nchar_classes || cset->non_match || cset->nranges
     357  #ifdef _LIBC
     358  		  || cset->nequiv_classes
     359  #endif /* _LIBC */
     360  		 ))
     361  	    {
     362  	      unsigned char c = 0;
     363  	      do
     364  		{
     365  		  mbstate_t mbs;
     366  		  memset (&mbs, 0, sizeof (mbs));
     367  		  if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
     368  		    re_set_fastmap (fastmap, false, (int) c);
     369  		}
     370  	      while (++c != 0);
     371  	    }
     372  
     373  	  else
     374  	    {
     375  	      /* ... Else catch all bytes which can start the mbchars.  */
     376  	      for (i = 0; i < cset->nmbchars; ++i)
     377  		{
     378  		  char buf[256];
     379  		  mbstate_t state;
     380  		  memset (&state, '\0', sizeof (state));
     381  		  if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
     382  		    re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
     383  		  if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
     384  		    {
     385  		      if (__wcrtomb (buf, __towlower (cset->mbchars[i]), &state)
     386  			  != (size_t) -1)
     387  			re_set_fastmap (fastmap, false, *(unsigned char *) buf);
     388  		    }
     389  		}
     390  	    }
     391  	}
     392        else if (type == OP_PERIOD || type == OP_UTF8_PERIOD || type == END_OF_RE)
     393  	{
     394  	  memset (fastmap, '\1', sizeof (char) * SBC_MAX);
     395  	  if (type == END_OF_RE)
     396  	    bufp->can_be_null = 1;
     397  	  return;
     398  	}
     399      }
     400  }
     401  
     402  /* Entry point for POSIX code.  */
     403  /* regcomp takes a regular expression as a string and compiles it.
     404  
     405     PREG is a regex_t *.  We do not expect any fields to be initialized,
     406     since POSIX says we shouldn't.  Thus, we set
     407  
     408       'buffer' to the compiled pattern;
     409       'used' to the length of the compiled pattern;
     410       'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
     411         REG_EXTENDED bit in CFLAGS is set; otherwise, to
     412         RE_SYNTAX_POSIX_BASIC;
     413       'newline_anchor' to REG_NEWLINE being set in CFLAGS;
     414       'fastmap' to an allocated space for the fastmap;
     415       'fastmap_accurate' to zero;
     416       're_nsub' to the number of subexpressions in PATTERN.
     417  
     418     PATTERN is the address of the pattern string.
     419  
     420     CFLAGS is a series of bits which affect compilation.
     421  
     422       If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
     423       use POSIX basic syntax.
     424  
     425       If REG_NEWLINE is set, then . and [^...] don't match newline.
     426       Also, regexec will try a match beginning after every newline.
     427  
     428       If REG_ICASE is set, then we considers upper- and lowercase
     429       versions of letters to be equivalent when matching.
     430  
     431       If REG_NOSUB is set, then when PREG is passed to regexec, that
     432       routine will report only success or failure, and nothing about the
     433       registers.
     434  
     435     It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
     436     the return codes and their meanings.)  */
     437  
     438  int
     439  regcomp (regex_t *__restrict preg, const char *__restrict pattern, int cflags)
     440  {
     441    reg_errcode_t ret;
     442    reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
     443  			 : RE_SYNTAX_POSIX_BASIC);
     444  
     445    preg->buffer = NULL;
     446    preg->allocated = 0;
     447    preg->used = 0;
     448  
     449    /* Try to allocate space for the fastmap.  */
     450    preg->fastmap = re_malloc (char, SBC_MAX);
     451    if (__glibc_unlikely (preg->fastmap == NULL))
     452      return REG_ESPACE;
     453  
     454    syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
     455  
     456    /* If REG_NEWLINE is set, newlines are treated differently.  */
     457    if (cflags & REG_NEWLINE)
     458      { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
     459        syntax &= ~RE_DOT_NEWLINE;
     460        syntax |= RE_HAT_LISTS_NOT_NEWLINE;
     461        /* It also changes the matching behavior.  */
     462        preg->newline_anchor = 1;
     463      }
     464    else
     465      preg->newline_anchor = 0;
     466    preg->no_sub = !!(cflags & REG_NOSUB);
     467    preg->translate = NULL;
     468  
     469    ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
     470  
     471    /* POSIX doesn't distinguish between an unmatched open-group and an
     472       unmatched close-group: both are REG_EPAREN.  */
     473    if (ret == REG_ERPAREN)
     474      ret = REG_EPAREN;
     475  
     476    /* We have already checked preg->fastmap != NULL.  */
     477    if (__glibc_likely (ret == REG_NOERROR))
     478      /* Compute the fastmap now, since regexec cannot modify the pattern
     479         buffer.  This function never fails in this implementation.  */
     480      (void) re_compile_fastmap (preg);
     481    else
     482      {
     483        /* Some error occurred while compiling the expression.  */
     484        re_free (preg->fastmap);
     485        preg->fastmap = NULL;
     486      }
     487  
     488    return (int) ret;
     489  }
     490  libc_hidden_def (__regcomp)
     491  weak_alias (__regcomp, regcomp)
     492  
     493  /* Returns a message corresponding to an error code, ERRCODE, returned
     494     from either regcomp or regexec.   We don't use PREG here.  */
     495  
     496  size_t
     497  regerror (int errcode, const regex_t *__restrict preg, char *__restrict errbuf,
     498  	  size_t errbuf_size)
     499  {
     500    const char *msg;
     501    size_t msg_size;
     502    int nerrcodes = sizeof __re_error_msgid_idx / sizeof __re_error_msgid_idx[0];
     503  
     504    if (__glibc_unlikely (errcode < 0 || errcode >= nerrcodes))
     505      /* Only error codes returned by the rest of the code should be passed
     506         to this routine.  If we are given anything else, or if other regex
     507         code generates an invalid error code, then the program has a bug.
     508         Dump core so we can fix it.  */
     509      abort ();
     510  
     511    msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
     512  
     513    msg_size = strlen (msg) + 1; /* Includes the null.  */
     514  
     515    if (__glibc_likely (errbuf_size != 0))
     516      {
     517        size_t cpy_size = msg_size;
     518        if (__glibc_unlikely (msg_size > errbuf_size))
     519  	{
     520  	  cpy_size = errbuf_size - 1;
     521  	  errbuf[cpy_size] = '\0';
     522  	}
     523        memcpy (errbuf, msg, cpy_size);
     524      }
     525  
     526    return msg_size;
     527  }
     528  weak_alias (__regerror, regerror)
     529  
     530  
     531  /* This static array is used for the map to single-byte characters when
     532     UTF-8 is used.  Otherwise we would allocate memory just to initialize
     533     it the same all the time.  UTF-8 is the preferred encoding so this is
     534     a worthwhile optimization.  */
     535  static const bitset_t utf8_sb_map =
     536  {
     537    /* Set the first 128 bits.  */
     538  #if (defined __GNUC__ || __clang_major__ >= 4) && !defined __STRICT_ANSI__
     539    [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
     540  #else
     541  # if 4 * BITSET_WORD_BITS < ASCII_CHARS
     542  #  error "bitset_word_t is narrower than 32 bits"
     543  # elif 3 * BITSET_WORD_BITS < ASCII_CHARS
     544    BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
     545  # elif 2 * BITSET_WORD_BITS < ASCII_CHARS
     546    BITSET_WORD_MAX, BITSET_WORD_MAX,
     547  # elif 1 * BITSET_WORD_BITS < ASCII_CHARS
     548    BITSET_WORD_MAX,
     549  # endif
     550    (BITSET_WORD_MAX
     551     >> (SBC_MAX % BITSET_WORD_BITS == 0
     552         ? 0
     553         : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
     554  #endif
     555  };
     556  
     557  
     558  static void
     559  free_dfa_content (re_dfa_t *dfa)
     560  {
     561    Idx i, j;
     562  
     563    if (dfa->nodes)
     564      for (i = 0; i < dfa->nodes_len; ++i)
     565        free_token (dfa->nodes + i);
     566    re_free (dfa->nexts);
     567    for (i = 0; i < dfa->nodes_len; ++i)
     568      {
     569        if (dfa->eclosures != NULL)
     570  	re_node_set_free (dfa->eclosures + i);
     571        if (dfa->inveclosures != NULL)
     572  	re_node_set_free (dfa->inveclosures + i);
     573        if (dfa->edests != NULL)
     574  	re_node_set_free (dfa->edests + i);
     575      }
     576    re_free (dfa->edests);
     577    re_free (dfa->eclosures);
     578    re_free (dfa->inveclosures);
     579    re_free (dfa->nodes);
     580  
     581    if (dfa->state_table)
     582      for (i = 0; i <= dfa->state_hash_mask; ++i)
     583        {
     584  	struct re_state_table_entry *entry = dfa->state_table + i;
     585  	for (j = 0; j < entry->num; ++j)
     586  	  {
     587  	    re_dfastate_t *state = entry->array[j];
     588  	    free_state (state);
     589  	  }
     590  	re_free (entry->array);
     591        }
     592    re_free (dfa->state_table);
     593    if (dfa->sb_char != utf8_sb_map)
     594      re_free (dfa->sb_char);
     595    re_free (dfa->subexp_map);
     596  #ifdef DEBUG
     597    re_free (dfa->re_str);
     598  #endif
     599  
     600    re_free (dfa);
     601  }
     602  
     603  
     604  /* Free dynamically allocated space used by PREG.  */
     605  
     606  void
     607  regfree (regex_t *preg)
     608  {
     609    re_dfa_t *dfa = preg->buffer;
     610    if (__glibc_likely (dfa != NULL))
     611      {
     612        lock_fini (dfa->lock);
     613        free_dfa_content (dfa);
     614      }
     615    preg->buffer = NULL;
     616    preg->allocated = 0;
     617  
     618    re_free (preg->fastmap);
     619    preg->fastmap = NULL;
     620  
     621    re_free (preg->translate);
     622    preg->translate = NULL;
     623  }
     624  libc_hidden_def (__regfree)
     625  weak_alias (__regfree, regfree)
     626  
     627  /* Entry points compatible with 4.2 BSD regex library.  We don't define
     628     them unless specifically requested.  */
     629  
     630  #if defined _REGEX_RE_COMP || defined _LIBC
     631  
     632  /* BSD has one and only one pattern buffer.  */
     633  static struct re_pattern_buffer re_comp_buf;
     634  
     635  char *
     636  # ifdef _LIBC
     637  /* Make these definitions weak in libc, so POSIX programs can redefine
     638     these names if they don't use our functions, and still use
     639     regcomp/regexec above without link errors.  */
     640  weak_function
     641  # endif
     642  re_comp (const char *s)
     643  {
     644    reg_errcode_t ret;
     645    char *fastmap;
     646  
     647    if (!s)
     648      {
     649        if (!re_comp_buf.buffer)
     650  	return gettext ("No previous regular expression");
     651        return 0;
     652      }
     653  
     654    if (re_comp_buf.buffer)
     655      {
     656        fastmap = re_comp_buf.fastmap;
     657        re_comp_buf.fastmap = NULL;
     658        __regfree (&re_comp_buf);
     659        memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
     660        re_comp_buf.fastmap = fastmap;
     661      }
     662  
     663    if (re_comp_buf.fastmap == NULL)
     664      {
     665        re_comp_buf.fastmap = re_malloc (char, SBC_MAX);
     666        if (re_comp_buf.fastmap == NULL)
     667  	return (char *) gettext (__re_error_msgid
     668  				 + __re_error_msgid_idx[(int) REG_ESPACE]);
     669      }
     670  
     671    /* Since 're_exec' always passes NULL for the 'regs' argument, we
     672       don't need to initialize the pattern buffer fields which affect it.  */
     673  
     674    /* Match anchors at newlines.  */
     675    re_comp_buf.newline_anchor = 1;
     676  
     677    ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
     678  
     679    if (!ret)
     680      return NULL;
     681  
     682    /* Yes, we're discarding 'const' here if !HAVE_LIBINTL.  */
     683    return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
     684  }
     685  
     686  #ifdef _LIBC
     687  libc_freeres_fn (free_mem)
     688  {
     689    __regfree (&re_comp_buf);
     690  }
     691  #endif
     692  
     693  #endif /* _REGEX_RE_COMP */
     694  
     695  /* Internal entry point.
     696     Compile the regular expression PATTERN, whose length is LENGTH.
     697     SYNTAX indicate regular expression's syntax.  */
     698  
     699  static reg_errcode_t
     700  re_compile_internal (regex_t *preg, const char * pattern, size_t length,
     701  		     reg_syntax_t syntax)
     702  {
     703    reg_errcode_t err = REG_NOERROR;
     704    re_dfa_t *dfa;
     705    re_string_t regexp;
     706  
     707    /* Initialize the pattern buffer.  */
     708    preg->fastmap_accurate = 0;
     709    preg->syntax = syntax;
     710    preg->not_bol = preg->not_eol = 0;
     711    preg->used = 0;
     712    preg->re_nsub = 0;
     713    preg->can_be_null = 0;
     714    preg->regs_allocated = REGS_UNALLOCATED;
     715  
     716    /* Initialize the dfa.  */
     717    dfa = preg->buffer;
     718    if (__glibc_unlikely (preg->allocated < sizeof (re_dfa_t)))
     719      {
     720        /* If zero allocated, but buffer is non-null, try to realloc
     721  	 enough space.  This loses if buffer's address is bogus, but
     722  	 that is the user's responsibility.  If ->buffer is NULL this
     723  	 is a simple allocation.  */
     724        dfa = re_realloc (preg->buffer, re_dfa_t, 1);
     725        if (dfa == NULL)
     726  	return REG_ESPACE;
     727        preg->allocated = sizeof (re_dfa_t);
     728        preg->buffer = dfa;
     729      }
     730    preg->used = sizeof (re_dfa_t);
     731  
     732    err = init_dfa (dfa, length);
     733    if (__glibc_unlikely (err == REG_NOERROR && lock_init (dfa->lock) != 0))
     734      err = REG_ESPACE;
     735    if (__glibc_unlikely (err != REG_NOERROR))
     736      {
     737        free_dfa_content (dfa);
     738        preg->buffer = NULL;
     739        preg->allocated = 0;
     740        return err;
     741      }
     742  #ifdef DEBUG
     743    /* Note: length+1 will not overflow since it is checked in init_dfa.  */
     744    dfa->re_str = re_malloc (char, length + 1);
     745    strncpy (dfa->re_str, pattern, length + 1);
     746  #endif
     747  
     748    err = re_string_construct (&regexp, pattern, length, preg->translate,
     749  			     (syntax & RE_ICASE) != 0, dfa);
     750    if (__glibc_unlikely (err != REG_NOERROR))
     751      {
     752      re_compile_internal_free_return:
     753        free_workarea_compile (preg);
     754        re_string_destruct (&regexp);
     755        lock_fini (dfa->lock);
     756        free_dfa_content (dfa);
     757        preg->buffer = NULL;
     758        preg->allocated = 0;
     759        return err;
     760      }
     761  
     762    /* Parse the regular expression, and build a structure tree.  */
     763    preg->re_nsub = 0;
     764    dfa->str_tree = parse (&regexp, preg, syntax, &err);
     765    if (__glibc_unlikely (dfa->str_tree == NULL))
     766      goto re_compile_internal_free_return;
     767  
     768    /* Analyze the tree and create the nfa.  */
     769    err = analyze (preg);
     770    if (__glibc_unlikely (err != REG_NOERROR))
     771      goto re_compile_internal_free_return;
     772  
     773    /* If possible, do searching in single byte encoding to speed things up.  */
     774    if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
     775      optimize_utf8 (dfa);
     776  
     777    /* Then create the initial state of the dfa.  */
     778    err = create_initial_state (dfa);
     779  
     780    /* Release work areas.  */
     781    free_workarea_compile (preg);
     782    re_string_destruct (&regexp);
     783  
     784    if (__glibc_unlikely (err != REG_NOERROR))
     785      {
     786        lock_fini (dfa->lock);
     787        free_dfa_content (dfa);
     788        preg->buffer = NULL;
     789        preg->allocated = 0;
     790      }
     791  
     792    return err;
     793  }
     794  
     795  /* Initialize DFA.  We use the length of the regular expression PAT_LEN
     796     as the initial length of some arrays.  */
     797  
     798  static reg_errcode_t
     799  init_dfa (re_dfa_t *dfa, size_t pat_len)
     800  {
     801    __re_size_t table_size;
     802  #ifndef _LIBC
     803    const char *codeset_name;
     804  #endif
     805    size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
     806    size_t max_object_size =
     807      MAX (sizeof (struct re_state_table_entry),
     808  	 MAX (sizeof (re_token_t),
     809  	      MAX (sizeof (re_node_set),
     810  		   MAX (sizeof (regmatch_t),
     811  			max_i18n_object_size))));
     812  
     813    memset (dfa, '\0', sizeof (re_dfa_t));
     814  
     815    /* Force allocation of str_tree_storage the first time.  */
     816    dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
     817  
     818    /* Avoid overflows.  The extra "/ 2" is for the table_size doubling
     819       calculation below, and for similar doubling calculations
     820       elsewhere.  And it's <= rather than <, because some of the
     821       doubling calculations add 1 afterwards.  */
     822    if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2
     823  			<= pat_len))
     824      return REG_ESPACE;
     825  
     826    dfa->nodes_alloc = pat_len + 1;
     827    dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
     828  
     829    /*  table_size = 2 ^ ceil(log pat_len) */
     830    for (table_size = 1; ; table_size <<= 1)
     831      if (table_size > pat_len)
     832        break;
     833  
     834    dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
     835    dfa->state_hash_mask = table_size - 1;
     836  
     837    dfa->mb_cur_max = MB_CUR_MAX;
     838  #ifdef _LIBC
     839    if (dfa->mb_cur_max == 6
     840        && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
     841      dfa->is_utf8 = 1;
     842    dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
     843  		       != 0);
     844  #else
     845    codeset_name = nl_langinfo (CODESET);
     846    if ((codeset_name[0] == 'U' || codeset_name[0] == 'u')
     847        && (codeset_name[1] == 'T' || codeset_name[1] == 't')
     848        && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
     849        && strcmp (codeset_name + 3 + (codeset_name[3] == '-'), "8") == 0)
     850      dfa->is_utf8 = 1;
     851  
     852    /* We check exhaustively in the loop below if this charset is a
     853       superset of ASCII.  */
     854    dfa->map_notascii = 0;
     855  #endif
     856  
     857    if (dfa->mb_cur_max > 1)
     858      {
     859        if (dfa->is_utf8)
     860  	dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
     861        else
     862  	{
     863  	  int i, j, ch;
     864  
     865  	  dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
     866  	  if (__glibc_unlikely (dfa->sb_char == NULL))
     867  	    return REG_ESPACE;
     868  
     869  	  /* Set the bits corresponding to single byte chars.  */
     870  	  for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
     871  	    for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
     872  	      {
     873  		wint_t wch = __btowc (ch);
     874  		if (wch != WEOF)
     875  		  dfa->sb_char[i] |= (bitset_word_t) 1 << j;
     876  #ifndef _LIBC
     877  		if (isascii (ch) && wch != ch)
     878  		  dfa->map_notascii = 1;
     879  #endif
     880  	      }
     881  	}
     882      }
     883  
     884    if (__glibc_unlikely (dfa->nodes == NULL || dfa->state_table == NULL))
     885      return REG_ESPACE;
     886    return REG_NOERROR;
     887  }
     888  
     889  /* Initialize WORD_CHAR table, which indicate which character is
     890     "word".  In this case "word" means that it is the word construction
     891     character used by some operators like "\<", "\>", etc.  */
     892  
     893  static void
     894  init_word_char (re_dfa_t *dfa)
     895  {
     896    int i = 0;
     897    int j;
     898    int ch = 0;
     899    dfa->word_ops_used = 1;
     900    if (__glibc_likely (dfa->map_notascii == 0))
     901      {
     902        bitset_word_t bits0 = 0x00000000;
     903        bitset_word_t bits1 = 0x03ff0000;
     904        bitset_word_t bits2 = 0x87fffffe;
     905        bitset_word_t bits3 = 0x07fffffe;
     906        if (BITSET_WORD_BITS == 64)
     907  	{
     908  	  /* Pacify gcc -Woverflow on 32-bit platforms.  */
     909  	  dfa->word_char[0] = bits1 << 31 << 1 | bits0;
     910  	  dfa->word_char[1] = bits3 << 31 << 1 | bits2;
     911  	  i = 2;
     912  	}
     913        else if (BITSET_WORD_BITS == 32)
     914  	{
     915  	  dfa->word_char[0] = bits0;
     916  	  dfa->word_char[1] = bits1;
     917  	  dfa->word_char[2] = bits2;
     918  	  dfa->word_char[3] = bits3;
     919  	  i = 4;
     920  	}
     921        else
     922          goto general_case;
     923        ch = 128;
     924  
     925        if (__glibc_likely (dfa->is_utf8))
     926  	{
     927  	  memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
     928  	  return;
     929  	}
     930      }
     931  
     932   general_case:
     933    for (; i < BITSET_WORDS; ++i)
     934      for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
     935        if (isalnum (ch) || ch == '_')
     936  	dfa->word_char[i] |= (bitset_word_t) 1 << j;
     937  }
     938  
     939  /* Free the work area which are only used while compiling.  */
     940  
     941  static void
     942  free_workarea_compile (regex_t *preg)
     943  {
     944    re_dfa_t *dfa = preg->buffer;
     945    bin_tree_storage_t *storage, *next;
     946    for (storage = dfa->str_tree_storage; storage; storage = next)
     947      {
     948        next = storage->next;
     949        re_free (storage);
     950      }
     951    dfa->str_tree_storage = NULL;
     952    dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
     953    dfa->str_tree = NULL;
     954    re_free (dfa->org_indices);
     955    dfa->org_indices = NULL;
     956  }
     957  
     958  /* Create initial states for all contexts.  */
     959  
     960  static reg_errcode_t
     961  create_initial_state (re_dfa_t *dfa)
     962  {
     963    Idx first, i;
     964    reg_errcode_t err;
     965    re_node_set init_nodes;
     966  
     967    /* Initial states have the epsilon closure of the node which is
     968       the first node of the regular expression.  */
     969    first = dfa->str_tree->first->node_idx;
     970    dfa->init_node = first;
     971    err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
     972    if (__glibc_unlikely (err != REG_NOERROR))
     973      return err;
     974  
     975    /* The back-references which are in initial states can epsilon transit,
     976       since in this case all of the subexpressions can be null.
     977       Then we add epsilon closures of the nodes which are the next nodes of
     978       the back-references.  */
     979    if (dfa->nbackref > 0)
     980      for (i = 0; i < init_nodes.nelem; ++i)
     981        {
     982  	Idx node_idx = init_nodes.elems[i];
     983  	re_token_type_t type = dfa->nodes[node_idx].type;
     984  
     985  	Idx clexp_idx;
     986  	if (type != OP_BACK_REF)
     987  	  continue;
     988  	for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
     989  	  {
     990  	    re_token_t *clexp_node;
     991  	    clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
     992  	    if (clexp_node->type == OP_CLOSE_SUBEXP
     993  		&& clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
     994  	      break;
     995  	  }
     996  	if (clexp_idx == init_nodes.nelem)
     997  	  continue;
     998  
     999  	if (type == OP_BACK_REF)
    1000  	  {
    1001  	    Idx dest_idx = dfa->edests[node_idx].elems[0];
    1002  	    if (!re_node_set_contains (&init_nodes, dest_idx))
    1003  	      {
    1004  		reg_errcode_t merge_err
    1005                    = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
    1006  		if (merge_err != REG_NOERROR)
    1007  		  return merge_err;
    1008  		i = 0;
    1009  	      }
    1010  	  }
    1011        }
    1012  
    1013    /* It must be the first time to invoke acquire_state.  */
    1014    dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
    1015    /* We don't check ERR here, since the initial state must not be NULL.  */
    1016    if (__glibc_unlikely (dfa->init_state == NULL))
    1017      return err;
    1018    if (dfa->init_state->has_constraint)
    1019      {
    1020        dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
    1021  						       CONTEXT_WORD);
    1022        dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
    1023  						     CONTEXT_NEWLINE);
    1024        dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
    1025  							 &init_nodes,
    1026  							 CONTEXT_NEWLINE
    1027  							 | CONTEXT_BEGBUF);
    1028        if (__glibc_unlikely (dfa->init_state_word == NULL
    1029  			    || dfa->init_state_nl == NULL
    1030  			    || dfa->init_state_begbuf == NULL))
    1031  	return err;
    1032      }
    1033    else
    1034      dfa->init_state_word = dfa->init_state_nl
    1035        = dfa->init_state_begbuf = dfa->init_state;
    1036  
    1037    re_node_set_free (&init_nodes);
    1038    return REG_NOERROR;
    1039  }
    1040  
    1041  /* If it is possible to do searching in single byte encoding instead of UTF-8
    1042     to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
    1043     DFA nodes where needed.  */
    1044  
    1045  static void
    1046  optimize_utf8 (re_dfa_t *dfa)
    1047  {
    1048    Idx node;
    1049    int i;
    1050    bool mb_chars = false;
    1051    bool has_period = false;
    1052  
    1053    for (node = 0; node < dfa->nodes_len; ++node)
    1054      switch (dfa->nodes[node].type)
    1055        {
    1056        case CHARACTER:
    1057  	if (dfa->nodes[node].opr.c >= ASCII_CHARS)
    1058  	  mb_chars = true;
    1059  	break;
    1060        case ANCHOR:
    1061  	switch (dfa->nodes[node].opr.ctx_type)
    1062  	  {
    1063  	  case LINE_FIRST:
    1064  	  case LINE_LAST:
    1065  	  case BUF_FIRST:
    1066  	  case BUF_LAST:
    1067  	    break;
    1068  	  default:
    1069  	    /* Word anchors etc. cannot be handled.  It's okay to test
    1070  	       opr.ctx_type since constraints (for all DFA nodes) are
    1071  	       created by ORing one or more opr.ctx_type values.  */
    1072  	    return;
    1073  	  }
    1074  	break;
    1075        case OP_PERIOD:
    1076  	has_period = true;
    1077  	break;
    1078        case OP_BACK_REF:
    1079        case OP_ALT:
    1080        case END_OF_RE:
    1081        case OP_DUP_ASTERISK:
    1082        case OP_OPEN_SUBEXP:
    1083        case OP_CLOSE_SUBEXP:
    1084  	break;
    1085        case COMPLEX_BRACKET:
    1086  	return;
    1087        case SIMPLE_BRACKET:
    1088  	/* Just double check.  */
    1089  	{
    1090  	  int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
    1091  			? 0
    1092  			: BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
    1093  	  for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
    1094  	    {
    1095  	      if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
    1096  		return;
    1097  	      rshift = 0;
    1098  	    }
    1099  	}
    1100  	break;
    1101        default:
    1102  	abort ();
    1103        }
    1104  
    1105    if (mb_chars || has_period)
    1106      for (node = 0; node < dfa->nodes_len; ++node)
    1107        {
    1108  	if (dfa->nodes[node].type == CHARACTER
    1109  	    && dfa->nodes[node].opr.c >= ASCII_CHARS)
    1110  	  dfa->nodes[node].mb_partial = 0;
    1111  	else if (dfa->nodes[node].type == OP_PERIOD)
    1112  	  dfa->nodes[node].type = OP_UTF8_PERIOD;
    1113        }
    1114  
    1115    /* The search can be in single byte locale.  */
    1116    dfa->mb_cur_max = 1;
    1117    dfa->is_utf8 = 0;
    1118    dfa->has_mb_node = dfa->nbackref > 0 || has_period;
    1119  }
    1120  
    1121  /* Analyze the structure tree, and calculate "first", "next", "edest",
    1122     "eclosure", and "inveclosure".  */
    1123  
    1124  static reg_errcode_t
    1125  analyze (regex_t *preg)
    1126  {
    1127    re_dfa_t *dfa = preg->buffer;
    1128    reg_errcode_t ret;
    1129  
    1130    /* Allocate arrays.  */
    1131    dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
    1132    dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
    1133    dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
    1134    dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
    1135    if (__glibc_unlikely (dfa->nexts == NULL || dfa->org_indices == NULL
    1136  			|| dfa->edests == NULL || dfa->eclosures == NULL))
    1137      return REG_ESPACE;
    1138  
    1139    dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
    1140    if (dfa->subexp_map != NULL)
    1141      {
    1142        Idx i;
    1143        for (i = 0; i < preg->re_nsub; i++)
    1144  	dfa->subexp_map[i] = i;
    1145        preorder (dfa->str_tree, optimize_subexps, dfa);
    1146        for (i = 0; i < preg->re_nsub; i++)
    1147  	if (dfa->subexp_map[i] != i)
    1148  	  break;
    1149        if (i == preg->re_nsub)
    1150  	{
    1151  	  re_free (dfa->subexp_map);
    1152  	  dfa->subexp_map = NULL;
    1153  	}
    1154      }
    1155  
    1156    ret = postorder (dfa->str_tree, lower_subexps, preg);
    1157    if (__glibc_unlikely (ret != REG_NOERROR))
    1158      return ret;
    1159    ret = postorder (dfa->str_tree, calc_first, dfa);
    1160    if (__glibc_unlikely (ret != REG_NOERROR))
    1161      return ret;
    1162    preorder (dfa->str_tree, calc_next, dfa);
    1163    ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
    1164    if (__glibc_unlikely (ret != REG_NOERROR))
    1165      return ret;
    1166    ret = calc_eclosure (dfa);
    1167    if (__glibc_unlikely (ret != REG_NOERROR))
    1168      return ret;
    1169  
    1170    /* We only need this during the prune_impossible_nodes pass in regexec.c;
    1171       skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
    1172    if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
    1173        || dfa->nbackref)
    1174      {
    1175        dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
    1176        if (__glibc_unlikely (dfa->inveclosures == NULL))
    1177  	return REG_ESPACE;
    1178        ret = calc_inveclosure (dfa);
    1179      }
    1180  
    1181    return ret;
    1182  }
    1183  
    1184  /* Our parse trees are very unbalanced, so we cannot use a stack to
    1185     implement parse tree visits.  Instead, we use parent pointers and
    1186     some hairy code in these two functions.  */
    1187  static reg_errcode_t
    1188  postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
    1189  	   void *extra)
    1190  {
    1191    bin_tree_t *node, *prev;
    1192  
    1193    for (node = root; ; )
    1194      {
    1195        /* Descend down the tree, preferably to the left (or to the right
    1196  	 if that's the only child).  */
    1197        while (node->left || node->right)
    1198  	if (node->left)
    1199  	  node = node->left;
    1200  	else
    1201  	  node = node->right;
    1202  
    1203        do
    1204  	{
    1205  	  reg_errcode_t err = fn (extra, node);
    1206  	  if (__glibc_unlikely (err != REG_NOERROR))
    1207  	    return err;
    1208  	  if (node->parent == NULL)
    1209  	    return REG_NOERROR;
    1210  	  prev = node;
    1211  	  node = node->parent;
    1212  	}
    1213        /* Go up while we have a node that is reached from the right.  */
    1214        while (node->right == prev || node->right == NULL);
    1215        node = node->right;
    1216      }
    1217  }
    1218  
    1219  static reg_errcode_t
    1220  preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
    1221  	  void *extra)
    1222  {
    1223    bin_tree_t *node;
    1224  
    1225    for (node = root; ; )
    1226      {
    1227        reg_errcode_t err = fn (extra, node);
    1228        if (__glibc_unlikely (err != REG_NOERROR))
    1229  	return err;
    1230  
    1231        /* Go to the left node, or up and to the right.  */
    1232        if (node->left)
    1233  	node = node->left;
    1234        else
    1235  	{
    1236  	  bin_tree_t *prev = NULL;
    1237  	  while (node->right == prev || node->right == NULL)
    1238  	    {
    1239  	      prev = node;
    1240  	      node = node->parent;
    1241  	      if (!node)
    1242  		return REG_NOERROR;
    1243  	    }
    1244  	  node = node->right;
    1245  	}
    1246      }
    1247  }
    1248  
    1249  /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
    1250     re_search_internal to map the inner one's opr.idx to this one's.  Adjust
    1251     backreferences as well.  Requires a preorder visit.  */
    1252  static reg_errcode_t
    1253  optimize_subexps (void *extra, bin_tree_t *node)
    1254  {
    1255    re_dfa_t *dfa = (re_dfa_t *) extra;
    1256  
    1257    if (node->token.type == OP_BACK_REF && dfa->subexp_map)
    1258      {
    1259        int idx = node->token.opr.idx;
    1260        node->token.opr.idx = dfa->subexp_map[idx];
    1261        dfa->used_bkref_map |= 1 << node->token.opr.idx;
    1262      }
    1263  
    1264    else if (node->token.type == SUBEXP
    1265  	   && node->left && node->left->token.type == SUBEXP)
    1266      {
    1267        Idx other_idx = node->left->token.opr.idx;
    1268  
    1269        node->left = node->left->left;
    1270        if (node->left)
    1271  	node->left->parent = node;
    1272  
    1273        dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
    1274        if (other_idx < BITSET_WORD_BITS)
    1275  	dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
    1276      }
    1277  
    1278    return REG_NOERROR;
    1279  }
    1280  
    1281  /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
    1282     of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
    1283  static reg_errcode_t
    1284  lower_subexps (void *extra, bin_tree_t *node)
    1285  {
    1286    regex_t *preg = (regex_t *) extra;
    1287    reg_errcode_t err = REG_NOERROR;
    1288  
    1289    if (node->left && node->left->token.type == SUBEXP)
    1290      {
    1291        node->left = lower_subexp (&err, preg, node->left);
    1292        if (node->left)
    1293  	node->left->parent = node;
    1294      }
    1295    if (node->right && node->right->token.type == SUBEXP)
    1296      {
    1297        node->right = lower_subexp (&err, preg, node->right);
    1298        if (node->right)
    1299  	node->right->parent = node;
    1300      }
    1301  
    1302    return err;
    1303  }
    1304  
    1305  static bin_tree_t *
    1306  lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
    1307  {
    1308    re_dfa_t *dfa = preg->buffer;
    1309    bin_tree_t *body = node->left;
    1310    bin_tree_t *op, *cls, *tree1, *tree;
    1311  
    1312    if (preg->no_sub
    1313        /* We do not optimize empty subexpressions, because otherwise we may
    1314  	 have bad CONCAT nodes with NULL children.  This is obviously not
    1315  	 very common, so we do not lose much.  An example that triggers
    1316  	 this case is the sed "script" /\(\)/x.  */
    1317        && node->left != NULL
    1318        && (node->token.opr.idx >= BITSET_WORD_BITS
    1319  	  || !(dfa->used_bkref_map
    1320  	       & ((bitset_word_t) 1 << node->token.opr.idx))))
    1321      return node->left;
    1322  
    1323    /* Convert the SUBEXP node to the concatenation of an
    1324       OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
    1325    op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
    1326    cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
    1327    tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
    1328    tree = create_tree (dfa, op, tree1, CONCAT);
    1329    if (__glibc_unlikely (tree == NULL || tree1 == NULL
    1330  			|| op == NULL || cls == NULL))
    1331      {
    1332        *err = REG_ESPACE;
    1333        return NULL;
    1334      }
    1335  
    1336    op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
    1337    op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
    1338    return tree;
    1339  }
    1340  
    1341  /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
    1342     nodes.  Requires a postorder visit.  */
    1343  static reg_errcode_t
    1344  calc_first (void *extra, bin_tree_t *node)
    1345  {
    1346    re_dfa_t *dfa = (re_dfa_t *) extra;
    1347    if (node->token.type == CONCAT)
    1348      {
    1349        node->first = node->left->first;
    1350        node->node_idx = node->left->node_idx;
    1351      }
    1352    else
    1353      {
    1354        node->first = node;
    1355        node->node_idx = re_dfa_add_node (dfa, node->token);
    1356        if (__glibc_unlikely (node->node_idx == -1))
    1357  	return REG_ESPACE;
    1358        if (node->token.type == ANCHOR)
    1359  	dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
    1360      }
    1361    return REG_NOERROR;
    1362  }
    1363  
    1364  /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
    1365  static reg_errcode_t
    1366  calc_next (void *extra, bin_tree_t *node)
    1367  {
    1368    switch (node->token.type)
    1369      {
    1370      case OP_DUP_ASTERISK:
    1371        node->left->next = node;
    1372        break;
    1373      case CONCAT:
    1374        node->left->next = node->right->first;
    1375        node->right->next = node->next;
    1376        break;
    1377      default:
    1378        if (node->left)
    1379  	node->left->next = node->next;
    1380        if (node->right)
    1381  	node->right->next = node->next;
    1382        break;
    1383      }
    1384    return REG_NOERROR;
    1385  }
    1386  
    1387  /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
    1388  static reg_errcode_t
    1389  link_nfa_nodes (void *extra, bin_tree_t *node)
    1390  {
    1391    re_dfa_t *dfa = (re_dfa_t *) extra;
    1392    Idx idx = node->node_idx;
    1393    reg_errcode_t err = REG_NOERROR;
    1394  
    1395    switch (node->token.type)
    1396      {
    1397      case CONCAT:
    1398        break;
    1399  
    1400      case END_OF_RE:
    1401        DEBUG_ASSERT (node->next == NULL);
    1402        break;
    1403  
    1404      case OP_DUP_ASTERISK:
    1405      case OP_ALT:
    1406        {
    1407  	Idx left, right;
    1408  	dfa->has_plural_match = 1;
    1409  	if (node->left != NULL)
    1410  	  left = node->left->first->node_idx;
    1411  	else
    1412  	  left = node->next->node_idx;
    1413  	if (node->right != NULL)
    1414  	  right = node->right->first->node_idx;
    1415  	else
    1416  	  right = node->next->node_idx;
    1417  	DEBUG_ASSERT (left > -1);
    1418  	DEBUG_ASSERT (right > -1);
    1419  	err = re_node_set_init_2 (dfa->edests + idx, left, right);
    1420        }
    1421        break;
    1422  
    1423      case ANCHOR:
    1424      case OP_OPEN_SUBEXP:
    1425      case OP_CLOSE_SUBEXP:
    1426        err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
    1427        break;
    1428  
    1429      case OP_BACK_REF:
    1430        dfa->nexts[idx] = node->next->node_idx;
    1431        if (node->token.type == OP_BACK_REF)
    1432  	err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
    1433        break;
    1434  
    1435      default:
    1436        DEBUG_ASSERT (!IS_EPSILON_NODE (node->token.type));
    1437        dfa->nexts[idx] = node->next->node_idx;
    1438        break;
    1439      }
    1440  
    1441    return err;
    1442  }
    1443  
    1444  /* Duplicate the epsilon closure of the node ROOT_NODE.
    1445     Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
    1446     to their own constraint.  */
    1447  
    1448  static reg_errcode_t
    1449  duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
    1450  			Idx root_node, unsigned int init_constraint)
    1451  {
    1452    Idx org_node, clone_node;
    1453    bool ok;
    1454    unsigned int constraint = init_constraint;
    1455    for (org_node = top_org_node, clone_node = top_clone_node;;)
    1456      {
    1457        Idx org_dest, clone_dest;
    1458        if (dfa->nodes[org_node].type == OP_BACK_REF)
    1459  	{
    1460  	  /* If the back reference epsilon-transit, its destination must
    1461  	     also have the constraint.  Then duplicate the epsilon closure
    1462  	     of the destination of the back reference, and store it in
    1463  	     edests of the back reference.  */
    1464  	  org_dest = dfa->nexts[org_node];
    1465  	  re_node_set_empty (dfa->edests + clone_node);
    1466  	  clone_dest = duplicate_node (dfa, org_dest, constraint);
    1467  	  if (__glibc_unlikely (clone_dest == -1))
    1468  	    return REG_ESPACE;
    1469  	  dfa->nexts[clone_node] = dfa->nexts[org_node];
    1470  	  ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
    1471  	  if (__glibc_unlikely (! ok))
    1472  	    return REG_ESPACE;
    1473  	}
    1474        else if (dfa->edests[org_node].nelem == 0)
    1475  	{
    1476  	  /* In case of the node can't epsilon-transit, don't duplicate the
    1477  	     destination and store the original destination as the
    1478  	     destination of the node.  */
    1479  	  dfa->nexts[clone_node] = dfa->nexts[org_node];
    1480  	  break;
    1481  	}
    1482        else if (dfa->edests[org_node].nelem == 1)
    1483  	{
    1484  	  /* In case of the node can epsilon-transit, and it has only one
    1485  	     destination.  */
    1486  	  org_dest = dfa->edests[org_node].elems[0];
    1487  	  re_node_set_empty (dfa->edests + clone_node);
    1488  	  /* If the node is root_node itself, it means the epsilon closure
    1489  	     has a loop.  Then tie it to the destination of the root_node.  */
    1490  	  if (org_node == root_node && clone_node != org_node)
    1491  	    {
    1492  	      ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
    1493  	      if (__glibc_unlikely (! ok))
    1494  	        return REG_ESPACE;
    1495  	      break;
    1496  	    }
    1497  	  /* In case the node has another constraint, append it.  */
    1498  	  constraint |= dfa->nodes[org_node].constraint;
    1499  	  clone_dest = duplicate_node (dfa, org_dest, constraint);
    1500  	  if (__glibc_unlikely (clone_dest == -1))
    1501  	    return REG_ESPACE;
    1502  	  ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
    1503  	  if (__glibc_unlikely (! ok))
    1504  	    return REG_ESPACE;
    1505  	}
    1506        else /* dfa->edests[org_node].nelem == 2 */
    1507  	{
    1508  	  /* In case of the node can epsilon-transit, and it has two
    1509  	     destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
    1510  	  org_dest = dfa->edests[org_node].elems[0];
    1511  	  re_node_set_empty (dfa->edests + clone_node);
    1512  	  /* Search for a duplicated node which satisfies the constraint.  */
    1513  	  clone_dest = search_duplicated_node (dfa, org_dest, constraint);
    1514  	  if (clone_dest == -1)
    1515  	    {
    1516  	      /* There is no such duplicated node, create a new one.  */
    1517  	      reg_errcode_t err;
    1518  	      clone_dest = duplicate_node (dfa, org_dest, constraint);
    1519  	      if (__glibc_unlikely (clone_dest == -1))
    1520  		return REG_ESPACE;
    1521  	      ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
    1522  	      if (__glibc_unlikely (! ok))
    1523  		return REG_ESPACE;
    1524  	      err = duplicate_node_closure (dfa, org_dest, clone_dest,
    1525  					    root_node, constraint);
    1526  	      if (__glibc_unlikely (err != REG_NOERROR))
    1527  		return err;
    1528  	    }
    1529  	  else
    1530  	    {
    1531  	      /* There is a duplicated node which satisfies the constraint,
    1532  		 use it to avoid infinite loop.  */
    1533  	      ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
    1534  	      if (__glibc_unlikely (! ok))
    1535  		return REG_ESPACE;
    1536  	    }
    1537  
    1538  	  org_dest = dfa->edests[org_node].elems[1];
    1539  	  clone_dest = duplicate_node (dfa, org_dest, constraint);
    1540  	  if (__glibc_unlikely (clone_dest == -1))
    1541  	    return REG_ESPACE;
    1542  	  ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
    1543  	  if (__glibc_unlikely (! ok))
    1544  	    return REG_ESPACE;
    1545  	}
    1546        org_node = org_dest;
    1547        clone_node = clone_dest;
    1548      }
    1549    return REG_NOERROR;
    1550  }
    1551  
    1552  /* Search for a node which is duplicated from the node ORG_NODE, and
    1553     satisfies the constraint CONSTRAINT.  */
    1554  
    1555  static Idx
    1556  search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
    1557  			unsigned int constraint)
    1558  {
    1559    Idx idx;
    1560    for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
    1561      {
    1562        if (org_node == dfa->org_indices[idx]
    1563  	  && constraint == dfa->nodes[idx].constraint)
    1564  	return idx; /* Found.  */
    1565      }
    1566    return -1; /* Not found.  */
    1567  }
    1568  
    1569  /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
    1570     Return the index of the new node, or -1 if insufficient storage is
    1571     available.  */
    1572  
    1573  static Idx
    1574  duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
    1575  {
    1576    Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
    1577    if (__glibc_likely (dup_idx != -1))
    1578      {
    1579        dfa->nodes[dup_idx].constraint = constraint;
    1580        dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
    1581        dfa->nodes[dup_idx].duplicated = 1;
    1582  
    1583        /* Store the index of the original node.  */
    1584        dfa->org_indices[dup_idx] = org_idx;
    1585      }
    1586    return dup_idx;
    1587  }
    1588  
    1589  static reg_errcode_t
    1590  calc_inveclosure (re_dfa_t *dfa)
    1591  {
    1592    Idx src, idx;
    1593    bool ok;
    1594    for (idx = 0; idx < dfa->nodes_len; ++idx)
    1595      re_node_set_init_empty (dfa->inveclosures + idx);
    1596  
    1597    for (src = 0; src < dfa->nodes_len; ++src)
    1598      {
    1599        Idx *elems = dfa->eclosures[src].elems;
    1600        for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
    1601  	{
    1602  	  ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
    1603  	  if (__glibc_unlikely (! ok))
    1604  	    return REG_ESPACE;
    1605  	}
    1606      }
    1607  
    1608    return REG_NOERROR;
    1609  }
    1610  
    1611  /* Calculate "eclosure" for all the node in DFA.  */
    1612  
    1613  static reg_errcode_t
    1614  calc_eclosure (re_dfa_t *dfa)
    1615  {
    1616    Idx node_idx;
    1617    bool incomplete;
    1618    DEBUG_ASSERT (dfa->nodes_len > 0);
    1619    incomplete = false;
    1620    /* For each nodes, calculate epsilon closure.  */
    1621    for (node_idx = 0; ; ++node_idx)
    1622      {
    1623        reg_errcode_t err;
    1624        re_node_set eclosure_elem;
    1625        if (node_idx == dfa->nodes_len)
    1626  	{
    1627  	  if (!incomplete)
    1628  	    break;
    1629  	  incomplete = false;
    1630  	  node_idx = 0;
    1631  	}
    1632  
    1633        DEBUG_ASSERT (dfa->eclosures[node_idx].nelem != -1);
    1634  
    1635        /* If we have already calculated, skip it.  */
    1636        if (dfa->eclosures[node_idx].nelem != 0)
    1637  	continue;
    1638        /* Calculate epsilon closure of 'node_idx'.  */
    1639        err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
    1640        if (__glibc_unlikely (err != REG_NOERROR))
    1641  	return err;
    1642  
    1643        if (dfa->eclosures[node_idx].nelem == 0)
    1644  	{
    1645  	  incomplete = true;
    1646  	  re_node_set_free (&eclosure_elem);
    1647  	}
    1648      }
    1649    return REG_NOERROR;
    1650  }
    1651  
    1652  /* Calculate epsilon closure of NODE.  */
    1653  
    1654  static reg_errcode_t
    1655  calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
    1656  {
    1657    reg_errcode_t err;
    1658    Idx i;
    1659    re_node_set eclosure;
    1660    bool incomplete = false;
    1661    err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
    1662    if (__glibc_unlikely (err != REG_NOERROR))
    1663      return err;
    1664  
    1665    /* An epsilon closure includes itself.  */
    1666    eclosure.elems[eclosure.nelem++] = node;
    1667  
    1668    /* This indicates that we are calculating this node now.
    1669       We reference this value to avoid infinite loop.  */
    1670    dfa->eclosures[node].nelem = -1;
    1671  
    1672    /* If the current node has constraints, duplicate all nodes
    1673       since they must inherit the constraints.  */
    1674    if (dfa->nodes[node].constraint
    1675        && dfa->edests[node].nelem
    1676        && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
    1677      {
    1678        err = duplicate_node_closure (dfa, node, node, node,
    1679  				    dfa->nodes[node].constraint);
    1680        if (__glibc_unlikely (err != REG_NOERROR))
    1681  	return err;
    1682      }
    1683  
    1684    /* Expand each epsilon destination nodes.  */
    1685    if (IS_EPSILON_NODE(dfa->nodes[node].type))
    1686      for (i = 0; i < dfa->edests[node].nelem; ++i)
    1687        {
    1688  	re_node_set eclosure_elem;
    1689  	Idx edest = dfa->edests[node].elems[i];
    1690  	/* If calculating the epsilon closure of 'edest' is in progress,
    1691  	   return intermediate result.  */
    1692  	if (dfa->eclosures[edest].nelem == -1)
    1693  	  {
    1694  	    incomplete = true;
    1695  	    continue;
    1696  	  }
    1697  	/* If we haven't calculated the epsilon closure of 'edest' yet,
    1698  	   calculate now. Otherwise use calculated epsilon closure.  */
    1699  	if (dfa->eclosures[edest].nelem == 0)
    1700  	  {
    1701  	    err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
    1702  	    if (__glibc_unlikely (err != REG_NOERROR))
    1703  	      return err;
    1704  	  }
    1705  	else
    1706  	  eclosure_elem = dfa->eclosures[edest];
    1707  	/* Merge the epsilon closure of 'edest'.  */
    1708  	err = re_node_set_merge (&eclosure, &eclosure_elem);
    1709  	if (__glibc_unlikely (err != REG_NOERROR))
    1710  	  return err;
    1711  	/* If the epsilon closure of 'edest' is incomplete,
    1712  	   the epsilon closure of this node is also incomplete.  */
    1713  	if (dfa->eclosures[edest].nelem == 0)
    1714  	  {
    1715  	    incomplete = true;
    1716  	    re_node_set_free (&eclosure_elem);
    1717  	  }
    1718        }
    1719  
    1720    if (incomplete && !root)
    1721      dfa->eclosures[node].nelem = 0;
    1722    else
    1723      dfa->eclosures[node] = eclosure;
    1724    *new_set = eclosure;
    1725    return REG_NOERROR;
    1726  }
    1727  
    1728  /* Functions for token which are used in the parser.  */
    1729  
    1730  /* Fetch a token from INPUT.
    1731     We must not use this function inside bracket expressions.  */
    1732  
    1733  static void
    1734  fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
    1735  {
    1736    re_string_skip_bytes (input, peek_token (result, input, syntax));
    1737  }
    1738  
    1739  /* Peek a token from INPUT, and return the length of the token.
    1740     We must not use this function inside bracket expressions.  */
    1741  
    1742  static int
    1743  peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
    1744  {
    1745    unsigned char c;
    1746  
    1747    if (re_string_eoi (input))
    1748      {
    1749        token->type = END_OF_RE;
    1750        return 0;
    1751      }
    1752  
    1753    c = re_string_peek_byte (input, 0);
    1754    token->opr.c = c;
    1755  
    1756    token->word_char = 0;
    1757    token->mb_partial = 0;
    1758    if (input->mb_cur_max > 1
    1759        && !re_string_first_byte (input, re_string_cur_idx (input)))
    1760      {
    1761        token->type = CHARACTER;
    1762        token->mb_partial = 1;
    1763        return 1;
    1764      }
    1765    if (c == '\\')
    1766      {
    1767        unsigned char c2;
    1768        if (re_string_cur_idx (input) + 1 >= re_string_length (input))
    1769  	{
    1770  	  token->type = BACK_SLASH;
    1771  	  return 1;
    1772  	}
    1773  
    1774        c2 = re_string_peek_byte_case (input, 1);
    1775        token->opr.c = c2;
    1776        token->type = CHARACTER;
    1777        if (input->mb_cur_max > 1)
    1778  	{
    1779  	  wint_t wc = re_string_wchar_at (input,
    1780  					  re_string_cur_idx (input) + 1);
    1781  	  token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
    1782  	}
    1783        else
    1784  	token->word_char = IS_WORD_CHAR (c2) != 0;
    1785  
    1786        switch (c2)
    1787  	{
    1788  	case '|':
    1789  	  if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
    1790  	    token->type = OP_ALT;
    1791  	  break;
    1792  	case '1': case '2': case '3': case '4': case '5':
    1793  	case '6': case '7': case '8': case '9':
    1794  	  if (!(syntax & RE_NO_BK_REFS))
    1795  	    {
    1796  	      token->type = OP_BACK_REF;
    1797  	      token->opr.idx = c2 - '1';
    1798  	    }
    1799  	  break;
    1800  	case '<':
    1801  	  if (!(syntax & RE_NO_GNU_OPS))
    1802  	    {
    1803  	      token->type = ANCHOR;
    1804  	      token->opr.ctx_type = WORD_FIRST;
    1805  	    }
    1806  	  break;
    1807  	case '>':
    1808  	  if (!(syntax & RE_NO_GNU_OPS))
    1809  	    {
    1810  	      token->type = ANCHOR;
    1811  	      token->opr.ctx_type = WORD_LAST;
    1812  	    }
    1813  	  break;
    1814  	case 'b':
    1815  	  if (!(syntax & RE_NO_GNU_OPS))
    1816  	    {
    1817  	      token->type = ANCHOR;
    1818  	      token->opr.ctx_type = WORD_DELIM;
    1819  	    }
    1820  	  break;
    1821  	case 'B':
    1822  	  if (!(syntax & RE_NO_GNU_OPS))
    1823  	    {
    1824  	      token->type = ANCHOR;
    1825  	      token->opr.ctx_type = NOT_WORD_DELIM;
    1826  	    }
    1827  	  break;
    1828  	case 'w':
    1829  	  if (!(syntax & RE_NO_GNU_OPS))
    1830  	    token->type = OP_WORD;
    1831  	  break;
    1832  	case 'W':
    1833  	  if (!(syntax & RE_NO_GNU_OPS))
    1834  	    token->type = OP_NOTWORD;
    1835  	  break;
    1836  	case 's':
    1837  	  if (!(syntax & RE_NO_GNU_OPS))
    1838  	    token->type = OP_SPACE;
    1839  	  break;
    1840  	case 'S':
    1841  	  if (!(syntax & RE_NO_GNU_OPS))
    1842  	    token->type = OP_NOTSPACE;
    1843  	  break;
    1844  	case '`':
    1845  	  if (!(syntax & RE_NO_GNU_OPS))
    1846  	    {
    1847  	      token->type = ANCHOR;
    1848  	      token->opr.ctx_type = BUF_FIRST;
    1849  	    }
    1850  	  break;
    1851  	case '\'':
    1852  	  if (!(syntax & RE_NO_GNU_OPS))
    1853  	    {
    1854  	      token->type = ANCHOR;
    1855  	      token->opr.ctx_type = BUF_LAST;
    1856  	    }
    1857  	  break;
    1858  	case '(':
    1859  	  if (!(syntax & RE_NO_BK_PARENS))
    1860  	    token->type = OP_OPEN_SUBEXP;
    1861  	  break;
    1862  	case ')':
    1863  	  if (!(syntax & RE_NO_BK_PARENS))
    1864  	    token->type = OP_CLOSE_SUBEXP;
    1865  	  break;
    1866  	case '+':
    1867  	  if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
    1868  	    token->type = OP_DUP_PLUS;
    1869  	  break;
    1870  	case '?':
    1871  	  if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
    1872  	    token->type = OP_DUP_QUESTION;
    1873  	  break;
    1874  	case '{':
    1875  	  if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
    1876  	    token->type = OP_OPEN_DUP_NUM;
    1877  	  break;
    1878  	case '}':
    1879  	  if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
    1880  	    token->type = OP_CLOSE_DUP_NUM;
    1881  	  break;
    1882  	default:
    1883  	  break;
    1884  	}
    1885        return 2;
    1886      }
    1887  
    1888    token->type = CHARACTER;
    1889    if (input->mb_cur_max > 1)
    1890      {
    1891        wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
    1892        token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
    1893      }
    1894    else
    1895      token->word_char = IS_WORD_CHAR (token->opr.c);
    1896  
    1897    switch (c)
    1898      {
    1899      case '\n':
    1900        if (syntax & RE_NEWLINE_ALT)
    1901  	token->type = OP_ALT;
    1902        break;
    1903      case '|':
    1904        if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
    1905  	token->type = OP_ALT;
    1906        break;
    1907      case '*':
    1908        token->type = OP_DUP_ASTERISK;
    1909        break;
    1910      case '+':
    1911        if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
    1912  	token->type = OP_DUP_PLUS;
    1913        break;
    1914      case '?':
    1915        if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
    1916  	token->type = OP_DUP_QUESTION;
    1917        break;
    1918      case '{':
    1919        if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
    1920  	token->type = OP_OPEN_DUP_NUM;
    1921        break;
    1922      case '}':
    1923        if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
    1924  	token->type = OP_CLOSE_DUP_NUM;
    1925        break;
    1926      case '(':
    1927        if (syntax & RE_NO_BK_PARENS)
    1928  	token->type = OP_OPEN_SUBEXP;
    1929        break;
    1930      case ')':
    1931        if (syntax & RE_NO_BK_PARENS)
    1932  	token->type = OP_CLOSE_SUBEXP;
    1933        break;
    1934      case '[':
    1935        token->type = OP_OPEN_BRACKET;
    1936        break;
    1937      case '.':
    1938        token->type = OP_PERIOD;
    1939        break;
    1940      case '^':
    1941        if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE))
    1942  	  && re_string_cur_idx (input) != 0)
    1943  	{
    1944  	  char prev = re_string_peek_byte (input, -1);
    1945  	  if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
    1946  	    break;
    1947  	}
    1948        token->type = ANCHOR;
    1949        token->opr.ctx_type = LINE_FIRST;
    1950        break;
    1951      case '$':
    1952        if (!(syntax & RE_CONTEXT_INDEP_ANCHORS)
    1953  	  && re_string_cur_idx (input) + 1 != re_string_length (input))
    1954  	{
    1955  	  re_token_t next;
    1956  	  re_string_skip_bytes (input, 1);
    1957  	  peek_token (&next, input, syntax);
    1958  	  re_string_skip_bytes (input, -1);
    1959  	  if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
    1960  	    break;
    1961  	}
    1962        token->type = ANCHOR;
    1963        token->opr.ctx_type = LINE_LAST;
    1964        break;
    1965      default:
    1966        break;
    1967      }
    1968    return 1;
    1969  }
    1970  
    1971  /* Peek a token from INPUT, and return the length of the token.
    1972     We must not use this function out of bracket expressions.  */
    1973  
    1974  static int
    1975  peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
    1976  {
    1977    unsigned char c;
    1978    if (re_string_eoi (input))
    1979      {
    1980        token->type = END_OF_RE;
    1981        return 0;
    1982      }
    1983    c = re_string_peek_byte (input, 0);
    1984    token->opr.c = c;
    1985  
    1986    if (input->mb_cur_max > 1
    1987        && !re_string_first_byte (input, re_string_cur_idx (input)))
    1988      {
    1989        token->type = CHARACTER;
    1990        return 1;
    1991      }
    1992  
    1993    if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
    1994        && re_string_cur_idx (input) + 1 < re_string_length (input))
    1995      {
    1996        /* In this case, '\' escape a character.  */
    1997        unsigned char c2;
    1998        re_string_skip_bytes (input, 1);
    1999        c2 = re_string_peek_byte (input, 0);
    2000        token->opr.c = c2;
    2001        token->type = CHARACTER;
    2002        return 1;
    2003      }
    2004    if (c == '[') /* '[' is a special char in a bracket exps.  */
    2005      {
    2006        unsigned char c2;
    2007        int token_len;
    2008        if (re_string_cur_idx (input) + 1 < re_string_length (input))
    2009  	c2 = re_string_peek_byte (input, 1);
    2010        else
    2011  	c2 = 0;
    2012        token->opr.c = c2;
    2013        token_len = 2;
    2014        switch (c2)
    2015  	{
    2016  	case '.':
    2017  	  token->type = OP_OPEN_COLL_ELEM;
    2018  	  break;
    2019  
    2020  	case '=':
    2021  	  token->type = OP_OPEN_EQUIV_CLASS;
    2022  	  break;
    2023  
    2024  	case ':':
    2025  	  if (syntax & RE_CHAR_CLASSES)
    2026  	    {
    2027  	      token->type = OP_OPEN_CHAR_CLASS;
    2028  	      break;
    2029  	    }
    2030  	  FALLTHROUGH;
    2031  	default:
    2032  	  token->type = CHARACTER;
    2033  	  token->opr.c = c;
    2034  	  token_len = 1;
    2035  	  break;
    2036  	}
    2037        return token_len;
    2038      }
    2039    switch (c)
    2040      {
    2041      case ']':
    2042        token->type = OP_CLOSE_BRACKET;
    2043        break;
    2044      case '^':
    2045        token->type = OP_NON_MATCH_LIST;
    2046        break;
    2047      case '-':
    2048        /* In V7 Unix grep and Unix awk and mawk, [...---...]
    2049           (3 adjacent minus signs) stands for a single minus sign.
    2050           Support that without breaking anything else.  */
    2051        if (! (re_string_cur_idx (input) + 2 < re_string_length (input)
    2052               && re_string_peek_byte (input, 1) == '-'
    2053               && re_string_peek_byte (input, 2) == '-'))
    2054          {
    2055            token->type = OP_CHARSET_RANGE;
    2056            break;
    2057          }
    2058        re_string_skip_bytes (input, 2);
    2059        FALLTHROUGH;
    2060      default:
    2061        token->type = CHARACTER;
    2062      }
    2063    return 1;
    2064  }
    2065  
    2066  /* Functions for parser.  */
    2067  
    2068  /* Entry point of the parser.
    2069     Parse the regular expression REGEXP and return the structure tree.
    2070     If an error occurs, ERR is set by error code, and return NULL.
    2071     This function build the following tree, from regular expression <reg_exp>:
    2072  	   CAT
    2073  	   / \
    2074  	  /   \
    2075     <reg_exp>  EOR
    2076  
    2077     CAT means concatenation.
    2078     EOR means end of regular expression.  */
    2079  
    2080  static bin_tree_t *
    2081  parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
    2082         reg_errcode_t *err)
    2083  {
    2084    re_dfa_t *dfa = preg->buffer;
    2085    bin_tree_t *tree, *eor, *root;
    2086    re_token_t current_token;
    2087    dfa->syntax = syntax;
    2088    fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
    2089    tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
    2090    if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
    2091      return NULL;
    2092    eor = create_tree (dfa, NULL, NULL, END_OF_RE);
    2093    if (tree != NULL)
    2094      root = create_tree (dfa, tree, eor, CONCAT);
    2095    else
    2096      root = eor;
    2097    if (__glibc_unlikely (eor == NULL || root == NULL))
    2098      {
    2099        *err = REG_ESPACE;
    2100        return NULL;
    2101      }
    2102    return root;
    2103  }
    2104  
    2105  /* This function build the following tree, from regular expression
    2106     <branch1>|<branch2>:
    2107  	   ALT
    2108  	   / \
    2109  	  /   \
    2110     <branch1> <branch2>
    2111  
    2112     ALT means alternative, which represents the operator '|'.  */
    2113  
    2114  static bin_tree_t *
    2115  parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
    2116  	       reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
    2117  {
    2118    re_dfa_t *dfa = preg->buffer;
    2119    bin_tree_t *tree, *branch = NULL;
    2120    bitset_word_t initial_bkref_map = dfa->completed_bkref_map;
    2121    tree = parse_branch (regexp, preg, token, syntax, nest, err);
    2122    if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
    2123      return NULL;
    2124  
    2125    while (token->type == OP_ALT)
    2126      {
    2127        fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
    2128        if (token->type != OP_ALT && token->type != END_OF_RE
    2129  	  && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
    2130  	{
    2131  	  bitset_word_t accumulated_bkref_map = dfa->completed_bkref_map;
    2132  	  dfa->completed_bkref_map = initial_bkref_map;
    2133  	  branch = parse_branch (regexp, preg, token, syntax, nest, err);
    2134  	  if (__glibc_unlikely (*err != REG_NOERROR && branch == NULL))
    2135  	    {
    2136  	      if (tree != NULL)
    2137  		postorder (tree, free_tree, NULL);
    2138  	      return NULL;
    2139  	    }
    2140  	  dfa->completed_bkref_map |= accumulated_bkref_map;
    2141  	}
    2142        else
    2143  	branch = NULL;
    2144        tree = create_tree (dfa, tree, branch, OP_ALT);
    2145        if (__glibc_unlikely (tree == NULL))
    2146  	{
    2147  	  *err = REG_ESPACE;
    2148  	  return NULL;
    2149  	}
    2150      }
    2151    return tree;
    2152  }
    2153  
    2154  /* This function build the following tree, from regular expression
    2155     <exp1><exp2>:
    2156  	CAT
    2157  	/ \
    2158         /   \
    2159     <exp1> <exp2>
    2160  
    2161     CAT means concatenation.  */
    2162  
    2163  static bin_tree_t *
    2164  parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
    2165  	      reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
    2166  {
    2167    bin_tree_t *tree, *expr;
    2168    re_dfa_t *dfa = preg->buffer;
    2169    tree = parse_expression (regexp, preg, token, syntax, nest, err);
    2170    if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
    2171      return NULL;
    2172  
    2173    while (token->type != OP_ALT && token->type != END_OF_RE
    2174  	 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
    2175      {
    2176        expr = parse_expression (regexp, preg, token, syntax, nest, err);
    2177        if (__glibc_unlikely (*err != REG_NOERROR && expr == NULL))
    2178  	{
    2179  	  if (tree != NULL)
    2180  	    postorder (tree, free_tree, NULL);
    2181  	  return NULL;
    2182  	}
    2183        if (tree != NULL && expr != NULL)
    2184  	{
    2185  	  bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT);
    2186  	  if (newtree == NULL)
    2187  	    {
    2188  	      postorder (expr, free_tree, NULL);
    2189  	      postorder (tree, free_tree, NULL);
    2190  	      *err = REG_ESPACE;
    2191  	      return NULL;
    2192  	    }
    2193  	  tree = newtree;
    2194  	}
    2195        else if (tree == NULL)
    2196  	tree = expr;
    2197        /* Otherwise expr == NULL, we don't need to create new tree.  */
    2198      }
    2199    return tree;
    2200  }
    2201  
    2202  /* This function build the following tree, from regular expression a*:
    2203  	 *
    2204  	 |
    2205  	 a
    2206  */
    2207  
    2208  static bin_tree_t *
    2209  parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
    2210  		  reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
    2211  {
    2212    re_dfa_t *dfa = preg->buffer;
    2213    bin_tree_t *tree;
    2214    switch (token->type)
    2215      {
    2216      case CHARACTER:
    2217        tree = create_token_tree (dfa, NULL, NULL, token);
    2218        if (__glibc_unlikely (tree == NULL))
    2219  	{
    2220  	  *err = REG_ESPACE;
    2221  	  return NULL;
    2222  	}
    2223        if (dfa->mb_cur_max > 1)
    2224  	{
    2225  	  while (!re_string_eoi (regexp)
    2226  		 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
    2227  	    {
    2228  	      bin_tree_t *mbc_remain;
    2229  	      fetch_token (token, regexp, syntax);
    2230  	      mbc_remain = create_token_tree (dfa, NULL, NULL, token);
    2231  	      tree = create_tree (dfa, tree, mbc_remain, CONCAT);
    2232  	      if (__glibc_unlikely (mbc_remain == NULL || tree == NULL))
    2233  		{
    2234  		  *err = REG_ESPACE;
    2235  		  return NULL;
    2236  		}
    2237  	    }
    2238  	}
    2239        break;
    2240  
    2241      case OP_OPEN_SUBEXP:
    2242        tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
    2243        if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
    2244  	return NULL;
    2245        break;
    2246  
    2247      case OP_OPEN_BRACKET:
    2248        tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
    2249        if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
    2250  	return NULL;
    2251        break;
    2252  
    2253      case OP_BACK_REF:
    2254        if (!__glibc_likely (dfa->completed_bkref_map & (1 << token->opr.idx)))
    2255  	{
    2256  	  *err = REG_ESUBREG;
    2257  	  return NULL;
    2258  	}
    2259        dfa->used_bkref_map |= 1 << token->opr.idx;
    2260        tree = create_token_tree (dfa, NULL, NULL, token);
    2261        if (__glibc_unlikely (tree == NULL))
    2262  	{
    2263  	  *err = REG_ESPACE;
    2264  	  return NULL;
    2265  	}
    2266        ++dfa->nbackref;
    2267        dfa->has_mb_node = 1;
    2268        break;
    2269  
    2270      case OP_OPEN_DUP_NUM:
    2271        if (syntax & RE_CONTEXT_INVALID_DUP)
    2272  	{
    2273  	  *err = REG_BADRPT;
    2274  	  return NULL;
    2275  	}
    2276        FALLTHROUGH;
    2277      case OP_DUP_ASTERISK:
    2278      case OP_DUP_PLUS:
    2279      case OP_DUP_QUESTION:
    2280        if (syntax & RE_CONTEXT_INVALID_OPS)
    2281  	{
    2282  	  *err = REG_BADRPT;
    2283  	  return NULL;
    2284  	}
    2285        else if (syntax & RE_CONTEXT_INDEP_OPS)
    2286  	{
    2287  	  fetch_token (token, regexp, syntax);
    2288  	  return parse_expression (regexp, preg, token, syntax, nest, err);
    2289  	}
    2290        FALLTHROUGH;
    2291      case OP_CLOSE_SUBEXP:
    2292        if ((token->type == OP_CLOSE_SUBEXP)
    2293  	  && !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
    2294  	{
    2295  	  *err = REG_ERPAREN;
    2296  	  return NULL;
    2297  	}
    2298        FALLTHROUGH;
    2299      case OP_CLOSE_DUP_NUM:
    2300        /* We treat it as a normal character.  */
    2301  
    2302        /* Then we can these characters as normal characters.  */
    2303        token->type = CHARACTER;
    2304        /* mb_partial and word_char bits should be initialized already
    2305  	 by peek_token.  */
    2306        tree = create_token_tree (dfa, NULL, NULL, token);
    2307        if (__glibc_unlikely (tree == NULL))
    2308  	{
    2309  	  *err = REG_ESPACE;
    2310  	  return NULL;
    2311  	}
    2312        break;
    2313  
    2314      case ANCHOR:
    2315        if ((token->opr.ctx_type
    2316  	   & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
    2317  	  && dfa->word_ops_used == 0)
    2318  	init_word_char (dfa);
    2319        if (token->opr.ctx_type == WORD_DELIM
    2320  	  || token->opr.ctx_type == NOT_WORD_DELIM)
    2321  	{
    2322  	  bin_tree_t *tree_first, *tree_last;
    2323  	  if (token->opr.ctx_type == WORD_DELIM)
    2324  	    {
    2325  	      token->opr.ctx_type = WORD_FIRST;
    2326  	      tree_first = create_token_tree (dfa, NULL, NULL, token);
    2327  	      token->opr.ctx_type = WORD_LAST;
    2328  	    }
    2329  	  else
    2330  	    {
    2331  	      token->opr.ctx_type = INSIDE_WORD;
    2332  	      tree_first = create_token_tree (dfa, NULL, NULL, token);
    2333  	      token->opr.ctx_type = INSIDE_NOTWORD;
    2334  	    }
    2335  	  tree_last = create_token_tree (dfa, NULL, NULL, token);
    2336  	  tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
    2337  	  if (__glibc_unlikely (tree_first == NULL || tree_last == NULL
    2338  				|| tree == NULL))
    2339  	    {
    2340  	      *err = REG_ESPACE;
    2341  	      return NULL;
    2342  	    }
    2343  	}
    2344        else
    2345  	{
    2346  	  tree = create_token_tree (dfa, NULL, NULL, token);
    2347  	  if (__glibc_unlikely (tree == NULL))
    2348  	    {
    2349  	      *err = REG_ESPACE;
    2350  	      return NULL;
    2351  	    }
    2352  	}
    2353        /* We must return here, since ANCHORs can't be followed
    2354  	 by repetition operators.
    2355  	 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
    2356  	     it must not be "<ANCHOR(^)><REPEAT(*)>".  */
    2357        fetch_token (token, regexp, syntax);
    2358        return tree;
    2359  
    2360      case OP_PERIOD:
    2361        tree = create_token_tree (dfa, NULL, NULL, token);
    2362        if (__glibc_unlikely (tree == NULL))
    2363  	{
    2364  	  *err = REG_ESPACE;
    2365  	  return NULL;
    2366  	}
    2367        if (dfa->mb_cur_max > 1)
    2368  	dfa->has_mb_node = 1;
    2369        break;
    2370  
    2371      case OP_WORD:
    2372      case OP_NOTWORD:
    2373        tree = build_charclass_op (dfa, regexp->trans,
    2374  				 "alnum",
    2375  				 "_",
    2376  				 token->type == OP_NOTWORD, err);
    2377        if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
    2378  	return NULL;
    2379        break;
    2380  
    2381      case OP_SPACE:
    2382      case OP_NOTSPACE:
    2383        tree = build_charclass_op (dfa, regexp->trans,
    2384  				 "space",
    2385  				 "",
    2386  				 token->type == OP_NOTSPACE, err);
    2387        if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
    2388  	return NULL;
    2389        break;
    2390  
    2391      case OP_ALT:
    2392      case END_OF_RE:
    2393        return NULL;
    2394  
    2395      case BACK_SLASH:
    2396        *err = REG_EESCAPE;
    2397        return NULL;
    2398  
    2399      default:
    2400        /* Must not happen?  */
    2401        DEBUG_ASSERT (false);
    2402        return NULL;
    2403      }
    2404    fetch_token (token, regexp, syntax);
    2405  
    2406    while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
    2407  	 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
    2408      {
    2409        bin_tree_t *dup_tree = parse_dup_op (tree, regexp, dfa, token,
    2410  					   syntax, err);
    2411        if (__glibc_unlikely (*err != REG_NOERROR && dup_tree == NULL))
    2412  	{
    2413  	  if (tree != NULL)
    2414  	    postorder (tree, free_tree, NULL);
    2415  	  return NULL;
    2416  	}
    2417        tree = dup_tree;
    2418        /* In BRE consecutive duplications are not allowed.  */
    2419        if ((syntax & RE_CONTEXT_INVALID_DUP)
    2420  	  && (token->type == OP_DUP_ASTERISK
    2421  	      || token->type == OP_OPEN_DUP_NUM))
    2422  	{
    2423  	  if (tree != NULL)
    2424  	    postorder (tree, free_tree, NULL);
    2425  	  *err = REG_BADRPT;
    2426  	  return NULL;
    2427  	}
    2428      }
    2429  
    2430    return tree;
    2431  }
    2432  
    2433  /* This function build the following tree, from regular expression
    2434     (<reg_exp>):
    2435  	 SUBEXP
    2436  	    |
    2437  	<reg_exp>
    2438  */
    2439  
    2440  static bin_tree_t *
    2441  parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
    2442  	       reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
    2443  {
    2444    re_dfa_t *dfa = preg->buffer;
    2445    bin_tree_t *tree;
    2446    size_t cur_nsub;
    2447    cur_nsub = preg->re_nsub++;
    2448  
    2449    fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
    2450  
    2451    /* The subexpression may be a null string.  */
    2452    if (token->type == OP_CLOSE_SUBEXP)
    2453      tree = NULL;
    2454    else
    2455      {
    2456        tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
    2457        if (__glibc_unlikely (*err == REG_NOERROR
    2458  			    && token->type != OP_CLOSE_SUBEXP))
    2459  	{
    2460  	  if (tree != NULL)
    2461  	    postorder (tree, free_tree, NULL);
    2462  	  *err = REG_EPAREN;
    2463  	}
    2464        if (__glibc_unlikely (*err != REG_NOERROR))
    2465  	return NULL;
    2466      }
    2467  
    2468    if (cur_nsub <= '9' - '1')
    2469      dfa->completed_bkref_map |= 1 << cur_nsub;
    2470  
    2471    tree = create_tree (dfa, tree, NULL, SUBEXP);
    2472    if (__glibc_unlikely (tree == NULL))
    2473      {
    2474        *err = REG_ESPACE;
    2475        return NULL;
    2476      }
    2477    tree->token.opr.idx = cur_nsub;
    2478    return tree;
    2479  }
    2480  
    2481  /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
    2482  
    2483  static bin_tree_t *
    2484  parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
    2485  	      re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
    2486  {
    2487    bin_tree_t *tree = NULL, *old_tree = NULL;
    2488    Idx i, start, end, start_idx = re_string_cur_idx (regexp);
    2489    re_token_t start_token = *token;
    2490  
    2491    if (token->type == OP_OPEN_DUP_NUM)
    2492      {
    2493        end = 0;
    2494        start = fetch_number (regexp, token, syntax);
    2495        if (start == -1)
    2496  	{
    2497  	  if (token->type == CHARACTER && token->opr.c == ',')
    2498  	    start = 0; /* We treat "{,m}" as "{0,m}".  */
    2499  	  else
    2500  	    {
    2501  	      *err = REG_BADBR; /* <re>{} is invalid.  */
    2502  	      return NULL;
    2503  	    }
    2504  	}
    2505        if (__glibc_likely (start != -2))
    2506  	{
    2507  	  /* We treat "{n}" as "{n,n}".  */
    2508  	  end = ((token->type == OP_CLOSE_DUP_NUM) ? start
    2509  		 : ((token->type == CHARACTER && token->opr.c == ',')
    2510  		    ? fetch_number (regexp, token, syntax) : -2));
    2511  	}
    2512        if (__glibc_unlikely (start == -2 || end == -2))
    2513  	{
    2514  	  /* Invalid sequence.  */
    2515  	  if (__glibc_unlikely (!(syntax & RE_INVALID_INTERVAL_ORD)))
    2516  	    {
    2517  	      if (token->type == END_OF_RE)
    2518  		*err = REG_EBRACE;
    2519  	      else
    2520  		*err = REG_BADBR;
    2521  
    2522  	      return NULL;
    2523  	    }
    2524  
    2525  	  /* If the syntax bit is set, rollback.  */
    2526  	  re_string_set_index (regexp, start_idx);
    2527  	  *token = start_token;
    2528  	  token->type = CHARACTER;
    2529  	  /* mb_partial and word_char bits should be already initialized by
    2530  	     peek_token.  */
    2531  	  return elem;
    2532  	}
    2533  
    2534        if (__glibc_unlikely ((end != -1 && start > end)
    2535  			    || token->type != OP_CLOSE_DUP_NUM))
    2536  	{
    2537  	  /* First number greater than second.  */
    2538  	  *err = REG_BADBR;
    2539  	  return NULL;
    2540  	}
    2541  
    2542        if (__glibc_unlikely (RE_DUP_MAX < (end == -1 ? start : end)))
    2543  	{
    2544  	  *err = REG_ESIZE;
    2545  	  return NULL;
    2546  	}
    2547      }
    2548    else
    2549      {
    2550        start = (token->type == OP_DUP_PLUS) ? 1 : 0;
    2551        end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
    2552      }
    2553  
    2554    fetch_token (token, regexp, syntax);
    2555  
    2556    if (__glibc_unlikely (elem == NULL))
    2557      return NULL;
    2558    if (__glibc_unlikely (start == 0 && end == 0))
    2559      {
    2560        postorder (elem, free_tree, NULL);
    2561        return NULL;
    2562      }
    2563  
    2564    /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
    2565    if (__glibc_unlikely (start > 0))
    2566      {
    2567        tree = elem;
    2568        for (i = 2; i <= start; ++i)
    2569  	{
    2570  	  elem = duplicate_tree (elem, dfa);
    2571  	  tree = create_tree (dfa, tree, elem, CONCAT);
    2572  	  if (__glibc_unlikely (elem == NULL || tree == NULL))
    2573  	    goto parse_dup_op_espace;
    2574  	}
    2575  
    2576        if (start == end)
    2577  	return tree;
    2578  
    2579        /* Duplicate ELEM before it is marked optional.  */
    2580        elem = duplicate_tree (elem, dfa);
    2581        if (__glibc_unlikely (elem == NULL))
    2582          goto parse_dup_op_espace;
    2583        old_tree = tree;
    2584      }
    2585    else
    2586      old_tree = NULL;
    2587  
    2588    if (elem->token.type == SUBEXP)
    2589      {
    2590        uintptr_t subidx = elem->token.opr.idx;
    2591        postorder (elem, mark_opt_subexp, (void *) subidx);
    2592      }
    2593  
    2594    tree = create_tree (dfa, elem, NULL,
    2595  		      (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
    2596    if (__glibc_unlikely (tree == NULL))
    2597      goto parse_dup_op_espace;
    2598  
    2599    /* This loop is actually executed only when end != -1,
    2600       to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
    2601       already created the start+1-th copy.  */
    2602    if (TYPE_SIGNED (Idx) || end != -1)
    2603      for (i = start + 2; i <= end; ++i)
    2604        {
    2605  	elem = duplicate_tree (elem, dfa);
    2606  	tree = create_tree (dfa, tree, elem, CONCAT);
    2607  	if (__glibc_unlikely (elem == NULL || tree == NULL))
    2608  	  goto parse_dup_op_espace;
    2609  
    2610  	tree = create_tree (dfa, tree, NULL, OP_ALT);
    2611  	if (__glibc_unlikely (tree == NULL))
    2612  	  goto parse_dup_op_espace;
    2613        }
    2614  
    2615    if (old_tree)
    2616      tree = create_tree (dfa, old_tree, tree, CONCAT);
    2617  
    2618    return tree;
    2619  
    2620   parse_dup_op_espace:
    2621    *err = REG_ESPACE;
    2622    return NULL;
    2623  }
    2624  
    2625  /* Size of the names for collating symbol/equivalence_class/character_class.
    2626     I'm not sure, but maybe enough.  */
    2627  #define BRACKET_NAME_BUF_SIZE 32
    2628  
    2629  #ifndef _LIBC
    2630  
    2631  /* Convert the byte B to the corresponding wide character.  In a
    2632     unibyte locale, treat B as itself.  In a multibyte locale, return
    2633     WEOF if B is an encoding error.  */
    2634  static wint_t
    2635  parse_byte (unsigned char b, re_dfa_t const *dfa)
    2636  {
    2637    return dfa->mb_cur_max > 1 ? __btowc (b) : b;
    2638  }
    2639  
    2640  /* Local function for parse_bracket_exp used in _LIBC environment.
    2641     Build the range expression which starts from START_ELEM, and ends
    2642     at END_ELEM.  The result are written to MBCSET and SBCSET.
    2643     RANGE_ALLOC is the allocated size of mbcset->range_starts, and
    2644     mbcset->range_ends, is a pointer argument since we may
    2645     update it.  */
    2646  
    2647  static reg_errcode_t
    2648  build_range_exp (bitset_t sbcset, re_charset_t *mbcset, Idx *range_alloc,
    2649  		 bracket_elem_t *start_elem, bracket_elem_t *end_elem,
    2650  		 re_dfa_t *dfa, reg_syntax_t syntax, uint_fast32_t nrules,
    2651  		 const unsigned char *collseqmb, const char *collseqwc,
    2652  		 int_fast32_t table_size, const void *symb_table,
    2653  		 const unsigned char *extra)
    2654  {
    2655    /* Equivalence Classes and Character Classes can't be a range start/end.  */
    2656    if (__glibc_unlikely (start_elem->type == EQUIV_CLASS
    2657  			|| start_elem->type == CHAR_CLASS
    2658  			|| end_elem->type == EQUIV_CLASS
    2659  			|| end_elem->type == CHAR_CLASS))
    2660      return REG_ERANGE;
    2661  
    2662    /* We can handle no multi character collating elements without libc
    2663       support.  */
    2664    if (__glibc_unlikely ((start_elem->type == COLL_SYM
    2665  			 && strlen ((char *) start_elem->opr.name) > 1)
    2666  			|| (end_elem->type == COLL_SYM
    2667  			    && strlen ((char *) end_elem->opr.name) > 1)))
    2668      return REG_ECOLLATE;
    2669  
    2670    unsigned int
    2671      start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
    2672  		: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
    2673  		   : 0)),
    2674      end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
    2675  	      : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
    2676  		 : 0));
    2677    wint_t
    2678      start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
    2679  		? parse_byte (start_ch, dfa) : start_elem->opr.wch),
    2680      end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
    2681  	      ? parse_byte (end_ch, dfa) : end_elem->opr.wch);
    2682  
    2683    if (start_wc == WEOF || end_wc == WEOF)
    2684      return REG_ECOLLATE;
    2685    else if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES)
    2686                               && start_wc > end_wc))
    2687      return REG_ERANGE;
    2688  
    2689    /* Got valid collation sequence values, add them as a new entry.
    2690       However, for !_LIBC we have no collation elements: if the
    2691       character set is single byte, the single byte character set
    2692       that we build below suffices.  parse_bracket_exp passes
    2693       no MBCSET if dfa->mb_cur_max == 1.  */
    2694    if (dfa->mb_cur_max > 1)
    2695      {
    2696        /* Check the space of the arrays.  */
    2697        if (__glibc_unlikely (*range_alloc == mbcset->nranges))
    2698          {
    2699            /* There is not enough space, need realloc.  */
    2700            wchar_t *new_array_start, *new_array_end;
    2701            Idx new_nranges;
    2702  
    2703            /* +1 in case of mbcset->nranges is 0.  */
    2704            new_nranges = 2 * mbcset->nranges + 1;
    2705            /* Use realloc since mbcset->range_starts and mbcset->range_ends
    2706               are NULL if *range_alloc == 0.  */
    2707            new_array_start = re_realloc (mbcset->range_starts, wchar_t,
    2708                                          new_nranges);
    2709            new_array_end = re_realloc (mbcset->range_ends, wchar_t,
    2710                                        new_nranges);
    2711  
    2712            if (__glibc_unlikely (new_array_start == NULL
    2713                                  || new_array_end == NULL))
    2714              {
    2715                re_free (new_array_start);
    2716                re_free (new_array_end);
    2717                return REG_ESPACE;
    2718              }
    2719  
    2720            mbcset->range_starts = new_array_start;
    2721            mbcset->range_ends = new_array_end;
    2722            *range_alloc = new_nranges;
    2723          }
    2724  
    2725        mbcset->range_starts[mbcset->nranges] = start_wc;
    2726        mbcset->range_ends[mbcset->nranges++] = end_wc;
    2727      }
    2728  
    2729    /* Build the table for single byte characters.  */
    2730    for (wchar_t wc = 0; wc < SBC_MAX; ++wc)
    2731      {
    2732        if (start_wc <= wc && wc <= end_wc)
    2733          bitset_set (sbcset, wc);
    2734      }
    2735  
    2736    return REG_NOERROR;
    2737  }
    2738  #endif /* not _LIBC */
    2739  
    2740  #ifndef _LIBC
    2741  /* Helper function for parse_bracket_exp only used in case of NOT _LIBC.
    2742     Build the collating element which is represented by NAME.
    2743     The result are written to MBCSET and SBCSET.
    2744     COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
    2745     pointer argument since we may update it.  */
    2746  
    2747  static reg_errcode_t
    2748  build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
    2749  			Idx *coll_sym_alloc, const unsigned char *name,
    2750  			uint_fast32_t nrules, int_fast32_t table_size,
    2751  			const void *symb_table, const unsigned char *extra)
    2752  {
    2753    size_t name_len = strlen ((const char *) name);
    2754    if (__glibc_unlikely (name_len != 1))
    2755      return REG_ECOLLATE;
    2756    else
    2757      {
    2758        bitset_set (sbcset, name[0]);
    2759        return REG_NOERROR;
    2760      }
    2761  }
    2762  #endif /* not _LIBC */
    2763  
    2764  #ifdef _LIBC
    2765  /* Local function for parse_bracket_exp used in _LIBC environment.
    2766     Seek the collating symbol entry corresponding to NAME.
    2767     Return the index of the symbol in the SYMB_TABLE,
    2768     or -1 if not found.  */
    2769  
    2770  static __always_inline int32_t
    2771  seek_collating_symbol_entry (const unsigned char *name, size_t name_len,
    2772  			     const int32_t *symb_table,
    2773  			     int_fast32_t table_size,
    2774  			     const unsigned char *extra)
    2775  {
    2776    int_fast32_t elem;
    2777  
    2778    for (elem = 0; elem < table_size; elem++)
    2779      if (symb_table[2 * elem] != 0)
    2780        {
    2781  	int32_t idx = symb_table[2 * elem + 1];
    2782  	/* Skip the name of collating element name.  */
    2783  	idx += 1 + extra[idx];
    2784  	if (/* Compare the length of the name.  */
    2785  	    name_len == extra[idx]
    2786  	    /* Compare the name.  */
    2787  	    && memcmp (name, &extra[idx + 1], name_len) == 0)
    2788  	  /* Yep, this is the entry.  */
    2789  	  return elem;
    2790        }
    2791    return -1;
    2792  }
    2793  
    2794  /* Local function for parse_bracket_exp used in _LIBC environment.
    2795     Look up the collation sequence value of BR_ELEM.
    2796     Return the value if succeeded, UINT_MAX otherwise.  */
    2797  
    2798  static __always_inline unsigned int
    2799  lookup_collation_sequence_value (bracket_elem_t *br_elem, uint32_t nrules,
    2800  				 const unsigned char *collseqmb,
    2801  				 const char *collseqwc,
    2802  				 int_fast32_t table_size,
    2803  				 const int32_t *symb_table,
    2804  				 const unsigned char *extra)
    2805  {
    2806    if (br_elem->type == SB_CHAR)
    2807      {
    2808        /* if (MB_CUR_MAX == 1) */
    2809        if (nrules == 0)
    2810  	return collseqmb[br_elem->opr.ch];
    2811        else
    2812  	{
    2813  	  wint_t wc = __btowc (br_elem->opr.ch);
    2814  	  return __collseq_table_lookup (collseqwc, wc);
    2815  	}
    2816      }
    2817    else if (br_elem->type == MB_CHAR)
    2818      {
    2819        if (nrules != 0)
    2820  	return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
    2821      }
    2822    else if (br_elem->type == COLL_SYM)
    2823      {
    2824        size_t sym_name_len = strlen ((char *) br_elem->opr.name);
    2825        if (nrules != 0)
    2826  	{
    2827  	  int32_t elem, idx;
    2828  	  elem = seek_collating_symbol_entry (br_elem->opr.name,
    2829  					      sym_name_len,
    2830  					      symb_table, table_size,
    2831  					      extra);
    2832  	  if (elem != -1)
    2833  	    {
    2834  	      /* We found the entry.  */
    2835  	      idx = symb_table[2 * elem + 1];
    2836  	      /* Skip the name of collating element name.  */
    2837  	      idx += 1 + extra[idx];
    2838  	      /* Skip the byte sequence of the collating element.  */
    2839  	      idx += 1 + extra[idx];
    2840  	      /* Adjust for the alignment.  */
    2841  	      idx = (idx + 3) & ~3;
    2842  	      /* Skip the multibyte collation sequence value.  */
    2843  	      idx += sizeof (unsigned int);
    2844  	      /* Skip the wide char sequence of the collating element.  */
    2845  	      idx += sizeof (unsigned int) *
    2846  		(1 + *(unsigned int *) (extra + idx));
    2847  	      /* Return the collation sequence value.  */
    2848  	      return *(unsigned int *) (extra + idx);
    2849  	    }
    2850  	  else if (sym_name_len == 1)
    2851  	    {
    2852  	      /* No valid character.  Match it as a single byte
    2853  		 character.  */
    2854  	      return collseqmb[br_elem->opr.name[0]];
    2855  	    }
    2856  	}
    2857        else if (sym_name_len == 1)
    2858  	return collseqmb[br_elem->opr.name[0]];
    2859      }
    2860    return UINT_MAX;
    2861  }
    2862  
    2863  /* Local function for parse_bracket_exp used in _LIBC environment.
    2864     Build the range expression which starts from START_ELEM, and ends
    2865     at END_ELEM.  The result are written to MBCSET and SBCSET.
    2866     RANGE_ALLOC is the allocated size of mbcset->range_starts, and
    2867     mbcset->range_ends, is a pointer argument since we may
    2868     update it.  */
    2869  
    2870  static __always_inline reg_errcode_t
    2871  build_range_exp (bitset_t sbcset, re_charset_t *mbcset, Idx *range_alloc,
    2872  		 bracket_elem_t *start_elem, bracket_elem_t *end_elem,
    2873  		 re_dfa_t *dfa, reg_syntax_t syntax, uint32_t nrules,
    2874  		 const unsigned char *collseqmb, const char *collseqwc,
    2875  		 int_fast32_t table_size, const int32_t *symb_table,
    2876  		 const unsigned char *extra)
    2877  {
    2878    unsigned int ch;
    2879    uint32_t start_collseq;
    2880    uint32_t end_collseq;
    2881  
    2882    /* Equivalence Classes and Character Classes can't be a range
    2883       start/end.  */
    2884    if (__glibc_unlikely (start_elem->type == EQUIV_CLASS
    2885                          || start_elem->type == CHAR_CLASS
    2886                          || end_elem->type == EQUIV_CLASS
    2887                          || end_elem->type == CHAR_CLASS))
    2888      return REG_ERANGE;
    2889  
    2890    /* FIXME: Implement rational ranges here, too.  */
    2891    start_collseq = lookup_collation_sequence_value (start_elem, nrules, collseqmb, collseqwc,
    2892  						   table_size, symb_table, extra);
    2893    end_collseq = lookup_collation_sequence_value (end_elem, nrules, collseqmb, collseqwc,
    2894  						 table_size, symb_table, extra);
    2895    /* Check start/end collation sequence values.  */
    2896    if (__glibc_unlikely (start_collseq == UINT_MAX
    2897                          || end_collseq == UINT_MAX))
    2898      return REG_ECOLLATE;
    2899    if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES)
    2900                          && start_collseq > end_collseq))
    2901      return REG_ERANGE;
    2902  
    2903    /* Got valid collation sequence values, add them as a new entry.
    2904       However, if we have no collation elements, and the character set
    2905       is single byte, the single byte character set that we
    2906       build below suffices. */
    2907    if (nrules > 0 || dfa->mb_cur_max > 1)
    2908      {
    2909        /* Check the space of the arrays.  */
    2910        if (__glibc_unlikely (*range_alloc == mbcset->nranges))
    2911  	{
    2912  	  /* There is not enough space, need realloc.  */
    2913  	  uint32_t *new_array_start;
    2914  	  uint32_t *new_array_end;
    2915  	  int new_nranges;
    2916  
    2917  	  /* +1 in case of mbcset->nranges is 0.  */
    2918  	  new_nranges = 2 * mbcset->nranges + 1;
    2919  	  new_array_start = re_realloc (mbcset->range_starts, uint32_t,
    2920  					new_nranges);
    2921  	  new_array_end = re_realloc (mbcset->range_ends, uint32_t,
    2922  				      new_nranges);
    2923  
    2924            if (__glibc_unlikely (new_array_start == NULL
    2925                                  || new_array_end == NULL))
    2926  	    return REG_ESPACE;
    2927  
    2928  	  mbcset->range_starts = new_array_start;
    2929  	  mbcset->range_ends = new_array_end;
    2930  	  *range_alloc = new_nranges;
    2931  	}
    2932  
    2933        mbcset->range_starts[mbcset->nranges] = start_collseq;
    2934        mbcset->range_ends[mbcset->nranges++] = end_collseq;
    2935      }
    2936  
    2937    /* Build the table for single byte characters.  */
    2938    for (ch = 0; ch < SBC_MAX; ch++)
    2939      {
    2940        uint32_t ch_collseq;
    2941        /* if (MB_CUR_MAX == 1) */
    2942        if (nrules == 0)
    2943  	ch_collseq = collseqmb[ch];
    2944        else
    2945  	ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
    2946        if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
    2947  	bitset_set (sbcset, ch);
    2948      }
    2949    return REG_NOERROR;
    2950  }
    2951  
    2952  /* Local function for parse_bracket_exp used in _LIBC environment.
    2953     Build the collating element which is represented by NAME.
    2954     The result are written to MBCSET and SBCSET.
    2955     COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
    2956     pointer argument since we may update it.  */
    2957  
    2958  static __always_inline reg_errcode_t
    2959  build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
    2960  			Idx *coll_sym_alloc, const unsigned char *name,
    2961  			uint_fast32_t nrules, int_fast32_t table_size,
    2962  			const int32_t *symb_table, const unsigned char *extra)
    2963  {
    2964    int32_t elem, idx;
    2965    size_t name_len = strlen ((const char *) name);
    2966    if (nrules != 0)
    2967      {
    2968        elem = seek_collating_symbol_entry (name, name_len, symb_table,
    2969  					  table_size, extra);
    2970        if (elem != -1)
    2971  	{
    2972  	  /* We found the entry.  */
    2973  	  idx = symb_table[2 * elem + 1];
    2974  	  /* Skip the name of collating element name.  */
    2975  	  idx += 1 + extra[idx];
    2976  	}
    2977        else if (name_len == 1)
    2978  	{
    2979  	  /* No valid character, treat it as a normal
    2980  	     character.  */
    2981  	  bitset_set (sbcset, name[0]);
    2982  	  return REG_NOERROR;
    2983  	}
    2984        else
    2985  	return REG_ECOLLATE;
    2986  
    2987        /* Got valid collation sequence, add it as a new entry.  */
    2988        /* Check the space of the arrays.  */
    2989        if (__glibc_unlikely (*coll_sym_alloc == mbcset->ncoll_syms))
    2990  	{
    2991  	  /* Not enough, realloc it.  */
    2992  	  /* +1 in case of mbcset->ncoll_syms is 0.  */
    2993  	  int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
    2994  	  /* Use realloc since mbcset->coll_syms is NULL
    2995  	     if *alloc == 0.  */
    2996  	  int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
    2997  					       new_coll_sym_alloc);
    2998            if (__glibc_unlikely (new_coll_syms == NULL))
    2999  	    return REG_ESPACE;
    3000  	  mbcset->coll_syms = new_coll_syms;
    3001  	  *coll_sym_alloc = new_coll_sym_alloc;
    3002  	}
    3003        mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
    3004        return REG_NOERROR;
    3005      }
    3006    else
    3007      {
    3008        if (__glibc_unlikely (name_len != 1))
    3009  	return REG_ECOLLATE;
    3010        else
    3011  	{
    3012  	  bitset_set (sbcset, name[0]);
    3013  	  return REG_NOERROR;
    3014  	}
    3015      }
    3016  }
    3017  #endif /* _LIBC */
    3018  
    3019  /* This function parse bracket expression like "[abc]", "[a-c]",
    3020     "[[.a-a.]]" etc.  */
    3021  
    3022  static bin_tree_t *
    3023  parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
    3024  		   reg_syntax_t syntax, reg_errcode_t *err)
    3025  {
    3026    const unsigned char *collseqmb = NULL;
    3027    const char *collseqwc = NULL;
    3028    uint_fast32_t nrules = 0;
    3029    int_fast32_t table_size = 0;
    3030    const void *symb_table = NULL;
    3031    const unsigned char *extra = NULL;
    3032  
    3033    re_token_t br_token;
    3034    re_bitset_ptr_t sbcset;
    3035    re_charset_t *mbcset;
    3036    Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
    3037    Idx equiv_class_alloc = 0, char_class_alloc = 0;
    3038    bool non_match = false;
    3039    bin_tree_t *work_tree;
    3040    int token_len;
    3041    bool first_round = true;
    3042  #ifdef _LIBC
    3043    collseqmb = (const unsigned char *)
    3044      _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
    3045    nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
    3046    if (nrules)
    3047      {
    3048        /*
    3049        if (MB_CUR_MAX > 1)
    3050        */
    3051        collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
    3052        table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
    3053        symb_table = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_TABLEMB);
    3054        extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
    3055  						   _NL_COLLATE_SYMB_EXTRAMB);
    3056      }
    3057  #endif
    3058    sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
    3059    mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
    3060    if (__glibc_unlikely (sbcset == NULL || mbcset == NULL))
    3061      {
    3062        re_free (sbcset);
    3063        re_free (mbcset);
    3064        *err = REG_ESPACE;
    3065        return NULL;
    3066      }
    3067  
    3068    token_len = peek_token_bracket (token, regexp, syntax);
    3069    if (__glibc_unlikely (token->type == END_OF_RE))
    3070      {
    3071        *err = REG_BADPAT;
    3072        goto parse_bracket_exp_free_return;
    3073      }
    3074    if (token->type == OP_NON_MATCH_LIST)
    3075      {
    3076        mbcset->non_match = 1;
    3077        non_match = true;
    3078        if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
    3079  	bitset_set (sbcset, '\n');
    3080        re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
    3081        token_len = peek_token_bracket (token, regexp, syntax);
    3082        if (__glibc_unlikely (token->type == END_OF_RE))
    3083  	{
    3084  	  *err = REG_BADPAT;
    3085  	  goto parse_bracket_exp_free_return;
    3086  	}
    3087      }
    3088  
    3089    /* We treat the first ']' as a normal character.  */
    3090    if (token->type == OP_CLOSE_BRACKET)
    3091      token->type = CHARACTER;
    3092  
    3093    while (1)
    3094      {
    3095        bracket_elem_t start_elem, end_elem;
    3096        unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
    3097        unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
    3098        reg_errcode_t ret;
    3099        int token_len2 = 0;
    3100        bool is_range_exp = false;
    3101        re_token_t token2;
    3102  
    3103        start_elem.opr.name = start_name_buf;
    3104        start_elem.type = COLL_SYM;
    3105        ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
    3106  				   syntax, first_round);
    3107        if (__glibc_unlikely (ret != REG_NOERROR))
    3108  	{
    3109  	  *err = ret;
    3110  	  goto parse_bracket_exp_free_return;
    3111  	}
    3112        first_round = false;
    3113  
    3114        /* Get information about the next token.  We need it in any case.  */
    3115        token_len = peek_token_bracket (token, regexp, syntax);
    3116  
    3117        /* Do not check for ranges if we know they are not allowed.  */
    3118        if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
    3119  	{
    3120  	  if (__glibc_unlikely (token->type == END_OF_RE))
    3121  	    {
    3122  	      *err = REG_EBRACK;
    3123  	      goto parse_bracket_exp_free_return;
    3124  	    }
    3125  	  if (token->type == OP_CHARSET_RANGE)
    3126  	    {
    3127  	      re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
    3128  	      token_len2 = peek_token_bracket (&token2, regexp, syntax);
    3129  	      if (__glibc_unlikely (token2.type == END_OF_RE))
    3130  		{
    3131  		  *err = REG_EBRACK;
    3132  		  goto parse_bracket_exp_free_return;
    3133  		}
    3134  	      if (token2.type == OP_CLOSE_BRACKET)
    3135  		{
    3136  		  /* We treat the last '-' as a normal character.  */
    3137  		  re_string_skip_bytes (regexp, -token_len);
    3138  		  token->type = CHARACTER;
    3139  		}
    3140  	      else
    3141  		is_range_exp = true;
    3142  	    }
    3143  	}
    3144  
    3145        if (is_range_exp == true)
    3146  	{
    3147  	  end_elem.opr.name = end_name_buf;
    3148  	  end_elem.type = COLL_SYM;
    3149  	  ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
    3150  				       dfa, syntax, true);
    3151  	  if (__glibc_unlikely (ret != REG_NOERROR))
    3152  	    {
    3153  	      *err = ret;
    3154  	      goto parse_bracket_exp_free_return;
    3155  	    }
    3156  
    3157  	  token_len = peek_token_bracket (token, regexp, syntax);
    3158  
    3159  	  *err = build_range_exp (sbcset, mbcset, &range_alloc,
    3160  				  &start_elem, &end_elem,
    3161  				  dfa, syntax, nrules, collseqmb, collseqwc,
    3162  				  table_size, symb_table, extra);
    3163  	  if (__glibc_unlikely (*err != REG_NOERROR))
    3164  	    goto parse_bracket_exp_free_return;
    3165  	}
    3166        else
    3167  	{
    3168  	  switch (start_elem.type)
    3169  	    {
    3170  	    case SB_CHAR:
    3171  	      bitset_set (sbcset, start_elem.opr.ch);
    3172  	      break;
    3173  	    case MB_CHAR:
    3174  	      /* Check whether the array has enough space.  */
    3175  	      if (__glibc_unlikely (mbchar_alloc == mbcset->nmbchars))
    3176  		{
    3177  		  wchar_t *new_mbchars;
    3178  		  /* Not enough, realloc it.  */
    3179  		  /* +1 in case of mbcset->nmbchars is 0.  */
    3180  		  mbchar_alloc = 2 * mbcset->nmbchars + 1;
    3181  		  /* Use realloc since array is NULL if *alloc == 0.  */
    3182  		  new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
    3183  					    mbchar_alloc);
    3184  		  if (__glibc_unlikely (new_mbchars == NULL))
    3185  		    goto parse_bracket_exp_espace;
    3186  		  mbcset->mbchars = new_mbchars;
    3187  		}
    3188  	      mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
    3189  	      break;
    3190  	    case EQUIV_CLASS:
    3191  	      *err = build_equiv_class (sbcset,
    3192  					mbcset, &equiv_class_alloc,
    3193  					start_elem.opr.name);
    3194  	      if (__glibc_unlikely (*err != REG_NOERROR))
    3195  		goto parse_bracket_exp_free_return;
    3196  	      break;
    3197  	    case COLL_SYM:
    3198  	      *err = build_collating_symbol (sbcset,
    3199  					     mbcset, &coll_sym_alloc,
    3200  					     start_elem.opr.name,
    3201  					     nrules, table_size, symb_table, extra);
    3202  	      if (__glibc_unlikely (*err != REG_NOERROR))
    3203  		goto parse_bracket_exp_free_return;
    3204  	      break;
    3205  	    case CHAR_CLASS:
    3206  	      *err = build_charclass (regexp->trans, sbcset,
    3207  				      mbcset, &char_class_alloc,
    3208  				      (const char *) start_elem.opr.name,
    3209  				      syntax);
    3210  	      if (__glibc_unlikely (*err != REG_NOERROR))
    3211  	       goto parse_bracket_exp_free_return;
    3212  	      break;
    3213  	    default:
    3214  	      DEBUG_ASSERT (false);
    3215  	      break;
    3216  	    }
    3217  	}
    3218        if (__glibc_unlikely (token->type == END_OF_RE))
    3219  	{
    3220  	  *err = REG_EBRACK;
    3221  	  goto parse_bracket_exp_free_return;
    3222  	}
    3223        if (token->type == OP_CLOSE_BRACKET)
    3224  	break;
    3225      }
    3226  
    3227    re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
    3228  
    3229    /* If it is non-matching list.  */
    3230    if (non_match)
    3231      bitset_not (sbcset);
    3232  
    3233    /* Ensure only single byte characters are set.  */
    3234    if (dfa->mb_cur_max > 1)
    3235      bitset_mask (sbcset, dfa->sb_char);
    3236  
    3237    if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
    3238        || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
    3239  						     || mbcset->non_match)))
    3240      {
    3241        bin_tree_t *mbc_tree;
    3242        int sbc_idx;
    3243        /* Build a tree for complex bracket.  */
    3244        dfa->has_mb_node = 1;
    3245        br_token.type = COMPLEX_BRACKET;
    3246        br_token.opr.mbcset = mbcset;
    3247        mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
    3248        if (__glibc_unlikely (mbc_tree == NULL))
    3249  	goto parse_bracket_exp_espace;
    3250        for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
    3251  	if (sbcset[sbc_idx])
    3252  	  break;
    3253        /* If there are no bits set in sbcset, there is no point
    3254  	 of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
    3255        if (sbc_idx < BITSET_WORDS)
    3256  	{
    3257  	  /* Build a tree for simple bracket.  */
    3258  	  br_token.type = SIMPLE_BRACKET;
    3259  	  br_token.opr.sbcset = sbcset;
    3260  	  work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
    3261  	  if (__glibc_unlikely (work_tree == NULL))
    3262  	    goto parse_bracket_exp_espace;
    3263  
    3264  	  /* Then join them by ALT node.  */
    3265  	  work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
    3266  	  if (__glibc_unlikely (work_tree == NULL))
    3267  	    goto parse_bracket_exp_espace;
    3268  	}
    3269        else
    3270  	{
    3271  	  re_free (sbcset);
    3272  	  work_tree = mbc_tree;
    3273  	}
    3274      }
    3275    else
    3276      {
    3277        free_charset (mbcset);
    3278        /* Build a tree for simple bracket.  */
    3279        br_token.type = SIMPLE_BRACKET;
    3280        br_token.opr.sbcset = sbcset;
    3281        work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
    3282        if (__glibc_unlikely (work_tree == NULL))
    3283  	goto parse_bracket_exp_espace;
    3284      }
    3285    return work_tree;
    3286  
    3287   parse_bracket_exp_espace:
    3288    *err = REG_ESPACE;
    3289   parse_bracket_exp_free_return:
    3290    re_free (sbcset);
    3291    free_charset (mbcset);
    3292    return NULL;
    3293  }
    3294  
    3295  /* Parse an element in the bracket expression.  */
    3296  
    3297  static reg_errcode_t
    3298  parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
    3299  		       re_token_t *token, int token_len, re_dfa_t *dfa,
    3300  		       reg_syntax_t syntax, bool accept_hyphen)
    3301  {
    3302    int cur_char_size;
    3303    cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
    3304    if (cur_char_size > 1)
    3305      {
    3306        elem->type = MB_CHAR;
    3307        elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
    3308        re_string_skip_bytes (regexp, cur_char_size);
    3309        return REG_NOERROR;
    3310      }
    3311    re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
    3312    if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
    3313        || token->type == OP_OPEN_EQUIV_CLASS)
    3314      return parse_bracket_symbol (elem, regexp, token);
    3315    if (__glibc_unlikely (token->type == OP_CHARSET_RANGE) && !accept_hyphen)
    3316      {
    3317        /* A '-' must only appear as anything but a range indicator before
    3318  	 the closing bracket.  Everything else is an error.  */
    3319        re_token_t token2;
    3320        (void) peek_token_bracket (&token2, regexp, syntax);
    3321        if (token2.type != OP_CLOSE_BRACKET)
    3322  	/* The actual error value is not standardized since this whole
    3323  	   case is undefined.  But ERANGE makes good sense.  */
    3324  	return REG_ERANGE;
    3325      }
    3326    elem->type = SB_CHAR;
    3327    elem->opr.ch = token->opr.c;
    3328    return REG_NOERROR;
    3329  }
    3330  
    3331  /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
    3332     such as [:<character_class>:], [.<collating_element>.], and
    3333     [=<equivalent_class>=].  */
    3334  
    3335  static reg_errcode_t
    3336  parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
    3337  		      re_token_t *token)
    3338  {
    3339    unsigned char ch, delim = token->opr.c;
    3340    int i = 0;
    3341    if (re_string_eoi(regexp))
    3342      return REG_EBRACK;
    3343    for (;; ++i)
    3344      {
    3345        if (i >= BRACKET_NAME_BUF_SIZE)
    3346  	return REG_EBRACK;
    3347        if (token->type == OP_OPEN_CHAR_CLASS)
    3348  	ch = re_string_fetch_byte_case (regexp);
    3349        else
    3350  	ch = re_string_fetch_byte (regexp);
    3351        if (re_string_eoi(regexp))
    3352  	return REG_EBRACK;
    3353        if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
    3354  	break;
    3355        elem->opr.name[i] = ch;
    3356      }
    3357    re_string_skip_bytes (regexp, 1);
    3358    elem->opr.name[i] = '\0';
    3359    switch (token->type)
    3360      {
    3361      case OP_OPEN_COLL_ELEM:
    3362        elem->type = COLL_SYM;
    3363        break;
    3364      case OP_OPEN_EQUIV_CLASS:
    3365        elem->type = EQUIV_CLASS;
    3366        break;
    3367      case OP_OPEN_CHAR_CLASS:
    3368        elem->type = CHAR_CLASS;
    3369        break;
    3370      default:
    3371        break;
    3372      }
    3373    return REG_NOERROR;
    3374  }
    3375  
    3376    /* Helper function for parse_bracket_exp.
    3377       Build the equivalence class which is represented by NAME.
    3378       The result are written to MBCSET and SBCSET.
    3379       EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
    3380       is a pointer argument since we may update it.  */
    3381  
    3382  static reg_errcode_t
    3383  build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
    3384  		   Idx *equiv_class_alloc, const unsigned char *name)
    3385  {
    3386  #ifdef _LIBC
    3387    uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
    3388    if (nrules != 0)
    3389      {
    3390        const int32_t *table, *indirect;
    3391        const unsigned char *weights, *extra, *cp;
    3392        unsigned char char_buf[2];
    3393        int32_t idx1, idx2;
    3394        unsigned int ch;
    3395        size_t len;
    3396        /* Calculate the index for equivalence class.  */
    3397        cp = name;
    3398        table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
    3399        weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
    3400  					       _NL_COLLATE_WEIGHTMB);
    3401        extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
    3402  						   _NL_COLLATE_EXTRAMB);
    3403        indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
    3404  						_NL_COLLATE_INDIRECTMB);
    3405        idx1 = findidx (table, indirect, extra, &cp, -1);
    3406        if (__glibc_unlikely (idx1 == 0 || *cp != '\0'))
    3407  	/* This isn't a valid character.  */
    3408  	return REG_ECOLLATE;
    3409  
    3410        /* Build single byte matching table for this equivalence class.  */
    3411        len = weights[idx1 & 0xffffff];
    3412        for (ch = 0; ch < SBC_MAX; ++ch)
    3413  	{
    3414  	  char_buf[0] = ch;
    3415  	  cp = char_buf;
    3416  	  idx2 = findidx (table, indirect, extra, &cp, 1);
    3417  /*
    3418  	  idx2 = table[ch];
    3419  */
    3420  	  if (idx2 == 0)
    3421  	    /* This isn't a valid character.  */
    3422  	    continue;
    3423  	  /* Compare only if the length matches and the collation rule
    3424  	     index is the same.  */
    3425  	  if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24)
    3426  	      && memcmp (weights + (idx1 & 0xffffff) + 1,
    3427  			 weights + (idx2 & 0xffffff) + 1, len) == 0)
    3428  	    bitset_set (sbcset, ch);
    3429  	}
    3430        /* Check whether the array has enough space.  */
    3431        if (__glibc_unlikely (*equiv_class_alloc == mbcset->nequiv_classes))
    3432  	{
    3433  	  /* Not enough, realloc it.  */
    3434  	  /* +1 in case of mbcset->nequiv_classes is 0.  */
    3435  	  Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
    3436  	  /* Use realloc since the array is NULL if *alloc == 0.  */
    3437  	  int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
    3438  						   int32_t,
    3439  						   new_equiv_class_alloc);
    3440  	  if (__glibc_unlikely (new_equiv_classes == NULL))
    3441  	    return REG_ESPACE;
    3442  	  mbcset->equiv_classes = new_equiv_classes;
    3443  	  *equiv_class_alloc = new_equiv_class_alloc;
    3444  	}
    3445        mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
    3446      }
    3447    else
    3448  #endif /* _LIBC */
    3449      {
    3450        if (__glibc_unlikely (strlen ((const char *) name) != 1))
    3451  	return REG_ECOLLATE;
    3452        bitset_set (sbcset, *name);
    3453      }
    3454    return REG_NOERROR;
    3455  }
    3456  
    3457    /* Helper function for parse_bracket_exp.
    3458       Build the character class which is represented by NAME.
    3459       The result are written to MBCSET and SBCSET.
    3460       CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
    3461       is a pointer argument since we may update it.  */
    3462  
    3463  static reg_errcode_t
    3464  build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
    3465  		 re_charset_t *mbcset, Idx *char_class_alloc,
    3466  		 const char *class_name, reg_syntax_t syntax)
    3467  {
    3468    int i;
    3469    const char *name = class_name;
    3470  
    3471    /* In case of REG_ICASE "upper" and "lower" match the both of
    3472       upper and lower cases.  */
    3473    if ((syntax & RE_ICASE)
    3474        && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
    3475      name = "alpha";
    3476  
    3477    /* Check the space of the arrays.  */
    3478    if (__glibc_unlikely (*char_class_alloc == mbcset->nchar_classes))
    3479      {
    3480        /* Not enough, realloc it.  */
    3481        /* +1 in case of mbcset->nchar_classes is 0.  */
    3482        Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
    3483        /* Use realloc since array is NULL if *alloc == 0.  */
    3484        wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
    3485  					       new_char_class_alloc);
    3486        if (__glibc_unlikely (new_char_classes == NULL))
    3487  	return REG_ESPACE;
    3488        mbcset->char_classes = new_char_classes;
    3489        *char_class_alloc = new_char_class_alloc;
    3490      }
    3491    mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
    3492  
    3493  #define BUILD_CHARCLASS_LOOP(ctype_func)	\
    3494    do {						\
    3495      if (__glibc_unlikely (trans != NULL))			\
    3496        {						\
    3497  	for (i = 0; i < SBC_MAX; ++i)		\
    3498  	  if (ctype_func (i))			\
    3499  	    bitset_set (sbcset, trans[i]);	\
    3500        }						\
    3501      else					\
    3502        {						\
    3503  	for (i = 0; i < SBC_MAX; ++i)		\
    3504  	  if (ctype_func (i))			\
    3505  	    bitset_set (sbcset, i);		\
    3506        }						\
    3507    } while (0)
    3508  
    3509    if (strcmp (name, "alnum") == 0)
    3510      BUILD_CHARCLASS_LOOP (isalnum);
    3511    else if (strcmp (name, "cntrl") == 0)
    3512      BUILD_CHARCLASS_LOOP (iscntrl);
    3513    else if (strcmp (name, "lower") == 0)
    3514      BUILD_CHARCLASS_LOOP (islower);
    3515    else if (strcmp (name, "space") == 0)
    3516      BUILD_CHARCLASS_LOOP (isspace);
    3517    else if (strcmp (name, "alpha") == 0)
    3518      BUILD_CHARCLASS_LOOP (isalpha);
    3519    else if (strcmp (name, "digit") == 0)
    3520      BUILD_CHARCLASS_LOOP (isdigit);
    3521    else if (strcmp (name, "print") == 0)
    3522      BUILD_CHARCLASS_LOOP (isprint);
    3523    else if (strcmp (name, "upper") == 0)
    3524      BUILD_CHARCLASS_LOOP (isupper);
    3525    else if (strcmp (name, "blank") == 0)
    3526      BUILD_CHARCLASS_LOOP (isblank);
    3527    else if (strcmp (name, "graph") == 0)
    3528      BUILD_CHARCLASS_LOOP (isgraph);
    3529    else if (strcmp (name, "punct") == 0)
    3530      BUILD_CHARCLASS_LOOP (ispunct);
    3531    else if (strcmp (name, "xdigit") == 0)
    3532      BUILD_CHARCLASS_LOOP (isxdigit);
    3533    else
    3534      return REG_ECTYPE;
    3535  
    3536    return REG_NOERROR;
    3537  }
    3538  
    3539  static bin_tree_t *
    3540  build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
    3541  		    const char *class_name,
    3542  		    const char *extra, bool non_match,
    3543  		    reg_errcode_t *err)
    3544  {
    3545    re_bitset_ptr_t sbcset;
    3546    re_charset_t *mbcset;
    3547    Idx alloc = 0;
    3548    reg_errcode_t ret;
    3549    bin_tree_t *tree;
    3550  
    3551    sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
    3552    if (__glibc_unlikely (sbcset == NULL))
    3553      {
    3554        *err = REG_ESPACE;
    3555        return NULL;
    3556      }
    3557    mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
    3558    if (__glibc_unlikely (mbcset == NULL))
    3559      {
    3560        re_free (sbcset);
    3561        *err = REG_ESPACE;
    3562        return NULL;
    3563      }
    3564    mbcset->non_match = non_match;
    3565  
    3566    /* We don't care the syntax in this case.  */
    3567    ret = build_charclass (trans, sbcset, mbcset, &alloc, class_name, 0);
    3568  
    3569    if (__glibc_unlikely (ret != REG_NOERROR))
    3570      {
    3571        re_free (sbcset);
    3572        free_charset (mbcset);
    3573        *err = ret;
    3574        return NULL;
    3575      }
    3576    /* \w match '_' also.  */
    3577    for (; *extra; extra++)
    3578      bitset_set (sbcset, *extra);
    3579  
    3580    /* If it is non-matching list.  */
    3581    if (non_match)
    3582      bitset_not (sbcset);
    3583  
    3584    /* Ensure only single byte characters are set.  */
    3585    if (dfa->mb_cur_max > 1)
    3586      bitset_mask (sbcset, dfa->sb_char);
    3587  
    3588    /* Build a tree for simple bracket.  */
    3589    re_token_t br_token = { .type = SIMPLE_BRACKET, .opr.sbcset = sbcset };
    3590    tree = create_token_tree (dfa, NULL, NULL, &br_token);
    3591    if (__glibc_unlikely (tree == NULL))
    3592      goto build_word_op_espace;
    3593  
    3594    if (dfa->mb_cur_max > 1)
    3595      {
    3596        bin_tree_t *mbc_tree;
    3597        /* Build a tree for complex bracket.  */
    3598        br_token.type = COMPLEX_BRACKET;
    3599        br_token.opr.mbcset = mbcset;
    3600        dfa->has_mb_node = 1;
    3601        mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
    3602        if (__glibc_unlikely (mbc_tree == NULL))
    3603  	goto build_word_op_espace;
    3604        /* Then join them by ALT node.  */
    3605        tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
    3606        if (__glibc_likely (mbc_tree != NULL))
    3607  	return tree;
    3608      }
    3609    else
    3610      {
    3611        free_charset (mbcset);
    3612        return tree;
    3613      }
    3614  
    3615   build_word_op_espace:
    3616    re_free (sbcset);
    3617    free_charset (mbcset);
    3618    *err = REG_ESPACE;
    3619    return NULL;
    3620  }
    3621  
    3622  /* This is intended for the expressions like "a{1,3}".
    3623     Fetch a number from 'input', and return the number.
    3624     Return -1 if the number field is empty like "{,1}".
    3625     Return RE_DUP_MAX + 1 if the number field is too large.
    3626     Return -2 if an error occurred.  */
    3627  
    3628  static Idx
    3629  fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
    3630  {
    3631    Idx num = -1;
    3632    unsigned char c;
    3633    while (1)
    3634      {
    3635        fetch_token (token, input, syntax);
    3636        c = token->opr.c;
    3637        if (__glibc_unlikely (token->type == END_OF_RE))
    3638  	return -2;
    3639        if (token->type == OP_CLOSE_DUP_NUM || c == ',')
    3640  	break;
    3641        num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
    3642  	     ? -2
    3643  	     : num == -1
    3644  	     ? c - '0'
    3645  	     : MIN (RE_DUP_MAX + 1, num * 10 + c - '0'));
    3646      }
    3647    return num;
    3648  }
    3649  
    3650  static void
    3651  free_charset (re_charset_t *cset)
    3652  {
    3653    re_free (cset->mbchars);
    3654  #ifdef _LIBC
    3655    re_free (cset->coll_syms);
    3656    re_free (cset->equiv_classes);
    3657  #endif
    3658    re_free (cset->range_starts);
    3659    re_free (cset->range_ends);
    3660    re_free (cset->char_classes);
    3661    re_free (cset);
    3662  }
    3663  
    3664  /* Functions for binary tree operation.  */
    3665  
    3666  /* Create a tree node.  */
    3667  
    3668  static bin_tree_t *
    3669  create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
    3670  	     re_token_type_t type)
    3671  {
    3672    re_token_t t = { .type = type };
    3673    return create_token_tree (dfa, left, right, &t);
    3674  }
    3675  
    3676  static bin_tree_t *
    3677  create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
    3678  		   const re_token_t *token)
    3679  {
    3680    bin_tree_t *tree;
    3681    if (__glibc_unlikely (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE))
    3682      {
    3683        bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
    3684  
    3685        if (storage == NULL)
    3686  	return NULL;
    3687        storage->next = dfa->str_tree_storage;
    3688        dfa->str_tree_storage = storage;
    3689        dfa->str_tree_storage_idx = 0;
    3690      }
    3691    tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
    3692  
    3693    tree->parent = NULL;
    3694    tree->left = left;
    3695    tree->right = right;
    3696    tree->token = *token;
    3697    tree->token.duplicated = 0;
    3698    tree->token.opt_subexp = 0;
    3699    tree->first = NULL;
    3700    tree->next = NULL;
    3701    tree->node_idx = -1;
    3702  
    3703    if (left != NULL)
    3704      left->parent = tree;
    3705    if (right != NULL)
    3706      right->parent = tree;
    3707    return tree;
    3708  }
    3709  
    3710  /* Mark the tree SRC as an optional subexpression.
    3711     To be called from preorder or postorder.  */
    3712  
    3713  static reg_errcode_t
    3714  mark_opt_subexp (void *extra, bin_tree_t *node)
    3715  {
    3716    Idx idx = (uintptr_t) extra;
    3717    if (node->token.type == SUBEXP && node->token.opr.idx == idx)
    3718      node->token.opt_subexp = 1;
    3719  
    3720    return REG_NOERROR;
    3721  }
    3722  
    3723  /* Free the allocated memory inside NODE. */
    3724  
    3725  static void
    3726  free_token (re_token_t *node)
    3727  {
    3728    if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
    3729      free_charset (node->opr.mbcset);
    3730    else if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
    3731      re_free (node->opr.sbcset);
    3732  }
    3733  
    3734  /* Worker function for tree walking.  Free the allocated memory inside NODE
    3735     and its children. */
    3736  
    3737  static reg_errcode_t
    3738  free_tree (void *extra, bin_tree_t *node)
    3739  {
    3740    free_token (&node->token);
    3741    return REG_NOERROR;
    3742  }
    3743  
    3744  
    3745  /* Duplicate the node SRC, and return new node.  This is a preorder
    3746     visit similar to the one implemented by the generic visitor, but
    3747     we need more infrastructure to maintain two parallel trees --- so,
    3748     it's easier to duplicate.  */
    3749  
    3750  static bin_tree_t *
    3751  duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
    3752  {
    3753    const bin_tree_t *node;
    3754    bin_tree_t *dup_root;
    3755    bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
    3756  
    3757    for (node = root; ; )
    3758      {
    3759        /* Create a new tree and link it back to the current parent.  */
    3760        *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
    3761        if (*p_new == NULL)
    3762  	return NULL;
    3763        (*p_new)->parent = dup_node;
    3764        (*p_new)->token.duplicated = 1;
    3765        dup_node = *p_new;
    3766  
    3767        /* Go to the left node, or up and to the right.  */
    3768        if (node->left)
    3769  	{
    3770  	  node = node->left;
    3771  	  p_new = &dup_node->left;
    3772  	}
    3773        else
    3774  	{
    3775  	  const bin_tree_t *prev = NULL;
    3776  	  while (node->right == prev || node->right == NULL)
    3777  	    {
    3778  	      prev = node;
    3779  	      node = node->parent;
    3780  	      dup_node = dup_node->parent;
    3781  	      if (!node)
    3782  		return dup_root;
    3783  	    }
    3784  	  node = node->right;
    3785  	  p_new = &dup_node->right;
    3786  	}
    3787      }
    3788  }