1  /* Extended regular expression matching and search library.
       2     Copyright (C) 2002-2021 Free Software Foundation, Inc.
       3     This file is part of the GNU C Library.
       4     Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
       5  
       6     The GNU C Library is free software; you can redistribute it and/or
       7     modify it under the terms of the GNU General Public
       8     License as published by the Free Software Foundation; either
       9     version 3 of the License, or (at your option) any later version.
      10  
      11     The GNU C Library is distributed in the hope that it will be useful,
      12     but WITHOUT ANY WARRANTY; without even the implied warranty of
      13     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
      14     General Public License for more details.
      15  
      16     You should have received a copy of the GNU General Public
      17     License along with the GNU C Library; if not, see
      18     <https://www.gnu.org/licenses/>.  */
      19  
      20  static void re_string_construct_common (const char *str, Idx len,
      21  					re_string_t *pstr,
      22  					RE_TRANSLATE_TYPE trans, bool icase,
      23  					const re_dfa_t *dfa);
      24  static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
      25  					  const re_node_set *nodes,
      26  					  re_hashval_t hash);
      27  static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
      28  					  const re_node_set *nodes,
      29  					  unsigned int context,
      30  					  re_hashval_t hash);
      31  static reg_errcode_t re_string_realloc_buffers (re_string_t *pstr,
      32  						Idx new_buf_len);
      33  #ifdef RE_ENABLE_I18N
      34  static void build_wcs_buffer (re_string_t *pstr);
      35  static reg_errcode_t build_wcs_upper_buffer (re_string_t *pstr);
      36  #endif /* RE_ENABLE_I18N */
      37  static void build_upper_buffer (re_string_t *pstr);
      38  static void re_string_translate_buffer (re_string_t *pstr);
      39  static unsigned int re_string_context_at (const re_string_t *input, Idx idx,
      40  					  int eflags) __attribute__ ((pure));
      41  
      42  /* Functions for string operation.  */
      43  
      44  /* This function allocate the buffers.  It is necessary to call
      45     re_string_reconstruct before using the object.  */
      46  
      47  static reg_errcode_t
      48  __attribute_warn_unused_result__
      49  re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len,
      50  		    RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
      51  {
      52    reg_errcode_t ret;
      53    Idx init_buf_len;
      54  
      55    /* Ensure at least one character fits into the buffers.  */
      56    if (init_len < dfa->mb_cur_max)
      57      init_len = dfa->mb_cur_max;
      58    init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
      59    re_string_construct_common (str, len, pstr, trans, icase, dfa);
      60  
      61    ret = re_string_realloc_buffers (pstr, init_buf_len);
      62    if (__glibc_unlikely (ret != REG_NOERROR))
      63      return ret;
      64  
      65    pstr->word_char = dfa->word_char;
      66    pstr->word_ops_used = dfa->word_ops_used;
      67    pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
      68    pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
      69    pstr->valid_raw_len = pstr->valid_len;
      70    return REG_NOERROR;
      71  }
      72  
      73  /* This function allocate the buffers, and initialize them.  */
      74  
      75  static reg_errcode_t
      76  __attribute_warn_unused_result__
      77  re_string_construct (re_string_t *pstr, const char *str, Idx len,
      78  		     RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
      79  {
      80    reg_errcode_t ret;
      81    memset (pstr, '\0', sizeof (re_string_t));
      82    re_string_construct_common (str, len, pstr, trans, icase, dfa);
      83  
      84    if (len > 0)
      85      {
      86        ret = re_string_realloc_buffers (pstr, len + 1);
      87        if (__glibc_unlikely (ret != REG_NOERROR))
      88  	return ret;
      89      }
      90    pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
      91  
      92    if (icase)
      93      {
      94  #ifdef RE_ENABLE_I18N
      95        if (dfa->mb_cur_max > 1)
      96  	{
      97  	  while (1)
      98  	    {
      99  	      ret = build_wcs_upper_buffer (pstr);
     100  	      if (__glibc_unlikely (ret != REG_NOERROR))
     101  		return ret;
     102  	      if (pstr->valid_raw_len >= len)
     103  		break;
     104  	      if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
     105  		break;
     106  	      ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
     107  	      if (__glibc_unlikely (ret != REG_NOERROR))
     108  		return ret;
     109  	    }
     110  	}
     111        else
     112  #endif /* RE_ENABLE_I18N  */
     113  	build_upper_buffer (pstr);
     114      }
     115    else
     116      {
     117  #ifdef RE_ENABLE_I18N
     118        if (dfa->mb_cur_max > 1)
     119  	build_wcs_buffer (pstr);
     120        else
     121  #endif /* RE_ENABLE_I18N  */
     122  	{
     123  	  if (trans != NULL)
     124  	    re_string_translate_buffer (pstr);
     125  	  else
     126  	    {
     127  	      pstr->valid_len = pstr->bufs_len;
     128  	      pstr->valid_raw_len = pstr->bufs_len;
     129  	    }
     130  	}
     131      }
     132  
     133    return REG_NOERROR;
     134  }
     135  
     136  /* Helper functions for re_string_allocate, and re_string_construct.  */
     137  
     138  static reg_errcode_t
     139  __attribute_warn_unused_result__
     140  re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len)
     141  {
     142  #ifdef RE_ENABLE_I18N
     143    if (pstr->mb_cur_max > 1)
     144      {
     145        wint_t *new_wcs;
     146  
     147        /* Avoid overflow in realloc.  */
     148        const size_t max_object_size = MAX (sizeof (wint_t), sizeof (Idx));
     149        if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size)
     150  			    < new_buf_len))
     151  	return REG_ESPACE;
     152  
     153        new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
     154        if (__glibc_unlikely (new_wcs == NULL))
     155  	return REG_ESPACE;
     156        pstr->wcs = new_wcs;
     157        if (pstr->offsets != NULL)
     158  	{
     159  	  Idx *new_offsets = re_realloc (pstr->offsets, Idx, new_buf_len);
     160  	  if (__glibc_unlikely (new_offsets == NULL))
     161  	    return REG_ESPACE;
     162  	  pstr->offsets = new_offsets;
     163  	}
     164      }
     165  #endif /* RE_ENABLE_I18N  */
     166    if (pstr->mbs_allocated)
     167      {
     168        unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
     169  					   new_buf_len);
     170        if (__glibc_unlikely (new_mbs == NULL))
     171  	return REG_ESPACE;
     172        pstr->mbs = new_mbs;
     173      }
     174    pstr->bufs_len = new_buf_len;
     175    return REG_NOERROR;
     176  }
     177  
     178  
     179  static void
     180  re_string_construct_common (const char *str, Idx len, re_string_t *pstr,
     181  			    RE_TRANSLATE_TYPE trans, bool icase,
     182  			    const re_dfa_t *dfa)
     183  {
     184    pstr->raw_mbs = (const unsigned char *) str;
     185    pstr->len = len;
     186    pstr->raw_len = len;
     187    pstr->trans = trans;
     188    pstr->icase = icase;
     189    pstr->mbs_allocated = (trans != NULL || icase);
     190    pstr->mb_cur_max = dfa->mb_cur_max;
     191    pstr->is_utf8 = dfa->is_utf8;
     192    pstr->map_notascii = dfa->map_notascii;
     193    pstr->stop = pstr->len;
     194    pstr->raw_stop = pstr->stop;
     195  }
     196  
     197  #ifdef RE_ENABLE_I18N
     198  
     199  /* Build wide character buffer PSTR->WCS.
     200     If the byte sequence of the string are:
     201       <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
     202     Then wide character buffer will be:
     203       <wc1>   , WEOF    , <wc2>   , WEOF    , <wc3>
     204     We use WEOF for padding, they indicate that the position isn't
     205     a first byte of a multibyte character.
     206  
     207     Note that this function assumes PSTR->VALID_LEN elements are already
     208     built and starts from PSTR->VALID_LEN.  */
     209  
     210  static void
     211  build_wcs_buffer (re_string_t *pstr)
     212  {
     213  #ifdef _LIBC
     214    unsigned char buf[MB_LEN_MAX];
     215    DEBUG_ASSERT (MB_LEN_MAX >= pstr->mb_cur_max);
     216  #else
     217    unsigned char buf[64];
     218  #endif
     219    mbstate_t prev_st;
     220    Idx byte_idx, end_idx, remain_len;
     221    size_t mbclen;
     222  
     223    /* Build the buffers from pstr->valid_len to either pstr->len or
     224       pstr->bufs_len.  */
     225    end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
     226    for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
     227      {
     228        wchar_t wc;
     229        const char *p;
     230  
     231        remain_len = end_idx - byte_idx;
     232        prev_st = pstr->cur_state;
     233        /* Apply the translation if we need.  */
     234        if (__glibc_unlikely (pstr->trans != NULL))
     235  	{
     236  	  int i, ch;
     237  
     238  	  for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
     239  	    {
     240  	      ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
     241  	      buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
     242  	    }
     243  	  p = (const char *) buf;
     244  	}
     245        else
     246  	p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
     247        mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
     248        if (__glibc_unlikely (mbclen == (size_t) -1 || mbclen == 0
     249  			    || (mbclen == (size_t) -2
     250  				&& pstr->bufs_len >= pstr->len)))
     251  	{
     252  	  /* We treat these cases as a singlebyte character.  */
     253  	  mbclen = 1;
     254  	  wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
     255  	  if (__glibc_unlikely (pstr->trans != NULL))
     256  	    wc = pstr->trans[wc];
     257  	  pstr->cur_state = prev_st;
     258  	}
     259        else if (__glibc_unlikely (mbclen == (size_t) -2))
     260  	{
     261  	  /* The buffer doesn't have enough space, finish to build.  */
     262  	  pstr->cur_state = prev_st;
     263  	  break;
     264  	}
     265  
     266        /* Write wide character and padding.  */
     267        pstr->wcs[byte_idx++] = wc;
     268        /* Write paddings.  */
     269        for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
     270  	pstr->wcs[byte_idx++] = WEOF;
     271      }
     272    pstr->valid_len = byte_idx;
     273    pstr->valid_raw_len = byte_idx;
     274  }
     275  
     276  /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
     277     but for REG_ICASE.  */
     278  
     279  static reg_errcode_t
     280  __attribute_warn_unused_result__
     281  build_wcs_upper_buffer (re_string_t *pstr)
     282  {
     283    mbstate_t prev_st;
     284    Idx src_idx, byte_idx, end_idx, remain_len;
     285    size_t mbclen;
     286  #ifdef _LIBC
     287    char buf[MB_LEN_MAX];
     288    DEBUG_ASSERT (pstr->mb_cur_max <= MB_LEN_MAX);
     289  #else
     290    char buf[64];
     291  #endif
     292  
     293    byte_idx = pstr->valid_len;
     294    end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
     295  
     296    /* The following optimization assumes that ASCII characters can be
     297       mapped to wide characters with a simple cast.  */
     298    if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
     299      {
     300        while (byte_idx < end_idx)
     301  	{
     302  	  wchar_t wc;
     303  	  unsigned char ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
     304  
     305  	  if (isascii (ch) && mbsinit (&pstr->cur_state))
     306  	    {
     307  	      /* The next step uses the assumption that wchar_t is encoded
     308  		 ASCII-safe: all ASCII values can be converted like this.  */
     309  	      wchar_t wcu = __towupper (ch);
     310  	      if (isascii (wcu))
     311  		{
     312  		  pstr->mbs[byte_idx] = wcu;
     313  		  pstr->wcs[byte_idx] = wcu;
     314  		  byte_idx++;
     315  		  continue;
     316  		}
     317  	    }
     318  
     319  	  remain_len = end_idx - byte_idx;
     320  	  prev_st = pstr->cur_state;
     321  	  mbclen = __mbrtowc (&wc,
     322  			      ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
     323  			       + byte_idx), remain_len, &pstr->cur_state);
     324  	  if (__glibc_likely (0 < mbclen && mbclen < (size_t) -2))
     325  	    {
     326  	      wchar_t wcu = __towupper (wc);
     327  	      if (wcu != wc)
     328  		{
     329  		  size_t mbcdlen;
     330  
     331  		  mbcdlen = __wcrtomb (buf, wcu, &prev_st);
     332  		  if (__glibc_likely (mbclen == mbcdlen))
     333  		    memcpy (pstr->mbs + byte_idx, buf, mbclen);
     334  		  else
     335  		    {
     336  		      src_idx = byte_idx;
     337  		      goto offsets_needed;
     338  		    }
     339  		}
     340  	      else
     341  		memcpy (pstr->mbs + byte_idx,
     342  			pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
     343  	      pstr->wcs[byte_idx++] = wcu;
     344  	      /* Write paddings.  */
     345  	      for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
     346  		pstr->wcs[byte_idx++] = WEOF;
     347  	    }
     348  	  else if (mbclen == (size_t) -1 || mbclen == 0
     349  		   || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
     350  	    {
     351  	      /* It is an invalid character, an incomplete character
     352  		 at the end of the string, or '\0'.  Just use the byte.  */
     353  	      pstr->mbs[byte_idx] = ch;
     354  	      /* And also cast it to wide char.  */
     355  	      pstr->wcs[byte_idx++] = (wchar_t) ch;
     356  	      if (__glibc_unlikely (mbclen == (size_t) -1))
     357  		pstr->cur_state = prev_st;
     358  	    }
     359  	  else
     360  	    {
     361  	      /* The buffer doesn't have enough space, finish to build.  */
     362  	      pstr->cur_state = prev_st;
     363  	      break;
     364  	    }
     365  	}
     366        pstr->valid_len = byte_idx;
     367        pstr->valid_raw_len = byte_idx;
     368        return REG_NOERROR;
     369      }
     370    else
     371      for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
     372        {
     373  	wchar_t wc;
     374  	const char *p;
     375        offsets_needed:
     376  	remain_len = end_idx - byte_idx;
     377  	prev_st = pstr->cur_state;
     378  	if (__glibc_unlikely (pstr->trans != NULL))
     379  	  {
     380  	    int i, ch;
     381  
     382  	    for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
     383  	      {
     384  		ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
     385  		buf[i] = pstr->trans[ch];
     386  	      }
     387  	    p = (const char *) buf;
     388  	  }
     389  	else
     390  	  p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
     391  	mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
     392  	if (__glibc_likely (0 < mbclen && mbclen < (size_t) -2))
     393  	  {
     394  	    wchar_t wcu = __towupper (wc);
     395  	    if (wcu != wc)
     396  	      {
     397  		size_t mbcdlen;
     398  
     399  		mbcdlen = __wcrtomb ((char *) buf, wcu, &prev_st);
     400  		if (__glibc_likely (mbclen == mbcdlen))
     401  		  memcpy (pstr->mbs + byte_idx, buf, mbclen);
     402  		else if (mbcdlen != (size_t) -1)
     403  		  {
     404  		    size_t i;
     405  
     406  		    if (byte_idx + mbcdlen > pstr->bufs_len)
     407  		      {
     408  			pstr->cur_state = prev_st;
     409  			break;
     410  		      }
     411  
     412  		    if (pstr->offsets == NULL)
     413  		      {
     414  			pstr->offsets = re_malloc (Idx, pstr->bufs_len);
     415  
     416  			if (pstr->offsets == NULL)
     417  			  return REG_ESPACE;
     418  		      }
     419  		    if (!pstr->offsets_needed)
     420  		      {
     421  			for (i = 0; i < (size_t) byte_idx; ++i)
     422  			  pstr->offsets[i] = i;
     423  			pstr->offsets_needed = 1;
     424  		      }
     425  
     426  		    memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
     427  		    pstr->wcs[byte_idx] = wcu;
     428  		    pstr->offsets[byte_idx] = src_idx;
     429  		    for (i = 1; i < mbcdlen; ++i)
     430  		      {
     431  			pstr->offsets[byte_idx + i]
     432  			  = src_idx + (i < mbclen ? i : mbclen - 1);
     433  			pstr->wcs[byte_idx + i] = WEOF;
     434  		      }
     435  		    pstr->len += mbcdlen - mbclen;
     436  		    if (pstr->raw_stop > src_idx)
     437  		      pstr->stop += mbcdlen - mbclen;
     438  		    end_idx = (pstr->bufs_len > pstr->len)
     439  			      ? pstr->len : pstr->bufs_len;
     440  		    byte_idx += mbcdlen;
     441  		    src_idx += mbclen;
     442  		    continue;
     443  		  }
     444  		else
     445  		  memcpy (pstr->mbs + byte_idx, p, mbclen);
     446  	      }
     447  	    else
     448  	      memcpy (pstr->mbs + byte_idx, p, mbclen);
     449  
     450  	    if (__glibc_unlikely (pstr->offsets_needed != 0))
     451  	      {
     452  		size_t i;
     453  		for (i = 0; i < mbclen; ++i)
     454  		  pstr->offsets[byte_idx + i] = src_idx + i;
     455  	      }
     456  	    src_idx += mbclen;
     457  
     458  	    pstr->wcs[byte_idx++] = wcu;
     459  	    /* Write paddings.  */
     460  	    for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
     461  	      pstr->wcs[byte_idx++] = WEOF;
     462  	  }
     463  	else if (mbclen == (size_t) -1 || mbclen == 0
     464  		 || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
     465  	  {
     466  	    /* It is an invalid character or '\0'.  Just use the byte.  */
     467  	    int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
     468  
     469  	    if (__glibc_unlikely (pstr->trans != NULL))
     470  	      ch = pstr->trans [ch];
     471  	    pstr->mbs[byte_idx] = ch;
     472  
     473  	    if (__glibc_unlikely (pstr->offsets_needed != 0))
     474  	      pstr->offsets[byte_idx] = src_idx;
     475  	    ++src_idx;
     476  
     477  	    /* And also cast it to wide char.  */
     478  	    pstr->wcs[byte_idx++] = (wchar_t) ch;
     479  	    if (__glibc_unlikely (mbclen == (size_t) -1))
     480  	      pstr->cur_state = prev_st;
     481  	  }
     482  	else
     483  	  {
     484  	    /* The buffer doesn't have enough space, finish to build.  */
     485  	    pstr->cur_state = prev_st;
     486  	    break;
     487  	  }
     488        }
     489    pstr->valid_len = byte_idx;
     490    pstr->valid_raw_len = src_idx;
     491    return REG_NOERROR;
     492  }
     493  
     494  /* Skip characters until the index becomes greater than NEW_RAW_IDX.
     495     Return the index.  */
     496  
     497  static Idx
     498  re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc)
     499  {
     500    mbstate_t prev_st;
     501    Idx rawbuf_idx;
     502    size_t mbclen;
     503    wint_t wc = WEOF;
     504  
     505    /* Skip the characters which are not necessary to check.  */
     506    for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
     507         rawbuf_idx < new_raw_idx;)
     508      {
     509        wchar_t wc2;
     510        Idx remain_len = pstr->raw_len - rawbuf_idx;
     511        prev_st = pstr->cur_state;
     512        mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
     513  			  remain_len, &pstr->cur_state);
     514        if (__glibc_unlikely (mbclen == (size_t) -2 || mbclen == (size_t) -1
     515  			    || mbclen == 0))
     516  	{
     517  	  /* We treat these cases as a single byte character.  */
     518  	  if (mbclen == 0 || remain_len == 0)
     519  	    wc = L'\0';
     520  	  else
     521  	    wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
     522  	  mbclen = 1;
     523  	  pstr->cur_state = prev_st;
     524  	}
     525        else
     526  	wc = wc2;
     527        /* Then proceed the next character.  */
     528        rawbuf_idx += mbclen;
     529      }
     530    *last_wc = wc;
     531    return rawbuf_idx;
     532  }
     533  #endif /* RE_ENABLE_I18N  */
     534  
     535  /* Build the buffer PSTR->MBS, and apply the translation if we need.
     536     This function is used in case of REG_ICASE.  */
     537  
     538  static void
     539  build_upper_buffer (re_string_t *pstr)
     540  {
     541    Idx char_idx, end_idx;
     542    end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
     543  
     544    for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
     545      {
     546        int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
     547        if (__glibc_unlikely (pstr->trans != NULL))
     548  	ch = pstr->trans[ch];
     549        pstr->mbs[char_idx] = toupper (ch);
     550      }
     551    pstr->valid_len = char_idx;
     552    pstr->valid_raw_len = char_idx;
     553  }
     554  
     555  /* Apply TRANS to the buffer in PSTR.  */
     556  
     557  static void
     558  re_string_translate_buffer (re_string_t *pstr)
     559  {
     560    Idx buf_idx, end_idx;
     561    end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
     562  
     563    for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
     564      {
     565        int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
     566        pstr->mbs[buf_idx] = pstr->trans[ch];
     567      }
     568  
     569    pstr->valid_len = buf_idx;
     570    pstr->valid_raw_len = buf_idx;
     571  }
     572  
     573  /* This function re-construct the buffers.
     574     Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
     575     convert to upper case in case of REG_ICASE, apply translation.  */
     576  
     577  static reg_errcode_t
     578  __attribute_warn_unused_result__
     579  re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
     580  {
     581    Idx offset;
     582  
     583    if (__glibc_unlikely (pstr->raw_mbs_idx <= idx))
     584      offset = idx - pstr->raw_mbs_idx;
     585    else
     586      {
     587        /* Reset buffer.  */
     588  #ifdef RE_ENABLE_I18N
     589        if (pstr->mb_cur_max > 1)
     590  	memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
     591  #endif /* RE_ENABLE_I18N */
     592        pstr->len = pstr->raw_len;
     593        pstr->stop = pstr->raw_stop;
     594        pstr->valid_len = 0;
     595        pstr->raw_mbs_idx = 0;
     596        pstr->valid_raw_len = 0;
     597        pstr->offsets_needed = 0;
     598        pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
     599  			   : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
     600        if (!pstr->mbs_allocated)
     601  	pstr->mbs = (unsigned char *) pstr->raw_mbs;
     602        offset = idx;
     603      }
     604  
     605    if (__glibc_likely (offset != 0))
     606      {
     607        /* Should the already checked characters be kept?  */
     608        if (__glibc_likely (offset < pstr->valid_raw_len))
     609  	{
     610  	  /* Yes, move them to the front of the buffer.  */
     611  #ifdef RE_ENABLE_I18N
     612  	  if (__glibc_unlikely (pstr->offsets_needed))
     613  	    {
     614  	      Idx low = 0, high = pstr->valid_len, mid;
     615  	      do
     616  		{
     617  		  mid = (high + low) / 2;
     618  		  if (pstr->offsets[mid] > offset)
     619  		    high = mid;
     620  		  else if (pstr->offsets[mid] < offset)
     621  		    low = mid + 1;
     622  		  else
     623  		    break;
     624  		}
     625  	      while (low < high);
     626  	      if (pstr->offsets[mid] < offset)
     627  		++mid;
     628  	      pstr->tip_context = re_string_context_at (pstr, mid - 1,
     629  							eflags);
     630  	      /* This can be quite complicated, so handle specially
     631  		 only the common and easy case where the character with
     632  		 different length representation of lower and upper
     633  		 case is present at or after offset.  */
     634  	      if (pstr->valid_len > offset
     635  		  && mid == offset && pstr->offsets[mid] == offset)
     636  		{
     637  		  memmove (pstr->wcs, pstr->wcs + offset,
     638  			   (pstr->valid_len - offset) * sizeof (wint_t));
     639  		  memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
     640  		  pstr->valid_len -= offset;
     641  		  pstr->valid_raw_len -= offset;
     642  		  for (low = 0; low < pstr->valid_len; low++)
     643  		    pstr->offsets[low] = pstr->offsets[low + offset] - offset;
     644  		}
     645  	      else
     646  		{
     647  		  /* Otherwise, just find out how long the partial multibyte
     648  		     character at offset is and fill it with WEOF/255.  */
     649  		  pstr->len = pstr->raw_len - idx + offset;
     650  		  pstr->stop = pstr->raw_stop - idx + offset;
     651  		  pstr->offsets_needed = 0;
     652  		  while (mid > 0 && pstr->offsets[mid - 1] == offset)
     653  		    --mid;
     654  		  while (mid < pstr->valid_len)
     655  		    if (pstr->wcs[mid] != WEOF)
     656  		      break;
     657  		    else
     658  		      ++mid;
     659  		  if (mid == pstr->valid_len)
     660  		    pstr->valid_len = 0;
     661  		  else
     662  		    {
     663  		      pstr->valid_len = pstr->offsets[mid] - offset;
     664  		      if (pstr->valid_len)
     665  			{
     666  			  for (low = 0; low < pstr->valid_len; ++low)
     667  			    pstr->wcs[low] = WEOF;
     668  			  memset (pstr->mbs, 255, pstr->valid_len);
     669  			}
     670  		    }
     671  		  pstr->valid_raw_len = pstr->valid_len;
     672  		}
     673  	    }
     674  	  else
     675  #endif
     676  	    {
     677  	      pstr->tip_context = re_string_context_at (pstr, offset - 1,
     678  							eflags);
     679  #ifdef RE_ENABLE_I18N
     680  	      if (pstr->mb_cur_max > 1)
     681  		memmove (pstr->wcs, pstr->wcs + offset,
     682  			 (pstr->valid_len - offset) * sizeof (wint_t));
     683  #endif /* RE_ENABLE_I18N */
     684  	      if (__glibc_unlikely (pstr->mbs_allocated))
     685  		memmove (pstr->mbs, pstr->mbs + offset,
     686  			 pstr->valid_len - offset);
     687  	      pstr->valid_len -= offset;
     688  	      pstr->valid_raw_len -= offset;
     689  	      DEBUG_ASSERT (pstr->valid_len > 0);
     690  	    }
     691  	}
     692        else
     693  	{
     694  #ifdef RE_ENABLE_I18N
     695  	  /* No, skip all characters until IDX.  */
     696  	  Idx prev_valid_len = pstr->valid_len;
     697  
     698  	  if (__glibc_unlikely (pstr->offsets_needed))
     699  	    {
     700  	      pstr->len = pstr->raw_len - idx + offset;
     701  	      pstr->stop = pstr->raw_stop - idx + offset;
     702  	      pstr->offsets_needed = 0;
     703  	    }
     704  #endif
     705  	  pstr->valid_len = 0;
     706  #ifdef RE_ENABLE_I18N
     707  	  if (pstr->mb_cur_max > 1)
     708  	    {
     709  	      Idx wcs_idx;
     710  	      wint_t wc = WEOF;
     711  
     712  	      if (pstr->is_utf8)
     713  		{
     714  		  const unsigned char *raw, *p, *end;
     715  
     716  		  /* Special case UTF-8.  Multi-byte chars start with any
     717  		     byte other than 0x80 - 0xbf.  */
     718  		  raw = pstr->raw_mbs + pstr->raw_mbs_idx;
     719  		  end = raw + (offset - pstr->mb_cur_max);
     720  		  if (end < pstr->raw_mbs)
     721  		    end = pstr->raw_mbs;
     722  		  p = raw + offset - 1;
     723  #ifdef _LIBC
     724  		  /* We know the wchar_t encoding is UCS4, so for the simple
     725  		     case, ASCII characters, skip the conversion step.  */
     726  		  if (isascii (*p) && __glibc_likely (pstr->trans == NULL))
     727  		    {
     728  		      memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
     729  		      /* pstr->valid_len = 0; */
     730  		      wc = (wchar_t) *p;
     731  		    }
     732  		  else
     733  #endif
     734  		    for (; p >= end; --p)
     735  		      if ((*p & 0xc0) != 0x80)
     736  			{
     737  			  mbstate_t cur_state;
     738  			  wchar_t wc2;
     739  			  Idx mlen = raw + pstr->len - p;
     740  			  unsigned char buf[6];
     741  			  size_t mbclen;
     742  
     743  			  const unsigned char *pp = p;
     744  			  if (__glibc_unlikely (pstr->trans != NULL))
     745  			    {
     746  			      int i = mlen < 6 ? mlen : 6;
     747  			      while (--i >= 0)
     748  				buf[i] = pstr->trans[p[i]];
     749  			      pp = buf;
     750  			    }
     751  			  /* XXX Don't use mbrtowc, we know which conversion
     752  			     to use (UTF-8 -> UCS4).  */
     753  			  memset (&cur_state, 0, sizeof (cur_state));
     754  			  mbclen = __mbrtowc (&wc2, (const char *) pp, mlen,
     755  					      &cur_state);
     756  			  if (raw + offset - p <= mbclen
     757  			      && mbclen < (size_t) -2)
     758  			    {
     759  			      memset (&pstr->cur_state, '\0',
     760  				      sizeof (mbstate_t));
     761  			      pstr->valid_len = mbclen - (raw + offset - p);
     762  			      wc = wc2;
     763  			    }
     764  			  break;
     765  			}
     766  		}
     767  
     768  	      if (wc == WEOF)
     769  		pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
     770  	      if (wc == WEOF)
     771  		pstr->tip_context
     772  		  = re_string_context_at (pstr, prev_valid_len - 1, eflags);
     773  	      else
     774  		pstr->tip_context = ((__glibc_unlikely (pstr->word_ops_used != 0)
     775  				      && IS_WIDE_WORD_CHAR (wc))
     776  				     ? CONTEXT_WORD
     777  				     : ((IS_WIDE_NEWLINE (wc)
     778  					 && pstr->newline_anchor)
     779  					? CONTEXT_NEWLINE : 0));
     780  	      if (__glibc_unlikely (pstr->valid_len))
     781  		{
     782  		  for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
     783  		    pstr->wcs[wcs_idx] = WEOF;
     784  		  if (pstr->mbs_allocated)
     785  		    memset (pstr->mbs, 255, pstr->valid_len);
     786  		}
     787  	      pstr->valid_raw_len = pstr->valid_len;
     788  	    }
     789  	  else
     790  #endif /* RE_ENABLE_I18N */
     791  	    {
     792  	      int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
     793  	      pstr->valid_raw_len = 0;
     794  	      if (pstr->trans)
     795  		c = pstr->trans[c];
     796  	      pstr->tip_context = (bitset_contain (pstr->word_char, c)
     797  				   ? CONTEXT_WORD
     798  				   : ((IS_NEWLINE (c) && pstr->newline_anchor)
     799  				      ? CONTEXT_NEWLINE : 0));
     800  	    }
     801  	}
     802        if (!__glibc_unlikely (pstr->mbs_allocated))
     803  	pstr->mbs += offset;
     804      }
     805    pstr->raw_mbs_idx = idx;
     806    pstr->len -= offset;
     807    pstr->stop -= offset;
     808  
     809    /* Then build the buffers.  */
     810  #ifdef RE_ENABLE_I18N
     811    if (pstr->mb_cur_max > 1)
     812      {
     813        if (pstr->icase)
     814  	{
     815  	  reg_errcode_t ret = build_wcs_upper_buffer (pstr);
     816  	  if (__glibc_unlikely (ret != REG_NOERROR))
     817  	    return ret;
     818  	}
     819        else
     820  	build_wcs_buffer (pstr);
     821      }
     822    else
     823  #endif /* RE_ENABLE_I18N */
     824      if (__glibc_unlikely (pstr->mbs_allocated))
     825        {
     826  	if (pstr->icase)
     827  	  build_upper_buffer (pstr);
     828  	else if (pstr->trans != NULL)
     829  	  re_string_translate_buffer (pstr);
     830        }
     831      else
     832        pstr->valid_len = pstr->len;
     833  
     834    pstr->cur_idx = 0;
     835    return REG_NOERROR;
     836  }
     837  
     838  static unsigned char
     839  __attribute__ ((pure))
     840  re_string_peek_byte_case (const re_string_t *pstr, Idx idx)
     841  {
     842    int ch;
     843    Idx off;
     844  
     845    /* Handle the common (easiest) cases first.  */
     846    if (__glibc_likely (!pstr->mbs_allocated))
     847      return re_string_peek_byte (pstr, idx);
     848  
     849  #ifdef RE_ENABLE_I18N
     850    if (pstr->mb_cur_max > 1
     851        && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
     852      return re_string_peek_byte (pstr, idx);
     853  #endif
     854  
     855    off = pstr->cur_idx + idx;
     856  #ifdef RE_ENABLE_I18N
     857    if (pstr->offsets_needed)
     858      off = pstr->offsets[off];
     859  #endif
     860  
     861    ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
     862  
     863  #ifdef RE_ENABLE_I18N
     864    /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
     865       this function returns CAPITAL LETTER I instead of first byte of
     866       DOTLESS SMALL LETTER I.  The latter would confuse the parser,
     867       since peek_byte_case doesn't advance cur_idx in any way.  */
     868    if (pstr->offsets_needed && !isascii (ch))
     869      return re_string_peek_byte (pstr, idx);
     870  #endif
     871  
     872    return ch;
     873  }
     874  
     875  static unsigned char
     876  re_string_fetch_byte_case (re_string_t *pstr)
     877  {
     878    if (__glibc_likely (!pstr->mbs_allocated))
     879      return re_string_fetch_byte (pstr);
     880  
     881  #ifdef RE_ENABLE_I18N
     882    if (pstr->offsets_needed)
     883      {
     884        Idx off;
     885        int ch;
     886  
     887        /* For tr_TR.UTF-8 [[:islower:]] there is
     888  	 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs.  Skip
     889  	 in that case the whole multi-byte character and return
     890  	 the original letter.  On the other side, with
     891  	 [[: DOTLESS SMALL LETTER I return [[:I, as doing
     892  	 anything else would complicate things too much.  */
     893  
     894        if (!re_string_first_byte (pstr, pstr->cur_idx))
     895  	return re_string_fetch_byte (pstr);
     896  
     897        off = pstr->offsets[pstr->cur_idx];
     898        ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
     899  
     900        if (! isascii (ch))
     901  	return re_string_fetch_byte (pstr);
     902  
     903        re_string_skip_bytes (pstr,
     904  			    re_string_char_size_at (pstr, pstr->cur_idx));
     905        return ch;
     906      }
     907  #endif
     908  
     909    return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
     910  }
     911  
     912  static void
     913  re_string_destruct (re_string_t *pstr)
     914  {
     915  #ifdef RE_ENABLE_I18N
     916    re_free (pstr->wcs);
     917    re_free (pstr->offsets);
     918  #endif /* RE_ENABLE_I18N  */
     919    if (pstr->mbs_allocated)
     920      re_free (pstr->mbs);
     921  }
     922  
     923  /* Return the context at IDX in INPUT.  */
     924  
     925  static unsigned int
     926  re_string_context_at (const re_string_t *input, Idx idx, int eflags)
     927  {
     928    int c;
     929    if (__glibc_unlikely (idx < 0))
     930      /* In this case, we use the value stored in input->tip_context,
     931         since we can't know the character in input->mbs[-1] here.  */
     932      return input->tip_context;
     933    if (__glibc_unlikely (idx == input->len))
     934      return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
     935  	    : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
     936  #ifdef RE_ENABLE_I18N
     937    if (input->mb_cur_max > 1)
     938      {
     939        wint_t wc;
     940        Idx wc_idx = idx;
     941        while(input->wcs[wc_idx] == WEOF)
     942  	{
     943  	  DEBUG_ASSERT (wc_idx >= 0);
     944  	  --wc_idx;
     945  	  if (wc_idx < 0)
     946  	    return input->tip_context;
     947  	}
     948        wc = input->wcs[wc_idx];
     949        if (__glibc_unlikely (input->word_ops_used != 0)
     950  	  && IS_WIDE_WORD_CHAR (wc))
     951  	return CONTEXT_WORD;
     952        return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
     953  	      ? CONTEXT_NEWLINE : 0);
     954      }
     955    else
     956  #endif
     957      {
     958        c = re_string_byte_at (input, idx);
     959        if (bitset_contain (input->word_char, c))
     960  	return CONTEXT_WORD;
     961        return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
     962      }
     963  }
     964  
     965  /* Functions for set operation.  */
     966  
     967  static reg_errcode_t
     968  __attribute_warn_unused_result__
     969  re_node_set_alloc (re_node_set *set, Idx size)
     970  {
     971    set->alloc = size;
     972    set->nelem = 0;
     973    set->elems = re_malloc (Idx, size);
     974    if (__glibc_unlikely (set->elems == NULL)
     975        && (MALLOC_0_IS_NONNULL || size != 0))
     976      return REG_ESPACE;
     977    return REG_NOERROR;
     978  }
     979  
     980  static reg_errcode_t
     981  __attribute_warn_unused_result__
     982  re_node_set_init_1 (re_node_set *set, Idx elem)
     983  {
     984    set->alloc = 1;
     985    set->nelem = 1;
     986    set->elems = re_malloc (Idx, 1);
     987    if (__glibc_unlikely (set->elems == NULL))
     988      {
     989        set->alloc = set->nelem = 0;
     990        return REG_ESPACE;
     991      }
     992    set->elems[0] = elem;
     993    return REG_NOERROR;
     994  }
     995  
     996  static reg_errcode_t
     997  __attribute_warn_unused_result__
     998  re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2)
     999  {
    1000    set->alloc = 2;
    1001    set->elems = re_malloc (Idx, 2);
    1002    if (__glibc_unlikely (set->elems == NULL))
    1003      return REG_ESPACE;
    1004    if (elem1 == elem2)
    1005      {
    1006        set->nelem = 1;
    1007        set->elems[0] = elem1;
    1008      }
    1009    else
    1010      {
    1011        set->nelem = 2;
    1012        if (elem1 < elem2)
    1013  	{
    1014  	  set->elems[0] = elem1;
    1015  	  set->elems[1] = elem2;
    1016  	}
    1017        else
    1018  	{
    1019  	  set->elems[0] = elem2;
    1020  	  set->elems[1] = elem1;
    1021  	}
    1022      }
    1023    return REG_NOERROR;
    1024  }
    1025  
    1026  static reg_errcode_t
    1027  __attribute_warn_unused_result__
    1028  re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
    1029  {
    1030    dest->nelem = src->nelem;
    1031    if (src->nelem > 0)
    1032      {
    1033        dest->alloc = dest->nelem;
    1034        dest->elems = re_malloc (Idx, dest->alloc);
    1035        if (__glibc_unlikely (dest->elems == NULL))
    1036  	{
    1037  	  dest->alloc = dest->nelem = 0;
    1038  	  return REG_ESPACE;
    1039  	}
    1040        memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
    1041      }
    1042    else
    1043      re_node_set_init_empty (dest);
    1044    return REG_NOERROR;
    1045  }
    1046  
    1047  /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
    1048     DEST. Return value indicate the error code or REG_NOERROR if succeeded.
    1049     Note: We assume dest->elems is NULL, when dest->alloc is 0.  */
    1050  
    1051  static reg_errcode_t
    1052  __attribute_warn_unused_result__
    1053  re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
    1054  			   const re_node_set *src2)
    1055  {
    1056    Idx i1, i2, is, id, delta, sbase;
    1057    if (src1->nelem == 0 || src2->nelem == 0)
    1058      return REG_NOERROR;
    1059  
    1060    /* We need dest->nelem + 2 * elems_in_intersection; this is a
    1061       conservative estimate.  */
    1062    if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
    1063      {
    1064        Idx new_alloc = src1->nelem + src2->nelem + dest->alloc;
    1065        Idx *new_elems = re_realloc (dest->elems, Idx, new_alloc);
    1066        if (__glibc_unlikely (new_elems == NULL))
    1067  	return REG_ESPACE;
    1068        dest->elems = new_elems;
    1069        dest->alloc = new_alloc;
    1070      }
    1071  
    1072    /* Find the items in the intersection of SRC1 and SRC2, and copy
    1073       into the top of DEST those that are not already in DEST itself.  */
    1074    sbase = dest->nelem + src1->nelem + src2->nelem;
    1075    i1 = src1->nelem - 1;
    1076    i2 = src2->nelem - 1;
    1077    id = dest->nelem - 1;
    1078    for (;;)
    1079      {
    1080        if (src1->elems[i1] == src2->elems[i2])
    1081  	{
    1082  	  /* Try to find the item in DEST.  Maybe we could binary search?  */
    1083  	  while (id >= 0 && dest->elems[id] > src1->elems[i1])
    1084  	    --id;
    1085  
    1086  	  if (id < 0 || dest->elems[id] != src1->elems[i1])
    1087              dest->elems[--sbase] = src1->elems[i1];
    1088  
    1089  	  if (--i1 < 0 || --i2 < 0)
    1090  	    break;
    1091  	}
    1092  
    1093        /* Lower the highest of the two items.  */
    1094        else if (src1->elems[i1] < src2->elems[i2])
    1095  	{
    1096  	  if (--i2 < 0)
    1097  	    break;
    1098  	}
    1099        else
    1100  	{
    1101  	  if (--i1 < 0)
    1102  	    break;
    1103  	}
    1104      }
    1105  
    1106    id = dest->nelem - 1;
    1107    is = dest->nelem + src1->nelem + src2->nelem - 1;
    1108    delta = is - sbase + 1;
    1109  
    1110    /* Now copy.  When DELTA becomes zero, the remaining
    1111       DEST elements are already in place; this is more or
    1112       less the same loop that is in re_node_set_merge.  */
    1113    dest->nelem += delta;
    1114    if (delta > 0 && id >= 0)
    1115      for (;;)
    1116        {
    1117  	if (dest->elems[is] > dest->elems[id])
    1118  	  {
    1119  	    /* Copy from the top.  */
    1120  	    dest->elems[id + delta--] = dest->elems[is--];
    1121  	    if (delta == 0)
    1122  	      break;
    1123  	  }
    1124  	else
    1125  	  {
    1126  	    /* Slide from the bottom.  */
    1127  	    dest->elems[id + delta] = dest->elems[id];
    1128  	    if (--id < 0)
    1129  	      break;
    1130  	  }
    1131        }
    1132  
    1133    /* Copy remaining SRC elements.  */
    1134    memcpy (dest->elems, dest->elems + sbase, delta * sizeof (Idx));
    1135  
    1136    return REG_NOERROR;
    1137  }
    1138  
    1139  /* Calculate the union set of the sets SRC1 and SRC2. And store it to
    1140     DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
    1141  
    1142  static reg_errcode_t
    1143  __attribute_warn_unused_result__
    1144  re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
    1145  			const re_node_set *src2)
    1146  {
    1147    Idx i1, i2, id;
    1148    if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
    1149      {
    1150        dest->alloc = src1->nelem + src2->nelem;
    1151        dest->elems = re_malloc (Idx, dest->alloc);
    1152        if (__glibc_unlikely (dest->elems == NULL))
    1153  	return REG_ESPACE;
    1154      }
    1155    else
    1156      {
    1157        if (src1 != NULL && src1->nelem > 0)
    1158  	return re_node_set_init_copy (dest, src1);
    1159        else if (src2 != NULL && src2->nelem > 0)
    1160  	return re_node_set_init_copy (dest, src2);
    1161        else
    1162  	re_node_set_init_empty (dest);
    1163        return REG_NOERROR;
    1164      }
    1165    for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
    1166      {
    1167        if (src1->elems[i1] > src2->elems[i2])
    1168  	{
    1169  	  dest->elems[id++] = src2->elems[i2++];
    1170  	  continue;
    1171  	}
    1172        if (src1->elems[i1] == src2->elems[i2])
    1173  	++i2;
    1174        dest->elems[id++] = src1->elems[i1++];
    1175      }
    1176    if (i1 < src1->nelem)
    1177      {
    1178        memcpy (dest->elems + id, src1->elems + i1,
    1179  	     (src1->nelem - i1) * sizeof (Idx));
    1180        id += src1->nelem - i1;
    1181      }
    1182    else if (i2 < src2->nelem)
    1183      {
    1184        memcpy (dest->elems + id, src2->elems + i2,
    1185  	     (src2->nelem - i2) * sizeof (Idx));
    1186        id += src2->nelem - i2;
    1187      }
    1188    dest->nelem = id;
    1189    return REG_NOERROR;
    1190  }
    1191  
    1192  /* Calculate the union set of the sets DEST and SRC. And store it to
    1193     DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
    1194  
    1195  static reg_errcode_t
    1196  __attribute_warn_unused_result__
    1197  re_node_set_merge (re_node_set *dest, const re_node_set *src)
    1198  {
    1199    Idx is, id, sbase, delta;
    1200    if (src == NULL || src->nelem == 0)
    1201      return REG_NOERROR;
    1202    if (dest->alloc < 2 * src->nelem + dest->nelem)
    1203      {
    1204        Idx new_alloc = 2 * (src->nelem + dest->alloc);
    1205        Idx *new_buffer = re_realloc (dest->elems, Idx, new_alloc);
    1206        if (__glibc_unlikely (new_buffer == NULL))
    1207  	return REG_ESPACE;
    1208        dest->elems = new_buffer;
    1209        dest->alloc = new_alloc;
    1210      }
    1211  
    1212    if (__glibc_unlikely (dest->nelem == 0))
    1213      {
    1214        dest->nelem = src->nelem;
    1215        memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
    1216        return REG_NOERROR;
    1217      }
    1218  
    1219    /* Copy into the top of DEST the items of SRC that are not
    1220       found in DEST.  Maybe we could binary search in DEST?  */
    1221    for (sbase = dest->nelem + 2 * src->nelem,
    1222         is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
    1223      {
    1224        if (dest->elems[id] == src->elems[is])
    1225  	is--, id--;
    1226        else if (dest->elems[id] < src->elems[is])
    1227  	dest->elems[--sbase] = src->elems[is--];
    1228        else /* if (dest->elems[id] > src->elems[is]) */
    1229  	--id;
    1230      }
    1231  
    1232    if (is >= 0)
    1233      {
    1234        /* If DEST is exhausted, the remaining items of SRC must be unique.  */
    1235        sbase -= is + 1;
    1236        memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (Idx));
    1237      }
    1238  
    1239    id = dest->nelem - 1;
    1240    is = dest->nelem + 2 * src->nelem - 1;
    1241    delta = is - sbase + 1;
    1242    if (delta == 0)
    1243      return REG_NOERROR;
    1244  
    1245    /* Now copy.  When DELTA becomes zero, the remaining
    1246       DEST elements are already in place.  */
    1247    dest->nelem += delta;
    1248    for (;;)
    1249      {
    1250        if (dest->elems[is] > dest->elems[id])
    1251  	{
    1252  	  /* Copy from the top.  */
    1253  	  dest->elems[id + delta--] = dest->elems[is--];
    1254  	  if (delta == 0)
    1255  	    break;
    1256  	}
    1257        else
    1258  	{
    1259  	  /* Slide from the bottom.  */
    1260  	  dest->elems[id + delta] = dest->elems[id];
    1261  	  if (--id < 0)
    1262  	    {
    1263  	      /* Copy remaining SRC elements.  */
    1264  	      memcpy (dest->elems, dest->elems + sbase,
    1265  		      delta * sizeof (Idx));
    1266  	      break;
    1267  	    }
    1268  	}
    1269      }
    1270  
    1271    return REG_NOERROR;
    1272  }
    1273  
    1274  /* Insert the new element ELEM to the re_node_set* SET.
    1275     SET should not already have ELEM.
    1276     Return true if successful.  */
    1277  
    1278  static bool
    1279  __attribute_warn_unused_result__
    1280  re_node_set_insert (re_node_set *set, Idx elem)
    1281  {
    1282    Idx idx;
    1283    /* In case the set is empty.  */
    1284    if (set->alloc == 0)
    1285      return __glibc_likely (re_node_set_init_1 (set, elem) == REG_NOERROR);
    1286  
    1287    if (__glibc_unlikely (set->nelem) == 0)
    1288      {
    1289        /* We already guaranteed above that set->alloc != 0.  */
    1290        set->elems[0] = elem;
    1291        ++set->nelem;
    1292        return true;
    1293      }
    1294  
    1295    /* Realloc if we need.  */
    1296    if (set->alloc == set->nelem)
    1297      {
    1298        Idx *new_elems;
    1299        set->alloc = set->alloc * 2;
    1300        new_elems = re_realloc (set->elems, Idx, set->alloc);
    1301        if (__glibc_unlikely (new_elems == NULL))
    1302  	return false;
    1303        set->elems = new_elems;
    1304      }
    1305  
    1306    /* Move the elements which follows the new element.  Test the
    1307       first element separately to skip a check in the inner loop.  */
    1308    if (elem < set->elems[0])
    1309      {
    1310        for (idx = set->nelem; idx > 0; idx--)
    1311  	set->elems[idx] = set->elems[idx - 1];
    1312      }
    1313    else
    1314      {
    1315        for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
    1316  	set->elems[idx] = set->elems[idx - 1];
    1317        DEBUG_ASSERT (set->elems[idx - 1] < elem);
    1318      }
    1319  
    1320    /* Insert the new element.  */
    1321    set->elems[idx] = elem;
    1322    ++set->nelem;
    1323    return true;
    1324  }
    1325  
    1326  /* Insert the new element ELEM to the re_node_set* SET.
    1327     SET should not already have any element greater than or equal to ELEM.
    1328     Return true if successful.  */
    1329  
    1330  static bool
    1331  __attribute_warn_unused_result__
    1332  re_node_set_insert_last (re_node_set *set, Idx elem)
    1333  {
    1334    /* Realloc if we need.  */
    1335    if (set->alloc == set->nelem)
    1336      {
    1337        Idx *new_elems;
    1338        set->alloc = (set->alloc + 1) * 2;
    1339        new_elems = re_realloc (set->elems, Idx, set->alloc);
    1340        if (__glibc_unlikely (new_elems == NULL))
    1341  	return false;
    1342        set->elems = new_elems;
    1343      }
    1344  
    1345    /* Insert the new element.  */
    1346    set->elems[set->nelem++] = elem;
    1347    return true;
    1348  }
    1349  
    1350  /* Compare two node sets SET1 and SET2.
    1351     Return true if SET1 and SET2 are equivalent.  */
    1352  
    1353  static bool
    1354  __attribute__ ((pure))
    1355  re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
    1356  {
    1357    Idx i;
    1358    if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
    1359      return false;
    1360    for (i = set1->nelem ; --i >= 0 ; )
    1361      if (set1->elems[i] != set2->elems[i])
    1362        return false;
    1363    return true;
    1364  }
    1365  
    1366  /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise.  */
    1367  
    1368  static Idx
    1369  __attribute__ ((pure))
    1370  re_node_set_contains (const re_node_set *set, Idx elem)
    1371  {
    1372    __re_size_t idx, right, mid;
    1373    if (set->nelem <= 0)
    1374      return 0;
    1375  
    1376    /* Binary search the element.  */
    1377    idx = 0;
    1378    right = set->nelem - 1;
    1379    while (idx < right)
    1380      {
    1381        mid = (idx + right) / 2;
    1382        if (set->elems[mid] < elem)
    1383  	idx = mid + 1;
    1384        else
    1385  	right = mid;
    1386      }
    1387    return set->elems[idx] == elem ? idx + 1 : 0;
    1388  }
    1389  
    1390  static void
    1391  re_node_set_remove_at (re_node_set *set, Idx idx)
    1392  {
    1393    if (idx < 0 || idx >= set->nelem)
    1394      return;
    1395    --set->nelem;
    1396    for (; idx < set->nelem; idx++)
    1397      set->elems[idx] = set->elems[idx + 1];
    1398  }
    1399  
    1400  
    1401  /* Add the token TOKEN to dfa->nodes, and return the index of the token.
    1402     Or return -1 if an error occurred.  */
    1403  
    1404  static Idx
    1405  re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
    1406  {
    1407    if (__glibc_unlikely (dfa->nodes_len >= dfa->nodes_alloc))
    1408      {
    1409        size_t new_nodes_alloc = dfa->nodes_alloc * 2;
    1410        Idx *new_nexts, *new_indices;
    1411        re_node_set *new_edests, *new_eclosures;
    1412        re_token_t *new_nodes;
    1413  
    1414        /* Avoid overflows in realloc.  */
    1415        const size_t max_object_size = MAX (sizeof (re_token_t),
    1416  					  MAX (sizeof (re_node_set),
    1417  					       sizeof (Idx)));
    1418        if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size)
    1419  			    < new_nodes_alloc))
    1420  	return -1;
    1421  
    1422        new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
    1423        if (__glibc_unlikely (new_nodes == NULL))
    1424  	return -1;
    1425        dfa->nodes = new_nodes;
    1426        new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc);
    1427        new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc);
    1428        new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
    1429        new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
    1430        if (__glibc_unlikely (new_nexts == NULL || new_indices == NULL
    1431  			    || new_edests == NULL || new_eclosures == NULL))
    1432  	{
    1433  	   re_free (new_nexts);
    1434  	   re_free (new_indices);
    1435  	   re_free (new_edests);
    1436  	   re_free (new_eclosures);
    1437  	   return -1;
    1438  	}
    1439        dfa->nexts = new_nexts;
    1440        dfa->org_indices = new_indices;
    1441        dfa->edests = new_edests;
    1442        dfa->eclosures = new_eclosures;
    1443        dfa->nodes_alloc = new_nodes_alloc;
    1444      }
    1445    dfa->nodes[dfa->nodes_len] = token;
    1446    dfa->nodes[dfa->nodes_len].constraint = 0;
    1447  #ifdef RE_ENABLE_I18N
    1448    dfa->nodes[dfa->nodes_len].accept_mb =
    1449      ((token.type == OP_PERIOD && dfa->mb_cur_max > 1)
    1450       || token.type == COMPLEX_BRACKET);
    1451  #endif
    1452    dfa->nexts[dfa->nodes_len] = -1;
    1453    re_node_set_init_empty (dfa->edests + dfa->nodes_len);
    1454    re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
    1455    return dfa->nodes_len++;
    1456  }
    1457  
    1458  static re_hashval_t
    1459  calc_state_hash (const re_node_set *nodes, unsigned int context)
    1460  {
    1461    re_hashval_t hash = nodes->nelem + context;
    1462    Idx i;
    1463    for (i = 0 ; i < nodes->nelem ; i++)
    1464      hash += nodes->elems[i];
    1465    return hash;
    1466  }
    1467  
    1468  /* Search for the state whose node_set is equivalent to NODES.
    1469     Return the pointer to the state, if we found it in the DFA.
    1470     Otherwise create the new one and return it.  In case of an error
    1471     return NULL and set the error code in ERR.
    1472     Note: - We assume NULL as the invalid state, then it is possible that
    1473  	   return value is NULL and ERR is REG_NOERROR.
    1474  	 - We never return non-NULL value in case of any errors, it is for
    1475  	   optimization.  */
    1476  
    1477  static re_dfastate_t *
    1478  __attribute_warn_unused_result__
    1479  re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
    1480  		  const re_node_set *nodes)
    1481  {
    1482    re_hashval_t hash;
    1483    re_dfastate_t *new_state;
    1484    struct re_state_table_entry *spot;
    1485    Idx i;
    1486  #if defined GCC_LINT || defined lint
    1487    /* Suppress bogus uninitialized-variable warnings.  */
    1488    *err = REG_NOERROR;
    1489  #endif
    1490    if (__glibc_unlikely (nodes->nelem == 0))
    1491      {
    1492        *err = REG_NOERROR;
    1493        return NULL;
    1494      }
    1495    hash = calc_state_hash (nodes, 0);
    1496    spot = dfa->state_table + (hash & dfa->state_hash_mask);
    1497  
    1498    for (i = 0 ; i < spot->num ; i++)
    1499      {
    1500        re_dfastate_t *state = spot->array[i];
    1501        if (hash != state->hash)
    1502  	continue;
    1503        if (re_node_set_compare (&state->nodes, nodes))
    1504  	return state;
    1505      }
    1506  
    1507    /* There are no appropriate state in the dfa, create the new one.  */
    1508    new_state = create_ci_newstate (dfa, nodes, hash);
    1509    if (__glibc_unlikely (new_state == NULL))
    1510      *err = REG_ESPACE;
    1511  
    1512    return new_state;
    1513  }
    1514  
    1515  /* Search for the state whose node_set is equivalent to NODES and
    1516     whose context is equivalent to CONTEXT.
    1517     Return the pointer to the state, if we found it in the DFA.
    1518     Otherwise create the new one and return it.  In case of an error
    1519     return NULL and set the error code in ERR.
    1520     Note: - We assume NULL as the invalid state, then it is possible that
    1521  	   return value is NULL and ERR is REG_NOERROR.
    1522  	 - We never return non-NULL value in case of any errors, it is for
    1523  	   optimization.  */
    1524  
    1525  static re_dfastate_t *
    1526  __attribute_warn_unused_result__
    1527  re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
    1528  			  const re_node_set *nodes, unsigned int context)
    1529  {
    1530    re_hashval_t hash;
    1531    re_dfastate_t *new_state;
    1532    struct re_state_table_entry *spot;
    1533    Idx i;
    1534  #if defined GCC_LINT || defined lint
    1535    /* Suppress bogus uninitialized-variable warnings.  */
    1536    *err = REG_NOERROR;
    1537  #endif
    1538    if (nodes->nelem == 0)
    1539      {
    1540        *err = REG_NOERROR;
    1541        return NULL;
    1542      }
    1543    hash = calc_state_hash (nodes, context);
    1544    spot = dfa->state_table + (hash & dfa->state_hash_mask);
    1545  
    1546    for (i = 0 ; i < spot->num ; i++)
    1547      {
    1548        re_dfastate_t *state = spot->array[i];
    1549        if (state->hash == hash
    1550  	  && state->context == context
    1551  	  && re_node_set_compare (state->entrance_nodes, nodes))
    1552  	return state;
    1553      }
    1554    /* There are no appropriate state in 'dfa', create the new one.  */
    1555    new_state = create_cd_newstate (dfa, nodes, context, hash);
    1556    if (__glibc_unlikely (new_state == NULL))
    1557      *err = REG_ESPACE;
    1558  
    1559    return new_state;
    1560  }
    1561  
    1562  /* Finish initialization of the new state NEWSTATE, and using its hash value
    1563     HASH put in the appropriate bucket of DFA's state table.  Return value
    1564     indicates the error code if failed.  */
    1565  
    1566  static reg_errcode_t
    1567  __attribute_warn_unused_result__
    1568  register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
    1569  		re_hashval_t hash)
    1570  {
    1571    struct re_state_table_entry *spot;
    1572    reg_errcode_t err;
    1573    Idx i;
    1574  
    1575    newstate->hash = hash;
    1576    err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
    1577    if (__glibc_unlikely (err != REG_NOERROR))
    1578      return REG_ESPACE;
    1579    for (i = 0; i < newstate->nodes.nelem; i++)
    1580      {
    1581        Idx elem = newstate->nodes.elems[i];
    1582        if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
    1583  	if (! re_node_set_insert_last (&newstate->non_eps_nodes, elem))
    1584  	  return REG_ESPACE;
    1585      }
    1586  
    1587    spot = dfa->state_table + (hash & dfa->state_hash_mask);
    1588    if (__glibc_unlikely (spot->alloc <= spot->num))
    1589      {
    1590        Idx new_alloc = 2 * spot->num + 2;
    1591        re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
    1592  					      new_alloc);
    1593        if (__glibc_unlikely (new_array == NULL))
    1594  	return REG_ESPACE;
    1595        spot->array = new_array;
    1596        spot->alloc = new_alloc;
    1597      }
    1598    spot->array[spot->num++] = newstate;
    1599    return REG_NOERROR;
    1600  }
    1601  
    1602  static void
    1603  free_state (re_dfastate_t *state)
    1604  {
    1605    re_node_set_free (&state->non_eps_nodes);
    1606    re_node_set_free (&state->inveclosure);
    1607    if (state->entrance_nodes != &state->nodes)
    1608      {
    1609        re_node_set_free (state->entrance_nodes);
    1610        re_free (state->entrance_nodes);
    1611      }
    1612    re_node_set_free (&state->nodes);
    1613    re_free (state->word_trtable);
    1614    re_free (state->trtable);
    1615    re_free (state);
    1616  }
    1617  
    1618  /* Create the new state which is independent of contexts.
    1619     Return the new state if succeeded, otherwise return NULL.  */
    1620  
    1621  static re_dfastate_t *
    1622  __attribute_warn_unused_result__
    1623  create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
    1624  		    re_hashval_t hash)
    1625  {
    1626    Idx i;
    1627    reg_errcode_t err;
    1628    re_dfastate_t *newstate;
    1629  
    1630    newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
    1631    if (__glibc_unlikely (newstate == NULL))
    1632      return NULL;
    1633    err = re_node_set_init_copy (&newstate->nodes, nodes);
    1634    if (__glibc_unlikely (err != REG_NOERROR))
    1635      {
    1636        re_free (newstate);
    1637        return NULL;
    1638      }
    1639  
    1640    newstate->entrance_nodes = &newstate->nodes;
    1641    for (i = 0 ; i < nodes->nelem ; i++)
    1642      {
    1643        re_token_t *node = dfa->nodes + nodes->elems[i];
    1644        re_token_type_t type = node->type;
    1645        if (type == CHARACTER && !node->constraint)
    1646  	continue;
    1647  #ifdef RE_ENABLE_I18N
    1648        newstate->accept_mb |= node->accept_mb;
    1649  #endif /* RE_ENABLE_I18N */
    1650  
    1651        /* If the state has the halt node, the state is a halt state.  */
    1652        if (type == END_OF_RE)
    1653  	newstate->halt = 1;
    1654        else if (type == OP_BACK_REF)
    1655  	newstate->has_backref = 1;
    1656        else if (type == ANCHOR || node->constraint)
    1657  	newstate->has_constraint = 1;
    1658      }
    1659    err = register_state (dfa, newstate, hash);
    1660    if (__glibc_unlikely (err != REG_NOERROR))
    1661      {
    1662        free_state (newstate);
    1663        newstate = NULL;
    1664      }
    1665    return newstate;
    1666  }
    1667  
    1668  /* Create the new state which is depend on the context CONTEXT.
    1669     Return the new state if succeeded, otherwise return NULL.  */
    1670  
    1671  static re_dfastate_t *
    1672  __attribute_warn_unused_result__
    1673  create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
    1674  		    unsigned int context, re_hashval_t hash)
    1675  {
    1676    Idx i, nctx_nodes = 0;
    1677    reg_errcode_t err;
    1678    re_dfastate_t *newstate;
    1679  
    1680    newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
    1681    if (__glibc_unlikely (newstate == NULL))
    1682      return NULL;
    1683    err = re_node_set_init_copy (&newstate->nodes, nodes);
    1684    if (__glibc_unlikely (err != REG_NOERROR))
    1685      {
    1686        re_free (newstate);
    1687        return NULL;
    1688      }
    1689  
    1690    newstate->context = context;
    1691    newstate->entrance_nodes = &newstate->nodes;
    1692  
    1693    for (i = 0 ; i < nodes->nelem ; i++)
    1694      {
    1695        re_token_t *node = dfa->nodes + nodes->elems[i];
    1696        re_token_type_t type = node->type;
    1697        unsigned int constraint = node->constraint;
    1698  
    1699        if (type == CHARACTER && !constraint)
    1700  	continue;
    1701  #ifdef RE_ENABLE_I18N
    1702        newstate->accept_mb |= node->accept_mb;
    1703  #endif /* RE_ENABLE_I18N */
    1704  
    1705        /* If the state has the halt node, the state is a halt state.  */
    1706        if (type == END_OF_RE)
    1707  	newstate->halt = 1;
    1708        else if (type == OP_BACK_REF)
    1709  	newstate->has_backref = 1;
    1710  
    1711        if (constraint)
    1712  	{
    1713  	  if (newstate->entrance_nodes == &newstate->nodes)
    1714  	    {
    1715  	      re_node_set *entrance_nodes = re_malloc (re_node_set, 1);
    1716  	      if (__glibc_unlikely (entrance_nodes == NULL))
    1717  		{
    1718  		  free_state (newstate);
    1719  		  return NULL;
    1720  		}
    1721  	      newstate->entrance_nodes = entrance_nodes;
    1722  	      if (re_node_set_init_copy (newstate->entrance_nodes, nodes)
    1723  		  != REG_NOERROR)
    1724  		{
    1725  		  free_state (newstate);
    1726  		  return NULL;
    1727  		}
    1728  	      nctx_nodes = 0;
    1729  	      newstate->has_constraint = 1;
    1730  	    }
    1731  
    1732  	  if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
    1733  	    {
    1734  	      re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
    1735  	      ++nctx_nodes;
    1736  	    }
    1737  	}
    1738      }
    1739    err = register_state (dfa, newstate, hash);
    1740    if (__glibc_unlikely (err != REG_NOERROR))
    1741      {
    1742        free_state (newstate);
    1743        newstate = NULL;
    1744      }
    1745    return  newstate;
    1746  }