1  /* Extended regular expression matching and search library.
       2     Copyright (C) 2002-2022 Free Software Foundation, Inc.
       3     This file is part of the GNU C Library.
       4     Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
       5  
       6     The GNU C Library is free software; you can redistribute it and/or
       7     modify it under the terms of the GNU Lesser General Public
       8     License as published by the Free Software Foundation; either
       9     version 2.1 of the License, or (at your option) any later version.
      10  
      11     The GNU C Library is distributed in the hope that it will be useful,
      12     but WITHOUT ANY WARRANTY; without even the implied warranty of
      13     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
      14     Lesser General Public License for more details.
      15  
      16     You should have received a copy of the GNU Lesser General Public
      17     License along with the GNU C Library; if not, see
      18     <https://www.gnu.org/licenses/>.  */
      19  
      20  static void re_string_construct_common (const char *str, Idx len,
      21  					re_string_t *pstr,
      22  					RE_TRANSLATE_TYPE trans, bool icase,
      23  					const re_dfa_t *dfa);
      24  static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
      25  					  const re_node_set *nodes,
      26  					  re_hashval_t hash);
      27  static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
      28  					  const re_node_set *nodes,
      29  					  unsigned int context,
      30  					  re_hashval_t hash);
      31  static reg_errcode_t re_string_realloc_buffers (re_string_t *pstr,
      32  						Idx new_buf_len);
      33  static void build_wcs_buffer (re_string_t *pstr);
      34  static reg_errcode_t build_wcs_upper_buffer (re_string_t *pstr);
      35  static void build_upper_buffer (re_string_t *pstr);
      36  static void re_string_translate_buffer (re_string_t *pstr);
      37  static unsigned int re_string_context_at (const re_string_t *input, Idx idx,
      38  					  int eflags) __attribute__ ((pure));
      39  
      40  /* Functions for string operation.  */
      41  
      42  /* This function allocate the buffers.  It is necessary to call
      43     re_string_reconstruct before using the object.  */
      44  
      45  static reg_errcode_t
      46  __attribute_warn_unused_result__
      47  re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len,
      48  		    RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
      49  {
      50    reg_errcode_t ret;
      51    Idx init_buf_len;
      52  
      53    /* Ensure at least one character fits into the buffers.  */
      54    if (init_len < dfa->mb_cur_max)
      55      init_len = dfa->mb_cur_max;
      56    init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
      57    re_string_construct_common (str, len, pstr, trans, icase, dfa);
      58  
      59    ret = re_string_realloc_buffers (pstr, init_buf_len);
      60    if (__glibc_unlikely (ret != REG_NOERROR))
      61      return ret;
      62  
      63    pstr->word_char = dfa->word_char;
      64    pstr->word_ops_used = dfa->word_ops_used;
      65    pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
      66    pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
      67    pstr->valid_raw_len = pstr->valid_len;
      68    return REG_NOERROR;
      69  }
      70  
      71  /* This function allocate the buffers, and initialize them.  */
      72  
      73  static reg_errcode_t
      74  __attribute_warn_unused_result__
      75  re_string_construct (re_string_t *pstr, const char *str, Idx len,
      76  		     RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
      77  {
      78    reg_errcode_t ret;
      79    memset (pstr, '\0', sizeof (re_string_t));
      80    re_string_construct_common (str, len, pstr, trans, icase, dfa);
      81  
      82    if (len > 0)
      83      {
      84        ret = re_string_realloc_buffers (pstr, len + 1);
      85        if (__glibc_unlikely (ret != REG_NOERROR))
      86  	return ret;
      87      }
      88    pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
      89  
      90    if (icase)
      91      {
      92        if (dfa->mb_cur_max > 1)
      93  	{
      94  	  while (1)
      95  	    {
      96  	      ret = build_wcs_upper_buffer (pstr);
      97  	      if (__glibc_unlikely (ret != REG_NOERROR))
      98  		return ret;
      99  	      if (pstr->valid_raw_len >= len)
     100  		break;
     101  	      if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
     102  		break;
     103  	      ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
     104  	      if (__glibc_unlikely (ret != REG_NOERROR))
     105  		return ret;
     106  	    }
     107  	}
     108        else
     109  	build_upper_buffer (pstr);
     110      }
     111    else
     112      {
     113        if (dfa->mb_cur_max > 1)
     114  	build_wcs_buffer (pstr);
     115        else
     116  	{
     117  	  if (trans != NULL)
     118  	    re_string_translate_buffer (pstr);
     119  	  else
     120  	    {
     121  	      pstr->valid_len = pstr->bufs_len;
     122  	      pstr->valid_raw_len = pstr->bufs_len;
     123  	    }
     124  	}
     125      }
     126  
     127    return REG_NOERROR;
     128  }
     129  
     130  /* Helper functions for re_string_allocate, and re_string_construct.  */
     131  
     132  static reg_errcode_t
     133  __attribute_warn_unused_result__
     134  re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len)
     135  {
     136    if (pstr->mb_cur_max > 1)
     137      {
     138        wint_t *new_wcs;
     139  
     140        /* Avoid overflow in realloc.  */
     141        const size_t max_object_size = MAX (sizeof (wint_t), sizeof (Idx));
     142        if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size)
     143  			    < new_buf_len))
     144  	return REG_ESPACE;
     145  
     146        new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
     147        if (__glibc_unlikely (new_wcs == NULL))
     148  	return REG_ESPACE;
     149        pstr->wcs = new_wcs;
     150        if (pstr->offsets != NULL)
     151  	{
     152  	  Idx *new_offsets = re_realloc (pstr->offsets, Idx, new_buf_len);
     153  	  if (__glibc_unlikely (new_offsets == NULL))
     154  	    return REG_ESPACE;
     155  	  pstr->offsets = new_offsets;
     156  	}
     157      }
     158    if (pstr->mbs_allocated)
     159      {
     160        unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
     161  					   new_buf_len);
     162        if (__glibc_unlikely (new_mbs == NULL))
     163  	return REG_ESPACE;
     164        pstr->mbs = new_mbs;
     165      }
     166    pstr->bufs_len = new_buf_len;
     167    return REG_NOERROR;
     168  }
     169  
     170  
     171  static void
     172  re_string_construct_common (const char *str, Idx len, re_string_t *pstr,
     173  			    RE_TRANSLATE_TYPE trans, bool icase,
     174  			    const re_dfa_t *dfa)
     175  {
     176    pstr->raw_mbs = (const unsigned char *) str;
     177    pstr->len = len;
     178    pstr->raw_len = len;
     179    pstr->trans = trans;
     180    pstr->icase = icase;
     181    pstr->mbs_allocated = (trans != NULL || icase);
     182    pstr->mb_cur_max = dfa->mb_cur_max;
     183    pstr->is_utf8 = dfa->is_utf8;
     184    pstr->map_notascii = dfa->map_notascii;
     185    pstr->stop = pstr->len;
     186    pstr->raw_stop = pstr->stop;
     187  }
     188  
     189  
     190  /* Build wide character buffer PSTR->WCS.
     191     If the byte sequence of the string are:
     192       <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
     193     Then wide character buffer will be:
     194       <wc1>   , WEOF    , <wc2>   , WEOF    , <wc3>
     195     We use WEOF for padding, they indicate that the position isn't
     196     a first byte of a multibyte character.
     197  
     198     Note that this function assumes PSTR->VALID_LEN elements are already
     199     built and starts from PSTR->VALID_LEN.  */
     200  
     201  static void
     202  build_wcs_buffer (re_string_t *pstr)
     203  {
     204  #ifdef _LIBC
     205    unsigned char buf[MB_LEN_MAX];
     206    DEBUG_ASSERT (MB_LEN_MAX >= pstr->mb_cur_max);
     207  #else
     208    unsigned char buf[64];
     209  #endif
     210    mbstate_t prev_st;
     211    Idx byte_idx, end_idx, remain_len;
     212    size_t mbclen;
     213  
     214    /* Build the buffers from pstr->valid_len to either pstr->len or
     215       pstr->bufs_len.  */
     216    end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
     217    for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
     218      {
     219        wchar_t wc;
     220        const char *p;
     221  
     222        remain_len = end_idx - byte_idx;
     223        prev_st = pstr->cur_state;
     224        /* Apply the translation if we need.  */
     225        if (__glibc_unlikely (pstr->trans != NULL))
     226  	{
     227  	  int i, ch;
     228  
     229  	  for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
     230  	    {
     231  	      ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
     232  	      buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
     233  	    }
     234  	  p = (const char *) buf;
     235  	}
     236        else
     237  	p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
     238        mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
     239        if (__glibc_unlikely (mbclen == (size_t) -1 || mbclen == 0
     240  			    || (mbclen == (size_t) -2
     241  				&& pstr->bufs_len >= pstr->len)))
     242  	{
     243  	  /* We treat these cases as a singlebyte character.  */
     244  	  mbclen = 1;
     245  	  wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
     246  	  if (__glibc_unlikely (pstr->trans != NULL))
     247  	    wc = pstr->trans[wc];
     248  	  pstr->cur_state = prev_st;
     249  	}
     250        else if (__glibc_unlikely (mbclen == (size_t) -2))
     251  	{
     252  	  /* The buffer doesn't have enough space, finish to build.  */
     253  	  pstr->cur_state = prev_st;
     254  	  break;
     255  	}
     256  
     257        /* Write wide character and padding.  */
     258        pstr->wcs[byte_idx++] = wc;
     259        /* Write paddings.  */
     260        for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
     261  	pstr->wcs[byte_idx++] = WEOF;
     262      }
     263    pstr->valid_len = byte_idx;
     264    pstr->valid_raw_len = byte_idx;
     265  }
     266  
     267  /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
     268     but for REG_ICASE.  */
     269  
     270  static reg_errcode_t
     271  __attribute_warn_unused_result__
     272  build_wcs_upper_buffer (re_string_t *pstr)
     273  {
     274    mbstate_t prev_st;
     275    Idx src_idx, byte_idx, end_idx, remain_len;
     276    size_t mbclen;
     277  #ifdef _LIBC
     278    char buf[MB_LEN_MAX];
     279    DEBUG_ASSERT (pstr->mb_cur_max <= MB_LEN_MAX);
     280  #else
     281    char buf[64];
     282  #endif
     283  
     284    byte_idx = pstr->valid_len;
     285    end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
     286  
     287    /* The following optimization assumes that ASCII characters can be
     288       mapped to wide characters with a simple cast.  */
     289    if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
     290      {
     291        while (byte_idx < end_idx)
     292  	{
     293  	  wchar_t wc;
     294  	  unsigned char ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
     295  
     296  	  if (isascii (ch) && mbsinit (&pstr->cur_state))
     297  	    {
     298  	      /* The next step uses the assumption that wchar_t is encoded
     299  		 ASCII-safe: all ASCII values can be converted like this.  */
     300  	      wchar_t wcu = __towupper (ch);
     301  	      if (isascii (wcu))
     302  		{
     303  		  pstr->mbs[byte_idx] = wcu;
     304  		  pstr->wcs[byte_idx] = wcu;
     305  		  byte_idx++;
     306  		  continue;
     307  		}
     308  	    }
     309  
     310  	  remain_len = end_idx - byte_idx;
     311  	  prev_st = pstr->cur_state;
     312  	  mbclen = __mbrtowc (&wc,
     313  			      ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
     314  			       + byte_idx), remain_len, &pstr->cur_state);
     315  	  if (__glibc_likely (0 < mbclen && mbclen < (size_t) -2))
     316  	    {
     317  	      wchar_t wcu = __towupper (wc);
     318  	      if (wcu != wc)
     319  		{
     320  		  size_t mbcdlen;
     321  
     322  		  mbcdlen = __wcrtomb (buf, wcu, &prev_st);
     323  		  if (__glibc_likely (mbclen == mbcdlen))
     324  		    memcpy (pstr->mbs + byte_idx, buf, mbclen);
     325  		  else
     326  		    {
     327  		      src_idx = byte_idx;
     328  		      goto offsets_needed;
     329  		    }
     330  		}
     331  	      else
     332  		memcpy (pstr->mbs + byte_idx,
     333  			pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
     334  	      pstr->wcs[byte_idx++] = wcu;
     335  	      /* Write paddings.  */
     336  	      for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
     337  		pstr->wcs[byte_idx++] = WEOF;
     338  	    }
     339  	  else if (mbclen == (size_t) -1 || mbclen == 0
     340  		   || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
     341  	    {
     342  	      /* It is an invalid character, an incomplete character
     343  		 at the end of the string, or '\0'.  Just use the byte.  */
     344  	      pstr->mbs[byte_idx] = ch;
     345  	      /* And also cast it to wide char.  */
     346  	      pstr->wcs[byte_idx++] = (wchar_t) ch;
     347  	      if (__glibc_unlikely (mbclen == (size_t) -1))
     348  		pstr->cur_state = prev_st;
     349  	    }
     350  	  else
     351  	    {
     352  	      /* The buffer doesn't have enough space, finish to build.  */
     353  	      pstr->cur_state = prev_st;
     354  	      break;
     355  	    }
     356  	}
     357        pstr->valid_len = byte_idx;
     358        pstr->valid_raw_len = byte_idx;
     359        return REG_NOERROR;
     360      }
     361    else
     362      for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
     363        {
     364  	wchar_t wc;
     365  	const char *p;
     366        offsets_needed:
     367  	remain_len = end_idx - byte_idx;
     368  	prev_st = pstr->cur_state;
     369  	if (__glibc_unlikely (pstr->trans != NULL))
     370  	  {
     371  	    int i, ch;
     372  
     373  	    for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
     374  	      {
     375  		ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
     376  		buf[i] = pstr->trans[ch];
     377  	      }
     378  	    p = (const char *) buf;
     379  	  }
     380  	else
     381  	  p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
     382  	mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
     383  	if (__glibc_likely (0 < mbclen && mbclen < (size_t) -2))
     384  	  {
     385  	    wchar_t wcu = __towupper (wc);
     386  	    if (wcu != wc)
     387  	      {
     388  		size_t mbcdlen;
     389  
     390  		mbcdlen = __wcrtomb ((char *) buf, wcu, &prev_st);
     391  		if (__glibc_likely (mbclen == mbcdlen))
     392  		  memcpy (pstr->mbs + byte_idx, buf, mbclen);
     393  		else if (mbcdlen != (size_t) -1)
     394  		  {
     395  		    size_t i;
     396  
     397  		    if (byte_idx + mbcdlen > pstr->bufs_len)
     398  		      {
     399  			pstr->cur_state = prev_st;
     400  			break;
     401  		      }
     402  
     403  		    if (pstr->offsets == NULL)
     404  		      {
     405  			pstr->offsets = re_malloc (Idx, pstr->bufs_len);
     406  
     407  			if (pstr->offsets == NULL)
     408  			  return REG_ESPACE;
     409  		      }
     410  		    if (!pstr->offsets_needed)
     411  		      {
     412  			for (i = 0; i < (size_t) byte_idx; ++i)
     413  			  pstr->offsets[i] = i;
     414  			pstr->offsets_needed = 1;
     415  		      }
     416  
     417  		    memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
     418  		    pstr->wcs[byte_idx] = wcu;
     419  		    pstr->offsets[byte_idx] = src_idx;
     420  		    for (i = 1; i < mbcdlen; ++i)
     421  		      {
     422  			pstr->offsets[byte_idx + i]
     423  			  = src_idx + (i < mbclen ? i : mbclen - 1);
     424  			pstr->wcs[byte_idx + i] = WEOF;
     425  		      }
     426  		    pstr->len += mbcdlen - mbclen;
     427  		    if (pstr->raw_stop > src_idx)
     428  		      pstr->stop += mbcdlen - mbclen;
     429  		    end_idx = (pstr->bufs_len > pstr->len)
     430  			      ? pstr->len : pstr->bufs_len;
     431  		    byte_idx += mbcdlen;
     432  		    src_idx += mbclen;
     433  		    continue;
     434  		  }
     435  		else
     436  		  memcpy (pstr->mbs + byte_idx, p, mbclen);
     437  	      }
     438  	    else
     439  	      memcpy (pstr->mbs + byte_idx, p, mbclen);
     440  
     441  	    if (__glibc_unlikely (pstr->offsets_needed != 0))
     442  	      {
     443  		size_t i;
     444  		for (i = 0; i < mbclen; ++i)
     445  		  pstr->offsets[byte_idx + i] = src_idx + i;
     446  	      }
     447  	    src_idx += mbclen;
     448  
     449  	    pstr->wcs[byte_idx++] = wcu;
     450  	    /* Write paddings.  */
     451  	    for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
     452  	      pstr->wcs[byte_idx++] = WEOF;
     453  	  }
     454  	else if (mbclen == (size_t) -1 || mbclen == 0
     455  		 || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
     456  	  {
     457  	    /* It is an invalid character or '\0'.  Just use the byte.  */
     458  	    int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
     459  
     460  	    if (__glibc_unlikely (pstr->trans != NULL))
     461  	      ch = pstr->trans [ch];
     462  	    pstr->mbs[byte_idx] = ch;
     463  
     464  	    if (__glibc_unlikely (pstr->offsets_needed != 0))
     465  	      pstr->offsets[byte_idx] = src_idx;
     466  	    ++src_idx;
     467  
     468  	    /* And also cast it to wide char.  */
     469  	    pstr->wcs[byte_idx++] = (wchar_t) ch;
     470  	    if (__glibc_unlikely (mbclen == (size_t) -1))
     471  	      pstr->cur_state = prev_st;
     472  	  }
     473  	else
     474  	  {
     475  	    /* The buffer doesn't have enough space, finish to build.  */
     476  	    pstr->cur_state = prev_st;
     477  	    break;
     478  	  }
     479        }
     480    pstr->valid_len = byte_idx;
     481    pstr->valid_raw_len = src_idx;
     482    return REG_NOERROR;
     483  }
     484  
     485  /* Skip characters until the index becomes greater than NEW_RAW_IDX.
     486     Return the index.  */
     487  
     488  static Idx
     489  re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc)
     490  {
     491    mbstate_t prev_st;
     492    Idx rawbuf_idx;
     493    size_t mbclen;
     494    wint_t wc = WEOF;
     495  
     496    /* Skip the characters which are not necessary to check.  */
     497    for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
     498         rawbuf_idx < new_raw_idx;)
     499      {
     500        wchar_t wc2;
     501        Idx remain_len = pstr->raw_len - rawbuf_idx;
     502        prev_st = pstr->cur_state;
     503        mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
     504  			  remain_len, &pstr->cur_state);
     505        if (__glibc_unlikely (mbclen == (size_t) -2 || mbclen == (size_t) -1
     506  			    || mbclen == 0))
     507  	{
     508  	  /* We treat these cases as a single byte character.  */
     509  	  if (mbclen == 0 || remain_len == 0)
     510  	    wc = L'\0';
     511  	  else
     512  	    wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
     513  	  mbclen = 1;
     514  	  pstr->cur_state = prev_st;
     515  	}
     516        else
     517  	wc = wc2;
     518        /* Then proceed the next character.  */
     519        rawbuf_idx += mbclen;
     520      }
     521    *last_wc = wc;
     522    return rawbuf_idx;
     523  }
     524  
     525  /* Build the buffer PSTR->MBS, and apply the translation if we need.
     526     This function is used in case of REG_ICASE.  */
     527  
     528  static void
     529  build_upper_buffer (re_string_t *pstr)
     530  {
     531    Idx char_idx, end_idx;
     532    end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
     533  
     534    for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
     535      {
     536        int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
     537        if (__glibc_unlikely (pstr->trans != NULL))
     538  	ch = pstr->trans[ch];
     539        pstr->mbs[char_idx] = toupper (ch);
     540      }
     541    pstr->valid_len = char_idx;
     542    pstr->valid_raw_len = char_idx;
     543  }
     544  
     545  /* Apply TRANS to the buffer in PSTR.  */
     546  
     547  static void
     548  re_string_translate_buffer (re_string_t *pstr)
     549  {
     550    Idx buf_idx, end_idx;
     551    end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
     552  
     553    for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
     554      {
     555        int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
     556        pstr->mbs[buf_idx] = pstr->trans[ch];
     557      }
     558  
     559    pstr->valid_len = buf_idx;
     560    pstr->valid_raw_len = buf_idx;
     561  }
     562  
     563  /* This function re-construct the buffers.
     564     Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
     565     convert to upper case in case of REG_ICASE, apply translation.  */
     566  
     567  static reg_errcode_t
     568  __attribute_warn_unused_result__
     569  re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
     570  {
     571    Idx offset;
     572  
     573    if (__glibc_unlikely (pstr->raw_mbs_idx <= idx))
     574      offset = idx - pstr->raw_mbs_idx;
     575    else
     576      {
     577        /* Reset buffer.  */
     578        if (pstr->mb_cur_max > 1)
     579  	memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
     580        pstr->len = pstr->raw_len;
     581        pstr->stop = pstr->raw_stop;
     582        pstr->valid_len = 0;
     583        pstr->raw_mbs_idx = 0;
     584        pstr->valid_raw_len = 0;
     585        pstr->offsets_needed = 0;
     586        pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
     587  			   : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
     588        if (!pstr->mbs_allocated)
     589  	pstr->mbs = (unsigned char *) pstr->raw_mbs;
     590        offset = idx;
     591      }
     592  
     593    if (__glibc_likely (offset != 0))
     594      {
     595        /* Should the already checked characters be kept?  */
     596        if (__glibc_likely (offset < pstr->valid_raw_len))
     597  	{
     598  	  /* Yes, move them to the front of the buffer.  */
     599  	  if (__glibc_unlikely (pstr->offsets_needed))
     600  	    {
     601  	      Idx low = 0, high = pstr->valid_len, mid;
     602  	      do
     603  		{
     604  		  mid = (high + low) / 2;
     605  		  if (pstr->offsets[mid] > offset)
     606  		    high = mid;
     607  		  else if (pstr->offsets[mid] < offset)
     608  		    low = mid + 1;
     609  		  else
     610  		    break;
     611  		}
     612  	      while (low < high);
     613  	      if (pstr->offsets[mid] < offset)
     614  		++mid;
     615  	      pstr->tip_context = re_string_context_at (pstr, mid - 1,
     616  							eflags);
     617  	      /* This can be quite complicated, so handle specially
     618  		 only the common and easy case where the character with
     619  		 different length representation of lower and upper
     620  		 case is present at or after offset.  */
     621  	      if (pstr->valid_len > offset
     622  		  && mid == offset && pstr->offsets[mid] == offset)
     623  		{
     624  		  memmove (pstr->wcs, pstr->wcs + offset,
     625  			   (pstr->valid_len - offset) * sizeof (wint_t));
     626  		  memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
     627  		  pstr->valid_len -= offset;
     628  		  pstr->valid_raw_len -= offset;
     629  		  for (low = 0; low < pstr->valid_len; low++)
     630  		    pstr->offsets[low] = pstr->offsets[low + offset] - offset;
     631  		}
     632  	      else
     633  		{
     634  		  /* Otherwise, just find out how long the partial multibyte
     635  		     character at offset is and fill it with WEOF/255.  */
     636  		  pstr->len = pstr->raw_len - idx + offset;
     637  		  pstr->stop = pstr->raw_stop - idx + offset;
     638  		  pstr->offsets_needed = 0;
     639  		  while (mid > 0 && pstr->offsets[mid - 1] == offset)
     640  		    --mid;
     641  		  while (mid < pstr->valid_len)
     642  		    if (pstr->wcs[mid] != WEOF)
     643  		      break;
     644  		    else
     645  		      ++mid;
     646  		  if (mid == pstr->valid_len)
     647  		    pstr->valid_len = 0;
     648  		  else
     649  		    {
     650  		      pstr->valid_len = pstr->offsets[mid] - offset;
     651  		      if (pstr->valid_len)
     652  			{
     653  			  for (low = 0; low < pstr->valid_len; ++low)
     654  			    pstr->wcs[low] = WEOF;
     655  			  memset (pstr->mbs, 255, pstr->valid_len);
     656  			}
     657  		    }
     658  		  pstr->valid_raw_len = pstr->valid_len;
     659  		}
     660  	    }
     661  	  else
     662  	    {
     663  	      pstr->tip_context = re_string_context_at (pstr, offset - 1,
     664  							eflags);
     665  	      if (pstr->mb_cur_max > 1)
     666  		memmove (pstr->wcs, pstr->wcs + offset,
     667  			 (pstr->valid_len - offset) * sizeof (wint_t));
     668  	      if (__glibc_unlikely (pstr->mbs_allocated))
     669  		memmove (pstr->mbs, pstr->mbs + offset,
     670  			 pstr->valid_len - offset);
     671  	      pstr->valid_len -= offset;
     672  	      pstr->valid_raw_len -= offset;
     673  	      DEBUG_ASSERT (pstr->valid_len > 0);
     674  	    }
     675  	}
     676        else
     677  	{
     678  	  /* No, skip all characters until IDX.  */
     679  	  Idx prev_valid_len = pstr->valid_len;
     680  
     681  	  if (__glibc_unlikely (pstr->offsets_needed))
     682  	    {
     683  	      pstr->len = pstr->raw_len - idx + offset;
     684  	      pstr->stop = pstr->raw_stop - idx + offset;
     685  	      pstr->offsets_needed = 0;
     686  	    }
     687  	  pstr->valid_len = 0;
     688  	  if (pstr->mb_cur_max > 1)
     689  	    {
     690  	      Idx wcs_idx;
     691  	      wint_t wc = WEOF;
     692  
     693  	      if (pstr->is_utf8)
     694  		{
     695  		  const unsigned char *raw, *p, *end;
     696  
     697  		  /* Special case UTF-8.  Multi-byte chars start with any
     698  		     byte other than 0x80 - 0xbf.  */
     699  		  raw = pstr->raw_mbs + pstr->raw_mbs_idx;
     700  		  end = raw + (offset - pstr->mb_cur_max);
     701  		  if (end < pstr->raw_mbs)
     702  		    end = pstr->raw_mbs;
     703  		  p = raw + offset - 1;
     704  #ifdef _LIBC
     705  		  /* We know the wchar_t encoding is UCS4, so for the simple
     706  		     case, ASCII characters, skip the conversion step.  */
     707  		  if (isascii (*p) && __glibc_likely (pstr->trans == NULL))
     708  		    {
     709  		      memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
     710  		      /* pstr->valid_len = 0; */
     711  		      wc = (wchar_t) *p;
     712  		    }
     713  		  else
     714  #endif
     715  		    for (; p >= end; --p)
     716  		      if ((*p & 0xc0) != 0x80)
     717  			{
     718  			  mbstate_t cur_state;
     719  			  wchar_t wc2;
     720  			  Idx mlen = raw + pstr->len - p;
     721  			  unsigned char buf[6];
     722  			  size_t mbclen;
     723  
     724  			  const unsigned char *pp = p;
     725  			  if (__glibc_unlikely (pstr->trans != NULL))
     726  			    {
     727  			      int i = mlen < 6 ? mlen : 6;
     728  			      while (--i >= 0)
     729  				buf[i] = pstr->trans[p[i]];
     730  			      pp = buf;
     731  			    }
     732  			  /* XXX Don't use mbrtowc, we know which conversion
     733  			     to use (UTF-8 -> UCS4).  */
     734  			  memset (&cur_state, 0, sizeof (cur_state));
     735  			  mbclen = __mbrtowc (&wc2, (const char *) pp, mlen,
     736  					      &cur_state);
     737  			  if (raw + offset - p <= mbclen
     738  			      && mbclen < (size_t) -2)
     739  			    {
     740  			      memset (&pstr->cur_state, '\0',
     741  				      sizeof (mbstate_t));
     742  			      pstr->valid_len = mbclen - (raw + offset - p);
     743  			      wc = wc2;
     744  			    }
     745  			  break;
     746  			}
     747  		}
     748  
     749  	      if (wc == WEOF)
     750  		pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
     751  	      if (wc == WEOF)
     752  		pstr->tip_context
     753  		  = re_string_context_at (pstr, prev_valid_len - 1, eflags);
     754  	      else
     755  		pstr->tip_context = ((__glibc_unlikely (pstr->word_ops_used != 0)
     756  				      && IS_WIDE_WORD_CHAR (wc))
     757  				     ? CONTEXT_WORD
     758  				     : ((IS_WIDE_NEWLINE (wc)
     759  					 && pstr->newline_anchor)
     760  					? CONTEXT_NEWLINE : 0));
     761  	      if (__glibc_unlikely (pstr->valid_len))
     762  		{
     763  		  for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
     764  		    pstr->wcs[wcs_idx] = WEOF;
     765  		  if (pstr->mbs_allocated)
     766  		    memset (pstr->mbs, 255, pstr->valid_len);
     767  		}
     768  	      pstr->valid_raw_len = pstr->valid_len;
     769  	    }
     770  	  else
     771  	    {
     772  	      int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
     773  	      pstr->valid_raw_len = 0;
     774  	      if (pstr->trans)
     775  		c = pstr->trans[c];
     776  	      pstr->tip_context = (bitset_contain (pstr->word_char, c)
     777  				   ? CONTEXT_WORD
     778  				   : ((IS_NEWLINE (c) && pstr->newline_anchor)
     779  				      ? CONTEXT_NEWLINE : 0));
     780  	    }
     781  	}
     782        if (!__glibc_unlikely (pstr->mbs_allocated))
     783  	pstr->mbs += offset;
     784      }
     785    pstr->raw_mbs_idx = idx;
     786    pstr->len -= offset;
     787    pstr->stop -= offset;
     788  
     789    /* Then build the buffers.  */
     790    if (pstr->mb_cur_max > 1)
     791      {
     792        if (pstr->icase)
     793  	{
     794  	  reg_errcode_t ret = build_wcs_upper_buffer (pstr);
     795  	  if (__glibc_unlikely (ret != REG_NOERROR))
     796  	    return ret;
     797  	}
     798        else
     799  	build_wcs_buffer (pstr);
     800      }
     801    else
     802      if (__glibc_unlikely (pstr->mbs_allocated))
     803        {
     804  	if (pstr->icase)
     805  	  build_upper_buffer (pstr);
     806  	else if (pstr->trans != NULL)
     807  	  re_string_translate_buffer (pstr);
     808        }
     809      else
     810        pstr->valid_len = pstr->len;
     811  
     812    pstr->cur_idx = 0;
     813    return REG_NOERROR;
     814  }
     815  
     816  static unsigned char
     817  __attribute__ ((pure))
     818  re_string_peek_byte_case (const re_string_t *pstr, Idx idx)
     819  {
     820    int ch;
     821    Idx off;
     822  
     823    /* Handle the common (easiest) cases first.  */
     824    if (__glibc_likely (!pstr->mbs_allocated))
     825      return re_string_peek_byte (pstr, idx);
     826  
     827    if (pstr->mb_cur_max > 1
     828        && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
     829      return re_string_peek_byte (pstr, idx);
     830  
     831    off = pstr->cur_idx + idx;
     832    if (pstr->offsets_needed)
     833      off = pstr->offsets[off];
     834  
     835    ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
     836  
     837    /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
     838       this function returns CAPITAL LETTER I instead of first byte of
     839       DOTLESS SMALL LETTER I.  The latter would confuse the parser,
     840       since peek_byte_case doesn't advance cur_idx in any way.  */
     841    if (pstr->offsets_needed && !isascii (ch))
     842      return re_string_peek_byte (pstr, idx);
     843  
     844    return ch;
     845  }
     846  
     847  static unsigned char
     848  re_string_fetch_byte_case (re_string_t *pstr)
     849  {
     850    if (__glibc_likely (!pstr->mbs_allocated))
     851      return re_string_fetch_byte (pstr);
     852  
     853    if (pstr->offsets_needed)
     854      {
     855        Idx off;
     856        int ch;
     857  
     858        /* For tr_TR.UTF-8 [[:islower:]] there is
     859  	 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs.  Skip
     860  	 in that case the whole multi-byte character and return
     861  	 the original letter.  On the other side, with
     862  	 [[: DOTLESS SMALL LETTER I return [[:I, as doing
     863  	 anything else would complicate things too much.  */
     864  
     865        if (!re_string_first_byte (pstr, pstr->cur_idx))
     866  	return re_string_fetch_byte (pstr);
     867  
     868        off = pstr->offsets[pstr->cur_idx];
     869        ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
     870  
     871        if (! isascii (ch))
     872  	return re_string_fetch_byte (pstr);
     873  
     874        re_string_skip_bytes (pstr,
     875  			    re_string_char_size_at (pstr, pstr->cur_idx));
     876        return ch;
     877      }
     878  
     879    return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
     880  }
     881  
     882  static void
     883  re_string_destruct (re_string_t *pstr)
     884  {
     885    re_free (pstr->wcs);
     886    re_free (pstr->offsets);
     887    if (pstr->mbs_allocated)
     888      re_free (pstr->mbs);
     889  }
     890  
     891  /* Return the context at IDX in INPUT.  */
     892  
     893  static unsigned int
     894  re_string_context_at (const re_string_t *input, Idx idx, int eflags)
     895  {
     896    int c;
     897    if (__glibc_unlikely (idx < 0))
     898      /* In this case, we use the value stored in input->tip_context,
     899         since we can't know the character in input->mbs[-1] here.  */
     900      return input->tip_context;
     901    if (__glibc_unlikely (idx == input->len))
     902      return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
     903  	    : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
     904    if (input->mb_cur_max > 1)
     905      {
     906        wint_t wc;
     907        Idx wc_idx = idx;
     908        while(input->wcs[wc_idx] == WEOF)
     909  	{
     910  	  DEBUG_ASSERT (wc_idx >= 0);
     911  	  --wc_idx;
     912  	  if (wc_idx < 0)
     913  	    return input->tip_context;
     914  	}
     915        wc = input->wcs[wc_idx];
     916        if (__glibc_unlikely (input->word_ops_used != 0)
     917  	  && IS_WIDE_WORD_CHAR (wc))
     918  	return CONTEXT_WORD;
     919        return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
     920  	      ? CONTEXT_NEWLINE : 0);
     921      }
     922    else
     923      {
     924        c = re_string_byte_at (input, idx);
     925        if (bitset_contain (input->word_char, c))
     926  	return CONTEXT_WORD;
     927        return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
     928      }
     929  }
     930  
     931  /* Functions for set operation.  */
     932  
     933  static reg_errcode_t
     934  __attribute_warn_unused_result__
     935  re_node_set_alloc (re_node_set *set, Idx size)
     936  {
     937    set->alloc = size;
     938    set->nelem = 0;
     939    set->elems = re_malloc (Idx, size);
     940    if (__glibc_unlikely (set->elems == NULL)
     941        && (MALLOC_0_IS_NONNULL || size != 0))
     942      return REG_ESPACE;
     943    return REG_NOERROR;
     944  }
     945  
     946  static reg_errcode_t
     947  __attribute_warn_unused_result__
     948  re_node_set_init_1 (re_node_set *set, Idx elem)
     949  {
     950    set->alloc = 1;
     951    set->nelem = 1;
     952    set->elems = re_malloc (Idx, 1);
     953    if (__glibc_unlikely (set->elems == NULL))
     954      {
     955        set->alloc = set->nelem = 0;
     956        return REG_ESPACE;
     957      }
     958    set->elems[0] = elem;
     959    return REG_NOERROR;
     960  }
     961  
     962  static reg_errcode_t
     963  __attribute_warn_unused_result__
     964  re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2)
     965  {
     966    set->alloc = 2;
     967    set->elems = re_malloc (Idx, 2);
     968    if (__glibc_unlikely (set->elems == NULL))
     969      return REG_ESPACE;
     970    if (elem1 == elem2)
     971      {
     972        set->nelem = 1;
     973        set->elems[0] = elem1;
     974      }
     975    else
     976      {
     977        set->nelem = 2;
     978        if (elem1 < elem2)
     979  	{
     980  	  set->elems[0] = elem1;
     981  	  set->elems[1] = elem2;
     982  	}
     983        else
     984  	{
     985  	  set->elems[0] = elem2;
     986  	  set->elems[1] = elem1;
     987  	}
     988      }
     989    return REG_NOERROR;
     990  }
     991  
     992  static reg_errcode_t
     993  __attribute_warn_unused_result__
     994  re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
     995  {
     996    dest->nelem = src->nelem;
     997    if (src->nelem > 0)
     998      {
     999        dest->alloc = dest->nelem;
    1000        dest->elems = re_malloc (Idx, dest->alloc);
    1001        if (__glibc_unlikely (dest->elems == NULL))
    1002  	{
    1003  	  dest->alloc = dest->nelem = 0;
    1004  	  return REG_ESPACE;
    1005  	}
    1006        memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
    1007      }
    1008    else
    1009      re_node_set_init_empty (dest);
    1010    return REG_NOERROR;
    1011  }
    1012  
    1013  /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
    1014     DEST. Return value indicate the error code or REG_NOERROR if succeeded.
    1015     Note: We assume dest->elems is NULL, when dest->alloc is 0.  */
    1016  
    1017  static reg_errcode_t
    1018  __attribute_warn_unused_result__
    1019  re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
    1020  			   const re_node_set *src2)
    1021  {
    1022    Idx i1, i2, is, id, delta, sbase;
    1023    if (src1->nelem == 0 || src2->nelem == 0)
    1024      return REG_NOERROR;
    1025  
    1026    /* We need dest->nelem + 2 * elems_in_intersection; this is a
    1027       conservative estimate.  */
    1028    if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
    1029      {
    1030        Idx new_alloc = src1->nelem + src2->nelem + dest->alloc;
    1031        Idx *new_elems = re_realloc (dest->elems, Idx, new_alloc);
    1032        if (__glibc_unlikely (new_elems == NULL))
    1033  	return REG_ESPACE;
    1034        dest->elems = new_elems;
    1035        dest->alloc = new_alloc;
    1036      }
    1037  
    1038    /* Find the items in the intersection of SRC1 and SRC2, and copy
    1039       into the top of DEST those that are not already in DEST itself.  */
    1040    sbase = dest->nelem + src1->nelem + src2->nelem;
    1041    i1 = src1->nelem - 1;
    1042    i2 = src2->nelem - 1;
    1043    id = dest->nelem - 1;
    1044    for (;;)
    1045      {
    1046        if (src1->elems[i1] == src2->elems[i2])
    1047  	{
    1048  	  /* Try to find the item in DEST.  Maybe we could binary search?  */
    1049  	  while (id >= 0 && dest->elems[id] > src1->elems[i1])
    1050  	    --id;
    1051  
    1052  	  if (id < 0 || dest->elems[id] != src1->elems[i1])
    1053              dest->elems[--sbase] = src1->elems[i1];
    1054  
    1055  	  if (--i1 < 0 || --i2 < 0)
    1056  	    break;
    1057  	}
    1058  
    1059        /* Lower the highest of the two items.  */
    1060        else if (src1->elems[i1] < src2->elems[i2])
    1061  	{
    1062  	  if (--i2 < 0)
    1063  	    break;
    1064  	}
    1065        else
    1066  	{
    1067  	  if (--i1 < 0)
    1068  	    break;
    1069  	}
    1070      }
    1071  
    1072    id = dest->nelem - 1;
    1073    is = dest->nelem + src1->nelem + src2->nelem - 1;
    1074    delta = is - sbase + 1;
    1075  
    1076    /* Now copy.  When DELTA becomes zero, the remaining
    1077       DEST elements are already in place; this is more or
    1078       less the same loop that is in re_node_set_merge.  */
    1079    dest->nelem += delta;
    1080    if (delta > 0 && id >= 0)
    1081      for (;;)
    1082        {
    1083  	if (dest->elems[is] > dest->elems[id])
    1084  	  {
    1085  	    /* Copy from the top.  */
    1086  	    dest->elems[id + delta--] = dest->elems[is--];
    1087  	    if (delta == 0)
    1088  	      break;
    1089  	  }
    1090  	else
    1091  	  {
    1092  	    /* Slide from the bottom.  */
    1093  	    dest->elems[id + delta] = dest->elems[id];
    1094  	    if (--id < 0)
    1095  	      break;
    1096  	  }
    1097        }
    1098  
    1099    /* Copy remaining SRC elements.  */
    1100    memcpy (dest->elems, dest->elems + sbase, delta * sizeof (Idx));
    1101  
    1102    return REG_NOERROR;
    1103  }
    1104  
    1105  /* Calculate the union set of the sets SRC1 and SRC2. And store it to
    1106     DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
    1107  
    1108  static reg_errcode_t
    1109  __attribute_warn_unused_result__
    1110  re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
    1111  			const re_node_set *src2)
    1112  {
    1113    Idx i1, i2, id;
    1114    if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
    1115      {
    1116        dest->alloc = src1->nelem + src2->nelem;
    1117        dest->elems = re_malloc (Idx, dest->alloc);
    1118        if (__glibc_unlikely (dest->elems == NULL))
    1119  	return REG_ESPACE;
    1120      }
    1121    else
    1122      {
    1123        if (src1 != NULL && src1->nelem > 0)
    1124  	return re_node_set_init_copy (dest, src1);
    1125        else if (src2 != NULL && src2->nelem > 0)
    1126  	return re_node_set_init_copy (dest, src2);
    1127        else
    1128  	re_node_set_init_empty (dest);
    1129        return REG_NOERROR;
    1130      }
    1131    for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
    1132      {
    1133        if (src1->elems[i1] > src2->elems[i2])
    1134  	{
    1135  	  dest->elems[id++] = src2->elems[i2++];
    1136  	  continue;
    1137  	}
    1138        if (src1->elems[i1] == src2->elems[i2])
    1139  	++i2;
    1140        dest->elems[id++] = src1->elems[i1++];
    1141      }
    1142    if (i1 < src1->nelem)
    1143      {
    1144        memcpy (dest->elems + id, src1->elems + i1,
    1145  	     (src1->nelem - i1) * sizeof (Idx));
    1146        id += src1->nelem - i1;
    1147      }
    1148    else if (i2 < src2->nelem)
    1149      {
    1150        memcpy (dest->elems + id, src2->elems + i2,
    1151  	     (src2->nelem - i2) * sizeof (Idx));
    1152        id += src2->nelem - i2;
    1153      }
    1154    dest->nelem = id;
    1155    return REG_NOERROR;
    1156  }
    1157  
    1158  /* Calculate the union set of the sets DEST and SRC. And store it to
    1159     DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
    1160  
    1161  static reg_errcode_t
    1162  __attribute_warn_unused_result__
    1163  re_node_set_merge (re_node_set *dest, const re_node_set *src)
    1164  {
    1165    Idx is, id, sbase, delta;
    1166    if (src == NULL || src->nelem == 0)
    1167      return REG_NOERROR;
    1168    if (dest->alloc < 2 * src->nelem + dest->nelem)
    1169      {
    1170        Idx new_alloc = 2 * (src->nelem + dest->alloc);
    1171        Idx *new_buffer = re_realloc (dest->elems, Idx, new_alloc);
    1172        if (__glibc_unlikely (new_buffer == NULL))
    1173  	return REG_ESPACE;
    1174        dest->elems = new_buffer;
    1175        dest->alloc = new_alloc;
    1176      }
    1177  
    1178    if (__glibc_unlikely (dest->nelem == 0))
    1179      {
    1180        /* Although we already guaranteed above that dest->alloc != 0 and
    1181           therefore dest->elems != NULL, add a debug assertion to pacify
    1182           GCC 11.2.1's -fanalyzer.  */
    1183        DEBUG_ASSERT (dest->elems);
    1184        dest->nelem = src->nelem;
    1185        memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
    1186        return REG_NOERROR;
    1187      }
    1188  
    1189    /* Copy into the top of DEST the items of SRC that are not
    1190       found in DEST.  Maybe we could binary search in DEST?  */
    1191    for (sbase = dest->nelem + 2 * src->nelem,
    1192         is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
    1193      {
    1194        if (dest->elems[id] == src->elems[is])
    1195  	is--, id--;
    1196        else if (dest->elems[id] < src->elems[is])
    1197  	dest->elems[--sbase] = src->elems[is--];
    1198        else /* if (dest->elems[id] > src->elems[is]) */
    1199  	--id;
    1200      }
    1201  
    1202    if (is >= 0)
    1203      {
    1204        /* If DEST is exhausted, the remaining items of SRC must be unique.  */
    1205        sbase -= is + 1;
    1206        memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (Idx));
    1207      }
    1208  
    1209    id = dest->nelem - 1;
    1210    is = dest->nelem + 2 * src->nelem - 1;
    1211    delta = is - sbase + 1;
    1212    if (delta == 0)
    1213      return REG_NOERROR;
    1214  
    1215    /* Now copy.  When DELTA becomes zero, the remaining
    1216       DEST elements are already in place.  */
    1217    dest->nelem += delta;
    1218    for (;;)
    1219      {
    1220        if (dest->elems[is] > dest->elems[id])
    1221  	{
    1222  	  /* Copy from the top.  */
    1223  	  dest->elems[id + delta--] = dest->elems[is--];
    1224  	  if (delta == 0)
    1225  	    break;
    1226  	}
    1227        else
    1228  	{
    1229  	  /* Slide from the bottom.  */
    1230  	  dest->elems[id + delta] = dest->elems[id];
    1231  	  if (--id < 0)
    1232  	    {
    1233  	      /* Copy remaining SRC elements.  */
    1234  	      memcpy (dest->elems, dest->elems + sbase,
    1235  		      delta * sizeof (Idx));
    1236  	      break;
    1237  	    }
    1238  	}
    1239      }
    1240  
    1241    return REG_NOERROR;
    1242  }
    1243  
    1244  /* Insert the new element ELEM to the re_node_set* SET.
    1245     SET should not already have ELEM.
    1246     Return true if successful.  */
    1247  
    1248  static bool
    1249  __attribute_warn_unused_result__
    1250  re_node_set_insert (re_node_set *set, Idx elem)
    1251  {
    1252    Idx idx;
    1253    /* In case the set is empty.  */
    1254    if (set->alloc == 0)
    1255      return __glibc_likely (re_node_set_init_1 (set, elem) == REG_NOERROR);
    1256  
    1257    if (__glibc_unlikely (set->nelem) == 0)
    1258      {
    1259        /* Although we already guaranteed above that set->alloc != 0 and
    1260           therefore set->elems != NULL, add a debug assertion to pacify
    1261           GCC 11.2 -fanalyzer.  */
    1262        DEBUG_ASSERT (set->elems);
    1263        set->elems[0] = elem;
    1264        ++set->nelem;
    1265        return true;
    1266      }
    1267  
    1268    /* Realloc if we need.  */
    1269    if (set->alloc == set->nelem)
    1270      {
    1271        Idx *new_elems;
    1272        set->alloc = set->alloc * 2;
    1273        new_elems = re_realloc (set->elems, Idx, set->alloc);
    1274        if (__glibc_unlikely (new_elems == NULL))
    1275  	return false;
    1276        set->elems = new_elems;
    1277      }
    1278  
    1279    /* Move the elements which follows the new element.  Test the
    1280       first element separately to skip a check in the inner loop.  */
    1281    if (elem < set->elems[0])
    1282      {
    1283        for (idx = set->nelem; idx > 0; idx--)
    1284  	set->elems[idx] = set->elems[idx - 1];
    1285      }
    1286    else
    1287      {
    1288        for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
    1289  	set->elems[idx] = set->elems[idx - 1];
    1290        DEBUG_ASSERT (set->elems[idx - 1] < elem);
    1291      }
    1292  
    1293    /* Insert the new element.  */
    1294    set->elems[idx] = elem;
    1295    ++set->nelem;
    1296    return true;
    1297  }
    1298  
    1299  /* Insert the new element ELEM to the re_node_set* SET.
    1300     SET should not already have any element greater than or equal to ELEM.
    1301     Return true if successful.  */
    1302  
    1303  static bool
    1304  __attribute_warn_unused_result__
    1305  re_node_set_insert_last (re_node_set *set, Idx elem)
    1306  {
    1307    /* Realloc if we need.  */
    1308    if (set->alloc == set->nelem)
    1309      {
    1310        Idx *new_elems;
    1311        set->alloc = (set->alloc + 1) * 2;
    1312        new_elems = re_realloc (set->elems, Idx, set->alloc);
    1313        if (__glibc_unlikely (new_elems == NULL))
    1314  	return false;
    1315        set->elems = new_elems;
    1316      }
    1317  
    1318    /* Insert the new element.  */
    1319    set->elems[set->nelem++] = elem;
    1320    return true;
    1321  }
    1322  
    1323  /* Compare two node sets SET1 and SET2.
    1324     Return true if SET1 and SET2 are equivalent.  */
    1325  
    1326  static bool
    1327  __attribute__ ((pure))
    1328  re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
    1329  {
    1330    Idx i;
    1331    if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
    1332      return false;
    1333    for (i = set1->nelem ; --i >= 0 ; )
    1334      if (set1->elems[i] != set2->elems[i])
    1335        return false;
    1336    return true;
    1337  }
    1338  
    1339  /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise.  */
    1340  
    1341  static Idx
    1342  __attribute__ ((pure))
    1343  re_node_set_contains (const re_node_set *set, Idx elem)
    1344  {
    1345    __re_size_t idx, right, mid;
    1346    if (set->nelem <= 0)
    1347      return 0;
    1348  
    1349    /* Binary search the element.  */
    1350    idx = 0;
    1351    right = set->nelem - 1;
    1352    while (idx < right)
    1353      {
    1354        mid = (idx + right) / 2;
    1355        if (set->elems[mid] < elem)
    1356  	idx = mid + 1;
    1357        else
    1358  	right = mid;
    1359      }
    1360    return set->elems[idx] == elem ? idx + 1 : 0;
    1361  }
    1362  
    1363  static void
    1364  re_node_set_remove_at (re_node_set *set, Idx idx)
    1365  {
    1366    if (idx < 0 || idx >= set->nelem)
    1367      return;
    1368    --set->nelem;
    1369    for (; idx < set->nelem; idx++)
    1370      set->elems[idx] = set->elems[idx + 1];
    1371  }
    1372  
    1373  
    1374  /* Add the token TOKEN to dfa->nodes, and return the index of the token.
    1375     Or return -1 if an error occurred.  */
    1376  
    1377  static Idx
    1378  re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
    1379  {
    1380    if (__glibc_unlikely (dfa->nodes_len >= dfa->nodes_alloc))
    1381      {
    1382        size_t new_nodes_alloc = dfa->nodes_alloc * 2;
    1383        Idx *new_nexts, *new_indices;
    1384        re_node_set *new_edests, *new_eclosures;
    1385        re_token_t *new_nodes;
    1386  
    1387        /* Avoid overflows in realloc.  */
    1388        const size_t max_object_size = MAX (sizeof (re_token_t),
    1389  					  MAX (sizeof (re_node_set),
    1390  					       sizeof (Idx)));
    1391        if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size)
    1392  			    < new_nodes_alloc))
    1393  	return -1;
    1394  
    1395        new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
    1396        if (__glibc_unlikely (new_nodes == NULL))
    1397  	return -1;
    1398        dfa->nodes = new_nodes;
    1399        new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc);
    1400        new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc);
    1401        new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
    1402        new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
    1403        if (__glibc_unlikely (new_nexts == NULL || new_indices == NULL
    1404  			    || new_edests == NULL || new_eclosures == NULL))
    1405  	{
    1406  	   re_free (new_nexts);
    1407  	   re_free (new_indices);
    1408  	   re_free (new_edests);
    1409  	   re_free (new_eclosures);
    1410  	   return -1;
    1411  	}
    1412        dfa->nexts = new_nexts;
    1413        dfa->org_indices = new_indices;
    1414        dfa->edests = new_edests;
    1415        dfa->eclosures = new_eclosures;
    1416        dfa->nodes_alloc = new_nodes_alloc;
    1417      }
    1418    dfa->nodes[dfa->nodes_len] = token;
    1419    dfa->nodes[dfa->nodes_len].constraint = 0;
    1420    dfa->nodes[dfa->nodes_len].accept_mb =
    1421      ((token.type == OP_PERIOD && dfa->mb_cur_max > 1)
    1422       || token.type == COMPLEX_BRACKET);
    1423    dfa->nexts[dfa->nodes_len] = -1;
    1424    re_node_set_init_empty (dfa->edests + dfa->nodes_len);
    1425    re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
    1426    return dfa->nodes_len++;
    1427  }
    1428  
    1429  static re_hashval_t
    1430  calc_state_hash (const re_node_set *nodes, unsigned int context)
    1431  {
    1432    re_hashval_t hash = nodes->nelem + context;
    1433    Idx i;
    1434    for (i = 0 ; i < nodes->nelem ; i++)
    1435      hash += nodes->elems[i];
    1436    return hash;
    1437  }
    1438  
    1439  /* Search for the state whose node_set is equivalent to NODES.
    1440     Return the pointer to the state, if we found it in the DFA.
    1441     Otherwise create the new one and return it.  In case of an error
    1442     return NULL and set the error code in ERR.
    1443     Note: - We assume NULL as the invalid state, then it is possible that
    1444  	   return value is NULL and ERR is REG_NOERROR.
    1445  	 - We never return non-NULL value in case of any errors, it is for
    1446  	   optimization.  */
    1447  
    1448  static re_dfastate_t *
    1449  __attribute_warn_unused_result__
    1450  re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
    1451  		  const re_node_set *nodes)
    1452  {
    1453    re_hashval_t hash;
    1454    re_dfastate_t *new_state;
    1455    struct re_state_table_entry *spot;
    1456    Idx i;
    1457  #if defined GCC_LINT || defined lint
    1458    /* Suppress bogus uninitialized-variable warnings.  */
    1459    *err = REG_NOERROR;
    1460  #endif
    1461    if (__glibc_unlikely (nodes->nelem == 0))
    1462      {
    1463        *err = REG_NOERROR;
    1464        return NULL;
    1465      }
    1466    hash = calc_state_hash (nodes, 0);
    1467    spot = dfa->state_table + (hash & dfa->state_hash_mask);
    1468  
    1469    for (i = 0 ; i < spot->num ; i++)
    1470      {
    1471        re_dfastate_t *state = spot->array[i];
    1472        if (hash != state->hash)
    1473  	continue;
    1474        if (re_node_set_compare (&state->nodes, nodes))
    1475  	return state;
    1476      }
    1477  
    1478    /* There are no appropriate state in the dfa, create the new one.  */
    1479    new_state = create_ci_newstate (dfa, nodes, hash);
    1480    if (__glibc_unlikely (new_state == NULL))
    1481      *err = REG_ESPACE;
    1482  
    1483    return new_state;
    1484  }
    1485  
    1486  /* Search for the state whose node_set is equivalent to NODES and
    1487     whose context is equivalent to CONTEXT.
    1488     Return the pointer to the state, if we found it in the DFA.
    1489     Otherwise create the new one and return it.  In case of an error
    1490     return NULL and set the error code in ERR.
    1491     Note: - We assume NULL as the invalid state, then it is possible that
    1492  	   return value is NULL and ERR is REG_NOERROR.
    1493  	 - We never return non-NULL value in case of any errors, it is for
    1494  	   optimization.  */
    1495  
    1496  static re_dfastate_t *
    1497  __attribute_warn_unused_result__
    1498  re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
    1499  			  const re_node_set *nodes, unsigned int context)
    1500  {
    1501    re_hashval_t hash;
    1502    re_dfastate_t *new_state;
    1503    struct re_state_table_entry *spot;
    1504    Idx i;
    1505  #if defined GCC_LINT || defined lint
    1506    /* Suppress bogus uninitialized-variable warnings.  */
    1507    *err = REG_NOERROR;
    1508  #endif
    1509    if (nodes->nelem == 0)
    1510      {
    1511        *err = REG_NOERROR;
    1512        return NULL;
    1513      }
    1514    hash = calc_state_hash (nodes, context);
    1515    spot = dfa->state_table + (hash & dfa->state_hash_mask);
    1516  
    1517    for (i = 0 ; i < spot->num ; i++)
    1518      {
    1519        re_dfastate_t *state = spot->array[i];
    1520        if (state->hash == hash
    1521  	  && state->context == context
    1522  	  && re_node_set_compare (state->entrance_nodes, nodes))
    1523  	return state;
    1524      }
    1525    /* There are no appropriate state in 'dfa', create the new one.  */
    1526    new_state = create_cd_newstate (dfa, nodes, context, hash);
    1527    if (__glibc_unlikely (new_state == NULL))
    1528      *err = REG_ESPACE;
    1529  
    1530    return new_state;
    1531  }
    1532  
    1533  /* Finish initialization of the new state NEWSTATE, and using its hash value
    1534     HASH put in the appropriate bucket of DFA's state table.  Return value
    1535     indicates the error code if failed.  */
    1536  
    1537  static reg_errcode_t
    1538  __attribute_warn_unused_result__
    1539  register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
    1540  		re_hashval_t hash)
    1541  {
    1542    struct re_state_table_entry *spot;
    1543    reg_errcode_t err;
    1544    Idx i;
    1545  
    1546    newstate->hash = hash;
    1547    err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
    1548    if (__glibc_unlikely (err != REG_NOERROR))
    1549      return REG_ESPACE;
    1550    for (i = 0; i < newstate->nodes.nelem; i++)
    1551      {
    1552        Idx elem = newstate->nodes.elems[i];
    1553        if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
    1554  	if (! re_node_set_insert_last (&newstate->non_eps_nodes, elem))
    1555  	  return REG_ESPACE;
    1556      }
    1557  
    1558    spot = dfa->state_table + (hash & dfa->state_hash_mask);
    1559    if (__glibc_unlikely (spot->alloc <= spot->num))
    1560      {
    1561        Idx new_alloc = 2 * spot->num + 2;
    1562        re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
    1563  					      new_alloc);
    1564        if (__glibc_unlikely (new_array == NULL))
    1565  	return REG_ESPACE;
    1566        spot->array = new_array;
    1567        spot->alloc = new_alloc;
    1568      }
    1569    spot->array[spot->num++] = newstate;
    1570    return REG_NOERROR;
    1571  }
    1572  
    1573  static void
    1574  free_state (re_dfastate_t *state)
    1575  {
    1576    re_node_set_free (&state->non_eps_nodes);
    1577    re_node_set_free (&state->inveclosure);
    1578    if (state->entrance_nodes != &state->nodes)
    1579      {
    1580        re_node_set_free (state->entrance_nodes);
    1581        re_free (state->entrance_nodes);
    1582      }
    1583    re_node_set_free (&state->nodes);
    1584    re_free (state->word_trtable);
    1585    re_free (state->trtable);
    1586    re_free (state);
    1587  }
    1588  
    1589  /* Create the new state which is independent of contexts.
    1590     Return the new state if succeeded, otherwise return NULL.  */
    1591  
    1592  static re_dfastate_t *
    1593  __attribute_warn_unused_result__
    1594  create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
    1595  		    re_hashval_t hash)
    1596  {
    1597    Idx i;
    1598    reg_errcode_t err;
    1599    re_dfastate_t *newstate;
    1600  
    1601    newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
    1602    if (__glibc_unlikely (newstate == NULL))
    1603      return NULL;
    1604    err = re_node_set_init_copy (&newstate->nodes, nodes);
    1605    if (__glibc_unlikely (err != REG_NOERROR))
    1606      {
    1607        re_free (newstate);
    1608        return NULL;
    1609      }
    1610  
    1611    newstate->entrance_nodes = &newstate->nodes;
    1612    for (i = 0 ; i < nodes->nelem ; i++)
    1613      {
    1614        re_token_t *node = dfa->nodes + nodes->elems[i];
    1615        re_token_type_t type = node->type;
    1616        if (type == CHARACTER && !node->constraint)
    1617  	continue;
    1618        newstate->accept_mb |= node->accept_mb;
    1619  
    1620        /* If the state has the halt node, the state is a halt state.  */
    1621        if (type == END_OF_RE)
    1622  	newstate->halt = 1;
    1623        else if (type == OP_BACK_REF)
    1624  	newstate->has_backref = 1;
    1625        else if (type == ANCHOR || node->constraint)
    1626  	newstate->has_constraint = 1;
    1627      }
    1628    err = register_state (dfa, newstate, hash);
    1629    if (__glibc_unlikely (err != REG_NOERROR))
    1630      {
    1631        free_state (newstate);
    1632        newstate = NULL;
    1633      }
    1634    return newstate;
    1635  }
    1636  
    1637  /* Create the new state which is depend on the context CONTEXT.
    1638     Return the new state if succeeded, otherwise return NULL.  */
    1639  
    1640  static re_dfastate_t *
    1641  __attribute_warn_unused_result__
    1642  create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
    1643  		    unsigned int context, re_hashval_t hash)
    1644  {
    1645    Idx i, nctx_nodes = 0;
    1646    reg_errcode_t err;
    1647    re_dfastate_t *newstate;
    1648  
    1649    newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
    1650    if (__glibc_unlikely (newstate == NULL))
    1651      return NULL;
    1652    err = re_node_set_init_copy (&newstate->nodes, nodes);
    1653    if (__glibc_unlikely (err != REG_NOERROR))
    1654      {
    1655        re_free (newstate);
    1656        return NULL;
    1657      }
    1658  
    1659    newstate->context = context;
    1660    newstate->entrance_nodes = &newstate->nodes;
    1661  
    1662    for (i = 0 ; i < nodes->nelem ; i++)
    1663      {
    1664        re_token_t *node = dfa->nodes + nodes->elems[i];
    1665        re_token_type_t type = node->type;
    1666        unsigned int constraint = node->constraint;
    1667  
    1668        if (type == CHARACTER && !constraint)
    1669  	continue;
    1670        newstate->accept_mb |= node->accept_mb;
    1671  
    1672        /* If the state has the halt node, the state is a halt state.  */
    1673        if (type == END_OF_RE)
    1674  	newstate->halt = 1;
    1675        else if (type == OP_BACK_REF)
    1676  	newstate->has_backref = 1;
    1677  
    1678        if (constraint)
    1679  	{
    1680  	  if (newstate->entrance_nodes == &newstate->nodes)
    1681  	    {
    1682  	      re_node_set *entrance_nodes = re_malloc (re_node_set, 1);
    1683  	      if (__glibc_unlikely (entrance_nodes == NULL))
    1684  		{
    1685  		  free_state (newstate);
    1686  		  return NULL;
    1687  		}
    1688  	      newstate->entrance_nodes = entrance_nodes;
    1689  	      if (re_node_set_init_copy (newstate->entrance_nodes, nodes)
    1690  		  != REG_NOERROR)
    1691  		{
    1692  		  free_state (newstate);
    1693  		  return NULL;
    1694  		}
    1695  	      nctx_nodes = 0;
    1696  	      newstate->has_constraint = 1;
    1697  	    }
    1698  
    1699  	  if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
    1700  	    {
    1701  	      re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
    1702  	      ++nctx_nodes;
    1703  	    }
    1704  	}
    1705      }
    1706    err = register_state (dfa, newstate, hash);
    1707    if (__glibc_unlikely (err != REG_NOERROR))
    1708      {
    1709        free_state (newstate);
    1710        newstate = NULL;
    1711      }
    1712    return  newstate;
    1713  }