(root)/
tar-1.35/
gnu/
regex_internal.c
       1  /* Extended regular expression matching and search library.
       2     Copyright (C) 2002-2023 Free Software Foundation, Inc.
       3     This file is part of the GNU C Library.
       4     Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
       5  
       6     The GNU C Library is free software; you can redistribute it and/or
       7     modify it under the terms of the GNU Lesser General Public
       8     License as published by the Free Software Foundation; either
       9     version 2.1 of the License, or (at your option) any later version.
      10  
      11     The GNU C Library is distributed in the hope that it will be useful,
      12     but WITHOUT ANY WARRANTY; without even the implied warranty of
      13     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
      14     Lesser General Public License for more details.
      15  
      16     You should have received a copy of the GNU Lesser General Public
      17     License along with the GNU C Library; if not, see
      18     <https://www.gnu.org/licenses/>.  */
      19  
      20  static void re_string_construct_common (const char *str, Idx len,
      21  					re_string_t *pstr,
      22  					RE_TRANSLATE_TYPE trans, bool icase,
      23  					const re_dfa_t *dfa);
      24  static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
      25  					  const re_node_set *nodes,
      26  					  re_hashval_t hash);
      27  static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
      28  					  const re_node_set *nodes,
      29  					  unsigned int context,
      30  					  re_hashval_t hash);
      31  static reg_errcode_t re_string_realloc_buffers (re_string_t *pstr,
      32  						Idx new_buf_len);
      33  static void build_wcs_buffer (re_string_t *pstr);
      34  static reg_errcode_t build_wcs_upper_buffer (re_string_t *pstr);
      35  static void build_upper_buffer (re_string_t *pstr);
      36  static void re_string_translate_buffer (re_string_t *pstr);
      37  static unsigned int re_string_context_at (const re_string_t *input, Idx idx,
      38  					  int eflags) __attribute__ ((pure));
      39  
      40  /* Functions for string operation.  */
      41  
      42  /* This function allocate the buffers.  It is necessary to call
      43     re_string_reconstruct before using the object.  */
      44  
      45  static reg_errcode_t
      46  __attribute_warn_unused_result__
      47  re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len,
      48  		    RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
      49  {
      50    reg_errcode_t ret;
      51    Idx init_buf_len;
      52  
      53    /* Ensure at least one character fits into the buffers.  */
      54    if (init_len < dfa->mb_cur_max)
      55      init_len = dfa->mb_cur_max;
      56    init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
      57    re_string_construct_common (str, len, pstr, trans, icase, dfa);
      58  
      59    ret = re_string_realloc_buffers (pstr, init_buf_len);
      60    if (__glibc_unlikely (ret != REG_NOERROR))
      61      return ret;
      62  
      63    pstr->word_char = dfa->word_char;
      64    pstr->word_ops_used = dfa->word_ops_used;
      65    pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
      66    pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
      67    pstr->valid_raw_len = pstr->valid_len;
      68    return REG_NOERROR;
      69  }
      70  
      71  /* This function allocate the buffers, and initialize them.  */
      72  
      73  static reg_errcode_t
      74  __attribute_warn_unused_result__
      75  re_string_construct (re_string_t *pstr, const char *str, Idx len,
      76  		     RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
      77  {
      78    reg_errcode_t ret;
      79    memset (pstr, '\0', sizeof (re_string_t));
      80    re_string_construct_common (str, len, pstr, trans, icase, dfa);
      81  
      82    if (len > 0)
      83      {
      84        ret = re_string_realloc_buffers (pstr, len + 1);
      85        if (__glibc_unlikely (ret != REG_NOERROR))
      86  	return ret;
      87      }
      88    pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
      89  
      90    if (icase)
      91      {
      92        if (dfa->mb_cur_max > 1)
      93  	{
      94  	  while (1)
      95  	    {
      96  	      ret = build_wcs_upper_buffer (pstr);
      97  	      if (__glibc_unlikely (ret != REG_NOERROR))
      98  		return ret;
      99  	      if (pstr->valid_raw_len >= len)
     100  		break;
     101  	      if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
     102  		break;
     103  	      ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
     104  	      if (__glibc_unlikely (ret != REG_NOERROR))
     105  		return ret;
     106  	    }
     107  	}
     108        else
     109  	build_upper_buffer (pstr);
     110      }
     111    else
     112      {
     113        if (dfa->mb_cur_max > 1)
     114  	build_wcs_buffer (pstr);
     115        else
     116  	{
     117  	  if (trans != NULL)
     118  	    re_string_translate_buffer (pstr);
     119  	  else
     120  	    {
     121  	      pstr->valid_len = pstr->bufs_len;
     122  	      pstr->valid_raw_len = pstr->bufs_len;
     123  	    }
     124  	}
     125      }
     126  
     127    return REG_NOERROR;
     128  }
     129  
     130  /* Helper functions for re_string_allocate, and re_string_construct.  */
     131  
     132  static reg_errcode_t
     133  __attribute_warn_unused_result__
     134  re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len)
     135  {
     136    if (pstr->mb_cur_max > 1)
     137      {
     138        wint_t *new_wcs;
     139  
     140        /* Avoid overflow in realloc.  */
     141        const size_t max_object_size = MAX (sizeof (wint_t), sizeof (Idx));
     142        if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size)
     143  			    < new_buf_len))
     144  	return REG_ESPACE;
     145  
     146        new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
     147        if (__glibc_unlikely (new_wcs == NULL))
     148  	return REG_ESPACE;
     149        pstr->wcs = new_wcs;
     150        if (pstr->offsets != NULL)
     151  	{
     152  	  Idx *new_offsets = re_realloc (pstr->offsets, Idx, new_buf_len);
     153  	  if (__glibc_unlikely (new_offsets == NULL))
     154  	    return REG_ESPACE;
     155  	  pstr->offsets = new_offsets;
     156  	}
     157      }
     158    if (pstr->mbs_allocated)
     159      {
     160        unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
     161  					   new_buf_len);
     162        if (__glibc_unlikely (new_mbs == NULL))
     163  	return REG_ESPACE;
     164        pstr->mbs = new_mbs;
     165      }
     166    pstr->bufs_len = new_buf_len;
     167    return REG_NOERROR;
     168  }
     169  
     170  
     171  static void
     172  re_string_construct_common (const char *str, Idx len, re_string_t *pstr,
     173  			    RE_TRANSLATE_TYPE trans, bool icase,
     174  			    const re_dfa_t *dfa)
     175  {
     176    pstr->raw_mbs = (const unsigned char *) str;
     177    pstr->len = len;
     178    pstr->raw_len = len;
     179    pstr->trans = trans;
     180    pstr->icase = icase;
     181    pstr->mbs_allocated = (trans != NULL || icase);
     182    pstr->mb_cur_max = dfa->mb_cur_max;
     183    pstr->is_utf8 = dfa->is_utf8;
     184    pstr->map_notascii = dfa->map_notascii;
     185    pstr->stop = pstr->len;
     186    pstr->raw_stop = pstr->stop;
     187  }
     188  
     189  
     190  /* Build wide character buffer PSTR->WCS.
     191     If the byte sequence of the string are:
     192       <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
     193     Then wide character buffer will be:
     194       <wc1>   , WEOF    , <wc2>   , WEOF    , <wc3>
     195     We use WEOF for padding, they indicate that the position isn't
     196     a first byte of a multibyte character.
     197  
     198     Note that this function assumes PSTR->VALID_LEN elements are already
     199     built and starts from PSTR->VALID_LEN.  */
     200  
     201  static void
     202  build_wcs_buffer (re_string_t *pstr)
     203  {
     204  #ifdef _LIBC
     205    unsigned char buf[MB_LEN_MAX];
     206    DEBUG_ASSERT (MB_LEN_MAX >= pstr->mb_cur_max);
     207  #else
     208    unsigned char buf[64];
     209  #endif
     210    mbstate_t prev_st;
     211    Idx byte_idx, end_idx, remain_len;
     212    size_t mbclen;
     213  
     214    /* Build the buffers from pstr->valid_len to either pstr->len or
     215       pstr->bufs_len.  */
     216    end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
     217    for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
     218      {
     219        wchar_t wc;
     220        const char *p;
     221  
     222        remain_len = end_idx - byte_idx;
     223        prev_st = pstr->cur_state;
     224        /* Apply the translation if we need.  */
     225        if (__glibc_unlikely (pstr->trans != NULL))
     226  	{
     227  	  int i, ch;
     228  
     229  	  for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
     230  	    {
     231  	      ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
     232  	      buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
     233  	    }
     234  	  p = (const char *) buf;
     235  	}
     236        else
     237  	p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
     238        mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
     239        if (__glibc_unlikely (mbclen == (size_t) -1 || mbclen == 0
     240  			    || (mbclen == (size_t) -2
     241  				&& pstr->bufs_len >= pstr->len)))
     242  	{
     243  	  /* We treat these cases as a singlebyte character.  */
     244  	  mbclen = 1;
     245  	  wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
     246  	  if (__glibc_unlikely (pstr->trans != NULL))
     247  	    wc = pstr->trans[wc];
     248  	  pstr->cur_state = prev_st;
     249  	}
     250        else if (__glibc_unlikely (mbclen == (size_t) -2))
     251  	{
     252  	  /* The buffer doesn't have enough space, finish to build.  */
     253  	  pstr->cur_state = prev_st;
     254  	  break;
     255  	}
     256  
     257        /* Write wide character and padding.  */
     258        pstr->wcs[byte_idx++] = wc;
     259        /* Write paddings.  */
     260        for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
     261  	pstr->wcs[byte_idx++] = WEOF;
     262      }
     263    pstr->valid_len = byte_idx;
     264    pstr->valid_raw_len = byte_idx;
     265  }
     266  
     267  /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
     268     but for REG_ICASE.  */
     269  
     270  static reg_errcode_t
     271  __attribute_warn_unused_result__
     272  build_wcs_upper_buffer (re_string_t *pstr)
     273  {
     274    mbstate_t prev_st;
     275    Idx src_idx, byte_idx, end_idx, remain_len;
     276    size_t mbclen;
     277  #ifdef _LIBC
     278    char buf[MB_LEN_MAX];
     279    DEBUG_ASSERT (pstr->mb_cur_max <= MB_LEN_MAX);
     280  #else
     281    char buf[64];
     282  #endif
     283  
     284    byte_idx = pstr->valid_len;
     285    end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
     286  
     287    /* The following optimization assumes that ASCII characters can be
     288       mapped to wide characters with a simple cast.  */
     289    if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
     290      {
     291        while (byte_idx < end_idx)
     292  	{
     293  	  wchar_t wc;
     294  	  unsigned char ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
     295  
     296  	  if (isascii (ch) && mbsinit (&pstr->cur_state))
     297  	    {
     298  	      /* The next step uses the assumption that wchar_t is encoded
     299  		 ASCII-safe: all ASCII values can be converted like this.  */
     300  	      wchar_t wcu = __towupper (ch);
     301  	      if (isascii (wcu))
     302  		{
     303  		  pstr->mbs[byte_idx] = wcu;
     304  		  pstr->wcs[byte_idx] = wcu;
     305  		  byte_idx++;
     306  		  continue;
     307  		}
     308  	    }
     309  
     310  	  remain_len = end_idx - byte_idx;
     311  	  prev_st = pstr->cur_state;
     312  	  mbclen = __mbrtowc (&wc,
     313  			      ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
     314  			       + byte_idx), remain_len, &pstr->cur_state);
     315  	  if (__glibc_likely (0 < mbclen && mbclen < (size_t) -2))
     316  	    {
     317  	      wchar_t wcu = __towupper (wc);
     318  	      if (wcu != wc)
     319  		{
     320  		  size_t mbcdlen;
     321  
     322  		  mbcdlen = __wcrtomb (buf, wcu, &prev_st);
     323  		  if (__glibc_likely (mbclen == mbcdlen))
     324  		    memcpy (pstr->mbs + byte_idx, buf, mbclen);
     325  		  else
     326  		    {
     327  		      src_idx = byte_idx;
     328  		      goto offsets_needed;
     329  		    }
     330  		}
     331  	      else
     332  		memcpy (pstr->mbs + byte_idx,
     333  			pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
     334  	      pstr->wcs[byte_idx++] = wcu;
     335  	      /* Write paddings.  */
     336  	      for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
     337  		pstr->wcs[byte_idx++] = WEOF;
     338  	    }
     339  	  else if (mbclen == (size_t) -1 || mbclen == 0
     340  		   || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
     341  	    {
     342  	      /* It is an invalid character, an incomplete character
     343  		 at the end of the string, or '\0'.  Just use the byte.  */
     344  	      pstr->mbs[byte_idx] = ch;
     345  	      /* And also cast it to wide char.  */
     346  	      pstr->wcs[byte_idx++] = (wchar_t) ch;
     347  	      if (__glibc_unlikely (mbclen == (size_t) -1))
     348  		pstr->cur_state = prev_st;
     349  	    }
     350  	  else
     351  	    {
     352  	      /* The buffer doesn't have enough space, finish to build.  */
     353  	      pstr->cur_state = prev_st;
     354  	      break;
     355  	    }
     356  	}
     357        pstr->valid_len = byte_idx;
     358        pstr->valid_raw_len = byte_idx;
     359        return REG_NOERROR;
     360      }
     361    else
     362      for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
     363        {
     364  	wchar_t wc;
     365  	const char *p;
     366        offsets_needed:
     367  	remain_len = end_idx - byte_idx;
     368  	prev_st = pstr->cur_state;
     369  	if (__glibc_unlikely (pstr->trans != NULL))
     370  	  {
     371  	    int i, ch;
     372  
     373  	    for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
     374  	      {
     375  		ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
     376  		buf[i] = pstr->trans[ch];
     377  	      }
     378  	    p = (const char *) buf;
     379  	  }
     380  	else
     381  	  p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
     382  	mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
     383  	if (__glibc_likely (0 < mbclen && mbclen < (size_t) -2))
     384  	  {
     385  	    wchar_t wcu = __towupper (wc);
     386  	    if (wcu != wc)
     387  	      {
     388  		size_t mbcdlen;
     389  
     390  		mbcdlen = __wcrtomb ((char *) buf, wcu, &prev_st);
     391  		if (__glibc_likely (mbclen == mbcdlen))
     392  		  memcpy (pstr->mbs + byte_idx, buf, mbclen);
     393  		else if (mbcdlen != (size_t) -1)
     394  		  {
     395  		    size_t i;
     396  
     397  		    if (byte_idx + mbcdlen > pstr->bufs_len)
     398  		      {
     399  			pstr->cur_state = prev_st;
     400  			break;
     401  		      }
     402  
     403  		    if (pstr->offsets == NULL)
     404  		      {
     405  			pstr->offsets = re_malloc (Idx, pstr->bufs_len);
     406  
     407  			if (pstr->offsets == NULL)
     408  			  return REG_ESPACE;
     409  		      }
     410  		    if (!pstr->offsets_needed)
     411  		      {
     412  			for (i = 0; i < (size_t) byte_idx; ++i)
     413  			  pstr->offsets[i] = i;
     414  			pstr->offsets_needed = 1;
     415  		      }
     416  
     417  		    memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
     418  		    pstr->wcs[byte_idx] = wcu;
     419  		    pstr->offsets[byte_idx] = src_idx;
     420  		    for (i = 1; i < mbcdlen; ++i)
     421  		      {
     422  			pstr->offsets[byte_idx + i]
     423  			  = src_idx + (i < mbclen ? i : mbclen - 1);
     424  			pstr->wcs[byte_idx + i] = WEOF;
     425  		      }
     426  		    pstr->len += mbcdlen - mbclen;
     427  		    if (pstr->raw_stop > src_idx)
     428  		      pstr->stop += mbcdlen - mbclen;
     429  		    end_idx = (pstr->bufs_len > pstr->len)
     430  			      ? pstr->len : pstr->bufs_len;
     431  		    byte_idx += mbcdlen;
     432  		    src_idx += mbclen;
     433  		    continue;
     434  		  }
     435  		else
     436  		  memcpy (pstr->mbs + byte_idx, p, mbclen);
     437  	      }
     438  	    else
     439  	      memcpy (pstr->mbs + byte_idx, p, mbclen);
     440  
     441  	    if (__glibc_unlikely (pstr->offsets_needed != 0))
     442  	      {
     443  		size_t i;
     444  		for (i = 0; i < mbclen; ++i)
     445  		  pstr->offsets[byte_idx + i] = src_idx + i;
     446  	      }
     447  	    src_idx += mbclen;
     448  
     449  	    pstr->wcs[byte_idx++] = wcu;
     450  	    /* Write paddings.  */
     451  	    for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
     452  	      pstr->wcs[byte_idx++] = WEOF;
     453  	  }
     454  	else if (mbclen == (size_t) -1 || mbclen == 0
     455  		 || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
     456  	  {
     457  	    /* It is an invalid character or '\0'.  Just use the byte.  */
     458  	    int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
     459  
     460  	    if (__glibc_unlikely (pstr->trans != NULL))
     461  	      ch = pstr->trans [ch];
     462  	    pstr->mbs[byte_idx] = ch;
     463  
     464  	    if (__glibc_unlikely (pstr->offsets_needed != 0))
     465  	      pstr->offsets[byte_idx] = src_idx;
     466  	    ++src_idx;
     467  
     468  	    /* And also cast it to wide char.  */
     469  	    pstr->wcs[byte_idx++] = (wchar_t) ch;
     470  	    if (__glibc_unlikely (mbclen == (size_t) -1))
     471  	      pstr->cur_state = prev_st;
     472  	  }
     473  	else
     474  	  {
     475  	    /* The buffer doesn't have enough space, finish to build.  */
     476  	    pstr->cur_state = prev_st;
     477  	    break;
     478  	  }
     479        }
     480    pstr->valid_len = byte_idx;
     481    pstr->valid_raw_len = src_idx;
     482    return REG_NOERROR;
     483  }
     484  
     485  /* Skip characters until the index becomes greater than NEW_RAW_IDX.
     486     Return the index.  */
     487  
     488  static Idx
     489  re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc)
     490  {
     491    mbstate_t prev_st;
     492    Idx rawbuf_idx;
     493    size_t mbclen;
     494    wint_t wc = WEOF;
     495  
     496    /* Skip the characters which are not necessary to check.  */
     497    for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
     498         rawbuf_idx < new_raw_idx;)
     499      {
     500        wchar_t wc2;
     501        Idx remain_len = pstr->raw_len - rawbuf_idx;
     502        prev_st = pstr->cur_state;
     503        mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
     504  			  remain_len, &pstr->cur_state);
     505        if (__glibc_unlikely (mbclen == (size_t) -2 || mbclen == (size_t) -1
     506  			    || mbclen == 0))
     507  	{
     508  	  /* We treat these cases as a single byte character.  */
     509  	  if (mbclen == 0 || remain_len == 0)
     510  	    wc = L'\0';
     511  	  else
     512  	    wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
     513  	  mbclen = 1;
     514  	  pstr->cur_state = prev_st;
     515  	}
     516        else
     517  	wc = wc2;
     518        /* Then proceed the next character.  */
     519        rawbuf_idx += mbclen;
     520      }
     521    *last_wc = wc;
     522    return rawbuf_idx;
     523  }
     524  
     525  /* Build the buffer PSTR->MBS, and apply the translation if we need.
     526     This function is used in case of REG_ICASE.  */
     527  
     528  static void
     529  build_upper_buffer (re_string_t *pstr)
     530  {
     531    Idx char_idx, end_idx;
     532    end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
     533  
     534    for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
     535      {
     536        int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
     537        if (__glibc_unlikely (pstr->trans != NULL))
     538  	ch = pstr->trans[ch];
     539        pstr->mbs[char_idx] = toupper (ch);
     540      }
     541    pstr->valid_len = char_idx;
     542    pstr->valid_raw_len = char_idx;
     543  }
     544  
     545  /* Apply TRANS to the buffer in PSTR.  */
     546  
     547  static void
     548  re_string_translate_buffer (re_string_t *pstr)
     549  {
     550    Idx buf_idx, end_idx;
     551    end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
     552  
     553    for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
     554      {
     555        int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
     556        pstr->mbs[buf_idx] = pstr->trans[ch];
     557      }
     558  
     559    pstr->valid_len = buf_idx;
     560    pstr->valid_raw_len = buf_idx;
     561  }
     562  
     563  /* This function re-construct the buffers.
     564     Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
     565     convert to upper case in case of REG_ICASE, apply translation.  */
     566  
     567  static reg_errcode_t
     568  __attribute_warn_unused_result__
     569  re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
     570  {
     571    Idx offset;
     572  
     573    if (__glibc_unlikely (pstr->raw_mbs_idx <= idx))
     574      offset = idx - pstr->raw_mbs_idx;
     575    else
     576      {
     577        /* Reset buffer.  */
     578        if (pstr->mb_cur_max > 1)
     579  	memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
     580        pstr->len = pstr->raw_len;
     581        pstr->stop = pstr->raw_stop;
     582        pstr->valid_len = 0;
     583        pstr->raw_mbs_idx = 0;
     584        pstr->valid_raw_len = 0;
     585        pstr->offsets_needed = 0;
     586        pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
     587  			   : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
     588        if (!pstr->mbs_allocated)
     589  	pstr->mbs = (unsigned char *) pstr->raw_mbs;
     590        offset = idx;
     591      }
     592  
     593    if (__glibc_likely (offset != 0))
     594      {
     595        /* Should the already checked characters be kept?  */
     596        if (__glibc_likely (offset < pstr->valid_raw_len))
     597  	{
     598  	  /* Yes, move them to the front of the buffer.  */
     599  	  if (__glibc_unlikely (pstr->offsets_needed))
     600  	    {
     601  	      Idx low = 0, high = pstr->valid_len, mid;
     602  	      do
     603  		{
     604  		  mid = (high + low) / 2;
     605  		  if (pstr->offsets[mid] > offset)
     606  		    high = mid;
     607  		  else if (pstr->offsets[mid] < offset)
     608  		    low = mid + 1;
     609  		  else
     610  		    break;
     611  		}
     612  	      while (low < high);
     613  	      if (pstr->offsets[mid] < offset)
     614  		++mid;
     615  	      pstr->tip_context = re_string_context_at (pstr, mid - 1,
     616  							eflags);
     617  	      /* This can be quite complicated, so handle specially
     618  		 only the common and easy case where the character with
     619  		 different length representation of lower and upper
     620  		 case is present at or after offset.  */
     621  	      if (pstr->valid_len > offset
     622  		  && mid == offset && pstr->offsets[mid] == offset)
     623  		{
     624  		  memmove (pstr->wcs, pstr->wcs + offset,
     625  			   (pstr->valid_len - offset) * sizeof (wint_t));
     626  		  memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
     627  		  pstr->valid_len -= offset;
     628  		  pstr->valid_raw_len -= offset;
     629  		  for (low = 0; low < pstr->valid_len; low++)
     630  		    pstr->offsets[low] = pstr->offsets[low + offset] - offset;
     631  		}
     632  	      else
     633  		{
     634  		  /* Otherwise, just find out how long the partial multibyte
     635  		     character at offset is and fill it with WEOF/255.  */
     636  		  pstr->len = pstr->raw_len - idx + offset;
     637  		  pstr->stop = pstr->raw_stop - idx + offset;
     638  		  pstr->offsets_needed = 0;
     639  		  while (mid > 0 && pstr->offsets[mid - 1] == offset)
     640  		    --mid;
     641  		  while (mid < pstr->valid_len)
     642  		    if (pstr->wcs[mid] != WEOF)
     643  		      break;
     644  		    else
     645  		      ++mid;
     646  		  if (mid == pstr->valid_len)
     647  		    pstr->valid_len = 0;
     648  		  else
     649  		    {
     650  		      pstr->valid_len = pstr->offsets[mid] - offset;
     651  		      if (pstr->valid_len)
     652  			{
     653  			  for (low = 0; low < pstr->valid_len; ++low)
     654  			    pstr->wcs[low] = WEOF;
     655  			  memset (pstr->mbs, 255, pstr->valid_len);
     656  			}
     657  		    }
     658  		  pstr->valid_raw_len = pstr->valid_len;
     659  		}
     660  	    }
     661  	  else
     662  	    {
     663  	      pstr->tip_context = re_string_context_at (pstr, offset - 1,
     664  							eflags);
     665  	      if (pstr->mb_cur_max > 1)
     666  		memmove (pstr->wcs, pstr->wcs + offset,
     667  			 (pstr->valid_len - offset) * sizeof (wint_t));
     668  	      if (__glibc_unlikely (pstr->mbs_allocated))
     669  		memmove (pstr->mbs, pstr->mbs + offset,
     670  			 pstr->valid_len - offset);
     671  	      pstr->valid_len -= offset;
     672  	      pstr->valid_raw_len -= offset;
     673  	      DEBUG_ASSERT (pstr->valid_len > 0);
     674  	    }
     675  	}
     676        else
     677  	{
     678  	  /* No, skip all characters until IDX.  */
     679  	  Idx prev_valid_len = pstr->valid_len;
     680  
     681  	  if (__glibc_unlikely (pstr->offsets_needed))
     682  	    {
     683  	      pstr->len = pstr->raw_len - idx + offset;
     684  	      pstr->stop = pstr->raw_stop - idx + offset;
     685  	      pstr->offsets_needed = 0;
     686  	    }
     687  	  pstr->valid_len = 0;
     688  	  if (pstr->mb_cur_max > 1)
     689  	    {
     690  	      Idx wcs_idx;
     691  	      wint_t wc = WEOF;
     692  
     693  	      if (pstr->is_utf8)
     694  		{
     695  		  const unsigned char *raw, *p, *end;
     696  
     697  		  /* Special case UTF-8.  Multi-byte chars start with any
     698  		     byte other than 0x80 - 0xbf.  */
     699  		  raw = pstr->raw_mbs + pstr->raw_mbs_idx;
     700  		  end = raw + (offset - pstr->mb_cur_max);
     701  		  if (end < pstr->raw_mbs)
     702  		    end = pstr->raw_mbs;
     703  		  p = raw + offset - 1;
     704  #ifdef _LIBC
     705  		  /* We know the wchar_t encoding is UCS4, so for the simple
     706  		     case, ASCII characters, skip the conversion step.  */
     707  		  if (isascii (*p) && __glibc_likely (pstr->trans == NULL))
     708  		    {
     709  		      memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
     710  		      /* pstr->valid_len = 0; */
     711  		      wc = (wchar_t) *p;
     712  		    }
     713  		  else
     714  #endif
     715  		    for (; p >= end; --p)
     716  		      if ((*p & 0xc0) != 0x80)
     717  			{
     718  			  mbstate_t cur_state;
     719  			  wchar_t wc2;
     720  			  Idx mlen = raw + pstr->len - p;
     721  			  unsigned char buf[6];
     722  			  size_t mbclen;
     723  
     724  			  const unsigned char *pp = p;
     725  			  if (__glibc_unlikely (pstr->trans != NULL))
     726  			    {
     727  			      int i = mlen < 6 ? mlen : 6;
     728  			      while (--i >= 0)
     729  				buf[i] = pstr->trans[p[i]];
     730  			      pp = buf;
     731  			    }
     732  			  /* XXX Don't use mbrtowc, we know which conversion
     733  			     to use (UTF-8 -> UCS4).  */
     734  			  memset (&cur_state, 0, sizeof (cur_state));
     735  			  mbclen = __mbrtowc (&wc2, (const char *) pp, mlen,
     736  					      &cur_state);
     737  			  if (raw + offset - p <= mbclen
     738  			      && mbclen < (size_t) -2)
     739  			    {
     740  			      memset (&pstr->cur_state, '\0',
     741  				      sizeof (mbstate_t));
     742  			      pstr->valid_len = mbclen - (raw + offset - p);
     743  			      wc = wc2;
     744  			    }
     745  			  break;
     746  			}
     747  		}
     748  
     749  	      if (wc == WEOF)
     750  		pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
     751  	      if (wc == WEOF)
     752  		pstr->tip_context
     753  		  = re_string_context_at (pstr, prev_valid_len - 1, eflags);
     754  	      else
     755  		pstr->tip_context = ((__glibc_unlikely (pstr->word_ops_used != 0)
     756  				      && IS_WIDE_WORD_CHAR (wc))
     757  				     ? CONTEXT_WORD
     758  				     : ((IS_WIDE_NEWLINE (wc)
     759  					 && pstr->newline_anchor)
     760  					? CONTEXT_NEWLINE : 0));
     761  	      if (__glibc_unlikely (pstr->valid_len))
     762  		{
     763  		  for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
     764  		    pstr->wcs[wcs_idx] = WEOF;
     765  		  if (pstr->mbs_allocated)
     766  		    memset (pstr->mbs, 255, pstr->valid_len);
     767  		}
     768  	      pstr->valid_raw_len = pstr->valid_len;
     769  	    }
     770  	  else
     771  	    {
     772  	      int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
     773  	      pstr->valid_raw_len = 0;
     774  	      if (pstr->trans)
     775  		c = pstr->trans[c];
     776  	      pstr->tip_context = (bitset_contain (pstr->word_char, c)
     777  				   ? CONTEXT_WORD
     778  				   : ((IS_NEWLINE (c) && pstr->newline_anchor)
     779  				      ? CONTEXT_NEWLINE : 0));
     780  	    }
     781  	}
     782        if (!__glibc_unlikely (pstr->mbs_allocated))
     783  	pstr->mbs += offset;
     784      }
     785    pstr->raw_mbs_idx = idx;
     786    pstr->len -= offset;
     787    pstr->stop -= offset;
     788  
     789    /* Then build the buffers.  */
     790    if (pstr->mb_cur_max > 1)
     791      {
     792        if (pstr->icase)
     793  	{
     794  	  reg_errcode_t ret = build_wcs_upper_buffer (pstr);
     795  	  if (__glibc_unlikely (ret != REG_NOERROR))
     796  	    return ret;
     797  	}
     798        else
     799  	build_wcs_buffer (pstr);
     800      }
     801    else
     802      if (__glibc_unlikely (pstr->mbs_allocated))
     803        {
     804  	if (pstr->icase)
     805  	  build_upper_buffer (pstr);
     806  	else if (pstr->trans != NULL)
     807  	  re_string_translate_buffer (pstr);
     808        }
     809      else
     810        pstr->valid_len = pstr->len;
     811  
     812    pstr->cur_idx = 0;
     813    return REG_NOERROR;
     814  }
     815  
     816  static unsigned char
     817  __attribute__ ((pure))
     818  re_string_peek_byte_case (const re_string_t *pstr, Idx idx)
     819  {
     820    int ch;
     821    Idx off;
     822  
     823    /* Handle the common (easiest) cases first.  */
     824    if (__glibc_likely (!pstr->mbs_allocated))
     825      return re_string_peek_byte (pstr, idx);
     826  
     827    if (pstr->mb_cur_max > 1
     828        && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
     829      return re_string_peek_byte (pstr, idx);
     830  
     831    off = pstr->cur_idx + idx;
     832    if (pstr->offsets_needed)
     833      off = pstr->offsets[off];
     834  
     835    ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
     836  
     837    /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
     838       this function returns CAPITAL LETTER I instead of first byte of
     839       DOTLESS SMALL LETTER I.  The latter would confuse the parser,
     840       since peek_byte_case doesn't advance cur_idx in any way.  */
     841    if (pstr->offsets_needed && !isascii (ch))
     842      return re_string_peek_byte (pstr, idx);
     843  
     844    return ch;
     845  }
     846  
     847  static unsigned char
     848  re_string_fetch_byte_case (re_string_t *pstr)
     849  {
     850    if (__glibc_likely (!pstr->mbs_allocated))
     851      return re_string_fetch_byte (pstr);
     852  
     853    if (pstr->offsets_needed)
     854      {
     855        Idx off;
     856        int ch;
     857  
     858        /* For tr_TR.UTF-8 [[:islower:]] there is
     859  	 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs.  Skip
     860  	 in that case the whole multi-byte character and return
     861  	 the original letter.  On the other side, with
     862  	 [[: DOTLESS SMALL LETTER I return [[:I, as doing
     863  	 anything else would complicate things too much.  */
     864  
     865        if (!re_string_first_byte (pstr, pstr->cur_idx))
     866  	return re_string_fetch_byte (pstr);
     867  
     868        off = pstr->offsets[pstr->cur_idx];
     869        ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
     870  
     871        if (! isascii (ch))
     872  	return re_string_fetch_byte (pstr);
     873  
     874        re_string_skip_bytes (pstr,
     875  			    re_string_char_size_at (pstr, pstr->cur_idx));
     876        return ch;
     877      }
     878  
     879    return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
     880  }
     881  
     882  static void
     883  re_string_destruct (re_string_t *pstr)
     884  {
     885    re_free (pstr->wcs);
     886    re_free (pstr->offsets);
     887    if (pstr->mbs_allocated)
     888      re_free (pstr->mbs);
     889  }
     890  
     891  /* Return the context at IDX in INPUT.  */
     892  
     893  static unsigned int
     894  re_string_context_at (const re_string_t *input, Idx idx, int eflags)
     895  {
     896    int c;
     897    if (__glibc_unlikely (idx < 0))
     898      /* In this case, we use the value stored in input->tip_context,
     899         since we can't know the character in input->mbs[-1] here.  */
     900      return input->tip_context;
     901    if (__glibc_unlikely (idx == input->len))
     902      return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
     903  	    : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
     904    if (input->mb_cur_max > 1)
     905      {
     906        wint_t wc;
     907        Idx wc_idx = idx;
     908        while(input->wcs[wc_idx] == WEOF)
     909  	{
     910  	  DEBUG_ASSERT (wc_idx >= 0);
     911  	  --wc_idx;
     912  	  if (wc_idx < 0)
     913  	    return input->tip_context;
     914  	}
     915        wc = input->wcs[wc_idx];
     916        if (__glibc_unlikely (input->word_ops_used != 0)
     917  	  && IS_WIDE_WORD_CHAR (wc))
     918  	return CONTEXT_WORD;
     919        return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
     920  	      ? CONTEXT_NEWLINE : 0);
     921      }
     922    else
     923      {
     924        c = re_string_byte_at (input, idx);
     925        if (bitset_contain (input->word_char, c))
     926  	return CONTEXT_WORD;
     927        return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
     928      }
     929  }
     930  
     931  /* Functions for set operation.  */
     932  
     933  static reg_errcode_t
     934  __attribute_warn_unused_result__
     935  re_node_set_alloc (re_node_set *set, Idx size)
     936  {
     937    set->alloc = size;
     938    set->nelem = 0;
     939    set->elems = re_malloc (Idx, size);
     940    if (__glibc_unlikely (set->elems == NULL)
     941        && (MALLOC_0_IS_NONNULL || size != 0))
     942      return REG_ESPACE;
     943    return REG_NOERROR;
     944  }
     945  
     946  static reg_errcode_t
     947  __attribute_warn_unused_result__
     948  re_node_set_init_1 (re_node_set *set, Idx elem)
     949  {
     950    set->alloc = 1;
     951    set->nelem = 1;
     952    set->elems = re_malloc (Idx, 1);
     953    if (__glibc_unlikely (set->elems == NULL))
     954      {
     955        set->alloc = set->nelem = 0;
     956        return REG_ESPACE;
     957      }
     958    set->elems[0] = elem;
     959    return REG_NOERROR;
     960  }
     961  
     962  static reg_errcode_t
     963  __attribute_warn_unused_result__
     964  re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2)
     965  {
     966    set->alloc = 2;
     967    set->elems = re_malloc (Idx, 2);
     968    if (__glibc_unlikely (set->elems == NULL))
     969      return REG_ESPACE;
     970    if (elem1 == elem2)
     971      {
     972        set->nelem = 1;
     973        set->elems[0] = elem1;
     974      }
     975    else
     976      {
     977        set->nelem = 2;
     978        if (elem1 < elem2)
     979  	{
     980  	  set->elems[0] = elem1;
     981  	  set->elems[1] = elem2;
     982  	}
     983        else
     984  	{
     985  	  set->elems[0] = elem2;
     986  	  set->elems[1] = elem1;
     987  	}
     988      }
     989    return REG_NOERROR;
     990  }
     991  
     992  static reg_errcode_t
     993  __attribute_warn_unused_result__
     994  re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
     995  {
     996    dest->nelem = src->nelem;
     997    if (src->nelem > 0)
     998      {
     999        dest->alloc = dest->nelem;
    1000        dest->elems = re_malloc (Idx, dest->alloc);
    1001        if (__glibc_unlikely (dest->elems == NULL))
    1002  	{
    1003  	  dest->alloc = dest->nelem = 0;
    1004  	  return REG_ESPACE;
    1005  	}
    1006        memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
    1007      }
    1008    else
    1009      re_node_set_init_empty (dest);
    1010    return REG_NOERROR;
    1011  }
    1012  
    1013  /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
    1014     DEST. Return value indicate the error code or REG_NOERROR if succeeded.
    1015     Note: We assume dest->elems is NULL, when dest->alloc is 0.  */
    1016  
    1017  static reg_errcode_t
    1018  __attribute_warn_unused_result__
    1019  re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
    1020  			   const re_node_set *src2)
    1021  {
    1022    Idx i1, i2, is, id, delta, sbase;
    1023    if (src1->nelem == 0 || src2->nelem == 0)
    1024      return REG_NOERROR;
    1025  
    1026    /* We need dest->nelem + 2 * elems_in_intersection; this is a
    1027       conservative estimate.  */
    1028    if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
    1029      {
    1030        Idx new_alloc = src1->nelem + src2->nelem + dest->alloc;
    1031        Idx *new_elems = re_realloc (dest->elems, Idx, new_alloc);
    1032        if (__glibc_unlikely (new_elems == NULL))
    1033  	return REG_ESPACE;
    1034        dest->elems = new_elems;
    1035        dest->alloc = new_alloc;
    1036      }
    1037  
    1038    /* Find the items in the intersection of SRC1 and SRC2, and copy
    1039       into the top of DEST those that are not already in DEST itself.  */
    1040    sbase = dest->nelem + src1->nelem + src2->nelem;
    1041    i1 = src1->nelem - 1;
    1042    i2 = src2->nelem - 1;
    1043    id = dest->nelem - 1;
    1044    for (;;)
    1045      {
    1046        if (src1->elems[i1] == src2->elems[i2])
    1047  	{
    1048  	  /* Try to find the item in DEST.  Maybe we could binary search?  */
    1049  	  while (id >= 0 && dest->elems[id] > src1->elems[i1])
    1050  	    --id;
    1051  
    1052  	  if (id < 0 || dest->elems[id] != src1->elems[i1])
    1053              dest->elems[--sbase] = src1->elems[i1];
    1054  
    1055  	  if (--i1 < 0 || --i2 < 0)
    1056  	    break;
    1057  	}
    1058  
    1059        /* Lower the highest of the two items.  */
    1060        else if (src1->elems[i1] < src2->elems[i2])
    1061  	{
    1062  	  if (--i2 < 0)
    1063  	    break;
    1064  	}
    1065        else
    1066  	{
    1067  	  if (--i1 < 0)
    1068  	    break;
    1069  	}
    1070      }
    1071  
    1072    id = dest->nelem - 1;
    1073    is = dest->nelem + src1->nelem + src2->nelem - 1;
    1074    delta = is - sbase + 1;
    1075  
    1076    /* Now copy.  When DELTA becomes zero, the remaining
    1077       DEST elements are already in place; this is more or
    1078       less the same loop that is in re_node_set_merge.  */
    1079    dest->nelem += delta;
    1080    if (delta > 0 && id >= 0)
    1081      for (;;)
    1082        {
    1083  	if (dest->elems[is] > dest->elems[id])
    1084  	  {
    1085  	    /* Copy from the top.  */
    1086  	    dest->elems[id + delta--] = dest->elems[is--];
    1087  	    if (delta == 0)
    1088  	      break;
    1089  	  }
    1090  	else
    1091  	  {
    1092  	    /* Slide from the bottom.  */
    1093  	    dest->elems[id + delta] = dest->elems[id];
    1094  	    if (--id < 0)
    1095  	      break;
    1096  	  }
    1097        }
    1098  
    1099    /* Copy remaining SRC elements.  */
    1100    memcpy (dest->elems, dest->elems + sbase, delta * sizeof (Idx));
    1101  
    1102    return REG_NOERROR;
    1103  }
    1104  
    1105  /* Calculate the union set of the sets SRC1 and SRC2. And store it to
    1106     DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
    1107  
    1108  static reg_errcode_t
    1109  __attribute_warn_unused_result__
    1110  re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
    1111  			const re_node_set *src2)
    1112  {
    1113    Idx i1, i2, id;
    1114    if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
    1115      {
    1116        dest->alloc = src1->nelem + src2->nelem;
    1117        dest->elems = re_malloc (Idx, dest->alloc);
    1118        if (__glibc_unlikely (dest->elems == NULL))
    1119  	return REG_ESPACE;
    1120      }
    1121    else
    1122      {
    1123        if (src1 != NULL && src1->nelem > 0)
    1124  	return re_node_set_init_copy (dest, src1);
    1125        else if (src2 != NULL && src2->nelem > 0)
    1126  	return re_node_set_init_copy (dest, src2);
    1127        else
    1128  	re_node_set_init_empty (dest);
    1129        return REG_NOERROR;
    1130      }
    1131    for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
    1132      {
    1133        if (src1->elems[i1] > src2->elems[i2])
    1134  	{
    1135  	  dest->elems[id++] = src2->elems[i2++];
    1136  	  continue;
    1137  	}
    1138        if (src1->elems[i1] == src2->elems[i2])
    1139  	++i2;
    1140        dest->elems[id++] = src1->elems[i1++];
    1141      }
    1142    if (i1 < src1->nelem)
    1143      {
    1144        memcpy (dest->elems + id, src1->elems + i1,
    1145  	     (src1->nelem - i1) * sizeof (Idx));
    1146        id += src1->nelem - i1;
    1147      }
    1148    else if (i2 < src2->nelem)
    1149      {
    1150        memcpy (dest->elems + id, src2->elems + i2,
    1151  	     (src2->nelem - i2) * sizeof (Idx));
    1152        id += src2->nelem - i2;
    1153      }
    1154    dest->nelem = id;
    1155    return REG_NOERROR;
    1156  }
    1157  
    1158  /* Calculate the union set of the sets DEST and SRC. And store it to
    1159     DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
    1160  
    1161  static reg_errcode_t
    1162  __attribute_warn_unused_result__
    1163  re_node_set_merge (re_node_set *dest, const re_node_set *src)
    1164  {
    1165    Idx is, id, sbase, delta;
    1166    if (src == NULL || src->nelem == 0)
    1167      return REG_NOERROR;
    1168    if (dest->alloc < 2 * src->nelem + dest->nelem)
    1169      {
    1170        Idx new_alloc = 2 * (src->nelem + dest->alloc);
    1171        Idx *new_buffer = re_realloc (dest->elems, Idx, new_alloc);
    1172        if (__glibc_unlikely (new_buffer == NULL))
    1173  	return REG_ESPACE;
    1174        dest->elems = new_buffer;
    1175        dest->alloc = new_alloc;
    1176      }
    1177  
    1178    if (__glibc_unlikely (dest->nelem == 0))
    1179      {
    1180        /* Although we already guaranteed above that dest->alloc != 0 and
    1181           therefore dest->elems != NULL, add a debug assertion to pacify
    1182           GCC 11.2.1's -fanalyzer.  */
    1183        DEBUG_ASSERT (dest->elems);
    1184        dest->nelem = src->nelem;
    1185        memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
    1186        return REG_NOERROR;
    1187      }
    1188  
    1189    /* Copy into the top of DEST the items of SRC that are not
    1190       found in DEST.  Maybe we could binary search in DEST?  */
    1191    for (sbase = dest->nelem + 2 * src->nelem,
    1192         is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
    1193      {
    1194        if (dest->elems[id] == src->elems[is])
    1195  	is--, id--;
    1196        else if (dest->elems[id] < src->elems[is])
    1197  	dest->elems[--sbase] = src->elems[is--];
    1198        else /* if (dest->elems[id] > src->elems[is]) */
    1199  	--id;
    1200      }
    1201  
    1202    if (is >= 0)
    1203      {
    1204        /* If DEST is exhausted, the remaining items of SRC must be unique.  */
    1205        sbase -= is + 1;
    1206        memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (Idx));
    1207      }
    1208  
    1209    id = dest->nelem - 1;
    1210    is = dest->nelem + 2 * src->nelem - 1;
    1211    delta = is - sbase + 1;
    1212    if (delta == 0)
    1213      return REG_NOERROR;
    1214  
    1215    /* Now copy.  When DELTA becomes zero, the remaining
    1216       DEST elements are already in place.  */
    1217    dest->nelem += delta;
    1218    for (;;)
    1219      {
    1220        if (dest->elems[is] > dest->elems[id])
    1221  	{
    1222  	  /* Copy from the top.  */
    1223  	  dest->elems[id + delta--] = dest->elems[is--];
    1224  	  if (delta == 0)
    1225  	    break;
    1226  	}
    1227        else
    1228  	{
    1229  	  /* Slide from the bottom.  */
    1230  	  dest->elems[id + delta] = dest->elems[id];
    1231  	  if (--id < 0)
    1232  	    {
    1233  	      /* Copy remaining SRC elements.  */
    1234  	      memcpy (dest->elems, dest->elems + sbase,
    1235  		      delta * sizeof (Idx));
    1236  	      break;
    1237  	    }
    1238  	}
    1239      }
    1240  
    1241    return REG_NOERROR;
    1242  }
    1243  
    1244  /* Insert the new element ELEM to the re_node_set* SET.
    1245     SET should not already have ELEM.
    1246     Return true if successful.  */
    1247  
    1248  static bool
    1249  __attribute_warn_unused_result__
    1250  re_node_set_insert (re_node_set *set, Idx elem)
    1251  {
    1252    Idx idx;
    1253    /* In case the set is empty.  */
    1254    if (set->alloc == 0)
    1255      return __glibc_likely (re_node_set_init_1 (set, elem) == REG_NOERROR);
    1256  
    1257    if (__glibc_unlikely (set->nelem) == 0)
    1258      {
    1259        /* Although we already guaranteed above that set->alloc != 0 and
    1260           therefore set->elems != NULL, add a debug assertion to pacify
    1261           GCC 11.2 -fanalyzer.  */
    1262        DEBUG_ASSERT (set->elems);
    1263        set->elems[0] = elem;
    1264        ++set->nelem;
    1265        return true;
    1266      }
    1267  
    1268    /* Realloc if we need.  */
    1269    if (set->alloc == set->nelem)
    1270      {
    1271        Idx *new_elems;
    1272        set->alloc = set->alloc * 2;
    1273        new_elems = re_realloc (set->elems, Idx, set->alloc);
    1274        if (__glibc_unlikely (new_elems == NULL))
    1275  	return false;
    1276        set->elems = new_elems;
    1277      }
    1278  
    1279    /* Move the elements which follows the new element.  Test the
    1280       first element separately to skip a check in the inner loop.  */
    1281    if (elem < set->elems[0])
    1282      {
    1283        for (idx = set->nelem; idx > 0; idx--)
    1284  	set->elems[idx] = set->elems[idx - 1];
    1285      }
    1286    else
    1287      {
    1288        for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
    1289  	set->elems[idx] = set->elems[idx - 1];
    1290        DEBUG_ASSERT (set->elems[idx - 1] < elem);
    1291      }
    1292  
    1293    /* Insert the new element.  */
    1294    set->elems[idx] = elem;
    1295    ++set->nelem;
    1296    return true;
    1297  }
    1298  
    1299  /* Insert the new element ELEM to the re_node_set* SET.
    1300     SET should not already have any element greater than or equal to ELEM.
    1301     Return true if successful.  */
    1302  
    1303  static bool
    1304  __attribute_warn_unused_result__
    1305  re_node_set_insert_last (re_node_set *set, Idx elem)
    1306  {
    1307    /* Realloc if we need.  */
    1308    if (set->alloc == set->nelem)
    1309      {
    1310        Idx *new_elems;
    1311        set->alloc = (set->alloc + 1) * 2;
    1312        new_elems = re_realloc (set->elems, Idx, set->alloc);
    1313        if (__glibc_unlikely (new_elems == NULL))
    1314  	return false;
    1315        set->elems = new_elems;
    1316      }
    1317  
    1318    /* Insert the new element.  */
    1319    set->elems[set->nelem++] = elem;
    1320    return true;
    1321  }
    1322  
    1323  /* Compare two node sets SET1 and SET2.
    1324     Return true if SET1 and SET2 are equivalent.  */
    1325  
    1326  static bool
    1327  __attribute__ ((pure))
    1328  re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
    1329  {
    1330    Idx i;
    1331    if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
    1332      return false;
    1333    for (i = set1->nelem ; --i >= 0 ; )
    1334      if (set1->elems[i] != set2->elems[i])
    1335        return false;
    1336    return true;
    1337  }
    1338  
    1339  /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise.  */
    1340  
    1341  static Idx
    1342  __attribute__ ((pure))
    1343  re_node_set_contains (const re_node_set *set, Idx elem)
    1344  {
    1345    __re_size_t idx, right, mid;
    1346    if (set->nelem <= 0)
    1347      return 0;
    1348  
    1349    /* Binary search the element.  */
    1350    idx = 0;
    1351    right = set->nelem - 1;
    1352    while (idx < right)
    1353      {
    1354        mid = (idx + right) / 2;
    1355        if (set->elems[mid] < elem)
    1356  	idx = mid + 1;
    1357        else
    1358  	right = mid;
    1359      }
    1360    return set->elems[idx] == elem ? idx + 1 : 0;
    1361  }
    1362  
    1363  static void
    1364  re_node_set_remove_at (re_node_set *set, Idx idx)
    1365  {
    1366    if (idx < 0 || idx >= set->nelem)
    1367      return;
    1368    --set->nelem;
    1369    for (; idx < set->nelem; idx++)
    1370      set->elems[idx] = set->elems[idx + 1];
    1371  }
    1372  
    1373  
    1374  /* Add the token TOKEN to dfa->nodes, and return the index of the token.
    1375     Or return -1 if an error occurred.  */
    1376  
    1377  static Idx
    1378  re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
    1379  {
    1380    if (__glibc_unlikely (dfa->nodes_len >= dfa->nodes_alloc))
    1381      {
    1382        size_t new_nodes_alloc = dfa->nodes_alloc * 2;
    1383        Idx *new_nexts, *new_indices;
    1384        re_node_set *new_edests, *new_eclosures;
    1385        re_token_t *new_nodes;
    1386  
    1387        /* Avoid overflows in realloc.  */
    1388        const size_t max_object_size = MAX (sizeof (re_token_t),
    1389  					  MAX (sizeof (re_node_set),
    1390  					       sizeof (Idx)));
    1391        if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size)
    1392  			    < new_nodes_alloc))
    1393  	return -1;
    1394  
    1395        new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
    1396        if (__glibc_unlikely (new_nodes == NULL))
    1397  	return -1;
    1398        dfa->nodes = new_nodes;
    1399        dfa->nodes_alloc = new_nodes_alloc;
    1400        new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc);
    1401        if (new_nexts != NULL)
    1402  	dfa->nexts = new_nexts;
    1403        new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc);
    1404        if (new_indices != NULL)
    1405  	dfa->org_indices = new_indices;
    1406        new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
    1407        if (new_edests != NULL)
    1408  	dfa->edests = new_edests;
    1409        new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
    1410        if (new_eclosures != NULL)
    1411  	dfa->eclosures = new_eclosures;
    1412        if (__glibc_unlikely (new_nexts == NULL || new_indices == NULL
    1413  			    || new_edests == NULL || new_eclosures == NULL))
    1414  	return -1;
    1415      }
    1416    dfa->nodes[dfa->nodes_len] = token;
    1417    dfa->nodes[dfa->nodes_len].constraint = 0;
    1418    dfa->nodes[dfa->nodes_len].accept_mb =
    1419      ((token.type == OP_PERIOD && dfa->mb_cur_max > 1)
    1420       || token.type == COMPLEX_BRACKET);
    1421    dfa->nexts[dfa->nodes_len] = -1;
    1422    re_node_set_init_empty (dfa->edests + dfa->nodes_len);
    1423    re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
    1424    return dfa->nodes_len++;
    1425  }
    1426  
    1427  static re_hashval_t
    1428  calc_state_hash (const re_node_set *nodes, unsigned int context)
    1429  {
    1430    re_hashval_t hash = nodes->nelem + context;
    1431    Idx i;
    1432    for (i = 0 ; i < nodes->nelem ; i++)
    1433      hash += nodes->elems[i];
    1434    return hash;
    1435  }
    1436  
    1437  /* Search for the state whose node_set is equivalent to NODES.
    1438     Return the pointer to the state, if we found it in the DFA.
    1439     Otherwise create the new one and return it.  In case of an error
    1440     return NULL and set the error code in ERR.
    1441     Note: - We assume NULL as the invalid state, then it is possible that
    1442  	   return value is NULL and ERR is REG_NOERROR.
    1443  	 - We never return non-NULL value in case of any errors, it is for
    1444  	   optimization.  */
    1445  
    1446  static re_dfastate_t *
    1447  __attribute_warn_unused_result__
    1448  re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
    1449  		  const re_node_set *nodes)
    1450  {
    1451    re_hashval_t hash;
    1452    re_dfastate_t *new_state;
    1453    struct re_state_table_entry *spot;
    1454    Idx i;
    1455  #if defined GCC_LINT || defined lint
    1456    /* Suppress bogus uninitialized-variable warnings.  */
    1457    *err = REG_NOERROR;
    1458  #endif
    1459    if (__glibc_unlikely (nodes->nelem == 0))
    1460      {
    1461        *err = REG_NOERROR;
    1462        return NULL;
    1463      }
    1464    hash = calc_state_hash (nodes, 0);
    1465    spot = dfa->state_table + (hash & dfa->state_hash_mask);
    1466  
    1467    for (i = 0 ; i < spot->num ; i++)
    1468      {
    1469        re_dfastate_t *state = spot->array[i];
    1470        if (hash != state->hash)
    1471  	continue;
    1472        if (re_node_set_compare (&state->nodes, nodes))
    1473  	return state;
    1474      }
    1475  
    1476    /* There are no appropriate state in the dfa, create the new one.  */
    1477    new_state = create_ci_newstate (dfa, nodes, hash);
    1478    if (__glibc_unlikely (new_state == NULL))
    1479      *err = REG_ESPACE;
    1480  
    1481    return new_state;
    1482  }
    1483  
    1484  /* Search for the state whose node_set is equivalent to NODES and
    1485     whose context is equivalent to CONTEXT.
    1486     Return the pointer to the state, if we found it in the DFA.
    1487     Otherwise create the new one and return it.  In case of an error
    1488     return NULL and set the error code in ERR.
    1489     Note: - We assume NULL as the invalid state, then it is possible that
    1490  	   return value is NULL and ERR is REG_NOERROR.
    1491  	 - We never return non-NULL value in case of any errors, it is for
    1492  	   optimization.  */
    1493  
    1494  static re_dfastate_t *
    1495  __attribute_warn_unused_result__
    1496  re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
    1497  			  const re_node_set *nodes, unsigned int context)
    1498  {
    1499    re_hashval_t hash;
    1500    re_dfastate_t *new_state;
    1501    struct re_state_table_entry *spot;
    1502    Idx i;
    1503  #if defined GCC_LINT || defined lint
    1504    /* Suppress bogus uninitialized-variable warnings.  */
    1505    *err = REG_NOERROR;
    1506  #endif
    1507    if (nodes->nelem == 0)
    1508      {
    1509        *err = REG_NOERROR;
    1510        return NULL;
    1511      }
    1512    hash = calc_state_hash (nodes, context);
    1513    spot = dfa->state_table + (hash & dfa->state_hash_mask);
    1514  
    1515    for (i = 0 ; i < spot->num ; i++)
    1516      {
    1517        re_dfastate_t *state = spot->array[i];
    1518        if (state->hash == hash
    1519  	  && state->context == context
    1520  	  && re_node_set_compare (state->entrance_nodes, nodes))
    1521  	return state;
    1522      }
    1523    /* There are no appropriate state in 'dfa', create the new one.  */
    1524    new_state = create_cd_newstate (dfa, nodes, context, hash);
    1525    if (__glibc_unlikely (new_state == NULL))
    1526      *err = REG_ESPACE;
    1527  
    1528    return new_state;
    1529  }
    1530  
    1531  /* Finish initialization of the new state NEWSTATE, and using its hash value
    1532     HASH put in the appropriate bucket of DFA's state table.  Return value
    1533     indicates the error code if failed.  */
    1534  
    1535  static reg_errcode_t
    1536  __attribute_warn_unused_result__
    1537  register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
    1538  		re_hashval_t hash)
    1539  {
    1540    struct re_state_table_entry *spot;
    1541    reg_errcode_t err;
    1542    Idx i;
    1543  
    1544    newstate->hash = hash;
    1545    err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
    1546    if (__glibc_unlikely (err != REG_NOERROR))
    1547      return REG_ESPACE;
    1548    for (i = 0; i < newstate->nodes.nelem; i++)
    1549      {
    1550        Idx elem = newstate->nodes.elems[i];
    1551        if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
    1552  	if (! re_node_set_insert_last (&newstate->non_eps_nodes, elem))
    1553  	  return REG_ESPACE;
    1554      }
    1555  
    1556    spot = dfa->state_table + (hash & dfa->state_hash_mask);
    1557    if (__glibc_unlikely (spot->alloc <= spot->num))
    1558      {
    1559        Idx new_alloc = 2 * spot->num + 2;
    1560        re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
    1561  					      new_alloc);
    1562        if (__glibc_unlikely (new_array == NULL))
    1563  	return REG_ESPACE;
    1564        spot->array = new_array;
    1565        spot->alloc = new_alloc;
    1566      }
    1567    spot->array[spot->num++] = newstate;
    1568    return REG_NOERROR;
    1569  }
    1570  
    1571  static void
    1572  free_state (re_dfastate_t *state)
    1573  {
    1574    re_node_set_free (&state->non_eps_nodes);
    1575    re_node_set_free (&state->inveclosure);
    1576    if (state->entrance_nodes != &state->nodes)
    1577      {
    1578        re_node_set_free (state->entrance_nodes);
    1579        re_free (state->entrance_nodes);
    1580      }
    1581    re_node_set_free (&state->nodes);
    1582    re_free (state->word_trtable);
    1583    re_free (state->trtable);
    1584    re_free (state);
    1585  }
    1586  
    1587  /* Create the new state which is independent of contexts.
    1588     Return the new state if succeeded, otherwise return NULL.  */
    1589  
    1590  static re_dfastate_t *
    1591  __attribute_warn_unused_result__
    1592  create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
    1593  		    re_hashval_t hash)
    1594  {
    1595    Idx i;
    1596    reg_errcode_t err;
    1597    re_dfastate_t *newstate;
    1598  
    1599    newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
    1600    if (__glibc_unlikely (newstate == NULL))
    1601      return NULL;
    1602    err = re_node_set_init_copy (&newstate->nodes, nodes);
    1603    if (__glibc_unlikely (err != REG_NOERROR))
    1604      {
    1605        re_free (newstate);
    1606        return NULL;
    1607      }
    1608  
    1609    newstate->entrance_nodes = &newstate->nodes;
    1610    for (i = 0 ; i < nodes->nelem ; i++)
    1611      {
    1612        re_token_t *node = dfa->nodes + nodes->elems[i];
    1613        re_token_type_t type = node->type;
    1614        if (type == CHARACTER && !node->constraint)
    1615  	continue;
    1616        newstate->accept_mb |= node->accept_mb;
    1617  
    1618        /* If the state has the halt node, the state is a halt state.  */
    1619        if (type == END_OF_RE)
    1620  	newstate->halt = 1;
    1621        else if (type == OP_BACK_REF)
    1622  	newstate->has_backref = 1;
    1623        else if (type == ANCHOR || node->constraint)
    1624  	newstate->has_constraint = 1;
    1625      }
    1626    err = register_state (dfa, newstate, hash);
    1627    if (__glibc_unlikely (err != REG_NOERROR))
    1628      {
    1629        free_state (newstate);
    1630        newstate = NULL;
    1631      }
    1632    return newstate;
    1633  }
    1634  
    1635  /* Create the new state which is depend on the context CONTEXT.
    1636     Return the new state if succeeded, otherwise return NULL.  */
    1637  
    1638  static re_dfastate_t *
    1639  __attribute_warn_unused_result__
    1640  create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
    1641  		    unsigned int context, re_hashval_t hash)
    1642  {
    1643    Idx i, nctx_nodes = 0;
    1644    reg_errcode_t err;
    1645    re_dfastate_t *newstate;
    1646  
    1647    newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
    1648    if (__glibc_unlikely (newstate == NULL))
    1649      return NULL;
    1650    err = re_node_set_init_copy (&newstate->nodes, nodes);
    1651    if (__glibc_unlikely (err != REG_NOERROR))
    1652      {
    1653        re_free (newstate);
    1654        return NULL;
    1655      }
    1656  
    1657    newstate->context = context;
    1658    newstate->entrance_nodes = &newstate->nodes;
    1659  
    1660    for (i = 0 ; i < nodes->nelem ; i++)
    1661      {
    1662        re_token_t *node = dfa->nodes + nodes->elems[i];
    1663        re_token_type_t type = node->type;
    1664        unsigned int constraint = node->constraint;
    1665  
    1666        if (type == CHARACTER && !constraint)
    1667  	continue;
    1668        newstate->accept_mb |= node->accept_mb;
    1669  
    1670        /* If the state has the halt node, the state is a halt state.  */
    1671        if (type == END_OF_RE)
    1672  	newstate->halt = 1;
    1673        else if (type == OP_BACK_REF)
    1674  	newstate->has_backref = 1;
    1675  
    1676        if (constraint)
    1677  	{
    1678  	  if (newstate->entrance_nodes == &newstate->nodes)
    1679  	    {
    1680  	      re_node_set *entrance_nodes = re_malloc (re_node_set, 1);
    1681  	      if (__glibc_unlikely (entrance_nodes == NULL))
    1682  		{
    1683  		  free_state (newstate);
    1684  		  return NULL;
    1685  		}
    1686  	      newstate->entrance_nodes = entrance_nodes;
    1687  	      if (re_node_set_init_copy (newstate->entrance_nodes, nodes)
    1688  		  != REG_NOERROR)
    1689  		{
    1690  		  free_state (newstate);
    1691  		  return NULL;
    1692  		}
    1693  	      nctx_nodes = 0;
    1694  	      newstate->has_constraint = 1;
    1695  	    }
    1696  
    1697  	  if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
    1698  	    {
    1699  	      re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
    1700  	      ++nctx_nodes;
    1701  	    }
    1702  	}
    1703      }
    1704    err = register_state (dfa, newstate, hash);
    1705    if (__glibc_unlikely (err != REG_NOERROR))
    1706      {
    1707        free_state (newstate);
    1708        newstate = NULL;
    1709      }
    1710    return  newstate;
    1711  }