1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2023 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <https://www.gnu.org/licenses/>. */
19
20 static void re_string_construct_common (const char *str, Idx len,
21 re_string_t *pstr,
22 RE_TRANSLATE_TYPE trans, bool icase,
23 const re_dfa_t *dfa);
24 static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
25 const re_node_set *nodes,
26 re_hashval_t hash);
27 static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
28 const re_node_set *nodes,
29 unsigned int context,
30 re_hashval_t hash);
31 static reg_errcode_t re_string_realloc_buffers (re_string_t *pstr,
32 Idx new_buf_len);
33 static void build_wcs_buffer (re_string_t *pstr);
34 static reg_errcode_t build_wcs_upper_buffer (re_string_t *pstr);
35 static void build_upper_buffer (re_string_t *pstr);
36 static void re_string_translate_buffer (re_string_t *pstr);
37 static unsigned int re_string_context_at (const re_string_t *input, Idx idx,
38 int eflags) __attribute__ ((pure));
39
40 /* Functions for string operation. */
41
42 /* This function allocate the buffers. It is necessary to call
43 re_string_reconstruct before using the object. */
44
45 static reg_errcode_t
46 __attribute_warn_unused_result__
47 re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len,
48 RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
49 {
50 reg_errcode_t ret;
51 Idx init_buf_len;
52
53 /* Ensure at least one character fits into the buffers. */
54 if (init_len < dfa->mb_cur_max)
55 init_len = dfa->mb_cur_max;
56 init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
57 re_string_construct_common (str, len, pstr, trans, icase, dfa);
58
59 ret = re_string_realloc_buffers (pstr, init_buf_len);
60 if (__glibc_unlikely (ret != REG_NOERROR))
61 return ret;
62
63 pstr->word_char = dfa->word_char;
64 pstr->word_ops_used = dfa->word_ops_used;
65 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
66 pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
67 pstr->valid_raw_len = pstr->valid_len;
68 return REG_NOERROR;
69 }
70
71 /* This function allocate the buffers, and initialize them. */
72
73 static reg_errcode_t
74 __attribute_warn_unused_result__
75 re_string_construct (re_string_t *pstr, const char *str, Idx len,
76 RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
77 {
78 reg_errcode_t ret;
79 memset (pstr, '\0', sizeof (re_string_t));
80 re_string_construct_common (str, len, pstr, trans, icase, dfa);
81
82 if (len > 0)
83 {
84 ret = re_string_realloc_buffers (pstr, len + 1);
85 if (__glibc_unlikely (ret != REG_NOERROR))
86 return ret;
87 }
88 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
89
90 if (icase)
91 {
92 if (dfa->mb_cur_max > 1)
93 {
94 while (1)
95 {
96 ret = build_wcs_upper_buffer (pstr);
97 if (__glibc_unlikely (ret != REG_NOERROR))
98 return ret;
99 if (pstr->valid_raw_len >= len)
100 break;
101 if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
102 break;
103 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
104 if (__glibc_unlikely (ret != REG_NOERROR))
105 return ret;
106 }
107 }
108 else
109 build_upper_buffer (pstr);
110 }
111 else
112 {
113 if (dfa->mb_cur_max > 1)
114 build_wcs_buffer (pstr);
115 else
116 {
117 if (trans != NULL)
118 re_string_translate_buffer (pstr);
119 else
120 {
121 pstr->valid_len = pstr->bufs_len;
122 pstr->valid_raw_len = pstr->bufs_len;
123 }
124 }
125 }
126
127 return REG_NOERROR;
128 }
129
130 /* Helper functions for re_string_allocate, and re_string_construct. */
131
132 static reg_errcode_t
133 __attribute_warn_unused_result__
134 re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len)
135 {
136 if (pstr->mb_cur_max > 1)
137 {
138 wint_t *new_wcs;
139
140 /* Avoid overflow in realloc. */
141 const size_t max_object_size = MAX (sizeof (wint_t), sizeof (Idx));
142 if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size)
143 < new_buf_len))
144 return REG_ESPACE;
145
146 new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
147 if (__glibc_unlikely (new_wcs == NULL))
148 return REG_ESPACE;
149 pstr->wcs = new_wcs;
150 if (pstr->offsets != NULL)
151 {
152 Idx *new_offsets = re_realloc (pstr->offsets, Idx, new_buf_len);
153 if (__glibc_unlikely (new_offsets == NULL))
154 return REG_ESPACE;
155 pstr->offsets = new_offsets;
156 }
157 }
158 if (pstr->mbs_allocated)
159 {
160 unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
161 new_buf_len);
162 if (__glibc_unlikely (new_mbs == NULL))
163 return REG_ESPACE;
164 pstr->mbs = new_mbs;
165 }
166 pstr->bufs_len = new_buf_len;
167 return REG_NOERROR;
168 }
169
170
171 static void
172 re_string_construct_common (const char *str, Idx len, re_string_t *pstr,
173 RE_TRANSLATE_TYPE trans, bool icase,
174 const re_dfa_t *dfa)
175 {
176 pstr->raw_mbs = (const unsigned char *) str;
177 pstr->len = len;
178 pstr->raw_len = len;
179 pstr->trans = trans;
180 pstr->icase = icase;
181 pstr->mbs_allocated = (trans != NULL || icase);
182 pstr->mb_cur_max = dfa->mb_cur_max;
183 pstr->is_utf8 = dfa->is_utf8;
184 pstr->map_notascii = dfa->map_notascii;
185 pstr->stop = pstr->len;
186 pstr->raw_stop = pstr->stop;
187 }
188
189
190 /* Build wide character buffer PSTR->WCS.
191 If the byte sequence of the string are:
192 <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
193 Then wide character buffer will be:
194 <wc1> , WEOF , <wc2> , WEOF , <wc3>
195 We use WEOF for padding, they indicate that the position isn't
196 a first byte of a multibyte character.
197
198 Note that this function assumes PSTR->VALID_LEN elements are already
199 built and starts from PSTR->VALID_LEN. */
200
201 static void
202 build_wcs_buffer (re_string_t *pstr)
203 {
204 #ifdef _LIBC
205 unsigned char buf[MB_LEN_MAX];
206 DEBUG_ASSERT (MB_LEN_MAX >= pstr->mb_cur_max);
207 #else
208 unsigned char buf[64];
209 #endif
210 mbstate_t prev_st;
211 Idx byte_idx, end_idx, remain_len;
212 size_t mbclen;
213
214 /* Build the buffers from pstr->valid_len to either pstr->len or
215 pstr->bufs_len. */
216 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
217 for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
218 {
219 wchar_t wc;
220 const char *p;
221
222 remain_len = end_idx - byte_idx;
223 prev_st = pstr->cur_state;
224 /* Apply the translation if we need. */
225 if (__glibc_unlikely (pstr->trans != NULL))
226 {
227 int i, ch;
228
229 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
230 {
231 ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
232 buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
233 }
234 p = (const char *) buf;
235 }
236 else
237 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
238 mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
239 if (__glibc_unlikely (mbclen == (size_t) -1 || mbclen == 0
240 || (mbclen == (size_t) -2
241 && pstr->bufs_len >= pstr->len)))
242 {
243 /* We treat these cases as a singlebyte character. */
244 mbclen = 1;
245 wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
246 if (__glibc_unlikely (pstr->trans != NULL))
247 wc = pstr->trans[wc];
248 pstr->cur_state = prev_st;
249 }
250 else if (__glibc_unlikely (mbclen == (size_t) -2))
251 {
252 /* The buffer doesn't have enough space, finish to build. */
253 pstr->cur_state = prev_st;
254 break;
255 }
256
257 /* Write wide character and padding. */
258 pstr->wcs[byte_idx++] = wc;
259 /* Write paddings. */
260 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
261 pstr->wcs[byte_idx++] = WEOF;
262 }
263 pstr->valid_len = byte_idx;
264 pstr->valid_raw_len = byte_idx;
265 }
266
267 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
268 but for REG_ICASE. */
269
270 static reg_errcode_t
271 __attribute_warn_unused_result__
272 build_wcs_upper_buffer (re_string_t *pstr)
273 {
274 mbstate_t prev_st;
275 Idx src_idx, byte_idx, end_idx, remain_len;
276 size_t mbclen;
277 #ifdef _LIBC
278 char buf[MB_LEN_MAX];
279 DEBUG_ASSERT (pstr->mb_cur_max <= MB_LEN_MAX);
280 #else
281 char buf[64];
282 #endif
283
284 byte_idx = pstr->valid_len;
285 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
286
287 /* The following optimization assumes that ASCII characters can be
288 mapped to wide characters with a simple cast. */
289 if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
290 {
291 while (byte_idx < end_idx)
292 {
293 wchar_t wc;
294 unsigned char ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
295
296 if (isascii (ch) && mbsinit (&pstr->cur_state))
297 {
298 /* The next step uses the assumption that wchar_t is encoded
299 ASCII-safe: all ASCII values can be converted like this. */
300 wchar_t wcu = __towupper (ch);
301 if (isascii (wcu))
302 {
303 pstr->mbs[byte_idx] = wcu;
304 pstr->wcs[byte_idx] = wcu;
305 byte_idx++;
306 continue;
307 }
308 }
309
310 remain_len = end_idx - byte_idx;
311 prev_st = pstr->cur_state;
312 mbclen = __mbrtowc (&wc,
313 ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
314 + byte_idx), remain_len, &pstr->cur_state);
315 if (__glibc_likely (0 < mbclen && mbclen < (size_t) -2))
316 {
317 wchar_t wcu = __towupper (wc);
318 if (wcu != wc)
319 {
320 size_t mbcdlen;
321
322 mbcdlen = __wcrtomb (buf, wcu, &prev_st);
323 if (__glibc_likely (mbclen == mbcdlen))
324 memcpy (pstr->mbs + byte_idx, buf, mbclen);
325 else
326 {
327 src_idx = byte_idx;
328 goto offsets_needed;
329 }
330 }
331 else
332 memcpy (pstr->mbs + byte_idx,
333 pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
334 pstr->wcs[byte_idx++] = wcu;
335 /* Write paddings. */
336 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
337 pstr->wcs[byte_idx++] = WEOF;
338 }
339 else if (mbclen == (size_t) -1 || mbclen == 0
340 || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
341 {
342 /* It is an invalid character, an incomplete character
343 at the end of the string, or '\0'. Just use the byte. */
344 pstr->mbs[byte_idx] = ch;
345 /* And also cast it to wide char. */
346 pstr->wcs[byte_idx++] = (wchar_t) ch;
347 if (__glibc_unlikely (mbclen == (size_t) -1))
348 pstr->cur_state = prev_st;
349 }
350 else
351 {
352 /* The buffer doesn't have enough space, finish to build. */
353 pstr->cur_state = prev_st;
354 break;
355 }
356 }
357 pstr->valid_len = byte_idx;
358 pstr->valid_raw_len = byte_idx;
359 return REG_NOERROR;
360 }
361 else
362 for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
363 {
364 wchar_t wc;
365 const char *p;
366 offsets_needed:
367 remain_len = end_idx - byte_idx;
368 prev_st = pstr->cur_state;
369 if (__glibc_unlikely (pstr->trans != NULL))
370 {
371 int i, ch;
372
373 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
374 {
375 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
376 buf[i] = pstr->trans[ch];
377 }
378 p = (const char *) buf;
379 }
380 else
381 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
382 mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
383 if (__glibc_likely (0 < mbclen && mbclen < (size_t) -2))
384 {
385 wchar_t wcu = __towupper (wc);
386 if (wcu != wc)
387 {
388 size_t mbcdlen;
389
390 mbcdlen = __wcrtomb ((char *) buf, wcu, &prev_st);
391 if (__glibc_likely (mbclen == mbcdlen))
392 memcpy (pstr->mbs + byte_idx, buf, mbclen);
393 else if (mbcdlen != (size_t) -1)
394 {
395 size_t i;
396
397 if (byte_idx + mbcdlen > pstr->bufs_len)
398 {
399 pstr->cur_state = prev_st;
400 break;
401 }
402
403 if (pstr->offsets == NULL)
404 {
405 pstr->offsets = re_malloc (Idx, pstr->bufs_len);
406
407 if (pstr->offsets == NULL)
408 return REG_ESPACE;
409 }
410 if (!pstr->offsets_needed)
411 {
412 for (i = 0; i < (size_t) byte_idx; ++i)
413 pstr->offsets[i] = i;
414 pstr->offsets_needed = 1;
415 }
416
417 memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
418 pstr->wcs[byte_idx] = wcu;
419 pstr->offsets[byte_idx] = src_idx;
420 for (i = 1; i < mbcdlen; ++i)
421 {
422 pstr->offsets[byte_idx + i]
423 = src_idx + (i < mbclen ? i : mbclen - 1);
424 pstr->wcs[byte_idx + i] = WEOF;
425 }
426 pstr->len += mbcdlen - mbclen;
427 if (pstr->raw_stop > src_idx)
428 pstr->stop += mbcdlen - mbclen;
429 end_idx = (pstr->bufs_len > pstr->len)
430 ? pstr->len : pstr->bufs_len;
431 byte_idx += mbcdlen;
432 src_idx += mbclen;
433 continue;
434 }
435 else
436 memcpy (pstr->mbs + byte_idx, p, mbclen);
437 }
438 else
439 memcpy (pstr->mbs + byte_idx, p, mbclen);
440
441 if (__glibc_unlikely (pstr->offsets_needed != 0))
442 {
443 size_t i;
444 for (i = 0; i < mbclen; ++i)
445 pstr->offsets[byte_idx + i] = src_idx + i;
446 }
447 src_idx += mbclen;
448
449 pstr->wcs[byte_idx++] = wcu;
450 /* Write paddings. */
451 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
452 pstr->wcs[byte_idx++] = WEOF;
453 }
454 else if (mbclen == (size_t) -1 || mbclen == 0
455 || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
456 {
457 /* It is an invalid character or '\0'. Just use the byte. */
458 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
459
460 if (__glibc_unlikely (pstr->trans != NULL))
461 ch = pstr->trans [ch];
462 pstr->mbs[byte_idx] = ch;
463
464 if (__glibc_unlikely (pstr->offsets_needed != 0))
465 pstr->offsets[byte_idx] = src_idx;
466 ++src_idx;
467
468 /* And also cast it to wide char. */
469 pstr->wcs[byte_idx++] = (wchar_t) ch;
470 if (__glibc_unlikely (mbclen == (size_t) -1))
471 pstr->cur_state = prev_st;
472 }
473 else
474 {
475 /* The buffer doesn't have enough space, finish to build. */
476 pstr->cur_state = prev_st;
477 break;
478 }
479 }
480 pstr->valid_len = byte_idx;
481 pstr->valid_raw_len = src_idx;
482 return REG_NOERROR;
483 }
484
485 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
486 Return the index. */
487
488 static Idx
489 re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc)
490 {
491 mbstate_t prev_st;
492 Idx rawbuf_idx;
493 size_t mbclen;
494 wint_t wc = WEOF;
495
496 /* Skip the characters which are not necessary to check. */
497 for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
498 rawbuf_idx < new_raw_idx;)
499 {
500 wchar_t wc2;
501 Idx remain_len = pstr->raw_len - rawbuf_idx;
502 prev_st = pstr->cur_state;
503 mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
504 remain_len, &pstr->cur_state);
505 if (__glibc_unlikely (mbclen == (size_t) -2 || mbclen == (size_t) -1
506 || mbclen == 0))
507 {
508 /* We treat these cases as a single byte character. */
509 if (mbclen == 0 || remain_len == 0)
510 wc = L'\0';
511 else
512 wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
513 mbclen = 1;
514 pstr->cur_state = prev_st;
515 }
516 else
517 wc = wc2;
518 /* Then proceed the next character. */
519 rawbuf_idx += mbclen;
520 }
521 *last_wc = wc;
522 return rawbuf_idx;
523 }
524
525 /* Build the buffer PSTR->MBS, and apply the translation if we need.
526 This function is used in case of REG_ICASE. */
527
528 static void
529 build_upper_buffer (re_string_t *pstr)
530 {
531 Idx char_idx, end_idx;
532 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
533
534 for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
535 {
536 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
537 if (__glibc_unlikely (pstr->trans != NULL))
538 ch = pstr->trans[ch];
539 pstr->mbs[char_idx] = toupper (ch);
540 }
541 pstr->valid_len = char_idx;
542 pstr->valid_raw_len = char_idx;
543 }
544
545 /* Apply TRANS to the buffer in PSTR. */
546
547 static void
548 re_string_translate_buffer (re_string_t *pstr)
549 {
550 Idx buf_idx, end_idx;
551 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
552
553 for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
554 {
555 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
556 pstr->mbs[buf_idx] = pstr->trans[ch];
557 }
558
559 pstr->valid_len = buf_idx;
560 pstr->valid_raw_len = buf_idx;
561 }
562
563 /* This function re-construct the buffers.
564 Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
565 convert to upper case in case of REG_ICASE, apply translation. */
566
567 static reg_errcode_t
568 __attribute_warn_unused_result__
569 re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
570 {
571 Idx offset;
572
573 if (__glibc_unlikely (pstr->raw_mbs_idx <= idx))
574 offset = idx - pstr->raw_mbs_idx;
575 else
576 {
577 /* Reset buffer. */
578 if (pstr->mb_cur_max > 1)
579 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
580 pstr->len = pstr->raw_len;
581 pstr->stop = pstr->raw_stop;
582 pstr->valid_len = 0;
583 pstr->raw_mbs_idx = 0;
584 pstr->valid_raw_len = 0;
585 pstr->offsets_needed = 0;
586 pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
587 : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
588 if (!pstr->mbs_allocated)
589 pstr->mbs = (unsigned char *) pstr->raw_mbs;
590 offset = idx;
591 }
592
593 if (__glibc_likely (offset != 0))
594 {
595 /* Should the already checked characters be kept? */
596 if (__glibc_likely (offset < pstr->valid_raw_len))
597 {
598 /* Yes, move them to the front of the buffer. */
599 if (__glibc_unlikely (pstr->offsets_needed))
600 {
601 Idx low = 0, high = pstr->valid_len, mid;
602 do
603 {
604 mid = (high + low) / 2;
605 if (pstr->offsets[mid] > offset)
606 high = mid;
607 else if (pstr->offsets[mid] < offset)
608 low = mid + 1;
609 else
610 break;
611 }
612 while (low < high);
613 if (pstr->offsets[mid] < offset)
614 ++mid;
615 pstr->tip_context = re_string_context_at (pstr, mid - 1,
616 eflags);
617 /* This can be quite complicated, so handle specially
618 only the common and easy case where the character with
619 different length representation of lower and upper
620 case is present at or after offset. */
621 if (pstr->valid_len > offset
622 && mid == offset && pstr->offsets[mid] == offset)
623 {
624 memmove (pstr->wcs, pstr->wcs + offset,
625 (pstr->valid_len - offset) * sizeof (wint_t));
626 memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
627 pstr->valid_len -= offset;
628 pstr->valid_raw_len -= offset;
629 for (low = 0; low < pstr->valid_len; low++)
630 pstr->offsets[low] = pstr->offsets[low + offset] - offset;
631 }
632 else
633 {
634 /* Otherwise, just find out how long the partial multibyte
635 character at offset is and fill it with WEOF/255. */
636 pstr->len = pstr->raw_len - idx + offset;
637 pstr->stop = pstr->raw_stop - idx + offset;
638 pstr->offsets_needed = 0;
639 while (mid > 0 && pstr->offsets[mid - 1] == offset)
640 --mid;
641 while (mid < pstr->valid_len)
642 if (pstr->wcs[mid] != WEOF)
643 break;
644 else
645 ++mid;
646 if (mid == pstr->valid_len)
647 pstr->valid_len = 0;
648 else
649 {
650 pstr->valid_len = pstr->offsets[mid] - offset;
651 if (pstr->valid_len)
652 {
653 for (low = 0; low < pstr->valid_len; ++low)
654 pstr->wcs[low] = WEOF;
655 memset (pstr->mbs, 255, pstr->valid_len);
656 }
657 }
658 pstr->valid_raw_len = pstr->valid_len;
659 }
660 }
661 else
662 {
663 pstr->tip_context = re_string_context_at (pstr, offset - 1,
664 eflags);
665 if (pstr->mb_cur_max > 1)
666 memmove (pstr->wcs, pstr->wcs + offset,
667 (pstr->valid_len - offset) * sizeof (wint_t));
668 if (__glibc_unlikely (pstr->mbs_allocated))
669 memmove (pstr->mbs, pstr->mbs + offset,
670 pstr->valid_len - offset);
671 pstr->valid_len -= offset;
672 pstr->valid_raw_len -= offset;
673 DEBUG_ASSERT (pstr->valid_len > 0);
674 }
675 }
676 else
677 {
678 /* No, skip all characters until IDX. */
679 Idx prev_valid_len = pstr->valid_len;
680
681 if (__glibc_unlikely (pstr->offsets_needed))
682 {
683 pstr->len = pstr->raw_len - idx + offset;
684 pstr->stop = pstr->raw_stop - idx + offset;
685 pstr->offsets_needed = 0;
686 }
687 pstr->valid_len = 0;
688 if (pstr->mb_cur_max > 1)
689 {
690 Idx wcs_idx;
691 wint_t wc = WEOF;
692
693 if (pstr->is_utf8)
694 {
695 const unsigned char *raw, *p, *end;
696
697 /* Special case UTF-8. Multi-byte chars start with any
698 byte other than 0x80 - 0xbf. */
699 raw = pstr->raw_mbs + pstr->raw_mbs_idx;
700 end = raw + (offset - pstr->mb_cur_max);
701 if (end < pstr->raw_mbs)
702 end = pstr->raw_mbs;
703 p = raw + offset - 1;
704 #ifdef _LIBC
705 /* We know the wchar_t encoding is UCS4, so for the simple
706 case, ASCII characters, skip the conversion step. */
707 if (isascii (*p) && __glibc_likely (pstr->trans == NULL))
708 {
709 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
710 /* pstr->valid_len = 0; */
711 wc = (wchar_t) *p;
712 }
713 else
714 #endif
715 for (; p >= end; --p)
716 if ((*p & 0xc0) != 0x80)
717 {
718 mbstate_t cur_state;
719 wchar_t wc2;
720 Idx mlen = raw + pstr->len - p;
721 unsigned char buf[6];
722 size_t mbclen;
723
724 const unsigned char *pp = p;
725 if (__glibc_unlikely (pstr->trans != NULL))
726 {
727 int i = mlen < 6 ? mlen : 6;
728 while (--i >= 0)
729 buf[i] = pstr->trans[p[i]];
730 pp = buf;
731 }
732 /* XXX Don't use mbrtowc, we know which conversion
733 to use (UTF-8 -> UCS4). */
734 memset (&cur_state, 0, sizeof (cur_state));
735 mbclen = __mbrtowc (&wc2, (const char *) pp, mlen,
736 &cur_state);
737 if (raw + offset - p <= mbclen
738 && mbclen < (size_t) -2)
739 {
740 memset (&pstr->cur_state, '\0',
741 sizeof (mbstate_t));
742 pstr->valid_len = mbclen - (raw + offset - p);
743 wc = wc2;
744 }
745 break;
746 }
747 }
748
749 if (wc == WEOF)
750 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
751 if (wc == WEOF)
752 pstr->tip_context
753 = re_string_context_at (pstr, prev_valid_len - 1, eflags);
754 else
755 pstr->tip_context = ((__glibc_unlikely (pstr->word_ops_used != 0)
756 && IS_WIDE_WORD_CHAR (wc))
757 ? CONTEXT_WORD
758 : ((IS_WIDE_NEWLINE (wc)
759 && pstr->newline_anchor)
760 ? CONTEXT_NEWLINE : 0));
761 if (__glibc_unlikely (pstr->valid_len))
762 {
763 for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
764 pstr->wcs[wcs_idx] = WEOF;
765 if (pstr->mbs_allocated)
766 memset (pstr->mbs, 255, pstr->valid_len);
767 }
768 pstr->valid_raw_len = pstr->valid_len;
769 }
770 else
771 {
772 int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
773 pstr->valid_raw_len = 0;
774 if (pstr->trans)
775 c = pstr->trans[c];
776 pstr->tip_context = (bitset_contain (pstr->word_char, c)
777 ? CONTEXT_WORD
778 : ((IS_NEWLINE (c) && pstr->newline_anchor)
779 ? CONTEXT_NEWLINE : 0));
780 }
781 }
782 if (!__glibc_unlikely (pstr->mbs_allocated))
783 pstr->mbs += offset;
784 }
785 pstr->raw_mbs_idx = idx;
786 pstr->len -= offset;
787 pstr->stop -= offset;
788
789 /* Then build the buffers. */
790 if (pstr->mb_cur_max > 1)
791 {
792 if (pstr->icase)
793 {
794 reg_errcode_t ret = build_wcs_upper_buffer (pstr);
795 if (__glibc_unlikely (ret != REG_NOERROR))
796 return ret;
797 }
798 else
799 build_wcs_buffer (pstr);
800 }
801 else
802 if (__glibc_unlikely (pstr->mbs_allocated))
803 {
804 if (pstr->icase)
805 build_upper_buffer (pstr);
806 else if (pstr->trans != NULL)
807 re_string_translate_buffer (pstr);
808 }
809 else
810 pstr->valid_len = pstr->len;
811
812 pstr->cur_idx = 0;
813 return REG_NOERROR;
814 }
815
816 static unsigned char
817 __attribute__ ((pure))
818 re_string_peek_byte_case (const re_string_t *pstr, Idx idx)
819 {
820 int ch;
821 Idx off;
822
823 /* Handle the common (easiest) cases first. */
824 if (__glibc_likely (!pstr->mbs_allocated))
825 return re_string_peek_byte (pstr, idx);
826
827 if (pstr->mb_cur_max > 1
828 && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
829 return re_string_peek_byte (pstr, idx);
830
831 off = pstr->cur_idx + idx;
832 if (pstr->offsets_needed)
833 off = pstr->offsets[off];
834
835 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
836
837 /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
838 this function returns CAPITAL LETTER I instead of first byte of
839 DOTLESS SMALL LETTER I. The latter would confuse the parser,
840 since peek_byte_case doesn't advance cur_idx in any way. */
841 if (pstr->offsets_needed && !isascii (ch))
842 return re_string_peek_byte (pstr, idx);
843
844 return ch;
845 }
846
847 static unsigned char
848 re_string_fetch_byte_case (re_string_t *pstr)
849 {
850 if (__glibc_likely (!pstr->mbs_allocated))
851 return re_string_fetch_byte (pstr);
852
853 if (pstr->offsets_needed)
854 {
855 Idx off;
856 int ch;
857
858 /* For tr_TR.UTF-8 [[:islower:]] there is
859 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
860 in that case the whole multi-byte character and return
861 the original letter. On the other side, with
862 [[: DOTLESS SMALL LETTER I return [[:I, as doing
863 anything else would complicate things too much. */
864
865 if (!re_string_first_byte (pstr, pstr->cur_idx))
866 return re_string_fetch_byte (pstr);
867
868 off = pstr->offsets[pstr->cur_idx];
869 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
870
871 if (! isascii (ch))
872 return re_string_fetch_byte (pstr);
873
874 re_string_skip_bytes (pstr,
875 re_string_char_size_at (pstr, pstr->cur_idx));
876 return ch;
877 }
878
879 return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
880 }
881
882 static void
883 re_string_destruct (re_string_t *pstr)
884 {
885 re_free (pstr->wcs);
886 re_free (pstr->offsets);
887 if (pstr->mbs_allocated)
888 re_free (pstr->mbs);
889 }
890
891 /* Return the context at IDX in INPUT. */
892
893 static unsigned int
894 re_string_context_at (const re_string_t *input, Idx idx, int eflags)
895 {
896 int c;
897 if (__glibc_unlikely (idx < 0))
898 /* In this case, we use the value stored in input->tip_context,
899 since we can't know the character in input->mbs[-1] here. */
900 return input->tip_context;
901 if (__glibc_unlikely (idx == input->len))
902 return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
903 : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
904 if (input->mb_cur_max > 1)
905 {
906 wint_t wc;
907 Idx wc_idx = idx;
908 while(input->wcs[wc_idx] == WEOF)
909 {
910 DEBUG_ASSERT (wc_idx >= 0);
911 --wc_idx;
912 if (wc_idx < 0)
913 return input->tip_context;
914 }
915 wc = input->wcs[wc_idx];
916 if (__glibc_unlikely (input->word_ops_used != 0)
917 && IS_WIDE_WORD_CHAR (wc))
918 return CONTEXT_WORD;
919 return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
920 ? CONTEXT_NEWLINE : 0);
921 }
922 else
923 {
924 c = re_string_byte_at (input, idx);
925 if (bitset_contain (input->word_char, c))
926 return CONTEXT_WORD;
927 return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
928 }
929 }
930
931 /* Functions for set operation. */
932
933 static reg_errcode_t
934 __attribute_warn_unused_result__
935 re_node_set_alloc (re_node_set *set, Idx size)
936 {
937 set->alloc = size;
938 set->nelem = 0;
939 set->elems = re_malloc (Idx, size);
940 if (__glibc_unlikely (set->elems == NULL)
941 && (MALLOC_0_IS_NONNULL || size != 0))
942 return REG_ESPACE;
943 return REG_NOERROR;
944 }
945
946 static reg_errcode_t
947 __attribute_warn_unused_result__
948 re_node_set_init_1 (re_node_set *set, Idx elem)
949 {
950 set->alloc = 1;
951 set->nelem = 1;
952 set->elems = re_malloc (Idx, 1);
953 if (__glibc_unlikely (set->elems == NULL))
954 {
955 set->alloc = set->nelem = 0;
956 return REG_ESPACE;
957 }
958 set->elems[0] = elem;
959 return REG_NOERROR;
960 }
961
962 static reg_errcode_t
963 __attribute_warn_unused_result__
964 re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2)
965 {
966 set->alloc = 2;
967 set->elems = re_malloc (Idx, 2);
968 if (__glibc_unlikely (set->elems == NULL))
969 return REG_ESPACE;
970 if (elem1 == elem2)
971 {
972 set->nelem = 1;
973 set->elems[0] = elem1;
974 }
975 else
976 {
977 set->nelem = 2;
978 if (elem1 < elem2)
979 {
980 set->elems[0] = elem1;
981 set->elems[1] = elem2;
982 }
983 else
984 {
985 set->elems[0] = elem2;
986 set->elems[1] = elem1;
987 }
988 }
989 return REG_NOERROR;
990 }
991
992 static reg_errcode_t
993 __attribute_warn_unused_result__
994 re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
995 {
996 dest->nelem = src->nelem;
997 if (src->nelem > 0)
998 {
999 dest->alloc = dest->nelem;
1000 dest->elems = re_malloc (Idx, dest->alloc);
1001 if (__glibc_unlikely (dest->elems == NULL))
1002 {
1003 dest->alloc = dest->nelem = 0;
1004 return REG_ESPACE;
1005 }
1006 memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1007 }
1008 else
1009 re_node_set_init_empty (dest);
1010 return REG_NOERROR;
1011 }
1012
1013 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1014 DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1015 Note: We assume dest->elems is NULL, when dest->alloc is 0. */
1016
1017 static reg_errcode_t
1018 __attribute_warn_unused_result__
1019 re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
1020 const re_node_set *src2)
1021 {
1022 Idx i1, i2, is, id, delta, sbase;
1023 if (src1->nelem == 0 || src2->nelem == 0)
1024 return REG_NOERROR;
1025
1026 /* We need dest->nelem + 2 * elems_in_intersection; this is a
1027 conservative estimate. */
1028 if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
1029 {
1030 Idx new_alloc = src1->nelem + src2->nelem + dest->alloc;
1031 Idx *new_elems = re_realloc (dest->elems, Idx, new_alloc);
1032 if (__glibc_unlikely (new_elems == NULL))
1033 return REG_ESPACE;
1034 dest->elems = new_elems;
1035 dest->alloc = new_alloc;
1036 }
1037
1038 /* Find the items in the intersection of SRC1 and SRC2, and copy
1039 into the top of DEST those that are not already in DEST itself. */
1040 sbase = dest->nelem + src1->nelem + src2->nelem;
1041 i1 = src1->nelem - 1;
1042 i2 = src2->nelem - 1;
1043 id = dest->nelem - 1;
1044 for (;;)
1045 {
1046 if (src1->elems[i1] == src2->elems[i2])
1047 {
1048 /* Try to find the item in DEST. Maybe we could binary search? */
1049 while (id >= 0 && dest->elems[id] > src1->elems[i1])
1050 --id;
1051
1052 if (id < 0 || dest->elems[id] != src1->elems[i1])
1053 dest->elems[--sbase] = src1->elems[i1];
1054
1055 if (--i1 < 0 || --i2 < 0)
1056 break;
1057 }
1058
1059 /* Lower the highest of the two items. */
1060 else if (src1->elems[i1] < src2->elems[i2])
1061 {
1062 if (--i2 < 0)
1063 break;
1064 }
1065 else
1066 {
1067 if (--i1 < 0)
1068 break;
1069 }
1070 }
1071
1072 id = dest->nelem - 1;
1073 is = dest->nelem + src1->nelem + src2->nelem - 1;
1074 delta = is - sbase + 1;
1075
1076 /* Now copy. When DELTA becomes zero, the remaining
1077 DEST elements are already in place; this is more or
1078 less the same loop that is in re_node_set_merge. */
1079 dest->nelem += delta;
1080 if (delta > 0 && id >= 0)
1081 for (;;)
1082 {
1083 if (dest->elems[is] > dest->elems[id])
1084 {
1085 /* Copy from the top. */
1086 dest->elems[id + delta--] = dest->elems[is--];
1087 if (delta == 0)
1088 break;
1089 }
1090 else
1091 {
1092 /* Slide from the bottom. */
1093 dest->elems[id + delta] = dest->elems[id];
1094 if (--id < 0)
1095 break;
1096 }
1097 }
1098
1099 /* Copy remaining SRC elements. */
1100 memcpy (dest->elems, dest->elems + sbase, delta * sizeof (Idx));
1101
1102 return REG_NOERROR;
1103 }
1104
1105 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1106 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1107
1108 static reg_errcode_t
1109 __attribute_warn_unused_result__
1110 re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1111 const re_node_set *src2)
1112 {
1113 Idx i1, i2, id;
1114 if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1115 {
1116 dest->alloc = src1->nelem + src2->nelem;
1117 dest->elems = re_malloc (Idx, dest->alloc);
1118 if (__glibc_unlikely (dest->elems == NULL))
1119 return REG_ESPACE;
1120 }
1121 else
1122 {
1123 if (src1 != NULL && src1->nelem > 0)
1124 return re_node_set_init_copy (dest, src1);
1125 else if (src2 != NULL && src2->nelem > 0)
1126 return re_node_set_init_copy (dest, src2);
1127 else
1128 re_node_set_init_empty (dest);
1129 return REG_NOERROR;
1130 }
1131 for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1132 {
1133 if (src1->elems[i1] > src2->elems[i2])
1134 {
1135 dest->elems[id++] = src2->elems[i2++];
1136 continue;
1137 }
1138 if (src1->elems[i1] == src2->elems[i2])
1139 ++i2;
1140 dest->elems[id++] = src1->elems[i1++];
1141 }
1142 if (i1 < src1->nelem)
1143 {
1144 memcpy (dest->elems + id, src1->elems + i1,
1145 (src1->nelem - i1) * sizeof (Idx));
1146 id += src1->nelem - i1;
1147 }
1148 else if (i2 < src2->nelem)
1149 {
1150 memcpy (dest->elems + id, src2->elems + i2,
1151 (src2->nelem - i2) * sizeof (Idx));
1152 id += src2->nelem - i2;
1153 }
1154 dest->nelem = id;
1155 return REG_NOERROR;
1156 }
1157
1158 /* Calculate the union set of the sets DEST and SRC. And store it to
1159 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1160
1161 static reg_errcode_t
1162 __attribute_warn_unused_result__
1163 re_node_set_merge (re_node_set *dest, const re_node_set *src)
1164 {
1165 Idx is, id, sbase, delta;
1166 if (src == NULL || src->nelem == 0)
1167 return REG_NOERROR;
1168 if (dest->alloc < 2 * src->nelem + dest->nelem)
1169 {
1170 Idx new_alloc = 2 * (src->nelem + dest->alloc);
1171 Idx *new_buffer = re_realloc (dest->elems, Idx, new_alloc);
1172 if (__glibc_unlikely (new_buffer == NULL))
1173 return REG_ESPACE;
1174 dest->elems = new_buffer;
1175 dest->alloc = new_alloc;
1176 }
1177
1178 if (__glibc_unlikely (dest->nelem == 0))
1179 {
1180 /* Although we already guaranteed above that dest->alloc != 0 and
1181 therefore dest->elems != NULL, add a debug assertion to pacify
1182 GCC 11.2.1's -fanalyzer. */
1183 DEBUG_ASSERT (dest->elems);
1184 dest->nelem = src->nelem;
1185 memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1186 return REG_NOERROR;
1187 }
1188
1189 /* Copy into the top of DEST the items of SRC that are not
1190 found in DEST. Maybe we could binary search in DEST? */
1191 for (sbase = dest->nelem + 2 * src->nelem,
1192 is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
1193 {
1194 if (dest->elems[id] == src->elems[is])
1195 is--, id--;
1196 else if (dest->elems[id] < src->elems[is])
1197 dest->elems[--sbase] = src->elems[is--];
1198 else /* if (dest->elems[id] > src->elems[is]) */
1199 --id;
1200 }
1201
1202 if (is >= 0)
1203 {
1204 /* If DEST is exhausted, the remaining items of SRC must be unique. */
1205 sbase -= is + 1;
1206 memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (Idx));
1207 }
1208
1209 id = dest->nelem - 1;
1210 is = dest->nelem + 2 * src->nelem - 1;
1211 delta = is - sbase + 1;
1212 if (delta == 0)
1213 return REG_NOERROR;
1214
1215 /* Now copy. When DELTA becomes zero, the remaining
1216 DEST elements are already in place. */
1217 dest->nelem += delta;
1218 for (;;)
1219 {
1220 if (dest->elems[is] > dest->elems[id])
1221 {
1222 /* Copy from the top. */
1223 dest->elems[id + delta--] = dest->elems[is--];
1224 if (delta == 0)
1225 break;
1226 }
1227 else
1228 {
1229 /* Slide from the bottom. */
1230 dest->elems[id + delta] = dest->elems[id];
1231 if (--id < 0)
1232 {
1233 /* Copy remaining SRC elements. */
1234 memcpy (dest->elems, dest->elems + sbase,
1235 delta * sizeof (Idx));
1236 break;
1237 }
1238 }
1239 }
1240
1241 return REG_NOERROR;
1242 }
1243
1244 /* Insert the new element ELEM to the re_node_set* SET.
1245 SET should not already have ELEM.
1246 Return true if successful. */
1247
1248 static bool
1249 __attribute_warn_unused_result__
1250 re_node_set_insert (re_node_set *set, Idx elem)
1251 {
1252 Idx idx;
1253 /* In case the set is empty. */
1254 if (set->alloc == 0)
1255 return __glibc_likely (re_node_set_init_1 (set, elem) == REG_NOERROR);
1256
1257 if (__glibc_unlikely (set->nelem) == 0)
1258 {
1259 /* Although we already guaranteed above that set->alloc != 0 and
1260 therefore set->elems != NULL, add a debug assertion to pacify
1261 GCC 11.2 -fanalyzer. */
1262 DEBUG_ASSERT (set->elems);
1263 set->elems[0] = elem;
1264 ++set->nelem;
1265 return true;
1266 }
1267
1268 /* Realloc if we need. */
1269 if (set->alloc == set->nelem)
1270 {
1271 Idx *new_elems;
1272 set->alloc = set->alloc * 2;
1273 new_elems = re_realloc (set->elems, Idx, set->alloc);
1274 if (__glibc_unlikely (new_elems == NULL))
1275 return false;
1276 set->elems = new_elems;
1277 }
1278
1279 /* Move the elements which follows the new element. Test the
1280 first element separately to skip a check in the inner loop. */
1281 if (elem < set->elems[0])
1282 {
1283 for (idx = set->nelem; idx > 0; idx--)
1284 set->elems[idx] = set->elems[idx - 1];
1285 }
1286 else
1287 {
1288 for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1289 set->elems[idx] = set->elems[idx - 1];
1290 DEBUG_ASSERT (set->elems[idx - 1] < elem);
1291 }
1292
1293 /* Insert the new element. */
1294 set->elems[idx] = elem;
1295 ++set->nelem;
1296 return true;
1297 }
1298
1299 /* Insert the new element ELEM to the re_node_set* SET.
1300 SET should not already have any element greater than or equal to ELEM.
1301 Return true if successful. */
1302
1303 static bool
1304 __attribute_warn_unused_result__
1305 re_node_set_insert_last (re_node_set *set, Idx elem)
1306 {
1307 /* Realloc if we need. */
1308 if (set->alloc == set->nelem)
1309 {
1310 Idx *new_elems;
1311 set->alloc = (set->alloc + 1) * 2;
1312 new_elems = re_realloc (set->elems, Idx, set->alloc);
1313 if (__glibc_unlikely (new_elems == NULL))
1314 return false;
1315 set->elems = new_elems;
1316 }
1317
1318 /* Insert the new element. */
1319 set->elems[set->nelem++] = elem;
1320 return true;
1321 }
1322
1323 /* Compare two node sets SET1 and SET2.
1324 Return true if SET1 and SET2 are equivalent. */
1325
1326 static bool
1327 __attribute__ ((pure))
1328 re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1329 {
1330 Idx i;
1331 if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1332 return false;
1333 for (i = set1->nelem ; --i >= 0 ; )
1334 if (set1->elems[i] != set2->elems[i])
1335 return false;
1336 return true;
1337 }
1338
1339 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
1340
1341 static Idx
1342 __attribute__ ((pure))
1343 re_node_set_contains (const re_node_set *set, Idx elem)
1344 {
1345 __re_size_t idx, right, mid;
1346 if (set->nelem <= 0)
1347 return 0;
1348
1349 /* Binary search the element. */
1350 idx = 0;
1351 right = set->nelem - 1;
1352 while (idx < right)
1353 {
1354 mid = (idx + right) / 2;
1355 if (set->elems[mid] < elem)
1356 idx = mid + 1;
1357 else
1358 right = mid;
1359 }
1360 return set->elems[idx] == elem ? idx + 1 : 0;
1361 }
1362
1363 static void
1364 re_node_set_remove_at (re_node_set *set, Idx idx)
1365 {
1366 if (idx < 0 || idx >= set->nelem)
1367 return;
1368 --set->nelem;
1369 for (; idx < set->nelem; idx++)
1370 set->elems[idx] = set->elems[idx + 1];
1371 }
1372
1373
1374 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1375 Or return -1 if an error occurred. */
1376
1377 static Idx
1378 re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1379 {
1380 if (__glibc_unlikely (dfa->nodes_len >= dfa->nodes_alloc))
1381 {
1382 size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1383 Idx *new_nexts, *new_indices;
1384 re_node_set *new_edests, *new_eclosures;
1385 re_token_t *new_nodes;
1386
1387 /* Avoid overflows in realloc. */
1388 const size_t max_object_size = MAX (sizeof (re_token_t),
1389 MAX (sizeof (re_node_set),
1390 sizeof (Idx)));
1391 if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size)
1392 < new_nodes_alloc))
1393 return -1;
1394
1395 new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1396 if (__glibc_unlikely (new_nodes == NULL))
1397 return -1;
1398 dfa->nodes = new_nodes;
1399 dfa->nodes_alloc = new_nodes_alloc;
1400 new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc);
1401 if (new_nexts != NULL)
1402 dfa->nexts = new_nexts;
1403 new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc);
1404 if (new_indices != NULL)
1405 dfa->org_indices = new_indices;
1406 new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1407 if (new_edests != NULL)
1408 dfa->edests = new_edests;
1409 new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1410 if (new_eclosures != NULL)
1411 dfa->eclosures = new_eclosures;
1412 if (__glibc_unlikely (new_nexts == NULL || new_indices == NULL
1413 || new_edests == NULL || new_eclosures == NULL))
1414 return -1;
1415 }
1416 dfa->nodes[dfa->nodes_len] = token;
1417 dfa->nodes[dfa->nodes_len].constraint = 0;
1418 dfa->nodes[dfa->nodes_len].accept_mb =
1419 ((token.type == OP_PERIOD && dfa->mb_cur_max > 1)
1420 || token.type == COMPLEX_BRACKET);
1421 dfa->nexts[dfa->nodes_len] = -1;
1422 re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1423 re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1424 return dfa->nodes_len++;
1425 }
1426
1427 static re_hashval_t
1428 calc_state_hash (const re_node_set *nodes, unsigned int context)
1429 {
1430 re_hashval_t hash = nodes->nelem + context;
1431 Idx i;
1432 for (i = 0 ; i < nodes->nelem ; i++)
1433 hash += nodes->elems[i];
1434 return hash;
1435 }
1436
1437 /* Search for the state whose node_set is equivalent to NODES.
1438 Return the pointer to the state, if we found it in the DFA.
1439 Otherwise create the new one and return it. In case of an error
1440 return NULL and set the error code in ERR.
1441 Note: - We assume NULL as the invalid state, then it is possible that
1442 return value is NULL and ERR is REG_NOERROR.
1443 - We never return non-NULL value in case of any errors, it is for
1444 optimization. */
1445
1446 static re_dfastate_t *
1447 __attribute_warn_unused_result__
1448 re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
1449 const re_node_set *nodes)
1450 {
1451 re_hashval_t hash;
1452 re_dfastate_t *new_state;
1453 struct re_state_table_entry *spot;
1454 Idx i;
1455 #if defined GCC_LINT || defined lint
1456 /* Suppress bogus uninitialized-variable warnings. */
1457 *err = REG_NOERROR;
1458 #endif
1459 if (__glibc_unlikely (nodes->nelem == 0))
1460 {
1461 *err = REG_NOERROR;
1462 return NULL;
1463 }
1464 hash = calc_state_hash (nodes, 0);
1465 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1466
1467 for (i = 0 ; i < spot->num ; i++)
1468 {
1469 re_dfastate_t *state = spot->array[i];
1470 if (hash != state->hash)
1471 continue;
1472 if (re_node_set_compare (&state->nodes, nodes))
1473 return state;
1474 }
1475
1476 /* There are no appropriate state in the dfa, create the new one. */
1477 new_state = create_ci_newstate (dfa, nodes, hash);
1478 if (__glibc_unlikely (new_state == NULL))
1479 *err = REG_ESPACE;
1480
1481 return new_state;
1482 }
1483
1484 /* Search for the state whose node_set is equivalent to NODES and
1485 whose context is equivalent to CONTEXT.
1486 Return the pointer to the state, if we found it in the DFA.
1487 Otherwise create the new one and return it. In case of an error
1488 return NULL and set the error code in ERR.
1489 Note: - We assume NULL as the invalid state, then it is possible that
1490 return value is NULL and ERR is REG_NOERROR.
1491 - We never return non-NULL value in case of any errors, it is for
1492 optimization. */
1493
1494 static re_dfastate_t *
1495 __attribute_warn_unused_result__
1496 re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
1497 const re_node_set *nodes, unsigned int context)
1498 {
1499 re_hashval_t hash;
1500 re_dfastate_t *new_state;
1501 struct re_state_table_entry *spot;
1502 Idx i;
1503 #if defined GCC_LINT || defined lint
1504 /* Suppress bogus uninitialized-variable warnings. */
1505 *err = REG_NOERROR;
1506 #endif
1507 if (nodes->nelem == 0)
1508 {
1509 *err = REG_NOERROR;
1510 return NULL;
1511 }
1512 hash = calc_state_hash (nodes, context);
1513 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1514
1515 for (i = 0 ; i < spot->num ; i++)
1516 {
1517 re_dfastate_t *state = spot->array[i];
1518 if (state->hash == hash
1519 && state->context == context
1520 && re_node_set_compare (state->entrance_nodes, nodes))
1521 return state;
1522 }
1523 /* There are no appropriate state in 'dfa', create the new one. */
1524 new_state = create_cd_newstate (dfa, nodes, context, hash);
1525 if (__glibc_unlikely (new_state == NULL))
1526 *err = REG_ESPACE;
1527
1528 return new_state;
1529 }
1530
1531 /* Finish initialization of the new state NEWSTATE, and using its hash value
1532 HASH put in the appropriate bucket of DFA's state table. Return value
1533 indicates the error code if failed. */
1534
1535 static reg_errcode_t
1536 __attribute_warn_unused_result__
1537 register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
1538 re_hashval_t hash)
1539 {
1540 struct re_state_table_entry *spot;
1541 reg_errcode_t err;
1542 Idx i;
1543
1544 newstate->hash = hash;
1545 err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1546 if (__glibc_unlikely (err != REG_NOERROR))
1547 return REG_ESPACE;
1548 for (i = 0; i < newstate->nodes.nelem; i++)
1549 {
1550 Idx elem = newstate->nodes.elems[i];
1551 if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1552 if (! re_node_set_insert_last (&newstate->non_eps_nodes, elem))
1553 return REG_ESPACE;
1554 }
1555
1556 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1557 if (__glibc_unlikely (spot->alloc <= spot->num))
1558 {
1559 Idx new_alloc = 2 * spot->num + 2;
1560 re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1561 new_alloc);
1562 if (__glibc_unlikely (new_array == NULL))
1563 return REG_ESPACE;
1564 spot->array = new_array;
1565 spot->alloc = new_alloc;
1566 }
1567 spot->array[spot->num++] = newstate;
1568 return REG_NOERROR;
1569 }
1570
1571 static void
1572 free_state (re_dfastate_t *state)
1573 {
1574 re_node_set_free (&state->non_eps_nodes);
1575 re_node_set_free (&state->inveclosure);
1576 if (state->entrance_nodes != &state->nodes)
1577 {
1578 re_node_set_free (state->entrance_nodes);
1579 re_free (state->entrance_nodes);
1580 }
1581 re_node_set_free (&state->nodes);
1582 re_free (state->word_trtable);
1583 re_free (state->trtable);
1584 re_free (state);
1585 }
1586
1587 /* Create the new state which is independent of contexts.
1588 Return the new state if succeeded, otherwise return NULL. */
1589
1590 static re_dfastate_t *
1591 __attribute_warn_unused_result__
1592 create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1593 re_hashval_t hash)
1594 {
1595 Idx i;
1596 reg_errcode_t err;
1597 re_dfastate_t *newstate;
1598
1599 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1600 if (__glibc_unlikely (newstate == NULL))
1601 return NULL;
1602 err = re_node_set_init_copy (&newstate->nodes, nodes);
1603 if (__glibc_unlikely (err != REG_NOERROR))
1604 {
1605 re_free (newstate);
1606 return NULL;
1607 }
1608
1609 newstate->entrance_nodes = &newstate->nodes;
1610 for (i = 0 ; i < nodes->nelem ; i++)
1611 {
1612 re_token_t *node = dfa->nodes + nodes->elems[i];
1613 re_token_type_t type = node->type;
1614 if (type == CHARACTER && !node->constraint)
1615 continue;
1616 newstate->accept_mb |= node->accept_mb;
1617
1618 /* If the state has the halt node, the state is a halt state. */
1619 if (type == END_OF_RE)
1620 newstate->halt = 1;
1621 else if (type == OP_BACK_REF)
1622 newstate->has_backref = 1;
1623 else if (type == ANCHOR || node->constraint)
1624 newstate->has_constraint = 1;
1625 }
1626 err = register_state (dfa, newstate, hash);
1627 if (__glibc_unlikely (err != REG_NOERROR))
1628 {
1629 free_state (newstate);
1630 newstate = NULL;
1631 }
1632 return newstate;
1633 }
1634
1635 /* Create the new state which is depend on the context CONTEXT.
1636 Return the new state if succeeded, otherwise return NULL. */
1637
1638 static re_dfastate_t *
1639 __attribute_warn_unused_result__
1640 create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1641 unsigned int context, re_hashval_t hash)
1642 {
1643 Idx i, nctx_nodes = 0;
1644 reg_errcode_t err;
1645 re_dfastate_t *newstate;
1646
1647 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1648 if (__glibc_unlikely (newstate == NULL))
1649 return NULL;
1650 err = re_node_set_init_copy (&newstate->nodes, nodes);
1651 if (__glibc_unlikely (err != REG_NOERROR))
1652 {
1653 re_free (newstate);
1654 return NULL;
1655 }
1656
1657 newstate->context = context;
1658 newstate->entrance_nodes = &newstate->nodes;
1659
1660 for (i = 0 ; i < nodes->nelem ; i++)
1661 {
1662 re_token_t *node = dfa->nodes + nodes->elems[i];
1663 re_token_type_t type = node->type;
1664 unsigned int constraint = node->constraint;
1665
1666 if (type == CHARACTER && !constraint)
1667 continue;
1668 newstate->accept_mb |= node->accept_mb;
1669
1670 /* If the state has the halt node, the state is a halt state. */
1671 if (type == END_OF_RE)
1672 newstate->halt = 1;
1673 else if (type == OP_BACK_REF)
1674 newstate->has_backref = 1;
1675
1676 if (constraint)
1677 {
1678 if (newstate->entrance_nodes == &newstate->nodes)
1679 {
1680 re_node_set *entrance_nodes = re_malloc (re_node_set, 1);
1681 if (__glibc_unlikely (entrance_nodes == NULL))
1682 {
1683 free_state (newstate);
1684 return NULL;
1685 }
1686 newstate->entrance_nodes = entrance_nodes;
1687 if (re_node_set_init_copy (newstate->entrance_nodes, nodes)
1688 != REG_NOERROR)
1689 {
1690 free_state (newstate);
1691 return NULL;
1692 }
1693 nctx_nodes = 0;
1694 newstate->has_constraint = 1;
1695 }
1696
1697 if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1698 {
1699 re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1700 ++nctx_nodes;
1701 }
1702 }
1703 }
1704 err = register_state (dfa, newstate, hash);
1705 if (__glibc_unlikely (err != REG_NOERROR))
1706 {
1707 free_state (newstate);
1708 newstate = NULL;
1709 }
1710 return newstate;
1711 }