1 /* pcresearch.c - searching subroutines using PCRE for grep.
2 Copyright 2000, 2007, 2009-2023 Free Software Foundation, Inc.
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 3, or (at your option)
7 any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
17 02110-1301, USA. */
18
19 #include <config.h>
20
21 #include "search.h"
22 #include "die.h"
23
24 #include <stdckdint.h>
25
26 #define PCRE2_CODE_UNIT_WIDTH 8
27 #include <pcre2.h>
28
29 /* For older PCRE2. */
30 #ifndef PCRE2_SIZE_MAX
31 # define PCRE2_SIZE_MAX SIZE_MAX
32 #endif
33 #ifndef PCRE2_CONFIG_DEPTHLIMIT
34 # define PCRE2_CONFIG_DEPTHLIMIT PCRE2_CONFIG_RECURSIONLIMIT
35 # define PCRE2_ERROR_DEPTHLIMIT PCRE2_ERROR_RECURSIONLIMIT
36 # define pcre2_set_depth_limit pcre2_set_recursion_limit
37 #endif
38 #ifndef PCRE2_EXTRA_ASCII_BSD
39 # define PCRE2_EXTRA_ASCII_BSD 0
40 #endif
41
42 /* Use PCRE2_MATCH_INVALID_UTF if supported and not buggy;
43 see <https://github.com/PCRE2Project/pcre2/issues/224>.
44 Assume the bug will be fixed after PCRE2 10.42. */
45 #if defined PCRE2_MATCH_INVALID_UTF && 10 < PCRE2_MAJOR + (42 < PCRE2_MINOR)
46 enum { MATCH_INVALID_UTF = PCRE2_MATCH_INVALID_UTF };
47 #else
48 enum { MATCH_INVALID_UTF = 0 };
49 #endif
50
51 struct pcre_comp
52 {
53 /* General context for PCRE operations. */
54 pcre2_general_context *gcontext;
55
56 /* Compiled internal form of a Perl regular expression. */
57 pcre2_code *cre;
58
59 /* Match context and data block. */
60 pcre2_match_context *mcontext;
61 pcre2_match_data *data;
62
63 /* The JIT stack and its maximum size. */
64 pcre2_jit_stack *jit_stack;
65 idx_t jit_stack_size;
66
67 /* Table, indexed by ! (flag & PCRE2_NOTBOL), of whether the empty
68 string matches when that flag is used. */
69 int empty_match[2];
70 };
71
72 /* Memory allocation functions for PCRE. */
73 static void *
74 private_malloc (PCRE2_SIZE size, _GL_UNUSED void *unused)
75 {
76 if (IDX_MAX < size)
77 xalloc_die ();
78 return ximalloc (size);
79 }
80 static void
81 private_free (void *ptr, _GL_UNUSED void *unused)
82 {
83 free (ptr);
84 }
85
86 void
87 Pprint_version (void)
88 {
89 char buf[128];
90 if (sizeof buf <= pcre2_config (PCRE2_CONFIG_VERSION, buf))
91 abort ();
92 printf (_("\ngrep -P uses PCRE2 %s\n"), buf);
93 }
94
95 /* Match the already-compiled PCRE pattern against the data in SUBJECT,
96 of size SEARCH_BYTES and starting with offset SEARCH_OFFSET, with
97 options OPTIONS.
98 Return the (nonnegative) match count or a (negative) error number. */
99 static int
100 jit_exec (struct pcre_comp *pc, char const *subject, idx_t search_bytes,
101 idx_t search_offset, int options)
102 {
103 while (true)
104 {
105 /* STACK_GROWTH_RATE is taken from PCRE's src/pcre2_jit_compile.c.
106 Going over the jitstack_max limit could trigger an int
107 overflow bug. */
108 int STACK_GROWTH_RATE = 8192;
109 idx_t jitstack_max = MIN (IDX_MAX, SIZE_MAX - (STACK_GROWTH_RATE - 1));
110
111 int e = pcre2_match (pc->cre, (PCRE2_SPTR) subject, search_bytes,
112 search_offset, options, pc->data, pc->mcontext);
113 if (e == PCRE2_ERROR_JIT_STACKLIMIT
114 && pc->jit_stack_size <= jitstack_max / 2)
115 {
116 idx_t old_size = pc->jit_stack_size;
117 idx_t new_size = pc->jit_stack_size = old_size * 2;
118 pcre2_jit_stack_free (pc->jit_stack);
119 pc->jit_stack = pcre2_jit_stack_create (old_size, new_size,
120 pc->gcontext);
121 if (!pc->jit_stack)
122 xalloc_die ();
123 if (!pc->mcontext)
124 pc->mcontext = pcre2_match_context_create (pc->gcontext);
125 pcre2_jit_stack_assign (pc->mcontext, NULL, pc->jit_stack);
126 }
127 else if (e == PCRE2_ERROR_DEPTHLIMIT)
128 {
129 uint32_t lim;
130 pcre2_config (PCRE2_CONFIG_DEPTHLIMIT, &lim);
131 if (ckd_mul (&lim, lim, 2))
132 return e;
133 if (!pc->mcontext)
134 pc->mcontext = pcre2_match_context_create (pc->gcontext);
135 pcre2_set_depth_limit (pc->mcontext, lim);
136 }
137 else
138 return e;
139 }
140 }
141
142 /* Return true if E is an error code for bad UTF-8. */
143 static bool
144 bad_utf8_from_pcre2 (int e)
145 {
146 return PCRE2_ERROR_UTF8_ERR21 <= e && e <= PCRE2_ERROR_UTF8_ERR1;
147 }
148
149 /* Compile the -P style PATTERN, containing SIZE bytes that are
150 followed by '\n'. Return a description of the compiled pattern. */
151
152 void *
153 Pcompile (char *pattern, idx_t size, reg_syntax_t ignored, bool exact)
154 {
155 PCRE2_SIZE e;
156 int ec;
157 int flags = PCRE2_DOLLAR_ENDONLY | (match_icase ? PCRE2_CASELESS : 0);
158 char *patlim = pattern + size;
159 struct pcre_comp *pc = ximalloc (sizeof *pc);
160 pcre2_general_context *gcontext = pc->gcontext
161 = pcre2_general_context_create (private_malloc, private_free, NULL);
162 pcre2_compile_context *ccontext = pcre2_compile_context_create (gcontext);
163
164 if (localeinfo.multibyte)
165 {
166 uint32_t unicode;
167 if (pcre2_config (PCRE2_CONFIG_UNICODE, &unicode) < 0 || !unicode)
168 die (EXIT_TROUBLE, 0,
169 _("-P supports only unibyte locales on this platform"));
170 if (! localeinfo.using_utf8)
171 die (EXIT_TROUBLE, 0, _("-P supports only unibyte and UTF-8 locales"));
172
173 flags |= PCRE2_UTF;
174
175 /* If supported, consider invalid UTF-8 as a barrier not an error. */
176 flags |= MATCH_INVALID_UTF;
177
178 /* If PCRE2_EXTRA_ASCII_BSD is available, use PCRE2_UCP
179 so that \d does not have the undesirable effect of matching
180 non-ASCII digits. Otherwise (i.e., with PCRE2 10.42 and earlier),
181 escapes like \w have only their ASCII interpretations,
182 but that's better than the confusion that would ensue if \d
183 matched non-ASCII digits. */
184 flags |= PCRE2_EXTRA_ASCII_BSD ? PCRE2_UCP : 0;
185
186 #if 0
187 /* Do not match individual code units but only UTF-8. */
188 flags |= PCRE2_NEVER_BACKSLASH_C;
189 #endif
190 }
191
192 /* FIXME: Remove this restriction. */
193 if (rawmemchr (pattern, '\n') != patlim)
194 die (EXIT_TROUBLE, 0, _("the -P option only supports a single pattern"));
195
196 #ifdef PCRE2_EXTRA_MATCH_LINE
197 uint32_t extra_options = (PCRE2_EXTRA_ASCII_BSD
198 | (match_lines ? PCRE2_EXTRA_MATCH_LINE : 0));
199 pcre2_set_compile_extra_options (ccontext, extra_options);
200 #endif
201
202 void *re_storage = NULL;
203 if (match_lines)
204 {
205 #ifndef PCRE2_EXTRA_MATCH_LINE
206 static char const /* These sizes omit trailing NUL. */
207 xprefix[4] = "^(?:", xsuffix[2] = ")$";
208 idx_t re_size = size + sizeof xprefix + sizeof xsuffix;
209 char *re = re_storage = ximalloc (re_size);
210 char *rez = mempcpy (re, xprefix, sizeof xprefix);
211 rez = mempcpy (rez, pattern, size);
212 memcpy (rez, xsuffix, sizeof xsuffix);
213 pattern = re;
214 size = re_size;
215 #endif
216 }
217 else if (match_words)
218 {
219 /* PCRE2_EXTRA_MATCH_WORD is incompatible with grep -w;
220 do things the grep way. */
221 static char const /* These sizes omit trailing NUL. */
222 wprefix[10] = "(?<!\\w)(?:", wsuffix[7] = ")(?!\\w)";
223 idx_t re_size = size + sizeof wprefix + sizeof wsuffix;
224 char *re = re_storage = ximalloc (re_size);
225 char *rez = mempcpy (re, wprefix, sizeof wprefix);
226 rez = mempcpy (rez, pattern, size);
227 memcpy (rez, wsuffix, sizeof wsuffix);
228 pattern = re;
229 size = re_size;
230 }
231
232 if (!localeinfo.multibyte)
233 pcre2_set_character_tables (ccontext, pcre2_maketables (gcontext));
234
235 pc->cre = pcre2_compile ((PCRE2_SPTR) pattern, size, flags,
236 &ec, &e, ccontext);
237 if (!pc->cre)
238 {
239 enum { ERRBUFSIZ = 256 }; /* Taken from pcre2grep.c ERRBUFSIZ. */
240 PCRE2_UCHAR8 ep[ERRBUFSIZ];
241 pcre2_get_error_message (ec, ep, sizeof ep);
242 die (EXIT_TROUBLE, 0, "%s", ep);
243 }
244
245 free (re_storage);
246 pcre2_compile_context_free (ccontext);
247
248 pc->mcontext = NULL;
249 pc->data = pcre2_match_data_create_from_pattern (pc->cre, gcontext);
250
251 /* Ignore any failure return from pcre2_jit_compile, as that merely
252 means JIT won't be used during matching. */
253 pcre2_jit_compile (pc->cre, PCRE2_JIT_COMPLETE);
254
255 /* The PCRE documentation says that a 32 KiB stack is the default. */
256 pc->jit_stack = NULL;
257 pc->jit_stack_size = 32 << 10;
258
259 pc->empty_match[false] = jit_exec (pc, "", 0, 0, PCRE2_NOTBOL);
260 pc->empty_match[true] = jit_exec (pc, "", 0, 0, 0);
261
262 return pc;
263 }
264
265 ptrdiff_t
266 Pexecute (void *vcp, char const *buf, idx_t size, idx_t *match_size,
267 char const *start_ptr)
268 {
269 char const *p = start_ptr ? start_ptr : buf;
270 bool bol = p[-1] == eolbyte;
271 char const *line_start = buf;
272 int e = PCRE2_ERROR_NOMATCH;
273 char const *line_end;
274 struct pcre_comp *pc = vcp;
275 PCRE2_SIZE *sub = pcre2_get_ovector_pointer (pc->data);
276
277 /* The search address to pass to PCRE. This is the start of
278 the buffer, or just past the most-recently discovered encoding
279 error or line end. */
280 char const *subject = buf;
281
282 do
283 {
284 /* Search line by line. Although this formerly used something like
285 PCRE2_MULTILINE for performance, the performance wasn't always
286 better and the correctness issues were too puzzling. See
287 Bug#22655. */
288 line_end = rawmemchr (p, eolbyte);
289 if (PCRE2_SIZE_MAX < line_end - p)
290 die (EXIT_TROUBLE, 0, _("exceeded PCRE's line length limit"));
291
292 for (;;)
293 {
294 /* Skip past bytes that are easily determined to be encoding
295 errors, treating them as data that cannot match. This is
296 faster than having PCRE check them. */
297 while (localeinfo.sbclen[to_uchar (*p)] == -1)
298 {
299 p++;
300 subject = p;
301 bol = false;
302 }
303
304 idx_t search_offset = p - subject;
305
306 /* Check for an empty match; this is faster than letting
307 PCRE do it. */
308 if (p == line_end)
309 {
310 sub[0] = sub[1] = search_offset;
311 e = pc->empty_match[bol];
312 break;
313 }
314
315 int options = 0;
316 if (!bol)
317 options |= PCRE2_NOTBOL;
318
319 e = jit_exec (pc, subject, line_end - subject,
320 search_offset, options);
321 if (MATCH_INVALID_UTF || !bad_utf8_from_pcre2 (e))
322 break;
323
324 idx_t valid_bytes = pcre2_get_startchar (pc->data);
325
326 if (search_offset <= valid_bytes)
327 {
328 /* Try to match the string before the encoding error. */
329 if (valid_bytes == 0)
330 {
331 /* Handle the empty-match case specially, for speed.
332 This optimization is valid if VALID_BYTES is zero,
333 which means SEARCH_OFFSET is also zero. */
334 sub[0] = valid_bytes;
335 sub[1] = 0;
336 e = pc->empty_match[bol];
337 }
338 else
339 e = jit_exec (pc, subject, valid_bytes, search_offset,
340 options | PCRE2_NO_UTF_CHECK | PCRE2_NOTEOL);
341
342 if (e != PCRE2_ERROR_NOMATCH)
343 break;
344
345 /* Treat the encoding error as data that cannot match. */
346 p = subject + valid_bytes + 1;
347 bol = false;
348 }
349
350 subject += valid_bytes + 1;
351 }
352
353 if (e != PCRE2_ERROR_NOMATCH)
354 break;
355 bol = true;
356 p = subject = line_start = line_end + 1;
357 }
358 while (p < buf + size);
359
360 if (e <= 0)
361 {
362 switch (e)
363 {
364 case PCRE2_ERROR_NOMATCH:
365 break;
366
367 case PCRE2_ERROR_NOMEMORY:
368 die (EXIT_TROUBLE, 0, _("%s: memory exhausted"), input_filename ());
369
370 case PCRE2_ERROR_JIT_STACKLIMIT:
371 die (EXIT_TROUBLE, 0, _("%s: exhausted PCRE JIT stack"),
372 input_filename ());
373
374 case PCRE2_ERROR_MATCHLIMIT:
375 die (EXIT_TROUBLE, 0, _("%s: exceeded PCRE's backtracking limit"),
376 input_filename ());
377
378 case PCRE2_ERROR_DEPTHLIMIT:
379 die (EXIT_TROUBLE, 0,
380 _("%s: exceeded PCRE's nested backtracking limit"),
381 input_filename ());
382
383 case PCRE2_ERROR_RECURSELOOP:
384 die (EXIT_TROUBLE, 0, _("%s: PCRE detected recurse loop"),
385 input_filename ());
386
387 #ifdef PCRE2_ERROR_HEAPLIMIT
388 case PCRE2_ERROR_HEAPLIMIT:
389 die (EXIT_TROUBLE, 0, _("%s: exceeded PCRE's heap limit"),
390 input_filename ());
391 #endif
392
393 default:
394 /* For now, we lump all remaining PCRE failures into this basket.
395 If anyone cares to provide sample grep usage that can trigger
396 particular PCRE errors, we can add to the list (above) of more
397 detailed diagnostics. */
398 die (EXIT_TROUBLE, 0, _("%s: internal PCRE error: %d"),
399 input_filename (), e);
400 }
401
402 return -1;
403 }
404 else
405 {
406 char const *matchbeg = subject + sub[0];
407 char const *matchend = subject + sub[1];
408 char const *beg;
409 char const *end;
410 if (start_ptr)
411 {
412 beg = matchbeg;
413 end = matchend;
414 }
415 else
416 {
417 beg = line_start;
418 end = line_end + 1;
419 }
420 *match_size = end - beg;
421 return beg - buf;
422 }
423 }