1 /* exclude.c -- exclude file names
2
3 Copyright (C) 1992-1994, 1997, 1999-2007, 2009-2023 Free Software
4 Foundation, Inc.
5
6 This program is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation, either version 3 of the License, or
9 (at your option) any later version.
10
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with this program. If not, see <https://www.gnu.org/licenses/>. */
18
19 /* Written by Paul Eggert <eggert@twinsun.com>
20 and Sergey Poznyakoff <gray@gnu.org>.
21 Thanks to Phil Proudman <phil@proudman51.freeserve.co.uk>
22 for improvement suggestions. */
23
24 #include <config.h>
25
26 #include <ctype.h>
27 #include <errno.h>
28 #include <regex.h>
29 #include <stddef.h>
30 #include <stdio.h>
31 #include <stdlib.h>
32 #include <string.h>
33 #include <uchar.h>
34
35 #include "exclude.h"
36 #include "filename.h"
37 #include "fnmatch.h"
38 #include "hash.h"
39 #include "mbuiter.h"
40 #include "xalloc.h"
41
42 #if GNULIB_EXCLUDE_SINGLE_THREAD
43 # include "unlocked-io.h"
44 #endif
45
46 /* Non-GNU systems lack these options, so we don't need to check them. */
47 #ifndef FNM_CASEFOLD
48 # define FNM_CASEFOLD 0
49 #endif
50 #ifndef FNM_EXTMATCH
51 # define FNM_EXTMATCH 0
52 #endif
53 #ifndef FNM_LEADING_DIR
54 # define FNM_LEADING_DIR 0
55 #endif
56
57 static_assert (((EXCLUDE_ANCHORED | EXCLUDE_INCLUDE | EXCLUDE_WILDCARDS)
58 & (FNM_PATHNAME | FNM_NOESCAPE | FNM_PERIOD | FNM_LEADING_DIR
59 | FNM_CASEFOLD | FNM_EXTMATCH))
60 == 0);
61
62
63 /* Exclusion patterns are grouped into a singly-linked list of
64 "exclusion segments". Each segment represents a set of patterns
65 that can be matches using the same algorithm. Non-wildcard
66 patterns are kept in hash tables, to speed up searches. Wildcard
67 patterns are stored as arrays of patterns. */
68
69
70 /* An exclude pattern-options pair. The options are fnmatch options
71 ORed with EXCLUDE_* options. */
72
73 struct patopts
74 {
75 int options;
76 union
77 {
78 char const *pattern;
79 regex_t re;
80 } v;
81 };
82
83 /* An array of pattern-options pairs. */
84
85 struct exclude_pattern
86 {
87 struct patopts *exclude;
88 idx_t exclude_alloc;
89 idx_t exclude_count;
90 };
91
92 enum exclude_type
93 {
94 exclude_hash, /* a hash table of excluded names */
95 exclude_pattern /* an array of exclude patterns */
96 };
97
98 struct exclude_segment
99 {
100 struct exclude_segment *next; /* next segment in list */
101 enum exclude_type type; /* type of this segment */
102 int options; /* common options for this segment */
103 union
104 {
105 Hash_table *table; /* for type == exclude_hash */
106 struct exclude_pattern pat; /* for type == exclude_pattern */
107 } v;
108 };
109
110 struct pattern_buffer
111 {
112 struct pattern_buffer *next;
113 char *base;
114 };
115
116 /* The exclude structure keeps a singly-linked list of exclude segments,
117 maintained in reverse order. */
118 struct exclude
119 {
120 struct exclude_segment *head;
121 struct pattern_buffer *patbuf;
122 };
123
124 /* Register BUF in the pattern buffer list of EX. ADD_FUNC (see
125 add_exclude_file and add_exclude_fp below) can use this function
126 if it modifies the pattern, to ensure the allocated memory will be
127 properly reclaimed upon calling free_exclude. */
128 void
129 exclude_add_pattern_buffer (struct exclude *ex, char *buf)
130 {
131 struct pattern_buffer *pbuf = xmalloc (sizeof *pbuf);
132 pbuf->base = buf;
133 pbuf->next = ex->patbuf;
134 ex->patbuf = pbuf;
135 }
136
137 /* Return true if STR has or may have wildcards, when matched with OPTIONS.
138 Return false if STR definitely does not have wildcards. */
139 bool
140 fnmatch_pattern_has_wildcards (const char *str, int options)
141 {
142 while (true)
143 {
144 switch (*str++)
145 {
146 case '.':
147 case '{':
148 case '}':
149 case '(':
150 case ')':
151 if (options & EXCLUDE_REGEX)
152 return true;
153 break;
154
155 case '\\':
156 if (options & EXCLUDE_REGEX)
157 continue;
158 else
159 str += ! (options & FNM_NOESCAPE) && *str;
160 break;
161
162 case '+': case '@': case '!':
163 if (options & FNM_EXTMATCH && *str == '(')
164 return true;
165 break;
166
167 case '?': case '*': case '[':
168 return true;
169
170 case '\0':
171 return false;
172 }
173 }
174 }
175
176 static void
177 unescape_pattern (char *str)
178 {
179 char const *q = str;
180 do
181 q += *q == '\\' && q[1];
182 while ((*str++ = *q++));
183 }
184
185 /* Return a newly allocated and empty exclude list. */
186
187 struct exclude *
188 new_exclude (void)
189 {
190 return xzalloc (sizeof *new_exclude ());
191 }
192
193 /* Calculate the hash of string. */
194 static size_t
195 string_hasher (void const *data, size_t n_buckets)
196 {
197 return hash_string (data, n_buckets);
198 }
199
200 /* Ditto, for case-insensitive hashes */
201 static size_t
202 string_hasher_ci (void const *data, size_t n_buckets)
203 {
204 char const *p = data;
205 size_t value = 0;
206
207 mbui_iterator_t iter;
208 for (mbui_init (iter, p); mbui_avail (iter); mbui_advance (iter))
209 {
210 mbchar_t m = mbui_cur (iter);
211 char32_t wc;
212
213 if (m.wc_valid)
214 wc = c32tolower (m.wc);
215 else
216 wc = *m.ptr;
217
218 value = value * 31 + wc;
219 }
220
221 return value % n_buckets;
222 }
223
224 /* compare two strings for equality */
225 static bool
226 string_compare (void const *data1, void const *data2)
227 {
228 return strcmp (data1, data2) == 0;
229 }
230
231 /* compare two strings for equality, case-insensitive */
232 static bool
233 string_compare_ci (void const *data1, void const *data2)
234 {
235 return mbscasecmp (data1, data2) == 0;
236 }
237
238 /* Create new exclude segment of given TYPE and OPTIONS, and attach it
239 to the head of EX. */
240 static void
241 new_exclude_segment (struct exclude *ex, enum exclude_type type, int options)
242 {
243 struct exclude_segment *sp = xmalloc (sizeof (struct exclude_segment));
244 sp->type = type;
245 sp->options = options;
246 switch (type)
247 {
248 case exclude_pattern:
249 sp->v.pat = (struct exclude_pattern) {0};
250 break;
251
252 case exclude_hash:
253 sp->v.table = hash_initialize (0, nullptr,
254 (options & FNM_CASEFOLD
255 ? string_hasher_ci
256 : string_hasher),
257 (options & FNM_CASEFOLD
258 ? string_compare_ci
259 : string_compare),
260 free);
261 break;
262 }
263 sp->next = ex->head;
264 ex->head = sp;
265 }
266
267 /* Free a single exclude segment */
268 static void
269 free_exclude_segment (struct exclude_segment *seg)
270 {
271 switch (seg->type)
272 {
273 case exclude_pattern:
274 for (idx_t i = 0; i < seg->v.pat.exclude_count; i++)
275 if (seg->v.pat.exclude[i].options & EXCLUDE_REGEX)
276 regfree (&seg->v.pat.exclude[i].v.re);
277 free (seg->v.pat.exclude);
278 break;
279
280 case exclude_hash:
281 hash_free (seg->v.table);
282 break;
283 }
284 free (seg);
285 }
286
287 /* Free the storage associated with an exclude list. */
288 void
289 free_exclude (struct exclude *ex)
290 {
291 for (struct exclude_segment *seg = ex->head; seg; )
292 {
293 struct exclude_segment *next = seg->next;
294 free_exclude_segment (seg);
295 seg = next;
296 }
297
298 for (struct pattern_buffer *pbuf = ex->patbuf; pbuf; )
299 {
300 struct pattern_buffer *next = pbuf->next;
301 free (pbuf->base);
302 free (pbuf);
303 pbuf = next;
304 }
305
306 free (ex);
307 }
308
309 /* Return zero if PATTERN matches F, obeying OPTIONS, except that
310 (unlike fnmatch) wildcards are disabled in PATTERN. */
311
312 static int
313 fnmatch_no_wildcards (char const *pattern, char const *f, int options)
314 {
315 if (! (options & FNM_LEADING_DIR))
316 return ((options & FNM_CASEFOLD)
317 ? mbscasecmp (pattern, f)
318 : strcmp (pattern, f));
319 else if (! (options & FNM_CASEFOLD))
320 {
321 idx_t patlen = strlen (pattern);
322 int r = strncmp (pattern, f, patlen);
323 if (! r)
324 {
325 r = f[patlen];
326 if (r == '/')
327 r = 0;
328 }
329 return r;
330 }
331 else
332 {
333 /* Walk through a copy of F, seeing whether P matches any prefix
334 of F.
335
336 FIXME: This is an O(N**2) algorithm; it should be O(N).
337 Also, the copy should not be necessary. However, fixing this
338 will probably involve a change to the mbs* API. */
339
340 char *fcopy = xstrdup (f);
341 int r;
342 for (char *p = fcopy; ; *p++ = '/')
343 {
344 p = strchr (p, '/');
345 if (p)
346 *p = '\0';
347 r = mbscasecmp (pattern, fcopy);
348 if (!p || r <= 0)
349 break;
350 }
351 free (fcopy);
352 return r;
353 }
354 }
355
356 bool
357 exclude_fnmatch (char const *pattern, char const *f, int options)
358 {
359 int (*matcher) (char const *, char const *, int) =
360 (options & EXCLUDE_WILDCARDS
361 ? fnmatch
362 : fnmatch_no_wildcards);
363 bool matched = matcher (pattern, f, options) == 0;
364
365 if (! (options & EXCLUDE_ANCHORED))
366 for (char const *p = f; *p && ! matched; p++)
367 if (*p == '/' && p[1] != '/')
368 matched = matcher (pattern, p + 1, options) == 0;
369
370 return matched;
371 }
372
373 static bool
374 exclude_patopts (struct patopts const *opts, char const *f)
375 {
376 int options = opts->options;
377
378 return (options & EXCLUDE_REGEX
379 ? regexec (&opts->v.re, f, 0, nullptr, 0) == 0
380 : exclude_fnmatch (opts->v.pattern, f, options));
381 }
382
383 /* Return true if the exclude_pattern segment SEG matches F. */
384
385 static bool
386 file_pattern_matches (struct exclude_segment const *seg, char const *f)
387 {
388 idx_t exclude_count = seg->v.pat.exclude_count;
389 struct patopts const *exclude = seg->v.pat.exclude;
390
391 for (idx_t i = 0; i < exclude_count; i++)
392 if (exclude_patopts (exclude + i, f))
393 return true;
394 return false;
395 }
396
397 /* Return true if the exclude_hash segment SEG matches F.
398 BUFFER is an auxiliary storage of the same length as F (with nul
399 terminator included) */
400 static bool
401 file_name_matches (struct exclude_segment const *seg, char const *f,
402 char *buffer)
403 {
404 int options = seg->options;
405 Hash_table *table = seg->v.table;
406
407 do
408 {
409 /* initialize the pattern */
410 strcpy (buffer, f);
411
412 while (true)
413 {
414 if (hash_lookup (table, buffer))
415 return true;
416 if (options & FNM_LEADING_DIR)
417 {
418 char *p = strrchr (buffer, '/');
419 if (p)
420 {
421 *p = '\0';
422 continue;
423 }
424 }
425 break;
426 }
427
428 if (!(options & EXCLUDE_ANCHORED))
429 {
430 f = strchr (f, '/');
431 if (f)
432 f++;
433 }
434 else
435 break;
436 }
437 while (f);
438
439 return false;
440 }
441
442 /* Return true if EX excludes F. */
443
444 bool
445 excluded_file_name (struct exclude const *ex, char const *f)
446 {
447 /* If no patterns are given, the default is to include. */
448 if (!ex->head)
449 return false;
450
451 bool invert = false;
452 char *filename = nullptr;
453
454 /* Scan through the segments, reporting the status of the first match.
455 The segments are in reverse order, so this reports the status of
456 the last match in the original option list. */
457 struct exclude_segment *seg;
458 for (seg = ex->head; ; seg = seg->next)
459 {
460 if (seg->type == exclude_hash)
461 {
462 if (!filename)
463 filename = xmalloc (strlen (f) + 1);
464 if (file_name_matches (seg, f, filename))
465 break;
466 }
467 else
468 {
469 if (file_pattern_matches (seg, f))
470 break;
471 }
472
473 if (! seg->next)
474 {
475 /* If patterns are given but none match, the default is the
476 opposite of the last segment (i.e., the first in the
477 original option list). For example, in the command
478 'grep -r --exclude="a*" --include="*b" pat dir', the
479 first option is --exclude so any file name matching
480 neither a* nor *b is included. */
481 invert = true;
482 break;
483 }
484 }
485
486 free (filename);
487 return invert ^ ! (seg->options & EXCLUDE_INCLUDE);
488 }
489
490 /* Append to EX the exclusion PATTERN with OPTIONS. */
491
492 void
493 add_exclude (struct exclude *ex, char const *pattern, int options)
494 {
495 if ((options & (EXCLUDE_REGEX|EXCLUDE_WILDCARDS))
496 && fnmatch_pattern_has_wildcards (pattern, options))
497 {
498 if (! (ex->head && ex->head->type == exclude_pattern
499 && ((ex->head->options & EXCLUDE_INCLUDE)
500 == (options & EXCLUDE_INCLUDE))))
501 new_exclude_segment (ex, exclude_pattern, options);
502
503 struct exclude_pattern *pat = &ex->head->v.pat;
504 if (pat->exclude_count == pat->exclude_alloc)
505 pat->exclude = xpalloc (pat->exclude, &pat->exclude_alloc, 1, -1,
506 sizeof *pat->exclude);
507 struct patopts *patopts = &pat->exclude[pat->exclude_count++];
508
509 patopts->options = options;
510 if (options & EXCLUDE_REGEX)
511 {
512 int rc;
513 int cflags = (REG_NOSUB | REG_EXTENDED
514 | (options & FNM_CASEFOLD ? REG_ICASE : 0));
515
516 if (! (options & FNM_LEADING_DIR))
517 rc = regcomp (&patopts->v.re, pattern, cflags);
518 else
519 for (idx_t len = strlen (pattern); ; len--)
520 {
521 if (len == 0)
522 {
523 rc = 1;
524 break;
525 }
526 if (!ISSLASH (pattern[len - 1]))
527 {
528 static char const patsuffix[] = "(/.*)?";
529 char *tmp = ximalloc (len + sizeof patsuffix);
530 memcpy (tmp, pattern, len);
531 strcpy (tmp + len, patsuffix);
532 rc = regcomp (&patopts->v.re, tmp, cflags);
533 free (tmp);
534 break;
535 }
536 }
537
538 if (rc)
539 {
540 pat->exclude_count--;
541 return;
542 }
543 }
544 else
545 {
546 if (options & EXCLUDE_ALLOC)
547 {
548 char *dup = xstrdup (pattern);
549 pattern = dup;
550 exclude_add_pattern_buffer (ex, dup);
551 }
552 patopts->v.pattern = pattern;
553 }
554 }
555 else
556 {
557 int exclude_hash_flags = (EXCLUDE_INCLUDE | EXCLUDE_ANCHORED
558 | FNM_LEADING_DIR | FNM_CASEFOLD);
559 if (! (ex->head && ex->head->type == exclude_hash
560 && ((ex->head->options & exclude_hash_flags)
561 == (options & exclude_hash_flags))))
562 new_exclude_segment (ex, exclude_hash, options);
563
564 char *str = xstrdup (pattern);
565 if ((options & (EXCLUDE_WILDCARDS | FNM_NOESCAPE)) == EXCLUDE_WILDCARDS)
566 unescape_pattern (str);
567 if (hash_insert (ex->head->v.table, str) != str)
568 free (str);
569 }
570 }
571
572 /* Use ADD_FUNC to append to EX the patterns in FILE_NAME, each with
573 OPTIONS. LINE_END terminates each pattern in the file. If
574 LINE_END is a space character, ignore trailing spaces and empty
575 lines in FP. Return -1 (setting errno) on failure, 0 on success. */
576
577 int
578 add_exclude_fp (void (*add_func) (struct exclude *, char const *, int, void *),
579 struct exclude *ex, FILE *fp, int options,
580 char line_end, void *data)
581 {
582 char *buf = nullptr;
583 idx_t buf_alloc = 0;
584 idx_t buf_count = 0;
585
586 for (int c; (c = getc (fp)) != EOF; )
587 {
588 if (buf_count == buf_alloc)
589 buf = xpalloc (buf, &buf_alloc, 1, -1, 1);
590 buf[buf_count++] = c;
591 }
592
593 int e = ferror (fp) ? errno : 0;
594
595 buf = xirealloc (buf, buf_count + 1);
596 buf[buf_count] = line_end;
597 char const *lim = (buf + buf_count
598 + ! (buf_count == 0 || buf[buf_count - 1] == line_end));
599
600 exclude_add_pattern_buffer (ex, buf);
601
602 char *pattern = buf;
603
604 for (char *p = buf; p < lim; p++)
605 if (*p == line_end)
606 {
607 char *pattern_end = p;
608
609 if (isspace ((unsigned char) line_end))
610 {
611 /* Assume that no multi-byte character has a trailing byte
612 that satisfies isspace, and that nobody cares about
613 trailing white space containing non-single-byte characters.
614 If either assumption turns out to be false, presumably
615 the code should be changed to scan forward through the
616 entire pattern, one multi-byte character at a time. */
617 for (; ; pattern_end--)
618 if (pattern_end == pattern)
619 goto next_pattern;
620 else if (! isspace ((unsigned char) pattern_end[-1]))
621 break;
622 }
623
624 *pattern_end = '\0';
625 add_func (ex, pattern, options, data);
626
627 next_pattern:
628 pattern = p + 1;
629 }
630
631 errno = e;
632 return e ? -1 : 0;
633 }
634
635 static void
636 call_addfn (struct exclude *ex, char const *pattern, int options, void *data)
637 {
638 void (**addfnptr) (struct exclude *, char const *, int) = data;
639 (*addfnptr) (ex, pattern, options);
640 }
641
642 int
643 add_exclude_file (void (*add_func) (struct exclude *, char const *, int),
644 struct exclude *ex, char const *file_name, int options,
645 char line_end)
646 {
647 if (strcmp (file_name, "-") == 0)
648 return add_exclude_fp (call_addfn, ex, stdin, options, line_end, &add_func);
649
650 FILE *in = fopen (file_name, "re");
651 if (!in)
652 return -1;
653 int rc = add_exclude_fp (call_addfn, ex, in, options, line_end, &add_func);
654 int e = errno;
655 if (fclose (in) < 0)
656 return -1;
657 errno = e;
658 return rc;
659 }