1 /* GLIB - Library of useful routines for C programming
2 * Copyright (C) 1995-1997, 1999 Peter Mattis, Red Hat, Inc.
3 *
4 * SPDX-License-Identifier: LGPL-2.1-or-later
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #include "config.h"
21
22 #include <string.h>
23
24 #include "gpattern.h"
25
26 #include "gmacros.h"
27 #include "gmem.h"
28 #include "gmessages.h"
29 #include "gstrfuncs.h"
30 #include "gunicode.h"
31 #include "gutils.h"
32
33 /**
34 * GPatternSpec:
35 *
36 * A `GPatternSpec` struct is the 'compiled' form of a glob-style pattern.
37 *
38 * The [func@GLib.pattern_match_simple] and [method@GLib.PatternSpec.match] functions
39 * match a string against a pattern containing '*' and '?' wildcards with similar
40 * semantics as the standard `glob()` function: '*' matches an arbitrary,
41 * possibly empty, string, '?' matches an arbitrary character.
42 *
43 * Note that in contrast to `glob()`, the '/' character can be matched by
44 * the wildcards, there are no '[...]' character ranges and '*' and '?'
45 * can not be escaped to include them literally in a pattern.
46 *
47 * When multiple strings must be matched against the same pattern, it is better
48 * to compile the pattern to a [struct@GLib.PatternSpec] using
49 * [ctor@GLib.PatternSpec.new] and use [method@GLib.PatternSpec.match_string]
50 * instead of [func@GLib.pattern_match_simple]. This avoids the overhead of repeated
51 * pattern compilation.
52 */
53
54 /* keep enum and structure of gpattern.c and patterntest.c in sync */
55 typedef enum
56 {
57 G_MATCH_ALL, /* "*A?A*" */
58 G_MATCH_ALL_TAIL, /* "*A?AA" */
59 G_MATCH_HEAD, /* "AAAA*" */
60 G_MATCH_TAIL, /* "*AAAA" */
61 G_MATCH_EXACT, /* "AAAAA" */
62 G_MATCH_LAST
63 } GMatchType;
64
65 struct _GPatternSpec
66 {
67 GMatchType match_type;
68 guint pattern_length;
69 guint min_length;
70 guint max_length;
71 gchar *pattern;
72 };
73
74
75 /* --- functions --- */
76 static inline gboolean
77 g_pattern_ph_match (const gchar *match_pattern,
78 const gchar *match_string,
79 gboolean *wildcard_reached_p)
80 {
81 const gchar *pattern, *string;
82 gchar ch;
83
84 pattern = match_pattern;
85 string = match_string;
86
87 ch = *pattern;
88 pattern++;
89 while (ch)
90 {
91 switch (ch)
92 {
93 case '?':
94 if (!*string)
95 return FALSE;
96 string = g_utf8_next_char (string);
97 break;
98
99 case '*':
100 *wildcard_reached_p = TRUE;
101 do
102 {
103 ch = *pattern;
104 pattern++;
105 if (ch == '?')
106 {
107 if (!*string)
108 return FALSE;
109 string = g_utf8_next_char (string);
110 }
111 }
112 while (ch == '*' || ch == '?');
113 if (!ch)
114 return TRUE;
115 do
116 {
117 gboolean next_wildcard_reached = FALSE;
118 while (ch != *string)
119 {
120 if (!*string)
121 return FALSE;
122 string = g_utf8_next_char (string);
123 }
124 string++;
125 if (g_pattern_ph_match (pattern, string, &next_wildcard_reached))
126 return TRUE;
127 if (next_wildcard_reached)
128 /* the forthcoming pattern substring up to the next wildcard has
129 * been matched, but a mismatch occurred for the rest of the
130 * pattern, following the next wildcard.
131 * there's no need to advance the current match position any
132 * further if the rest pattern will not match.
133 */
134 return FALSE;
135 }
136 while (*string);
137 break;
138
139 default:
140 if (ch == *string)
141 string++;
142 else
143 return FALSE;
144 break;
145 }
146
147 ch = *pattern;
148 pattern++;
149 }
150
151 return *string == 0;
152 }
153
154 /**
155 * g_pattern_spec_match:
156 * @pspec: a #GPatternSpec
157 * @string_length: the length of @string (in bytes, i.e. strlen(),
158 * not g_utf8_strlen())
159 * @string: the UTF-8 encoded string to match
160 * @string_reversed: (nullable): the reverse of @string or %NULL
161 *
162 * Matches a string against a compiled pattern. Passing the correct
163 * length of the string given is mandatory. The reversed string can be
164 * omitted by passing %NULL, this is more efficient if the reversed
165 * version of the string to be matched is not at hand, as
166 * g_pattern_match() will only construct it if the compiled pattern
167 * requires reverse matches.
168 *
169 * Note that, if the user code will (possibly) match a string against a
170 * multitude of patterns containing wildcards, chances are high that
171 * some patterns will require a reversed string. In this case, it's
172 * more efficient to provide the reversed string to avoid multiple
173 * constructions thereof in the various calls to g_pattern_match().
174 *
175 * Note also that the reverse of a UTF-8 encoded string can in general
176 * not be obtained by g_strreverse(). This works only if the string
177 * does not contain any multibyte characters. GLib offers the
178 * g_utf8_strreverse() function to reverse UTF-8 encoded strings.
179 *
180 * Returns: %TRUE if @string matches @pspec
181 *
182 * Since: 2.70
183 **/
184 gboolean
185 g_pattern_spec_match (GPatternSpec *pspec,
186 gsize string_length,
187 const gchar *string,
188 const gchar *string_reversed)
189 {
190 g_return_val_if_fail (pspec != NULL, FALSE);
191 g_return_val_if_fail (string != NULL, FALSE);
192
193 if (string_length < pspec->min_length ||
194 string_length > pspec->max_length)
195 return FALSE;
196
197 switch (pspec->match_type)
198 {
199 gboolean dummy;
200 case G_MATCH_ALL:
201 return g_pattern_ph_match (pspec->pattern, string, &dummy);
202 case G_MATCH_ALL_TAIL:
203 if (string_reversed)
204 return g_pattern_ph_match (pspec->pattern, string_reversed, &dummy);
205 else
206 {
207 gboolean result;
208 gchar *tmp;
209 tmp = g_utf8_strreverse (string, string_length);
210 result = g_pattern_ph_match (pspec->pattern, tmp, &dummy);
211 g_free (tmp);
212 return result;
213 }
214 case G_MATCH_HEAD:
215 if (pspec->pattern_length == string_length)
216 return strcmp (pspec->pattern, string) == 0;
217 else if (pspec->pattern_length)
218 return strncmp (pspec->pattern, string, pspec->pattern_length) == 0;
219 else
220 return TRUE;
221 case G_MATCH_TAIL:
222 if (pspec->pattern_length)
223 return strcmp (pspec->pattern, string + (string_length - pspec->pattern_length)) == 0;
224 else
225 return TRUE;
226 case G_MATCH_EXACT:
227 if (pspec->pattern_length != string_length)
228 return FALSE;
229 else
230 return strcmp (pspec->pattern, string) == 0;
231 default:
232 g_return_val_if_fail (pspec->match_type < G_MATCH_LAST, FALSE);
233 return FALSE;
234 }
235 }
236
237 /**
238 * g_pattern_match: (skip)
239 * @pspec: a #GPatternSpec
240 * @string_length: the length of @string (in bytes, i.e. strlen(),
241 * not g_utf8_strlen())
242 * @string: the UTF-8 encoded string to match
243 * @string_reversed: (nullable): the reverse of @string or %NULL
244 *
245 * Matches a string against a compiled pattern. Passing the correct
246 * length of the string given is mandatory. The reversed string can be
247 * omitted by passing %NULL, this is more efficient if the reversed
248 * version of the string to be matched is not at hand, as
249 * g_pattern_match() will only construct it if the compiled pattern
250 * requires reverse matches.
251 *
252 * Note that, if the user code will (possibly) match a string against a
253 * multitude of patterns containing wildcards, chances are high that
254 * some patterns will require a reversed string. In this case, it's
255 * more efficient to provide the reversed string to avoid multiple
256 * constructions thereof in the various calls to g_pattern_match().
257 *
258 * Note also that the reverse of a UTF-8 encoded string can in general
259 * not be obtained by g_strreverse(). This works only if the string
260 * does not contain any multibyte characters. GLib offers the
261 * g_utf8_strreverse() function to reverse UTF-8 encoded strings.
262 *
263 * Returns: %TRUE if @string matches @pspec
264 * Deprecated: 2.70: Use g_pattern_spec_match() instead
265 **/
266 gboolean
267 g_pattern_match (GPatternSpec *pspec,
268 guint string_length,
269 const gchar *string,
270 const gchar *string_reversed)
271 {
272 return g_pattern_spec_match (pspec, string_length, string, string_reversed);
273 }
274
275 /**
276 * g_pattern_spec_new:
277 * @pattern: a zero-terminated UTF-8 encoded string
278 *
279 * Compiles a pattern to a #GPatternSpec.
280 *
281 * Returns: a newly-allocated #GPatternSpec
282 **/
283 GPatternSpec*
284 g_pattern_spec_new (const gchar *pattern)
285 {
286 GPatternSpec *pspec;
287 gboolean seen_joker = FALSE, seen_wildcard = FALSE, more_wildcards = FALSE;
288 gint hw_pos = -1, tw_pos = -1, hj_pos = -1, tj_pos = -1;
289 gboolean follows_wildcard = FALSE;
290 guint pending_jokers = 0;
291 const gchar *s;
292 gchar *d;
293 guint i;
294
295 g_return_val_if_fail (pattern != NULL, NULL);
296
297 /* canonicalize pattern and collect necessary stats */
298 pspec = g_new (GPatternSpec, 1);
299 pspec->pattern_length = strlen (pattern);
300 pspec->min_length = 0;
301 pspec->max_length = 0;
302 pspec->pattern = g_new (gchar, pspec->pattern_length + 1);
303 d = pspec->pattern;
304 for (i = 0, s = pattern; *s != 0; s++)
305 {
306 switch (*s)
307 {
308 case '*':
309 if (follows_wildcard) /* compress multiple wildcards */
310 {
311 pspec->pattern_length--;
312 continue;
313 }
314 follows_wildcard = TRUE;
315 if (hw_pos < 0)
316 hw_pos = i;
317 tw_pos = i;
318 break;
319 case '?':
320 pending_jokers++;
321 pspec->min_length++;
322 pspec->max_length += 4; /* maximum UTF-8 character length */
323 continue;
324 default:
325 for (; pending_jokers; pending_jokers--, i++) {
326 *d++ = '?';
327 if (hj_pos < 0)
328 hj_pos = i;
329 tj_pos = i;
330 }
331 follows_wildcard = FALSE;
332 pspec->min_length++;
333 pspec->max_length++;
334 break;
335 }
336 *d++ = *s;
337 i++;
338 }
339 for (; pending_jokers; pending_jokers--) {
340 *d++ = '?';
341 if (hj_pos < 0)
342 hj_pos = i;
343 tj_pos = i;
344 }
345 *d++ = 0;
346 seen_joker = hj_pos >= 0;
347 seen_wildcard = hw_pos >= 0;
348 more_wildcards = seen_wildcard && hw_pos != tw_pos;
349 if (seen_wildcard)
350 pspec->max_length = G_MAXUINT;
351
352 /* special case sole head/tail wildcard or exact matches */
353 if (!seen_joker && !more_wildcards)
354 {
355 if (pspec->pattern[0] == '*')
356 {
357 pspec->match_type = G_MATCH_TAIL;
358 memmove (pspec->pattern, pspec->pattern + 1, --pspec->pattern_length);
359 pspec->pattern[pspec->pattern_length] = 0;
360 return pspec;
361 }
362 if (pspec->pattern_length > 0 &&
363 pspec->pattern[pspec->pattern_length - 1] == '*')
364 {
365 pspec->match_type = G_MATCH_HEAD;
366 pspec->pattern[--pspec->pattern_length] = 0;
367 return pspec;
368 }
369 if (!seen_wildcard)
370 {
371 pspec->match_type = G_MATCH_EXACT;
372 return pspec;
373 }
374 }
375
376 /* now just need to distinguish between head or tail match start */
377 tw_pos = pspec->pattern_length - 1 - tw_pos; /* last pos to tail distance */
378 tj_pos = pspec->pattern_length - 1 - tj_pos; /* last pos to tail distance */
379 if (seen_wildcard)
380 pspec->match_type = tw_pos > hw_pos ? G_MATCH_ALL_TAIL : G_MATCH_ALL;
381 else /* seen_joker */
382 pspec->match_type = tj_pos > hj_pos ? G_MATCH_ALL_TAIL : G_MATCH_ALL;
383 if (pspec->match_type == G_MATCH_ALL_TAIL) {
384 gchar *tmp = pspec->pattern;
385 pspec->pattern = g_utf8_strreverse (pspec->pattern, pspec->pattern_length);
386 g_free (tmp);
387 }
388 return pspec;
389 }
390
391 /**
392 * g_pattern_spec_copy:
393 * @pspec: a #GPatternSpec
394 *
395 * Copies @pspec in a new #GPatternSpec.
396 *
397 * Returns: (transfer full): a copy of @pspec.
398 *
399 * Since: 2.70
400 **/
401 GPatternSpec *
402 g_pattern_spec_copy (GPatternSpec *pspec)
403 {
404 GPatternSpec *pspec_copy;
405
406 g_return_val_if_fail (pspec != NULL, NULL);
407
408 pspec_copy = g_new (GPatternSpec, 1);
409 *pspec_copy = *pspec;
410 pspec_copy->pattern = g_strndup (pspec->pattern, pspec->pattern_length);
411
412 return pspec_copy;
413 }
414
415 /**
416 * g_pattern_spec_free:
417 * @pspec: a #GPatternSpec
418 *
419 * Frees the memory allocated for the #GPatternSpec.
420 **/
421 void
422 g_pattern_spec_free (GPatternSpec *pspec)
423 {
424 g_return_if_fail (pspec != NULL);
425
426 g_free (pspec->pattern);
427 g_free (pspec);
428 }
429
430 /**
431 * g_pattern_spec_equal:
432 * @pspec1: a #GPatternSpec
433 * @pspec2: another #GPatternSpec
434 *
435 * Compares two compiled pattern specs and returns whether they will
436 * match the same set of strings.
437 *
438 * Returns: Whether the compiled patterns are equal
439 **/
440 gboolean
441 g_pattern_spec_equal (GPatternSpec *pspec1,
442 GPatternSpec *pspec2)
443 {
444 g_return_val_if_fail (pspec1 != NULL, FALSE);
445 g_return_val_if_fail (pspec2 != NULL, FALSE);
446
447 return (pspec1->pattern_length == pspec2->pattern_length &&
448 pspec1->match_type == pspec2->match_type &&
449 strcmp (pspec1->pattern, pspec2->pattern) == 0);
450 }
451
452 /**
453 * g_pattern_spec_match_string:
454 * @pspec: a #GPatternSpec
455 * @string: the UTF-8 encoded string to match
456 *
457 * Matches a string against a compiled pattern. If the string is to be
458 * matched against more than one pattern, consider using
459 * g_pattern_match() instead while supplying the reversed string.
460 *
461 * Returns: %TRUE if @string matches @pspec
462 *
463 * Since: 2.70
464 **/
465 gboolean
466 g_pattern_spec_match_string (GPatternSpec *pspec,
467 const gchar *string)
468 {
469 g_return_val_if_fail (pspec != NULL, FALSE);
470 g_return_val_if_fail (string != NULL, FALSE);
471
472 return g_pattern_spec_match (pspec, strlen (string), string, NULL);
473 }
474
475 /**
476 * g_pattern_match_string: (skip)
477 * @pspec: a #GPatternSpec
478 * @string: the UTF-8 encoded string to match
479 *
480 * Matches a string against a compiled pattern. If the string is to be
481 * matched against more than one pattern, consider using
482 * g_pattern_match() instead while supplying the reversed string.
483 *
484 * Returns: %TRUE if @string matches @pspec
485 * Deprecated: 2.70: Use g_pattern_spec_match_string() instead
486 **/
487 gboolean
488 g_pattern_match_string (GPatternSpec *pspec,
489 const gchar *string)
490 {
491 return g_pattern_spec_match_string (pspec, string);
492 }
493
494 /**
495 * g_pattern_match_simple:
496 * @pattern: the UTF-8 encoded pattern
497 * @string: the UTF-8 encoded string to match
498 *
499 * Matches a string against a pattern given as a string. If this
500 * function is to be called in a loop, it's more efficient to compile
501 * the pattern once with g_pattern_spec_new() and call
502 * g_pattern_match_string() repeatedly.
503 *
504 * Returns: %TRUE if @string matches @pspec
505 **/
506 gboolean
507 g_pattern_match_simple (const gchar *pattern,
508 const gchar *string)
509 {
510 GPatternSpec *pspec;
511 gboolean ergo;
512
513 g_return_val_if_fail (pattern != NULL, FALSE);
514 g_return_val_if_fail (string != NULL, FALSE);
515
516 pspec = g_pattern_spec_new (pattern);
517 ergo = g_pattern_spec_match (pspec, strlen (string), string, NULL);
518 g_pattern_spec_free (pspec);
519
520 return ergo;
521 }