(root)/
glib-2.79.0/
glib/
gpattern.c
       1  /* GLIB - Library of useful routines for C programming
       2   * Copyright (C) 1995-1997, 1999  Peter Mattis, Red Hat, Inc.
       3   *
       4   * SPDX-License-Identifier: LGPL-2.1-or-later
       5   *
       6   * This library is free software; you can redistribute it and/or
       7   * modify it under the terms of the GNU Lesser General Public
       8   * License as published by the Free Software Foundation; either
       9   * version 2.1 of the License, or (at your option) any later version.
      10   *
      11   * This library is distributed in the hope that it will be useful,
      12   * but WITHOUT ANY WARRANTY; without even the implied warranty of
      13   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
      14   * Lesser General Public License for more details.
      15   *
      16   * You should have received a copy of the GNU Lesser General Public
      17   * License along with this library; if not, see <http://www.gnu.org/licenses/>.
      18   */
      19  
      20  #include "config.h"
      21  
      22  #include <string.h>
      23  
      24  #include "gpattern.h"
      25  
      26  #include "gmacros.h"
      27  #include "gmem.h"
      28  #include "gmessages.h"
      29  #include "gstrfuncs.h"
      30  #include "gunicode.h"
      31  #include "gutils.h"
      32  
      33  /**
      34   * GPatternSpec:
      35   *
      36   * A `GPatternSpec` struct is the 'compiled' form of a glob-style pattern.
      37   *
      38   * The [func@GLib.pattern_match_simple] and [method@GLib.PatternSpec.match] functions
      39   * match a string against a pattern containing '*' and '?' wildcards with similar
      40   * semantics as the standard `glob()` function: '*' matches an arbitrary,
      41   * possibly empty, string, '?' matches an arbitrary character.
      42   *
      43   * Note that in contrast to `glob()`, the '/' character can be matched by
      44   * the wildcards, there are no '[...]' character ranges and '*' and '?'
      45   * can not be escaped to include them literally in a pattern.
      46   *
      47   * When multiple strings must be matched against the same pattern, it is better
      48   * to compile the pattern to a [struct@GLib.PatternSpec] using
      49   * [ctor@GLib.PatternSpec.new] and use [method@GLib.PatternSpec.match_string]
      50   * instead of [func@GLib.pattern_match_simple]. This avoids the overhead of repeated
      51   * pattern compilation.
      52   */
      53  
      54  /* keep enum and structure of gpattern.c and patterntest.c in sync */
      55  typedef enum
      56  {
      57    G_MATCH_ALL,       /* "*A?A*" */
      58    G_MATCH_ALL_TAIL,  /* "*A?AA" */
      59    G_MATCH_HEAD,      /* "AAAA*" */
      60    G_MATCH_TAIL,      /* "*AAAA" */
      61    G_MATCH_EXACT,     /* "AAAAA" */
      62    G_MATCH_LAST
      63  } GMatchType;
      64  
      65  struct _GPatternSpec
      66  {
      67    GMatchType match_type;
      68    guint      pattern_length;
      69    guint      min_length;
      70    guint      max_length;
      71    gchar     *pattern;
      72  };
      73  
      74  
      75  /* --- functions --- */
      76  static inline gboolean
      77  g_pattern_ph_match (const gchar *match_pattern,
      78  		    const gchar *match_string,
      79  		    gboolean    *wildcard_reached_p)
      80  {
      81    const gchar *pattern, *string;
      82    gchar ch;
      83  
      84    pattern = match_pattern;
      85    string = match_string;
      86  
      87    ch = *pattern;
      88    pattern++;
      89    while (ch)
      90      {
      91        switch (ch)
      92  	{
      93  	case '?':
      94  	  if (!*string)
      95  	    return FALSE;
      96  	  string = g_utf8_next_char (string);
      97  	  break;
      98  
      99  	case '*':
     100  	  *wildcard_reached_p = TRUE;
     101  	  do
     102  	    {
     103  	      ch = *pattern;
     104  	      pattern++;
     105  	      if (ch == '?')
     106  		{
     107  		  if (!*string)
     108  		    return FALSE;
     109  		  string = g_utf8_next_char (string);
     110  		}
     111  	    }
     112  	  while (ch == '*' || ch == '?');
     113  	  if (!ch)
     114  	    return TRUE;
     115  	  do
     116  	    {
     117                gboolean next_wildcard_reached = FALSE;
     118  	      while (ch != *string)
     119  		{
     120  		  if (!*string)
     121  		    return FALSE;
     122  		  string = g_utf8_next_char (string);
     123  		}
     124  	      string++;
     125  	      if (g_pattern_ph_match (pattern, string, &next_wildcard_reached))
     126  		return TRUE;
     127                if (next_wildcard_reached)
     128                  /* the forthcoming pattern substring up to the next wildcard has
     129                   * been matched, but a mismatch occurred for the rest of the
     130                   * pattern, following the next wildcard.
     131                   * there's no need to advance the current match position any
     132                   * further if the rest pattern will not match.
     133                   */
     134  		return FALSE;
     135  	    }
     136  	  while (*string);
     137  	  break;
     138  
     139  	default:
     140  	  if (ch == *string)
     141  	    string++;
     142  	  else
     143  	    return FALSE;
     144  	  break;
     145  	}
     146  
     147        ch = *pattern;
     148        pattern++;
     149      }
     150  
     151    return *string == 0;
     152  }
     153  
     154  /**
     155   * g_pattern_spec_match:
     156   * @pspec: a #GPatternSpec
     157   * @string_length: the length of @string (in bytes, i.e. strlen(),
     158   *     not g_utf8_strlen())
     159   * @string: the UTF-8 encoded string to match
     160   * @string_reversed: (nullable): the reverse of @string or %NULL
     161   *
     162   * Matches a string against a compiled pattern. Passing the correct
     163   * length of the string given is mandatory. The reversed string can be
     164   * omitted by passing %NULL, this is more efficient if the reversed
     165   * version of the string to be matched is not at hand, as
     166   * g_pattern_match() will only construct it if the compiled pattern
     167   * requires reverse matches.
     168   *
     169   * Note that, if the user code will (possibly) match a string against a
     170   * multitude of patterns containing wildcards, chances are high that
     171   * some patterns will require a reversed string. In this case, it's
     172   * more efficient to provide the reversed string to avoid multiple
     173   * constructions thereof in the various calls to g_pattern_match().
     174   *
     175   * Note also that the reverse of a UTF-8 encoded string can in general
     176   * not be obtained by g_strreverse(). This works only if the string
     177   * does not contain any multibyte characters. GLib offers the
     178   * g_utf8_strreverse() function to reverse UTF-8 encoded strings.
     179   *
     180   * Returns: %TRUE if @string matches @pspec
     181   *
     182   * Since: 2.70
     183   **/
     184  gboolean
     185  g_pattern_spec_match (GPatternSpec *pspec,
     186                        gsize string_length,
     187                        const gchar *string,
     188                        const gchar *string_reversed)
     189  {
     190    g_return_val_if_fail (pspec != NULL, FALSE);
     191    g_return_val_if_fail (string != NULL, FALSE);
     192  
     193    if (string_length < pspec->min_length ||
     194        string_length > pspec->max_length)
     195      return FALSE;
     196  
     197    switch (pspec->match_type)
     198      {
     199        gboolean dummy;
     200      case G_MATCH_ALL:
     201        return g_pattern_ph_match (pspec->pattern, string, &dummy);
     202      case G_MATCH_ALL_TAIL:
     203        if (string_reversed)
     204  	return g_pattern_ph_match (pspec->pattern, string_reversed, &dummy);
     205        else
     206  	{
     207            gboolean result;
     208            gchar *tmp;
     209  	  tmp = g_utf8_strreverse (string, string_length);
     210  	  result = g_pattern_ph_match (pspec->pattern, tmp, &dummy);
     211  	  g_free (tmp);
     212  	  return result;
     213  	}
     214      case G_MATCH_HEAD:
     215        if (pspec->pattern_length == string_length)
     216  	return strcmp (pspec->pattern, string) == 0;
     217        else if (pspec->pattern_length)
     218  	return strncmp (pspec->pattern, string, pspec->pattern_length) == 0;
     219        else
     220  	return TRUE;
     221      case G_MATCH_TAIL:
     222        if (pspec->pattern_length)
     223          return strcmp (pspec->pattern, string + (string_length - pspec->pattern_length)) == 0;
     224        else
     225  	return TRUE;
     226      case G_MATCH_EXACT:
     227        if (pspec->pattern_length != string_length)
     228          return FALSE;
     229        else
     230          return strcmp (pspec->pattern, string) == 0;
     231      default:
     232        g_return_val_if_fail (pspec->match_type < G_MATCH_LAST, FALSE);
     233        return FALSE;
     234      }
     235  }
     236  
     237  /**
     238   * g_pattern_match: (skip)
     239   * @pspec: a #GPatternSpec
     240   * @string_length: the length of @string (in bytes, i.e. strlen(),
     241   *     not g_utf8_strlen())
     242   * @string: the UTF-8 encoded string to match
     243   * @string_reversed: (nullable): the reverse of @string or %NULL
     244   *
     245   * Matches a string against a compiled pattern. Passing the correct
     246   * length of the string given is mandatory. The reversed string can be
     247   * omitted by passing %NULL, this is more efficient if the reversed
     248   * version of the string to be matched is not at hand, as
     249   * g_pattern_match() will only construct it if the compiled pattern
     250   * requires reverse matches.
     251   *
     252   * Note that, if the user code will (possibly) match a string against a
     253   * multitude of patterns containing wildcards, chances are high that
     254   * some patterns will require a reversed string. In this case, it's
     255   * more efficient to provide the reversed string to avoid multiple
     256   * constructions thereof in the various calls to g_pattern_match().
     257   *
     258   * Note also that the reverse of a UTF-8 encoded string can in general
     259   * not be obtained by g_strreverse(). This works only if the string
     260   * does not contain any multibyte characters. GLib offers the
     261   * g_utf8_strreverse() function to reverse UTF-8 encoded strings.
     262   *
     263   * Returns: %TRUE if @string matches @pspec
     264   * Deprecated: 2.70: Use g_pattern_spec_match() instead
     265   **/
     266  gboolean
     267  g_pattern_match (GPatternSpec *pspec,
     268                   guint string_length,
     269                   const gchar *string,
     270                   const gchar *string_reversed)
     271  {
     272    return g_pattern_spec_match (pspec, string_length, string, string_reversed);
     273  }
     274  
     275  /**
     276   * g_pattern_spec_new:
     277   * @pattern: a zero-terminated UTF-8 encoded string
     278   *
     279   * Compiles a pattern to a #GPatternSpec.
     280   *
     281   * Returns: a newly-allocated #GPatternSpec
     282   **/
     283  GPatternSpec*
     284  g_pattern_spec_new (const gchar *pattern)
     285  {
     286    GPatternSpec *pspec;
     287    gboolean seen_joker = FALSE, seen_wildcard = FALSE, more_wildcards = FALSE;
     288    gint hw_pos = -1, tw_pos = -1, hj_pos = -1, tj_pos = -1;
     289    gboolean follows_wildcard = FALSE;
     290    guint pending_jokers = 0;
     291    const gchar *s;
     292    gchar *d;
     293    guint i;
     294    
     295    g_return_val_if_fail (pattern != NULL, NULL);
     296  
     297    /* canonicalize pattern and collect necessary stats */
     298    pspec = g_new (GPatternSpec, 1);
     299    pspec->pattern_length = strlen (pattern);
     300    pspec->min_length = 0;
     301    pspec->max_length = 0;
     302    pspec->pattern = g_new (gchar, pspec->pattern_length + 1);
     303    d = pspec->pattern;
     304    for (i = 0, s = pattern; *s != 0; s++)
     305      {
     306        switch (*s)
     307  	{
     308  	case '*':
     309  	  if (follows_wildcard)	/* compress multiple wildcards */
     310  	    {
     311  	      pspec->pattern_length--;
     312  	      continue;
     313  	    }
     314  	  follows_wildcard = TRUE;
     315  	  if (hw_pos < 0)
     316  	    hw_pos = i;
     317  	  tw_pos = i;
     318  	  break;
     319  	case '?':
     320  	  pending_jokers++;
     321  	  pspec->min_length++;
     322  	  pspec->max_length += 4; /* maximum UTF-8 character length */
     323  	  continue;
     324  	default:
     325  	  for (; pending_jokers; pending_jokers--, i++) {
     326  	    *d++ = '?';
     327    	    if (hj_pos < 0)
     328  	     hj_pos = i;
     329  	    tj_pos = i;
     330  	  }
     331  	  follows_wildcard = FALSE;
     332  	  pspec->min_length++;
     333  	  pspec->max_length++;
     334  	  break;
     335  	}
     336        *d++ = *s;
     337        i++;
     338      }
     339    for (; pending_jokers; pending_jokers--) {
     340      *d++ = '?';
     341      if (hj_pos < 0)
     342        hj_pos = i;
     343      tj_pos = i;
     344    }
     345    *d++ = 0;
     346    seen_joker = hj_pos >= 0;
     347    seen_wildcard = hw_pos >= 0;
     348    more_wildcards = seen_wildcard && hw_pos != tw_pos;
     349    if (seen_wildcard)
     350      pspec->max_length = G_MAXUINT;
     351  
     352    /* special case sole head/tail wildcard or exact matches */
     353    if (!seen_joker && !more_wildcards)
     354      {
     355        if (pspec->pattern[0] == '*')
     356  	{
     357  	  pspec->match_type = G_MATCH_TAIL;
     358            memmove (pspec->pattern, pspec->pattern + 1, --pspec->pattern_length);
     359  	  pspec->pattern[pspec->pattern_length] = 0;
     360  	  return pspec;
     361  	}
     362        if (pspec->pattern_length > 0 &&
     363  	  pspec->pattern[pspec->pattern_length - 1] == '*')
     364  	{
     365  	  pspec->match_type = G_MATCH_HEAD;
     366  	  pspec->pattern[--pspec->pattern_length] = 0;
     367  	  return pspec;
     368  	}
     369        if (!seen_wildcard)
     370  	{
     371  	  pspec->match_type = G_MATCH_EXACT;
     372  	  return pspec;
     373  	}
     374      }
     375  
     376    /* now just need to distinguish between head or tail match start */
     377    tw_pos = pspec->pattern_length - 1 - tw_pos;	/* last pos to tail distance */
     378    tj_pos = pspec->pattern_length - 1 - tj_pos;	/* last pos to tail distance */
     379    if (seen_wildcard)
     380      pspec->match_type = tw_pos > hw_pos ? G_MATCH_ALL_TAIL : G_MATCH_ALL;
     381    else /* seen_joker */
     382      pspec->match_type = tj_pos > hj_pos ? G_MATCH_ALL_TAIL : G_MATCH_ALL;
     383    if (pspec->match_type == G_MATCH_ALL_TAIL) {
     384      gchar *tmp = pspec->pattern;
     385      pspec->pattern = g_utf8_strreverse (pspec->pattern, pspec->pattern_length);
     386      g_free (tmp);
     387    }
     388    return pspec;
     389  }
     390  
     391  /**
     392   * g_pattern_spec_copy:
     393   * @pspec: a #GPatternSpec
     394   *
     395   * Copies @pspec in a new #GPatternSpec.
     396   *
     397   * Returns: (transfer full): a copy of @pspec.
     398   *
     399   * Since: 2.70
     400   **/
     401  GPatternSpec *
     402  g_pattern_spec_copy (GPatternSpec *pspec)
     403  {
     404    GPatternSpec *pspec_copy;
     405  
     406    g_return_val_if_fail (pspec != NULL, NULL);
     407  
     408    pspec_copy = g_new (GPatternSpec, 1);
     409    *pspec_copy = *pspec;
     410    pspec_copy->pattern = g_strndup (pspec->pattern, pspec->pattern_length);
     411  
     412    return pspec_copy;
     413  }
     414  
     415  /**
     416   * g_pattern_spec_free:
     417   * @pspec: a #GPatternSpec
     418   *
     419   * Frees the memory allocated for the #GPatternSpec.
     420   **/
     421  void
     422  g_pattern_spec_free (GPatternSpec *pspec)
     423  {
     424    g_return_if_fail (pspec != NULL);
     425  
     426    g_free (pspec->pattern);
     427    g_free (pspec);
     428  }
     429  
     430  /**
     431   * g_pattern_spec_equal:
     432   * @pspec1: a #GPatternSpec
     433   * @pspec2: another #GPatternSpec
     434   *
     435   * Compares two compiled pattern specs and returns whether they will
     436   * match the same set of strings.
     437   *
     438   * Returns: Whether the compiled patterns are equal
     439   **/
     440  gboolean
     441  g_pattern_spec_equal (GPatternSpec *pspec1,
     442  		      GPatternSpec *pspec2)
     443  {
     444    g_return_val_if_fail (pspec1 != NULL, FALSE);
     445    g_return_val_if_fail (pspec2 != NULL, FALSE);
     446  
     447    return (pspec1->pattern_length == pspec2->pattern_length &&
     448  	  pspec1->match_type == pspec2->match_type &&
     449  	  strcmp (pspec1->pattern, pspec2->pattern) == 0);
     450  }
     451  
     452  /**
     453   * g_pattern_spec_match_string:
     454   * @pspec: a #GPatternSpec
     455   * @string: the UTF-8 encoded string to match
     456   *
     457   * Matches a string against a compiled pattern. If the string is to be
     458   * matched against more than one pattern, consider using
     459   * g_pattern_match() instead while supplying the reversed string.
     460   *
     461   * Returns: %TRUE if @string matches @pspec
     462   *
     463   * Since: 2.70
     464   **/
     465  gboolean
     466  g_pattern_spec_match_string (GPatternSpec *pspec,
     467                               const gchar *string)
     468  {
     469    g_return_val_if_fail (pspec != NULL, FALSE);
     470    g_return_val_if_fail (string != NULL, FALSE);
     471  
     472    return g_pattern_spec_match (pspec, strlen (string), string, NULL);
     473  }
     474  
     475  /**
     476   * g_pattern_match_string: (skip)
     477   * @pspec: a #GPatternSpec
     478   * @string: the UTF-8 encoded string to match
     479   *
     480   * Matches a string against a compiled pattern. If the string is to be
     481   * matched against more than one pattern, consider using
     482   * g_pattern_match() instead while supplying the reversed string.
     483   *
     484   * Returns: %TRUE if @string matches @pspec
     485   * Deprecated: 2.70: Use g_pattern_spec_match_string() instead
     486   **/
     487  gboolean
     488  g_pattern_match_string (GPatternSpec *pspec,
     489                          const gchar *string)
     490  {
     491    return g_pattern_spec_match_string (pspec, string);
     492  }
     493  
     494  /**
     495   * g_pattern_match_simple:
     496   * @pattern: the UTF-8 encoded pattern
     497   * @string: the UTF-8 encoded string to match
     498   *
     499   * Matches a string against a pattern given as a string. If this
     500   * function is to be called in a loop, it's more efficient to compile
     501   * the pattern once with g_pattern_spec_new() and call
     502   * g_pattern_match_string() repeatedly.
     503   *
     504   * Returns: %TRUE if @string matches @pspec
     505   **/
     506  gboolean
     507  g_pattern_match_simple (const gchar *pattern,
     508  			const gchar *string)
     509  {
     510    GPatternSpec *pspec;
     511    gboolean ergo;
     512  
     513    g_return_val_if_fail (pattern != NULL, FALSE);
     514    g_return_val_if_fail (string != NULL, FALSE);
     515  
     516    pspec = g_pattern_spec_new (pattern);
     517    ergo = g_pattern_spec_match (pspec, strlen (string), string, NULL);
     518    g_pattern_spec_free (pspec);
     519  
     520    return ergo;
     521  }