(root)/
findutils-4.9.0/
lib/
regexprops.c
       1  /* regexprops.c -- document the properties of the regular expressions
       2     understood by gnulib.
       3  
       4     Copyright (C) 2005-2022 Free Software Foundation, Inc.
       5  
       6     This program is free software: you can redistribute it and/or modify
       7     it under the terms of the GNU General Public License as published by
       8     the Free Software Foundation, either version 3 of the License, or
       9     (at your option) any later version.
      10  
      11     This program is distributed in the hope that it will be useful,
      12     but WITHOUT ANY WARRANTY; without even the implied warranty of
      13     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      14     GNU General Public License for more details.
      15  
      16     You should have received a copy of the GNU General Public License
      17     along with this program.  If not, see <https://www.gnu.org/licenses/>.
      18  */
      19  
      20  
      21  /*
      22    The output of this program is included in the GNU findutils source
      23    distribution.  The copying conditions for that file are generated
      24    by the copying() function below.
      25  */
      26  
      27  /* Written by James Youngman, <jay@gnu.org>. */
      28  
      29  /* config.h must be included first. */
      30  #include <config.h>
      31  
      32  /* system headers */
      33  #include <errno.h>
      34  #include <regex.h>
      35  #include <stdio.h>
      36  #include <string.h>
      37  #include <unistd.h>
      38  
      39  /* gnulib headers */
      40  #include "progname.h"
      41  
      42  /* find headers */
      43  #include "regextype.h"
      44  
      45  static void
      46  output (const char *s, int escape)
      47  {
      48    (void) escape;
      49  
      50    fputs (s, stdout);
      51  }
      52  
      53  
      54  static void
      55  newline (void)
      56  {
      57    output ("\n", 0);
      58  }
      59  
      60  static void
      61  content (const char *s)
      62  {
      63    output (s, 1);
      64  }
      65  
      66  static void
      67  literal (const char *s)
      68  {
      69    output (s, 0);
      70  }
      71  
      72  static void
      73  directive (const char *s)
      74  {
      75    output (s, 0);
      76  }
      77  
      78  static void
      79  comment (const char *s)
      80  {
      81    directive ("@c");
      82    if (s[0])
      83      {
      84        literal (" ");
      85        literal (s);
      86      }
      87    newline ();
      88  }
      89  
      90  static void
      91  enum_item (const char *s)
      92  {
      93    newline ();
      94    directive ("@item ");
      95    literal (s);
      96    newline ();
      97  }
      98  
      99  static void
     100  begin_subsection (const char *name,
     101  		  const char *next,
     102  		  const char *prev,
     103  		  const char *up)
     104  {
     105    (void) next;
     106    (void) prev;
     107    (void) up;
     108  
     109    newline ();
     110  
     111    directive ("@node ");
     112    content (name);
     113    content (" regular expression syntax");
     114    newline ();
     115  
     116    directive ("@subsection ");
     117    output ("@samp{", 0);
     118    content (name);
     119    output ("}", 0);
     120    content (" regular expression syntax");
     121    newline ();
     122  }
     123  
     124  static void
     125  begintable_markup (char const *markup)
     126  {
     127    newline ();
     128    directive ("@table ");
     129    literal (markup);
     130    newline ();
     131  }
     132  
     133  static void
     134  endtable (void)
     135  {
     136    newline ();
     137    directive ("@end table");
     138    newline ();
     139  }
     140  
     141  static void
     142  beginenum (void)
     143  {
     144    newline ();
     145    directive ("@enumerate");
     146    newline ();
     147  }
     148  
     149  static void
     150  endenum (void)
     151  {
     152    newline ();
     153    directive ("@end enumerate");
     154    newline ();
     155  }
     156  
     157  static void
     158  newpara (void)
     159  {
     160    content ("\n\n");
     161  }
     162  
     163  
     164  static void
     165  describe_regex_syntax (int options)
     166  {
     167    newpara ();
     168    content ("The character @samp{.} matches any single character");
     169    if ( (options & RE_DOT_NEWLINE)  == 0 )
     170      {
     171        content (" except newline");
     172      }
     173    if (options & RE_DOT_NOT_NULL)
     174      {
     175        if ( (options & RE_DOT_NEWLINE)  == 0 )
     176  	content (" and");
     177        else
     178  	content (" except");
     179  
     180        content (" the null character");
     181      }
     182    content (".");
     183    newpara ();
     184  
     185    if (!(options & RE_LIMITED_OPS))
     186      {
     187        begintable_markup ("@samp");
     188        if (options & RE_BK_PLUS_QM)
     189  	{
     190  	  enum_item ("\\+");
     191  	  content ("indicates that the regular expression should match one"
     192  		   " or more occurrences of the previous atom or regexp.");
     193  	  enum_item ("\\?");
     194  	  content ("indicates that the regular expression should match zero"
     195  		   " or one occurrence of the previous atom or regexp.");
     196  	  enum_item ("+ and ?");
     197  	  content ("match themselves.\n");
     198  	}
     199        else
     200  	{
     201  	  enum_item ("+");
     202  	  content ("indicates that the regular expression should match one"
     203  		   " or more occurrences of the previous atom or regexp.");
     204  	  enum_item ("?");
     205  	  content ("indicates that the regular expression should match zero"
     206  		   " or one occurrence of the previous atom or regexp.");
     207  	  enum_item ("\\+");
     208  	  literal ("matches a @samp{+}");
     209  	  enum_item ("\\?");
     210  	  literal ("matches a @samp{?}.");
     211  	}
     212        endtable ();
     213      }
     214  
     215    newpara ();
     216  
     217    content ("Bracket expressions are used to match ranges of characters.  ");
     218    literal ("Bracket expressions where the range is backward, for example @samp{[z-a]}, are ");
     219    if (options & RE_NO_EMPTY_RANGES)
     220      content ("invalid");
     221    else
     222      content ("ignored");
     223    content (".  ");
     224  
     225    if (options &  RE_BACKSLASH_ESCAPE_IN_LISTS)
     226      literal ("Within square brackets, @samp{\\} can be used to quote "
     227  	     "the following character.  ");
     228    else
     229      literal ("Within square brackets, @samp{\\} is taken literally.  ");
     230  
     231    if (options & RE_CHAR_CLASSES)
     232      content ("Character classes are supported; for example "
     233  	     "@samp{[[:digit:]]} will match a single decimal digit.\n");
     234    else
     235      literal ("Character classes are not supported, so for example "
     236  	     "you would need to use @samp{[0-9]} "
     237  	     "instead of @samp{[[:digit:]]}.\n");
     238  
     239    if (options & RE_HAT_LISTS_NOT_NEWLINE)
     240      {
     241        literal ("Non-matching lists @samp{[^@dots{}]} do not ever match newline.\n");
     242      }
     243    newpara ();
     244    if (options & RE_NO_GNU_OPS)
     245      {
     246        content ("GNU extensions are not supported and so "
     247  	       "@samp{\\w}, @samp{\\W}, @samp{\\<}, @samp{\\>}, @samp{\\b}, @samp{\\B}, @samp{\\`}, and @samp{\\'} "
     248  	       "match "
     249  	       "@samp{w}, @samp{W}, @samp{<}, @samp{>}, @samp{b}, @samp{B}, @samp{`}, and @samp{'} respectively.\n");
     250      }
     251    else
     252      {
     253        content ("GNU extensions are supported:");
     254        beginenum ();
     255        enum_item ("@samp{\\w} matches a character within a word");
     256        enum_item ("@samp{\\W} matches a character which is not within a word");
     257        enum_item ("@samp{\\<} matches the beginning of a word");
     258        enum_item ("@samp{\\>} matches the end of a word");
     259        enum_item ("@samp{\\b} matches a word boundary");
     260        enum_item ("@samp{\\B} matches characters which are not a word boundary");
     261        enum_item ("@samp{\\`} matches the beginning of the whole input");
     262        enum_item ("@samp{\\'} matches the end of the whole input");
     263        endenum ();
     264      }
     265  
     266    newpara ();
     267  
     268  
     269    if (options & RE_NO_BK_PARENS)
     270      {
     271        literal ("Grouping is performed with parentheses @samp{()}.  ");
     272  
     273        if (options & RE_UNMATCHED_RIGHT_PAREN_ORD)
     274  	literal ("An unmatched @samp{)} matches just itself.  ");
     275      }
     276    else
     277      {
     278        literal ("Grouping is performed with backslashes followed by parentheses @samp{\\(}, @samp{\\)}.  ");
     279      }
     280  
     281    if (options & RE_NO_BK_REFS)
     282      {
     283        content ("A backslash followed by a digit matches that digit.");
     284      }
     285    else
     286      {
     287        literal ("A backslash followed by a digit acts as a back-reference and matches the same thing as the previous grouped expression indicated by that number.  For example @samp{\\2} matches the second group expression.  The order of group expressions is determined by the position of their opening parenthesis ");
     288        if (options & RE_NO_BK_PARENS)
     289  	literal ("@samp{(}");
     290        else
     291  	literal ("@samp{\\(}");
     292        content (".");
     293      }
     294  
     295  
     296    newpara ();
     297    if (!(options & RE_LIMITED_OPS))
     298      {
     299        if (options & RE_NO_BK_VBAR)
     300  	literal ("The alternation operator is @samp{|}.");
     301        else
     302  	literal ("The alternation operator is @samp{\\|}.");
     303      }
     304    newpara ();
     305  
     306    if (options & RE_CONTEXT_INDEP_ANCHORS)
     307      {
     308        literal ("The characters @samp{^} and @samp{$} always represent the beginning and end of a string respectively, except within square brackets.  Within brackets, @samp{^} can be used to invert the membership of the character class being specified.\n");
     309      }
     310    else
     311      {
     312        literal ("The character @samp{^} only represents the beginning of a string when it appears:");
     313        beginenum ();
     314        enum_item ("At the beginning of a regular expression");
     315        if (options & RE_NO_BK_PARENS)
     316  	{
     317  	  enum_item ("After an open-group, signified by @samp{(}");
     318  	}
     319        else
     320  	{
     321  	  enum_item ("After an open-group, signified by @samp{\\(}");
     322  	}
     323        newline ();
     324        if (!(options & RE_LIMITED_OPS))
     325  	{
     326  	  if (options & RE_NEWLINE_ALT)
     327  	    enum_item ("After a newline");
     328  
     329  	  if (options & RE_NO_BK_VBAR )
     330  	    enum_item ("After the alternation operator @samp{|}");
     331  	  else
     332  	    enum_item ("After the alternation operator @samp{\\|}");
     333  	}
     334        endenum ();
     335  
     336        newpara ();
     337        literal ("The character @samp{$} only represents the end of a string when it appears:");
     338        beginenum ();
     339        enum_item ("At the end of a regular expression");
     340        if (options & RE_NO_BK_PARENS)
     341  	{
     342  	  enum_item ("Before a close-group, signified by @samp{)}");
     343  	}
     344        else
     345  	{
     346  	  enum_item ("Before a close-group, signified by @samp{\\)}");
     347  	}
     348        if (!(options & RE_LIMITED_OPS))
     349  	{
     350  	  if (options & RE_NEWLINE_ALT)
     351  	    enum_item ("Before a newline");
     352  
     353  	  if (options & RE_NO_BK_VBAR)
     354  	    enum_item ("Before the alternation operator @samp{|}");
     355  	  else
     356  	    enum_item ("Before the alternation operator @samp{\\|}");
     357  	}
     358        endenum ();
     359      }
     360    newpara ();
     361    if (!(options & RE_LIMITED_OPS) )
     362      {
     363        if ((options & RE_CONTEXT_INDEP_OPS)
     364  	  && !(options & RE_CONTEXT_INVALID_OPS))
     365  	{
     366  	  literal ("The characters @samp{*}, @samp{+} and @samp{?} are special anywhere in a regular expression.\n");
     367  	}
     368        else
     369  	{
     370  	  if (options & RE_BK_PLUS_QM)
     371  	    literal ("@samp{\\*}, @samp{\\+} and @samp{\\?} ");
     372  	  else
     373  	    literal ("@samp{*}, @samp{+} and @samp{?} ");
     374  
     375  	  if (options & RE_CONTEXT_INVALID_OPS)
     376  	    {
     377  	      content ("are special at any point in a regular expression except the following places, where they are not allowed:");
     378  	    }
     379  	  else
     380  	    {
     381  	      content ("are special at any point in a regular expression except:");
     382  	    }
     383  
     384  	  beginenum ();
     385  	  enum_item ("At the beginning of a regular expression");
     386  	  if (options & RE_NO_BK_PARENS)
     387  	    {
     388  	      enum_item ("After an open-group, signified by @samp{(}");
     389  	    }
     390  	  else
     391  	    {
     392  	      enum_item ("After an open-group, signified by @samp{\\(}");
     393  	    }
     394  	  if (!(options & RE_LIMITED_OPS))
     395  	    {
     396  	      if (options & RE_NEWLINE_ALT)
     397  		enum_item ("After a newline");
     398  
     399  	      if (options & RE_NO_BK_VBAR)
     400  		enum_item ("After the alternation operator @samp{|}");
     401  	      else
     402  		enum_item ("After the alternation operator @samp{\\|}");
     403  	    }
     404  	  endenum ();
     405  	}
     406      }
     407  
     408  
     409    newpara ();
     410    if (options & RE_INTERVALS)
     411      {
     412        if (options & RE_NO_BK_BRACES)
     413  	{
     414  	  literal ("Intervals are specified by @samp{@{} and @samp{@}}.\n");
     415  	  if (options & RE_INVALID_INTERVAL_ORD)
     416  	    {
     417  	      literal ("Invalid intervals are treated as literals, for example @samp{a@{1} is treated as @samp{a\\@{1}");
     418  	    }
     419  	  else
     420  	    {
     421  	      literal ("Invalid intervals such as @samp{a@{1z} are not accepted.\n");
     422  	    }
     423  	}
     424        else
     425  	{
     426  	  literal ("Intervals are specified by @samp{\\@{} and @samp{\\@}}.\n");
     427  	  if (options & RE_INVALID_INTERVAL_ORD)
     428  	    {
     429  	      literal ("Invalid intervals are treated as literals, for example @samp{a\\@{1} is treated as @samp{a@{1}");
     430  	    }
     431  	  else
     432  	    {
     433  	      literal ("Invalid intervals such as @samp{a\\@{1z} are not accepted.\n");
     434  	    }
     435  	}
     436      }
     437  
     438    newpara ();
     439    if (options & RE_NO_POSIX_BACKTRACKING)
     440      {
     441        content ("Matching succeeds as soon as the whole pattern is matched, meaning that the result may not be the longest possible match.");
     442      }
     443    else
     444      {
     445        content ("The longest possible match is returned; this applies to the regular expression as a whole and (subject to this constraint) to subexpressions within groups.");
     446      }
     447    newpara ();
     448  }
     449  
     450  
     451  static void
     452  copying (void)
     453  {
     454    static const char *copy_para[]=
     455      {
     456        /* The copyright year number range is with "--" in Texinfo files.  */
     457        "Copyright (C) 1994--2022 Free Software Foundation, Inc."
     458        ,""
     459        ,"Permission is granted to copy, distribute and/or modify this document"
     460        ,"under the terms of the GNU Free Documentation License, Version 1.3 or"
     461        ,"any later version published by the Free Software Foundation; with no"
     462        ,"Invariant Sections, no Front-Cover Texts, and no Back-Cover Texts."
     463        ,"A copy of the license is included in the ``GNU Free"
     464        ,"Documentation License'' file as part of this distribution."
     465        ""
     466        ,NULL
     467      };
     468    const char **s = copy_para;
     469    while (*s)
     470      comment (*s++);
     471  }
     472  
     473  static int
     474  ignore (int ix, const unsigned int context)
     475  {
     476    return 0 == (get_regex_type_context (ix) & context);
     477  }
     478  
     479  static void
     480  menu (unsigned int context)
     481  {
     482    int i;
     483    const char *name;
     484  
     485    output ("@menu\n", 0);
     486    for (i=0;
     487         get_regex_type_flags (i),
     488  	 name=get_regex_type_name (i);
     489         ++i)
     490      {
     491        if (!ignore (i, context))
     492  	{
     493  	  output ("* ", 0);
     494  	  output (name, 0);
     495  	  content (" regular expression syntax");
     496  	  output ("::", 0);
     497  	  newline ();
     498  	}
     499      }
     500    output ("@end menu\n", 0);
     501  }
     502  
     503  
     504  
     505  static const char *
     506  get_next (unsigned int ix, unsigned int context)
     507  {
     508    const char *next;
     509    while (get_regex_type_name (ix))
     510      {
     511        if (!ignore (ix, context))
     512  	{
     513  	  next = get_regex_type_name (ix);
     514  	  if (NULL == next)
     515  	    return "";
     516  	  else
     517  	    return next;
     518  	}
     519        ++ix;
     520      }
     521    return "";
     522  }
     523  
     524  
     525  static void
     526  describe_all (const char *contextname,
     527  	      unsigned int context,
     528  	      const char *up)
     529  {
     530    const char *name, *next, *previous;
     531    int regopts;
     532    int i, parent;
     533  
     534    copying ();
     535    newline ();
     536    literal ("@c this regular expression description is for: ");
     537    literal (contextname);
     538    newline ();
     539    newline ();
     540    menu (context);
     541  
     542    previous = "";
     543  
     544    for (i=0;
     545         regopts = get_regex_type_flags (i),
     546  	 name=get_regex_type_name (i);
     547         ++i)
     548      {
     549        if (ignore (i, context))
     550  	{
     551  	  fprintf (stderr,
     552  		   "Skipping regexp type %s for context %s\n",
     553  		   name, contextname);
     554  	  name = previous;
     555  	  continue;
     556  	}
     557  
     558        next = get_next (i+1, context);
     559        if (NULL == next)
     560  	next = "";
     561        begin_subsection (name, next, previous, up);
     562        parent = get_regex_type_synonym (i, context);
     563        if (parent >= 0)
     564  	{
     565  	  content ("This is a synonym for ");
     566  	  content (get_regex_type_name (parent));
     567  	  content (".");
     568  	}
     569        else
     570  	{
     571  	  describe_regex_syntax (regopts);
     572  	}
     573        previous = name;
     574      }
     575  }
     576  
     577  
     578  
     579  int
     580  main (int argc, char *argv[])
     581  {
     582    const char *up = "";
     583    unsigned int context = CONTEXT_ALL;
     584    const char *contextname = "all";
     585  
     586    if (argc)
     587      set_program_name (argv[0]);
     588    else
     589      set_program_name ("regexprops");
     590  
     591    if (argc > 1)
     592      {
     593        up = argv[1];
     594      }
     595    if (argc > 2)
     596      {
     597        contextname = argv[2];
     598        if (0 == strcmp (contextname, "findutils"))
     599  	context = CONTEXT_FINDUTILS;
     600        else if (0 == strcmp (contextname, "generic"))
     601  	context = CONTEXT_GENERIC;
     602        else if (0 == strcmp (contextname, "all"))
     603  	context = CONTEXT_ALL;
     604        else
     605  	{
     606  	  fprintf (stderr, "Unexpected context %s",
     607  		   contextname);
     608  	  return 1;
     609  	}
     610      }
     611  
     612    describe_all (contextname, context, up);
     613    return 0;
     614  }