(root)/
m4-1.4.19/
src/
input.c
       1  /* GNU m4 -- A simple macro processor
       2  
       3     Copyright (C) 1989-1994, 2004-2014, 2016-2017, 2020-2021 Free
       4     Software Foundation, Inc.
       5  
       6     This file is part of GNU M4.
       7  
       8     GNU M4 is free software: you can redistribute it and/or modify
       9     it under the terms of the GNU General Public License as published by
      10     the Free Software Foundation, either version 3 of the License, or
      11     (at your option) any later version.
      12  
      13     GNU M4 is distributed in the hope that it will be useful,
      14     but WITHOUT ANY WARRANTY; without even the implied warranty of
      15     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      16     GNU General Public License for more details.
      17  
      18     You should have received a copy of the GNU General Public License
      19     along with this program.  If not, see <https://www.gnu.org/licenses/>.
      20  */
      21  
      22  /* Handling of different input sources, and lexical analysis.  */
      23  
      24  #include "m4.h"
      25  
      26  #include "memchr2.h"
      27  
      28  /* Unread input can be either files, that should be read (eg. included
      29     files), strings, which should be rescanned (eg. macro expansion text),
      30     or quoted macro definitions (as returned by the builtin "defn").
      31     Unread input are organised in a stack, implemented with an obstack.
      32     Each input source is described by a "struct input_block".  The obstack
      33     is "current_input".  The top of the input stack is "isp".
      34  
      35     The macro "m4wrap" places the text to be saved on another input
      36     stack, on the obstack "wrapup_stack", whose top is "wsp".  When EOF
      37     is seen on normal input (eg, when "current_input" is empty), input is
      38     switched over to "wrapup_stack", and the original "current_input" is
      39     freed.  A new stack is allocated for "wrapup_stack", which will
      40     accept any text produced by calls to "m4wrap" from within the
      41     wrapped text.  This process of shuffling "wrapup_stack" to
      42     "current_input" can continue indefinitely, even generating infinite
      43     loops (e.g. "define(`f',`m4wrap(`f')')f"), without memory leaks.
      44  
      45     Pushing new input on the input stack is done by push_file (),
      46     push_string (), push_wrapup () (for wrapup text), and push_macro ()
      47     (for macro definitions).  Because macro expansion needs direct access
      48     to the current input obstack (for optimisation), push_string () are
      49     split in two functions, push_string_init (), which returns a pointer
      50     to the current input stack, and push_string_finish (), which return a
      51     pointer to the final text.  The input_block *next is used to manage
      52     the coordination between the different push routines.
      53  
      54     The current file and line number are stored in two global
      55     variables, for use by the error handling functions in m4.c.  Macro
      56     expansion wants to report the line where a macro name was detected,
      57     rather than where it finished collecting arguments.  This also
      58     applies to text resulting from macro expansions.  So each input
      59     block maintains its own notion of the current file and line, and
      60     swapping between input blocks updates the global variables
      61     accordingly.  */
      62  
      63  #ifdef ENABLE_CHANGEWORD
      64  #include "regex.h"
      65  #endif
      66  
      67  enum input_type
      68  {
      69    INPUT_STRING,         /* String resulting from macro expansion.  */
      70    INPUT_FILE,           /* File from command line or include.  */
      71    INPUT_MACRO           /* Builtin resulting from defn.  */
      72  };
      73  
      74  typedef enum input_type input_type;
      75  
      76  struct input_block
      77  {
      78    struct input_block *prev;     /* previous input_block on the input stack */
      79    input_type type;              /* see enum values */
      80    const char *file;             /* file where this input is from */
      81    int line;                     /* line where this input is from */
      82    union
      83      {
      84        struct
      85          {
      86            char *string;         /* remaining string value */
      87            char *end;            /* terminating NUL of string */
      88          }
      89          u_s;    /* INPUT_STRING */
      90        struct
      91          {
      92            FILE *fp;                  /* input file handle */
      93            bool_bitfield end : 1;     /* true if peek has seen EOF */
      94            bool_bitfield close : 1;   /* true if we should close file on pop */
      95            bool_bitfield advance : 1; /* track previous start_of_input_line */
      96          }
      97          u_f;    /* INPUT_FILE */
      98        builtin_func *func;       /* pointer to macro's function */
      99      }
     100    u;
     101  };
     102  
     103  typedef struct input_block input_block;
     104  
     105  
     106  /* Current input file name.  */
     107  const char *current_file;
     108  
     109  /* Current input line number.  */
     110  int current_line;
     111  
     112  /* Obstack for storing individual tokens.  */
     113  static struct obstack token_stack;
     114  
     115  /* Obstack for storing file names.  */
     116  static struct obstack file_names;
     117  
     118  /* Wrapup input stack.  */
     119  static struct obstack *wrapup_stack;
     120  
     121  /* Current stack, from input or wrapup.  */
     122  static struct obstack *current_input;
     123  
     124  /* Bottom of token_stack, for obstack_free.  */
     125  static void *token_bottom;
     126  
     127  /* Pointer to top of current_input.  */
     128  static input_block *isp;
     129  
     130  /* Pointer to top of wrapup_stack.  */
     131  static input_block *wsp;
     132  
     133  /* Aux. for handling split push_string ().  */
     134  static input_block *next;
     135  
     136  /* Flag for next_char () to increment current_line.  */
     137  static bool start_of_input_line;
     138  
     139  /* Flag for next_char () to recognize change in input block.  */
     140  static bool input_change;
     141  
     142  #define CHAR_EOF        256     /* character return on EOF */
     143  #define CHAR_MACRO      257     /* character return for MACRO token */
     144  
     145  /* Quote chars.  */
     146  STRING rquote;
     147  STRING lquote;
     148  
     149  /* Comment chars.  */
     150  STRING bcomm;
     151  STRING ecomm;
     152  
     153  #ifdef ENABLE_CHANGEWORD
     154  
     155  # define DEFAULT_WORD_REGEXP "[_a-zA-Z][_a-zA-Z0-9]*"
     156  
     157  static struct re_pattern_buffer word_regexp;
     158  static int default_word_regexp;
     159  static struct re_registers regs;
     160  
     161  #else /* ! ENABLE_CHANGEWORD */
     162  # define default_word_regexp 1
     163  #endif /* ! ENABLE_CHANGEWORD */
     164  
     165  #ifdef DEBUG_INPUT
     166  static const char *token_type_string (token_type);
     167  #endif
     168  
     169  static void pop_input (void);
     170  
     171  
     172  
     173  /*-------------------------------------------------------------------.
     174  | push_file () pushes an input file on the input stack, saving the   |
     175  | current file name and line number.  If next is non-NULL, this push |
     176  | invalidates a call to push_string_init (), whose storage is        |
     177  | consequently released.  If CLOSE_WHEN_DONE, then close FP after    |
     178  | EOF is detected.                                                   |
     179  `-------------------------------------------------------------------*/
     180  
     181  void
     182  push_file (FILE *fp, const char *title, bool close_when_done)
     183  {
     184    input_block *i;
     185  
     186    if (next != NULL)
     187      {
     188        obstack_free (current_input, next);
     189        next = NULL;
     190      }
     191  
     192    if (debug_level & DEBUG_TRACE_INPUT)
     193      DEBUG_MESSAGE1 ("input read from %s", title);
     194  
     195    i = (input_block *) obstack_alloc (current_input,
     196                                       sizeof (struct input_block));
     197    i->type = INPUT_FILE;
     198    i->file = (char *) obstack_copy0 (&file_names, title, strlen (title));
     199    i->line = 1;
     200    input_change = true;
     201  
     202    i->u.u_f.fp = fp;
     203    i->u.u_f.end = false;
     204    i->u.u_f.close = close_when_done;
     205    i->u.u_f.advance = start_of_input_line;
     206    output_current_line = -1;
     207  
     208    i->prev = isp;
     209    isp = i;
     210  }
     211  
     212  /*---------------------------------------------------------------.
     213  | push_macro () pushes a builtin macro's definition on the input |
     214  | stack.  If next is non-NULL, this push invalidates a call to   |
     215  | push_string_init (), whose storage is consequently released.   |
     216  `---------------------------------------------------------------*/
     217  
     218  void
     219  push_macro (builtin_func *func)
     220  {
     221    input_block *i;
     222  
     223    if (next != NULL)
     224      {
     225        obstack_free (current_input, next);
     226        next = NULL;
     227      }
     228  
     229    i = (input_block *) obstack_alloc (current_input,
     230                                       sizeof (struct input_block));
     231    i->type = INPUT_MACRO;
     232    i->file = current_file;
     233    i->line = current_line;
     234    input_change = true;
     235  
     236    i->u.func = func;
     237    i->prev = isp;
     238    isp = i;
     239  }
     240  
     241  /*------------------------------------------------------------------.
     242  | First half of push_string ().  The pointer next points to the new |
     243  | input_block.                                                      |
     244  `------------------------------------------------------------------*/
     245  
     246  struct obstack *
     247  push_string_init (void)
     248  {
     249    if (next != NULL)
     250      {
     251        M4ERROR ((warning_status, 0,
     252                  "INTERNAL ERROR: recursive push_string!"));
     253        abort ();
     254      }
     255  
     256    /* Prefer reusing an older block, for tail-call optimization.  */
     257    while (isp && isp->type == INPUT_STRING && !isp->u.u_s.string[0])
     258      pop_input ();
     259    next = (input_block *) obstack_alloc (current_input,
     260                                          sizeof (struct input_block));
     261    next->type = INPUT_STRING;
     262    next->file = current_file;
     263    next->line = current_line;
     264  
     265    return current_input;
     266  }
     267  
     268  /*-------------------------------------------------------------------.
     269  | Last half of push_string ().  If next is now NULL, a call to       |
     270  | push_file () has invalidated the previous call to push_string_init |
     271  | (), so we just give up.  If the new object is void, we do not push |
     272  | it.  The function push_string_finish () returns a pointer to the   |
     273  | finished object.  This pointer is only for temporary use, since    |
     274  | reading the next token might release the memory used for the       |
     275  | object.                                                            |
     276  `-------------------------------------------------------------------*/
     277  
     278  const char *
     279  push_string_finish (void)
     280  {
     281    const char *ret = NULL;
     282  
     283    if (next == NULL)
     284      return NULL;
     285  
     286    if (obstack_object_size (current_input) > 0)
     287      {
     288        size_t len = obstack_object_size (current_input);
     289        obstack_1grow (current_input, '\0');
     290        next->u.u_s.string = (char *) obstack_finish (current_input);
     291        next->u.u_s.end = next->u.u_s.string + len;
     292        next->prev = isp;
     293        isp = next;
     294        ret = isp->u.u_s.string; /* for immediate use only */
     295        input_change = true;
     296      }
     297    else
     298      obstack_free (current_input, next); /* people might leave garbage on it. */
     299    next = NULL;
     300    return ret;
     301  }
     302  
     303  /*------------------------------------------------------------------.
     304  | The function push_wrapup () pushes a string on the wrapup stack.  |
     305  | When the normal input stack gets empty, the wrapup stack will     |
     306  | become the input stack, and push_string () and push_file () will  |
     307  | operate on wrapup_stack.  Push_wrapup should be done as           |
     308  | push_string (), but this will suffice, as long as arguments to    |
     309  | m4_m4wrap () are moderate in size.                                |
     310  `------------------------------------------------------------------*/
     311  
     312  void
     313  push_wrapup (const char *s)
     314  {
     315    size_t len = strlen (s);
     316    input_block *i;
     317    i = (input_block *) obstack_alloc (wrapup_stack,
     318                                       sizeof (struct input_block));
     319    i->prev = wsp;
     320    i->type = INPUT_STRING;
     321    i->file = current_file;
     322    i->line = current_line;
     323    i->u.u_s.string = (char *) obstack_copy0 (wrapup_stack, s, len);
     324    i->u.u_s.end = i->u.u_s.string + len;
     325    wsp = i;
     326  }
     327  
     328  
     329  /*-------------------------------------------------------------------.
     330  | The function pop_input () pops one level of input sources.  If the |
     331  | popped input_block is a file, current_file and current_line are    |
     332  | reset to the saved values before the memory for the input_block is |
     333  | released.                                                          |
     334  `-------------------------------------------------------------------*/
     335  
     336  static void
     337  pop_input (void)
     338  {
     339    input_block *tmp = isp->prev;
     340  
     341    switch (isp->type)
     342      {
     343      case INPUT_STRING:
     344      case INPUT_MACRO:
     345        break;
     346  
     347      case INPUT_FILE:
     348        if (debug_level & DEBUG_TRACE_INPUT)
     349          {
     350            if (tmp)
     351              DEBUG_MESSAGE2 ("input reverted to %s, line %d",
     352                              tmp->file, tmp->line);
     353            else
     354              DEBUG_MESSAGE ("input exhausted");
     355          }
     356  
     357        if (ferror (isp->u.u_f.fp))
     358          {
     359            M4ERROR ((warning_status, 0, _("read error")));
     360            if (isp->u.u_f.close)
     361              fclose (isp->u.u_f.fp);
     362            retcode = EXIT_FAILURE;
     363          }
     364        else if (isp->u.u_f.close && fclose (isp->u.u_f.fp) == EOF)
     365          {
     366            M4ERROR ((warning_status, errno, _("error reading file")));
     367            retcode = EXIT_FAILURE;
     368          }
     369        start_of_input_line = isp->u.u_f.advance;
     370        output_current_line = -1;
     371        break;
     372  
     373      default:
     374        M4ERROR ((warning_status, 0,
     375                  "INTERNAL ERROR: input stack botch in pop_input ()"));
     376        abort ();
     377      }
     378    obstack_free (current_input, isp);
     379    next = NULL; /* might be set in push_string_init () */
     380  
     381    isp = tmp;
     382    input_change = true;
     383  }
     384  
     385  /*-------------------------------------------------------------------.
     386  | To switch input over to the wrapup stack, main calls pop_wrapup    |
     387  | ().  Since wrapup text can install new wrapup text, pop_wrapup ()  |
     388  | returns false when there is no wrapup text on the stack, and true  |
     389  | otherwise.                                                         |
     390  `-------------------------------------------------------------------*/
     391  
     392  bool
     393  pop_wrapup (void)
     394  {
     395    next = NULL;
     396    obstack_free (current_input, NULL);
     397    free (current_input);
     398  
     399    if (wsp == NULL)
     400      {
     401        /* End of the program.  Free all memory even though we are about
     402           to exit, since it makes leak detection easier.  */
     403        obstack_free (&token_stack, NULL);
     404        obstack_free (&file_names, NULL);
     405        obstack_free (wrapup_stack, NULL);
     406        free (wrapup_stack);
     407  #ifdef ENABLE_CHANGEWORD
     408        regfree (&word_regexp);
     409  #endif /* ENABLE_CHANGEWORD */
     410        return false;
     411      }
     412  
     413    current_input = wrapup_stack;
     414    wrapup_stack = (struct obstack *) xmalloc (sizeof (struct obstack));
     415    obstack_init (wrapup_stack);
     416  
     417    isp = wsp;
     418    wsp = NULL;
     419    input_change = true;
     420  
     421    return true;
     422  }
     423  
     424  /*-------------------------------------------------------------------.
     425  | When a MACRO token is seen, next_token () uses init_macro_token () |
     426  | to retrieve the value of the function pointer.                     |
     427  `-------------------------------------------------------------------*/
     428  
     429  static void
     430  init_macro_token (token_data *td)
     431  {
     432    if (isp->type != INPUT_MACRO)
     433      {
     434        M4ERROR ((warning_status, 0,
     435                  "INTERNAL ERROR: bad call to init_macro_token ()"));
     436        abort ();
     437      }
     438  
     439    TOKEN_DATA_TYPE (td) = TOKEN_FUNC;
     440    TOKEN_DATA_FUNC (td) = isp->u.func;
     441  }
     442  
     443  
     444  /*-----------------------------------------------------------------.
     445  | Low level input is done a character at a time.  The function     |
     446  | peek_input () is used to look at the next character in the input |
     447  | stream.  At any given time, it reads from the input_block on the |
     448  | top of the current input stack.                                  |
     449  `-----------------------------------------------------------------*/
     450  
     451  static int
     452  peek_input (void)
     453  {
     454    int ch;
     455    input_block *block = isp;
     456  
     457    while (1)
     458      {
     459        if (block == NULL)
     460          return CHAR_EOF;
     461  
     462        switch (block->type)
     463          {
     464          case INPUT_STRING:
     465            ch = to_uchar (block->u.u_s.string[0]);
     466            if (ch != '\0')
     467              return ch;
     468            break;
     469  
     470          case INPUT_FILE:
     471            ch = getc (block->u.u_f.fp);
     472            if (ch != EOF)
     473              {
     474                ungetc (ch, block->u.u_f.fp);
     475                return ch;
     476              }
     477            block->u.u_f.end = true;
     478            break;
     479  
     480          case INPUT_MACRO:
     481            return CHAR_MACRO;
     482  
     483          default:
     484            M4ERROR ((warning_status, 0,
     485                      "INTERNAL ERROR: input stack botch in peek_input ()"));
     486            abort ();
     487          }
     488        block = block->prev;
     489      }
     490  }
     491  
     492  /*-------------------------------------------------------------------.
     493  | The function next_char () is used to read and advance the input to |
     494  | the next character.  It also manages line numbers for error        |
     495  | messages, so they do not get wrong, due to lookahead.  The token   |
     496  | consisting of a newline alone is taken as belonging to the line it |
     497  | ends, and the current line number is not incremented until the     |
     498  | next character is read.  99.9% of all calls will read from a       |
     499  | string, so factor that out into a macro for speed.                 |
     500  `-------------------------------------------------------------------*/
     501  
     502  #define next_char() \
     503    (isp && isp->type == INPUT_STRING && isp->u.u_s.string[0]     \
     504     && !input_change                                             \
     505     ? to_uchar (*isp->u.u_s.string++)                            \
     506     : next_char_1 ())
     507  
     508  static int
     509  next_char_1 (void)
     510  {
     511    int ch;
     512  
     513    while (1)
     514      {
     515        if (isp == NULL)
     516          {
     517            current_file = "";
     518            current_line = 0;
     519            return CHAR_EOF;
     520          }
     521  
     522        if (input_change)
     523          {
     524            current_file = isp->file;
     525            current_line = isp->line;
     526            input_change = false;
     527          }
     528  
     529        switch (isp->type)
     530          {
     531          case INPUT_STRING:
     532            ch = to_uchar (*isp->u.u_s.string++);
     533            if (ch != '\0')
     534              return ch;
     535            break;
     536  
     537          case INPUT_FILE:
     538            if (start_of_input_line)
     539              {
     540                start_of_input_line = false;
     541                current_line = ++isp->line;
     542              }
     543  
     544            /* If stdin is a terminal, calling getc after peek_input
     545               already called it would make the user have to hit ^D
     546               twice to quit.  */
     547            ch = isp->u.u_f.end ? EOF : getc (isp->u.u_f.fp);
     548            if (ch != EOF)
     549              {
     550                if (ch == '\n')
     551                  start_of_input_line = true;
     552                return ch;
     553              }
     554            break;
     555  
     556          case INPUT_MACRO:
     557            pop_input (); /* INPUT_MACRO input sources has only one token */
     558            return CHAR_MACRO;
     559  
     560          default:
     561            M4ERROR ((warning_status, 0,
     562                      "INTERNAL ERROR: input stack botch in next_char ()"));
     563            abort ();
     564          }
     565  
     566        /* End of input source --- pop one level.  */
     567        pop_input ();
     568      }
     569  }
     570  
     571  /*-------------------------------------------------------------------.
     572  | skip_line () simply discards all immediately following characters, |
     573  | upto the first newline.  It is only used from m4_dnl ().           |
     574  `-------------------------------------------------------------------*/
     575  
     576  void
     577  skip_line (void)
     578  {
     579    int ch;
     580    const char *file = current_file;
     581    int line = current_line;
     582  
     583    while ((ch = next_char ()) != CHAR_EOF && ch != '\n')
     584      ;
     585    if (ch == CHAR_EOF)
     586      /* current_file changed to "" if we see CHAR_EOF, use the
     587         previous value we stored earlier.  */
     588      M4ERROR_AT_LINE ((warning_status, 0, file, line,
     589                        _("Warning: end of file treated as newline")));
     590    /* On the rare occasion that dnl crosses include file boundaries
     591       (either the input file did not end in a newline, or changeword
     592       was used), calling next_char can update current_file and
     593       current_line, and that update will be undone as we return to
     594       expand_macro.  This informs next_char to fix things again.  */
     595    if (file != current_file || line != current_line)
     596      input_change = true;
     597  }
     598  
     599  
     600  /*------------------------------------------------------------------.
     601  | This function is for matching a string against a prefix of the    |
     602  | input stream.  If the string matches the input and consume is     |
     603  | true, the input is discarded; otherwise any characters read are   |
     604  | pushed back again.  The function is used only when multicharacter |
     605  | quotes or comment delimiters are used.                            |
     606  `------------------------------------------------------------------*/
     607  
     608  static bool
     609  match_input (const char *s, bool consume)
     610  {
     611    int n;                        /* number of characters matched */
     612    int ch;                       /* input character */
     613    const char *t;
     614    bool result = false;
     615  
     616    ch = peek_input ();
     617    if (ch != to_uchar (*s))
     618      return false;                       /* fail */
     619  
     620    if (s[1] == '\0')
     621      {
     622        if (consume)
     623          next_char ();
     624        return true;                      /* short match */
     625      }
     626  
     627    next_char ();
     628    for (n = 1, t = s++; peek_input () == to_uchar (*s++); )
     629      {
     630        next_char ();
     631        n++;
     632        if (*s == '\0')           /* long match */
     633          {
     634            if (consume)
     635              return true;
     636            result = true;
     637            break;
     638          }
     639      }
     640  
     641    /* Failed or shouldn't consume, push back input.  */
     642    {
     643      struct obstack *h = push_string_init ();
     644  
     645      /* `obstack_grow' may be macro evaluating its arg 1 several times. */
     646      obstack_grow (h, t, n);
     647    }
     648    push_string_finish ();
     649    return result;
     650  }
     651  
     652  /*--------------------------------------------------------------------.
     653  | The macro MATCH() is used to match a string S against the input.    |
     654  | The first character is handled inline, for speed.  Hopefully, this  |
     655  | will not hurt efficiency too much when single character quotes and  |
     656  | comment delimiters are used.  If CONSUME, then CH is the result of  |
     657  | next_char, and a successful match will discard the matched string.  |
     658  | Otherwise, CH is the result of peek_char, and the input stream is   |
     659  | effectively unchanged.                                              |
     660  `--------------------------------------------------------------------*/
     661  
     662  #define MATCH(ch, s, consume)                                           \
     663    (to_uchar ((s)[0]) == (ch)                                            \
     664     && (ch) != '\0'                                                      \
     665     && ((s)[1] == '\0' || (match_input ((s) + (consume), consume))))
     666  
     667  
     668  /*--------------------------------------------------------.
     669  | Initialize input stacks, and quote/comment characters.  |
     670  `--------------------------------------------------------*/
     671  
     672  void
     673  input_init (void)
     674  {
     675    current_file = "";
     676    current_line = 0;
     677  
     678    current_input = (struct obstack *) xmalloc (sizeof (struct obstack));
     679    obstack_init (current_input);
     680    wrapup_stack = (struct obstack *) xmalloc (sizeof (struct obstack));
     681    obstack_init (wrapup_stack);
     682  
     683    obstack_init (&file_names);
     684  
     685    /* Allocate an object in the current chunk, so that obstack_free
     686       will always work even if the first token parsed spills to a new
     687       chunk.  */
     688    obstack_init (&token_stack);
     689    obstack_alloc (&token_stack, 1);
     690    token_bottom = obstack_base (&token_stack);
     691  
     692    isp = NULL;
     693    wsp = NULL;
     694    next = NULL;
     695  
     696    start_of_input_line = false;
     697  
     698    lquote.string = xstrdup (DEF_LQUOTE);
     699    lquote.length = strlen (lquote.string);
     700    rquote.string = xstrdup (DEF_RQUOTE);
     701    rquote.length = strlen (rquote.string);
     702    bcomm.string = xstrdup (DEF_BCOMM);
     703    bcomm.length = strlen (bcomm.string);
     704    ecomm.string = xstrdup (DEF_ECOMM);
     705    ecomm.length = strlen (ecomm.string);
     706  
     707  #ifdef ENABLE_CHANGEWORD
     708    set_word_regexp (user_word_regexp);
     709  #endif
     710  }
     711  
     712  
     713  /*------------------------------------------------------------------.
     714  | Functions for setting quotes and comment delimiters.  Used by     |
     715  | m4_changecom () and m4_changequote ().  Pass NULL if the argument |
     716  | was not present, to distinguish from an explicit empty string.    |
     717  `------------------------------------------------------------------*/
     718  
     719  void
     720  set_quotes (const char *lq, const char *rq)
     721  {
     722    free (lquote.string);
     723    free (rquote.string);
     724  
     725    /* POSIX states that with 0 arguments, the default quotes are used.
     726       POSIX XCU ERN 112 states that behavior is implementation-defined
     727       if there was only one argument, or if there is an empty string in
     728       either position when there are two arguments.  We allow an empty
     729       left quote to disable quoting, but a non-empty left quote will
     730       always create a non-empty right quote.  See the texinfo for what
     731       some other implementations do.  */
     732    if (!lq)
     733      {
     734        lq = DEF_LQUOTE;
     735        rq = DEF_RQUOTE;
     736      }
     737    else if (!rq || (*lq && !*rq))
     738      rq = DEF_RQUOTE;
     739  
     740    lquote.string = xstrdup (lq);
     741    lquote.length = strlen (lquote.string);
     742    rquote.string = xstrdup (rq);
     743    rquote.length = strlen (rquote.string);
     744  }
     745  
     746  void
     747  set_comment (const char *bc, const char *ec)
     748  {
     749    free (bcomm.string);
     750    free (ecomm.string);
     751  
     752    /* POSIX requires no arguments to disable comments.  It requires
     753       empty arguments to be used as-is, but this is counter to
     754       traditional behavior, because a non-null begin and null end makes
     755       it impossible to end a comment.  An aardvark has been filed:
     756       http://www.opengroup.org/austin/mailarchives/ag-review/msg02168.html
     757       This implementation assumes the aardvark will be approved.  See
     758       the texinfo for what some other implementations do.  */
     759    if (!bc)
     760      bc = ec = "";
     761    else if (!ec || (*bc && !*ec))
     762      ec = DEF_ECOMM;
     763  
     764    bcomm.string = xstrdup (bc);
     765    bcomm.length = strlen (bcomm.string);
     766    ecomm.string = xstrdup (ec);
     767    ecomm.length = strlen (ecomm.string);
     768  }
     769  
     770  #ifdef ENABLE_CHANGEWORD
     771  
     772  void
     773  set_word_regexp (const char *regexp)
     774  {
     775    const char *msg;
     776    struct re_pattern_buffer new_word_regexp;
     777  
     778    if (!*regexp || STREQ (regexp, DEFAULT_WORD_REGEXP))
     779      {
     780        default_word_regexp = true;
     781        return;
     782      }
     783  
     784    /* Dry run to see whether the new expression is compilable.  */
     785    init_pattern_buffer (&new_word_regexp, NULL);
     786    msg = re_compile_pattern (regexp, strlen (regexp), &new_word_regexp);
     787    regfree (&new_word_regexp);
     788  
     789    if (msg != NULL)
     790      {
     791        M4ERROR ((warning_status, 0,
     792                  _("bad regular expression `%s': %s"), regexp, msg));
     793        return;
     794      }
     795  
     796    /* If compilation worked, retry using the word_regexp struct.  We
     797       can't rely on struct assigns working, so redo the compilation.
     798       The fastmap can be reused between compilations, and will be freed
     799       by the final regfree.  */
     800    if (!word_regexp.fastmap)
     801      word_regexp.fastmap = xcharalloc (UCHAR_MAX + 1);
     802    msg = re_compile_pattern (regexp, strlen (regexp), &word_regexp);
     803    assert (!msg);
     804    re_set_registers (&word_regexp, &regs, regs.num_regs, regs.start, regs.end);
     805    if (re_compile_fastmap (&word_regexp))
     806      assert (false);
     807  
     808    default_word_regexp = false;
     809  }
     810  
     811  #endif /* ENABLE_CHANGEWORD */
     812  
     813  
     814  /*--------------------------------------------------------------------.
     815  | Parse and return a single token from the input stream.  A token     |
     816  | can either be TOKEN_EOF, if the input_stack is empty; it can be     |
     817  | TOKEN_STRING for a quoted string; TOKEN_WORD for something that is  |
     818  | a potential macro name; and TOKEN_SIMPLE for any single character   |
     819  | that is not a part of any of the previous types.  If LINE is not    |
     820  | NULL, set *LINE to the line where the token starts.                 |
     821  |                                                                     |
     822  | Next_token () return the token type, and passes back a pointer to   |
     823  | the token data through TD.  The token text is collected on the      |
     824  | obstack token_stack, which never contains more than one token text  |
     825  | at a time.  The storage pointed to by the fields in TD is           |
     826  | therefore subject to change the next time next_token () is called.  |
     827  `--------------------------------------------------------------------*/
     828  
     829  token_type
     830  next_token (token_data *td, int *line)
     831  {
     832    int ch;
     833    int quote_level;
     834    token_type type;
     835  #ifdef ENABLE_CHANGEWORD
     836    int startpos;
     837    char *orig_text = NULL;
     838  #endif
     839    const char *file;
     840    int dummy;
     841  
     842    obstack_free (&token_stack, token_bottom);
     843    if (!line)
     844      line = &dummy;
     845  
     846   /* Can't consume character until after CHAR_MACRO is handled.  */
     847    ch = peek_input ();
     848    if (ch == CHAR_EOF)
     849      {
     850  #ifdef DEBUG_INPUT
     851        xfprintf (stderr, "next_token -> EOF\n");
     852  #endif
     853        next_char ();
     854        return TOKEN_EOF;
     855      }
     856    if (ch == CHAR_MACRO)
     857      {
     858        init_macro_token (td);
     859        next_char ();
     860  #ifdef DEBUG_INPUT
     861        xfprintf (stderr, "next_token -> MACDEF (%s)\n",
     862                  find_builtin_by_addr (TOKEN_DATA_FUNC (td))->name);
     863  #endif
     864        return TOKEN_MACDEF;
     865      }
     866  
     867    next_char (); /* Consume character we already peeked at.  */
     868    file = current_file;
     869    *line = current_line;
     870    if (MATCH (ch, bcomm.string, true))
     871      {
     872        obstack_grow (&token_stack, bcomm.string, bcomm.length);
     873        while ((ch = next_char ()) != CHAR_EOF
     874               && !MATCH (ch, ecomm.string, true))
     875          obstack_1grow (&token_stack, ch);
     876        if (ch != CHAR_EOF)
     877          obstack_grow (&token_stack, ecomm.string, ecomm.length);
     878        else
     879          /* current_file changed to "" if we see CHAR_EOF, use the
     880             previous value we stored earlier.  */
     881          m4_failure_at_line (0, file, *line, _("ERROR: end of file in comment"));
     882  
     883        type = TOKEN_STRING;
     884      }
     885    else if (default_word_regexp && (c_isalpha (ch) || ch == '_'))
     886      {
     887        obstack_1grow (&token_stack, ch);
     888        while ((ch = peek_input ()) != CHAR_EOF && (c_isalnum (ch) || ch == '_'))
     889          {
     890            obstack_1grow (&token_stack, ch);
     891            next_char ();
     892          }
     893        type = TOKEN_WORD;
     894      }
     895  
     896  #ifdef ENABLE_CHANGEWORD
     897  
     898    else if (!default_word_regexp && word_regexp.fastmap[ch])
     899      {
     900        obstack_1grow (&token_stack, ch);
     901        while (1)
     902          {
     903            ch = peek_input ();
     904            if (ch == CHAR_EOF)
     905              break;
     906            obstack_1grow (&token_stack, ch);
     907            startpos = re_search (&word_regexp,
     908                                  (char *) obstack_base (&token_stack),
     909                                  obstack_object_size (&token_stack), 0, 0,
     910                                  &regs);
     911            if (startpos ||
     912                regs.end [0] != (regoff_t) obstack_object_size (&token_stack))
     913              {
     914                *(((char *) obstack_base (&token_stack)
     915                   + obstack_object_size (&token_stack)) - 1) = '\0';
     916                break;
     917              }
     918            next_char ();
     919          }
     920  
     921        obstack_1grow (&token_stack, '\0');
     922        orig_text = (char *) obstack_finish (&token_stack);
     923  
     924        if (regs.start[1] != -1)
     925          obstack_grow (&token_stack,orig_text + regs.start[1],
     926                        regs.end[1] - regs.start[1]);
     927        else
     928          obstack_grow (&token_stack, orig_text,regs.end[0]);
     929  
     930        type = TOKEN_WORD;
     931      }
     932  
     933  #endif /* ENABLE_CHANGEWORD */
     934  
     935    else if (!MATCH (ch, lquote.string, true))
     936      {
     937        switch (ch)
     938          {
     939          case '(':
     940            type = TOKEN_OPEN;
     941            break;
     942          case ',':
     943            type = TOKEN_COMMA;
     944            break;
     945          case ')':
     946            type = TOKEN_CLOSE;
     947            break;
     948          default:
     949            type = TOKEN_SIMPLE;
     950            break;
     951          }
     952        obstack_1grow (&token_stack, ch);
     953      }
     954    else
     955      {
     956        bool fast = lquote.length == 1 && rquote.length == 1;
     957        quote_level = 1;
     958        while (1)
     959          {
     960            /* Try scanning a buffer first.  */
     961            const char *buffer = (isp && isp->type == INPUT_STRING
     962                                  ? isp->u.u_s.string : NULL);
     963            if (buffer && *buffer)
     964              {
     965                size_t len = isp->u.u_s.end - buffer;
     966                const char *p = buffer;
     967                do
     968                  {
     969                    p = (char *) memchr2 (p, *lquote.string, *rquote.string,
     970                                          buffer + len - p);
     971                  }
     972                while (p && fast && (*p++ == *rquote.string
     973                                     ? --quote_level : ++quote_level));
     974                if (p)
     975                  {
     976                    if (fast)
     977                      {
     978                        assert (!quote_level);
     979                        obstack_grow (&token_stack, buffer, p - buffer - 1);
     980                        isp->u.u_s.string += p - buffer;
     981                        break;
     982                      }
     983                    obstack_grow (&token_stack, buffer, p - buffer);
     984                    ch = to_uchar (*p);
     985                    isp->u.u_s.string += p - buffer + 1;
     986                  }
     987                else
     988                  {
     989                    obstack_grow (&token_stack, buffer, len);
     990                    isp->u.u_s.string += len;
     991                    continue;
     992                  }
     993              }
     994            /* Fall back to a byte.  */
     995            else
     996              ch = next_char ();
     997            if (ch == CHAR_EOF)
     998              /* current_file changed to "" if we see CHAR_EOF, use
     999                 the previous value we stored earlier.  */
    1000              m4_failure_at_line (0, file, *line,
    1001                                  _("ERROR: end of file in string"));
    1002  
    1003            if (MATCH (ch, rquote.string, true))
    1004              {
    1005                if (--quote_level == 0)
    1006                  break;
    1007                obstack_grow (&token_stack, rquote.string, rquote.length);
    1008              }
    1009            else if (MATCH (ch, lquote.string, true))
    1010              {
    1011                quote_level++;
    1012                obstack_grow (&token_stack, lquote.string, lquote.length);
    1013              }
    1014            else
    1015              obstack_1grow (&token_stack, ch);
    1016          }
    1017        type = TOKEN_STRING;
    1018      }
    1019  
    1020    obstack_1grow (&token_stack, '\0');
    1021  
    1022    TOKEN_DATA_TYPE (td) = TOKEN_TEXT;
    1023    TOKEN_DATA_TEXT (td) = (char *) obstack_finish (&token_stack);
    1024  #ifdef ENABLE_CHANGEWORD
    1025    if (orig_text == NULL)
    1026      orig_text = TOKEN_DATA_TEXT (td);
    1027    TOKEN_DATA_ORIG_TEXT (td) = orig_text;
    1028  #endif
    1029  #ifdef DEBUG_INPUT
    1030    xfprintf (stderr, "next_token -> %s (%s)\n",
    1031              token_type_string (type), TOKEN_DATA_TEXT (td));
    1032  #endif
    1033    return type;
    1034  }
    1035  
    1036  /*-----------------------------------------------.
    1037  | Peek at the next token from the input stream.  |
    1038  `-----------------------------------------------*/
    1039  
    1040  token_type
    1041  peek_token (void)
    1042  {
    1043    token_type result;
    1044    int ch = peek_input ();
    1045  
    1046    if (ch == CHAR_EOF)
    1047      {
    1048        result = TOKEN_EOF;
    1049      }
    1050    else if (ch == CHAR_MACRO)
    1051      {
    1052        result = TOKEN_MACDEF;
    1053      }
    1054    else if (MATCH (ch, bcomm.string, false))
    1055      {
    1056        result = TOKEN_STRING;
    1057      }
    1058    else if ((default_word_regexp && (c_isalpha (ch) || ch == '_'))
    1059  #ifdef ENABLE_CHANGEWORD
    1060             || (! default_word_regexp && word_regexp.fastmap[ch])
    1061  #endif /* ENABLE_CHANGEWORD */
    1062             )
    1063      {
    1064        result = TOKEN_WORD;
    1065      }
    1066    else if (MATCH (ch, lquote.string, false))
    1067      {
    1068        result = TOKEN_STRING;
    1069      }
    1070    else
    1071      switch (ch)
    1072        {
    1073        case '(':
    1074          result = TOKEN_OPEN;
    1075          break;
    1076        case ',':
    1077          result = TOKEN_COMMA;
    1078          break;
    1079        case ')':
    1080          result = TOKEN_CLOSE;
    1081          break;
    1082        default:
    1083          result = TOKEN_SIMPLE;
    1084        }
    1085  
    1086  #ifdef DEBUG_INPUT
    1087    xfprintf (stderr, "peek_token -> %s\n", token_type_string (result));
    1088  #endif /* DEBUG_INPUT */
    1089    return result;
    1090  }
    1091  
    1092  
    1093  #ifdef DEBUG_INPUT
    1094  
    1095  static const char *
    1096  token_type_string (token_type t)
    1097  {
    1098   switch (t)
    1099      { /* TOKSW */
    1100      case TOKEN_EOF:
    1101        return "EOF";
    1102      case TOKEN_STRING:
    1103        return "STRING";
    1104      case TOKEN_WORD:
    1105        return "WORD";
    1106      case TOKEN_OPEN:
    1107        return "OPEN";
    1108      case TOKEN_COMMA:
    1109        return "COMMA";
    1110      case TOKEN_CLOSE:
    1111        return "CLOSE";
    1112      case TOKEN_SIMPLE:
    1113        return "SIMPLE";
    1114      case TOKEN_MACDEF:
    1115        return "MACDEF";
    1116      default:
    1117        abort ();
    1118      }
    1119   }
    1120  
    1121  static void
    1122  print_token (const char *s, token_type t, token_data *td)
    1123  {
    1124    xfprintf (stderr, "%s: ", s);
    1125    switch (t)
    1126      { /* TOKSW */
    1127      case TOKEN_OPEN:
    1128      case TOKEN_COMMA:
    1129      case TOKEN_CLOSE:
    1130      case TOKEN_SIMPLE:
    1131        xfprintf (stderr, "char:");
    1132        break;
    1133  
    1134      case TOKEN_WORD:
    1135        xfprintf (stderr, "word:");
    1136        break;
    1137  
    1138      case TOKEN_STRING:
    1139        xfprintf (stderr, "string:");
    1140        break;
    1141  
    1142      case TOKEN_MACDEF:
    1143        xfprintf (stderr, "macro: %p\n", TOKEN_DATA_FUNC (td));
    1144        break;
    1145  
    1146      case TOKEN_EOF:
    1147        xfprintf (stderr, "eof\n");
    1148        break;
    1149      }
    1150    xfprintf (stderr, "\t\"%s\"\n", TOKEN_DATA_TEXT (td));
    1151  }
    1152  
    1153  static void MAYBE_UNUSED
    1154  lex_debug (void)
    1155  {
    1156    token_type t;
    1157    token_data td;
    1158  
    1159    while ((t = next_token (&td, NULL)) != TOKEN_EOF)
    1160      print_token ("lex", t, &td);
    1161  }
    1162  #endif /* DEBUG_INPUT */