(root)/
gettext-0.22.4/
gettext-tools/
src/
x-tcl.c
       1  /* xgettext Tcl backend.
       2     Copyright (C) 2002-2003, 2005-2009, 2013, 2018-2023 Free Software Foundation, Inc.
       3  
       4     This file was written by Bruno Haible <haible@clisp.cons.org>, 2002.
       5  
       6     This program is free software: you can redistribute it and/or modify
       7     it under the terms of the GNU General Public License as published by
       8     the Free Software Foundation; either version 3 of the License, or
       9     (at your option) any later version.
      10  
      11     This program is distributed in the hope that it will be useful,
      12     but WITHOUT ANY WARRANTY; without even the implied warranty of
      13     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      14     GNU General Public License for more details.
      15  
      16     You should have received a copy of the GNU General Public License
      17     along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
      18  
      19  #ifdef HAVE_CONFIG_H
      20  # include "config.h"
      21  #endif
      22  
      23  /* Specification.  */
      24  #include "x-tcl.h"
      25  
      26  #include <assert.h>
      27  #include <errno.h>
      28  #include <limits.h>
      29  #include <stdbool.h>
      30  #include <stdio.h>
      31  #include <stdlib.h>
      32  #include <string.h>
      33  
      34  #include "attribute.h"
      35  #include "message.h"
      36  #include "xgettext.h"
      37  #include "xg-pos.h"
      38  #include "xg-encoding.h"
      39  #include "xg-mixed-string.h"
      40  #include "xg-arglist-context.h"
      41  #include "xg-arglist-callshape.h"
      42  #include "xg-arglist-parser.h"
      43  #include "xg-message.h"
      44  #include "error.h"
      45  #include "error-progname.h"
      46  #include "xalloc.h"
      47  #include "mem-hash-map.h"
      48  #include "c-ctype.h"
      49  #include "po-charset.h"
      50  #include "unistr.h"
      51  #include "gettext.h"
      52  
      53  #define _(s) gettext(s)
      54  
      55  #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
      56  
      57  
      58  /* The Tcl syntax is defined in the Tcl.n manual page, see
      59     https://www.tcl-lang.org/man/tcl8.6/TclCmd/Tcl.htm .
      60     Summary of Tcl syntax:
      61     Like sh syntax, except that `...` is replaced with [...]. In detail:
      62     - In a preprocessing pass, backslash-newline-anywhitespace is replaced
      63       with single space.
      64     - Input is broken into words, which are then subject to command
      65       substitution [...] , variable substitution $var, backslash substitution
      66       \escape.
      67     - Strings are enclosed in "..."; command substitution, variable
      68       substitution and backslash substitutions are performed here as well.
      69     - {...} is a string without substitutions.
      70     - The list of resulting words is split into commands by semicolon and
      71       newline.
      72     - '#' at the beginning of a command introduces a comment until end of line.
      73     The parser is implemented in tcl8.6/generic/tclParse.c.  */
      74  
      75  
      76  /* ====================== Keyword set customization.  ====================== */
      77  
      78  /* If true extract all strings.  */
      79  static bool extract_all = false;
      80  
      81  static hash_table keywords;
      82  static bool default_keywords = true;
      83  
      84  
      85  void
      86  x_tcl_extract_all ()
      87  {
      88    extract_all = true;
      89  }
      90  
      91  
      92  void
      93  x_tcl_keyword (const char *name)
      94  {
      95    if (name == NULL)
      96      default_keywords = false;
      97    else
      98      {
      99        const char *end;
     100        struct callshape shape;
     101  
     102        if (keywords.table == NULL)
     103          hash_init (&keywords, 100);
     104  
     105        split_keywordspec (name, &end, &shape);
     106  
     107        /* The characters between name and end should form a valid Tcl
     108           function name.  A leading "::" is redundant.  */
     109        if (end - name >= 2 && name[0] == ':' && name[1] == ':')
     110          name += 2;
     111  
     112        insert_keyword_callshape (&keywords, name, end - name, &shape);
     113      }
     114  }
     115  
     116  /* Finish initializing the keywords hash table.
     117     Called after argument processing, before each file is processed.  */
     118  static void
     119  init_keywords ()
     120  {
     121    if (default_keywords)
     122      {
     123        /* When adding new keywords here, also update the documentation in
     124           xgettext.texi!  */
     125        x_tcl_keyword ("::msgcat::mc");
     126        default_keywords = false;
     127      }
     128  }
     129  
     130  void
     131  init_flag_table_tcl ()
     132  {
     133    xgettext_record_flag ("::msgcat::mc:1:pass-tcl-format");
     134    xgettext_record_flag ("format:1:tcl-format");
     135  }
     136  
     137  
     138  /* ======================== Reading of characters.  ======================== */
     139  
     140  /* The input file stream.  */
     141  static FILE *fp;
     142  
     143  
     144  /* Fetch the next character from the input file.  */
     145  static int
     146  do_getc ()
     147  {
     148    int c = getc (fp);
     149  
     150    if (c == EOF)
     151      {
     152        if (ferror (fp))
     153          error (EXIT_FAILURE, errno,
     154                 _("error while reading \"%s\""), real_file_name);
     155      }
     156    else if (c == '\n')
     157     line_number++;
     158  
     159    return c;
     160  }
     161  
     162  /* Put back the last fetched character, not EOF.  */
     163  static void
     164  do_ungetc (int c)
     165  {
     166    if (c == '\n')
     167      line_number--;
     168    ungetc (c, fp);
     169  }
     170  
     171  
     172  /* Combine backslash followed by newline and additional whitespace to
     173     a single space.  */
     174  
     175  /* An int that becomes a space when casted to 'unsigned char'.  */
     176  #define BS_NL (UCHAR_MAX + 1 + ' ')
     177  
     178  static int phase1_pushback[5];
     179  static int phase1_pushback_length;
     180  
     181  static int
     182  phase1_getc ()
     183  {
     184    int c;
     185  
     186    if (phase1_pushback_length)
     187      {
     188        c = phase1_pushback[--phase1_pushback_length];
     189        if (c == '\n' || c == BS_NL)
     190          ++line_number;
     191        return c;
     192      }
     193    c = do_getc ();
     194    if (c != '\\')
     195      return c;
     196    c = do_getc ();
     197    if (c != '\n')
     198      {
     199        if (c != EOF)
     200          do_ungetc (c);
     201        return '\\';
     202      }
     203    for (;;)
     204      {
     205        c = do_getc ();
     206        if (!(c == ' ' || c == '\t'))
     207          break;
     208      }
     209    if (c != EOF)
     210      do_ungetc (c);
     211    return BS_NL;
     212  }
     213  
     214  /* Supports only one pushback character.  */
     215  static void
     216  phase1_ungetc (int c)
     217  {
     218    switch (c)
     219      {
     220      case EOF:
     221        break;
     222  
     223      case '\n':
     224      case BS_NL:
     225        --line_number;
     226        FALLTHROUGH;
     227  
     228      default:
     229        if (phase1_pushback_length == SIZEOF (phase1_pushback))
     230          abort ();
     231        phase1_pushback[phase1_pushback_length++] = c;
     232        break;
     233      }
     234  }
     235  
     236  
     237  /* Keep track of brace nesting depth.
     238     When a word starts with an opening brace, a character group begins that
     239     ends with the corresponding closing brace.  In theory these character
     240     groups are string literals, but they are used by so many Tcl primitives
     241     (proc, if, ...) as representing command lists, that we treat them as
     242     command lists.  */
     243  
     244  /* An int that becomes a closing brace when casted to 'unsigned char'.  */
     245  #define CL_BRACE (UCHAR_MAX + 1 + '}')
     246  
     247  static int phase2_pushback[2];
     248  static int phase2_pushback_length;
     249  
     250  /* Brace nesting depth inside the current character group.  */
     251  static int brace_depth;
     252  
     253  static int
     254  phase2_push ()
     255  {
     256    int previous_depth = brace_depth;
     257    brace_depth = 1;
     258    return previous_depth;
     259  }
     260  
     261  static void
     262  phase2_pop (int previous_depth)
     263  {
     264    brace_depth = previous_depth;
     265  }
     266  
     267  static int
     268  phase2_getc ()
     269  {
     270    int c;
     271  
     272    if (phase2_pushback_length)
     273      {
     274        c = phase2_pushback[--phase2_pushback_length];
     275        if (c == '\n' || c == BS_NL)
     276          ++line_number;
     277        else if (c == '{')
     278          ++brace_depth;
     279        else if (c == '}')
     280          --brace_depth;
     281        return c;
     282      }
     283    c = phase1_getc ();
     284    if (c == '{')
     285      ++brace_depth;
     286    else if (c == '}')
     287      {
     288        if (--brace_depth == 0)
     289          c = CL_BRACE;
     290      }
     291    return c;
     292  }
     293  
     294  /* Supports 2 characters of pushback.  */
     295  static void
     296  phase2_ungetc (int c)
     297  {
     298    if (c != EOF)
     299      {
     300        switch (c)
     301          {
     302          case '\n':
     303          case BS_NL:
     304            --line_number;
     305            break;
     306  
     307          case '{':
     308            --brace_depth;
     309            break;
     310  
     311          case '}':
     312            ++brace_depth;
     313            break;
     314          }
     315        if (phase2_pushback_length == SIZEOF (phase2_pushback))
     316          abort ();
     317        phase2_pushback[phase2_pushback_length++] = c;
     318      }
     319  }
     320  
     321  
     322  /* ========================== Reading of tokens.  ========================== */
     323  
     324  
     325  /* A token consists of a sequence of characters.  */
     326  struct token
     327  {
     328    int allocated;                /* number of allocated 'token_char's */
     329    int charcount;                /* number of used 'token_char's */
     330    char *chars;                  /* the token's constituents */
     331  };
     332  
     333  /* Initialize a 'struct token'.  */
     334  static inline void
     335  init_token (struct token *tp)
     336  {
     337    tp->allocated = 10;
     338    tp->chars = XNMALLOC (tp->allocated, char);
     339    tp->charcount = 0;
     340  }
     341  
     342  /* Free the memory pointed to by a 'struct token'.  */
     343  static inline void
     344  free_token (struct token *tp)
     345  {
     346    free (tp->chars);
     347  }
     348  
     349  /* Ensure there is enough room in the token for one more character.  */
     350  static inline void
     351  grow_token (struct token *tp)
     352  {
     353    if (tp->charcount == tp->allocated)
     354      {
     355        tp->allocated *= 2;
     356        tp->chars = (char *) xrealloc (tp->chars, tp->allocated * sizeof (char));
     357      }
     358  }
     359  
     360  
     361  /* ========================= Accumulating comments ========================= */
     362  
     363  
     364  static char *buffer;
     365  static size_t bufmax;
     366  static size_t buflen;
     367  
     368  static inline void
     369  comment_start ()
     370  {
     371    buflen = 0;
     372  }
     373  
     374  static inline void
     375  comment_add (int c)
     376  {
     377    if (buflen >= bufmax)
     378      {
     379        bufmax = 2 * bufmax + 10;
     380        buffer = xrealloc (buffer, bufmax);
     381      }
     382    buffer[buflen++] = c;
     383  }
     384  
     385  static inline void
     386  comment_line_end ()
     387  {
     388    while (buflen >= 1
     389           && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
     390      --buflen;
     391    if (buflen >= bufmax)
     392      {
     393        bufmax = 2 * bufmax + 10;
     394        buffer = xrealloc (buffer, bufmax);
     395      }
     396    buffer[buflen] = '\0';
     397    savable_comment_add (buffer);
     398  }
     399  
     400  
     401  /* These are for tracking whether comments count as immediately before
     402     keyword.  */
     403  static int last_comment_line;
     404  static int last_non_comment_line;
     405  
     406  
     407  /* ========================= Accumulating messages ========================= */
     408  
     409  
     410  static message_list_ty *mlp;
     411  
     412  
     413  /* ========================== Reading of commands ========================== */
     414  
     415  
     416  /* We are only interested in constant strings (e.g. "msgcat::mc" or other
     417     string literals).  Other words need not to be represented precisely.  */
     418  enum word_type
     419  {
     420    t_string,     /* constant string */
     421    t_other,      /* other string */
     422    t_separator,  /* command separator: semicolon or newline */
     423    t_bracket,    /* ']' pseudo word */
     424    t_brace,      /* '}' pseudo word */
     425    t_eof         /* EOF marker */
     426  };
     427  
     428  struct word
     429  {
     430    enum word_type type;
     431    struct token *token;          /* for t_string */
     432    int line_number_at_start;     /* for t_string */
     433  };
     434  
     435  /* Free the memory pointed to by a 'struct word'.  */
     436  static inline void
     437  free_word (struct word *wp)
     438  {
     439    if (wp->type == t_string)
     440      {
     441        free_token (wp->token);
     442        free (wp->token);
     443      }
     444  }
     445  
     446  /* Convert a t_string token to a char*.  */
     447  static char *
     448  string_of_word (const struct word *wp)
     449  {
     450    char *str;
     451    int n;
     452  
     453    if (!(wp->type == t_string))
     454      abort ();
     455    n = wp->token->charcount;
     456    str = XNMALLOC (n + 1, char);
     457    memcpy (str, wp->token->chars, n);
     458    str[n] = '\0';
     459    return str;
     460  }
     461  
     462  
     463  /* Context lookup table.  */
     464  static flag_context_list_table_ty *flag_context_list_table;
     465  
     466  
     467  /* Maximum supported nesting depth.  */
     468  #define MAX_NESTING_DEPTH 1000
     469  
     470  /* Current nesting depths.  */
     471  static int bracket_nesting_depth;
     472  static int brace_nesting_depth;
     473  
     474  
     475  /* Read an escape sequence.  The value is an ISO-8859-1 character (in the
     476     range 0x00..0xff) or a Unicode character (in the range 0x0000..0x10FFFF).  */
     477  static int
     478  do_getc_escaped ()
     479  {
     480    int c;
     481  
     482    c = phase1_getc ();
     483    switch (c)
     484      {
     485      case EOF:
     486        return '\\';
     487      case 'a':
     488        return '\a';
     489      case 'b':
     490        return '\b';
     491      case 'f':
     492        return '\f';
     493      case 'n':
     494        return '\n';
     495      case 'r':
     496        return '\r';
     497      case 't':
     498        return '\t';
     499      case 'v':
     500        return '\v';
     501      case 'x':
     502        {
     503          unsigned int n = 0;
     504          unsigned int i;
     505  
     506          for (i = 0; i < 2; i++)
     507            {
     508              c = phase1_getc ();
     509              if (c == EOF || !c_isxdigit ((unsigned char) c))
     510                {
     511                  phase1_ungetc (c);
     512                  break;
     513                }
     514  
     515              if (c >= '0' && c <= '9')
     516                n = (n << 4) + (c - '0');
     517              else if (c >= 'A' && c <= 'F')
     518                n = (n << 4) + (c - 'A' + 10);
     519              else if (c >= 'a' && c <= 'f')
     520                n = (n << 4) + (c - 'a' + 10);
     521            }
     522          return (i > 0 ? (unsigned char) n : 'x');
     523        }
     524      case 'u':
     525        {
     526          unsigned int n = 0;
     527          unsigned int i;
     528  
     529          for (i = 0; i < 4; i++)
     530            {
     531              c = phase1_getc ();
     532              if (c == EOF || !c_isxdigit ((unsigned char) c))
     533                {
     534                  phase1_ungetc (c);
     535                  break;
     536                }
     537  
     538              if (c >= '0' && c <= '9')
     539                n = (n << 4) + (c - '0');
     540              else if (c >= 'A' && c <= 'F')
     541                n = (n << 4) + (c - 'A' + 10);
     542              else if (c >= 'a' && c <= 'f')
     543                n = (n << 4) + (c - 'a' + 10);
     544            }
     545          return (i > 0 ? n : 'u');
     546        }
     547      case 'U':
     548        {
     549          unsigned int n = 0;
     550          unsigned int i;
     551  
     552          for (i = 0; i < 8; i++)
     553            {
     554              c = phase1_getc ();
     555              if (c == EOF || !c_isxdigit ((unsigned char) c) || n >= 0x11000)
     556                {
     557                  phase1_ungetc (c);
     558                  break;
     559                }
     560  
     561              if (c >= '0' && c <= '9')
     562                n = (n << 4) + (c - '0');
     563              else if (c >= 'A' && c <= 'F')
     564                n = (n << 4) + (c - 'A' + 10);
     565              else if (c >= 'a' && c <= 'f')
     566                n = (n << 4) + (c - 'a' + 10);
     567            }
     568          return (i > 0 ? n : 'u');
     569        }
     570      case '0': case '1': case '2': case '3': case '4':
     571      case '5': case '6': case '7':
     572        {
     573          int n = c - '0';
     574  
     575          c = phase1_getc ();
     576          if (c != EOF)
     577            {
     578              if (c >= '0' && c <= '7')
     579                {
     580                  n = (n << 3) + (c - '0');
     581                  c = phase1_getc ();
     582                  if (c != EOF)
     583                    {
     584                      if (c >= '0' && c <= '7')
     585                        n = (n << 3) + (c - '0');
     586                      else
     587                        phase1_ungetc (c);
     588                    }
     589                }
     590              else
     591                phase1_ungetc (c);
     592            }
     593          return (unsigned char) n;
     594        }
     595      default:
     596        /* Note: If c is non-ASCII, Tcl's behaviour is undefined here.  */
     597        return (unsigned char) c;
     598      }
     599  }
     600  
     601  /* Read an escape sequence for a low surrogate Unicode character.
     602     The value is in the range 0xDC00..0xDFFF.
     603     Return -1 when none was seen.  */
     604  static int
     605  do_getc_escaped_low_surrogate ()
     606  {
     607    int c;
     608  
     609    c = phase1_getc ();
     610    switch (c)
     611      {
     612      case 'u':
     613        {
     614          unsigned char buf[4];
     615          unsigned int n = 0;
     616          unsigned int i;
     617  
     618          for (i = 0; i < 4; i++)
     619            {
     620              c = phase1_getc ();
     621              if (c == EOF || !c_isxdigit ((unsigned char) c))
     622                {
     623                  phase1_ungetc (c);
     624                  while (i > 0)
     625                    phase1_ungetc (buf[--i]);
     626                  phase1_ungetc ('u');
     627                  return -1;
     628                }
     629              buf[i] = c;
     630  
     631              if (c >= '0' && c <= '9')
     632                n = (n << 4) + (c - '0');
     633              else if (c >= 'A' && c <= 'F')
     634                n = (n << 4) + (c - 'A' + 10);
     635              else if (c >= 'a' && c <= 'f')
     636                n = (n << 4) + (c - 'a' + 10);
     637            }
     638          if (n >= 0xdc00 && n <= 0xdfff)
     639            return n;
     640          else
     641            {
     642              while (i > 0)
     643                phase1_ungetc (buf[--i]);
     644              phase1_ungetc ('u');
     645              return -1;
     646            }
     647        }
     648      default:
     649        phase1_ungetc (c);
     650        return -1;
     651      }
     652  }
     653  
     654  
     655  enum terminator
     656  {
     657    te_space_separator,           /* looking for space semicolon newline */
     658    te_space_separator_bracket,   /* looking for space semicolon newline ']' */
     659    te_paren,                     /* looking for ')' */
     660    te_quote                      /* looking for '"' */
     661  };
     662  
     663  /* Forward declaration of local functions.  */
     664  static enum word_type read_command_list (int looking_for,
     665                                           flag_context_ty outer_context);
     666  
     667  /* Accumulate tokens into the given word.
     668     'looking_for' denotes a parse terminator combination.
     669     Return the first character past the token.  */
     670  static int
     671  accumulate_word (struct word *wp, enum terminator looking_for,
     672                   flag_context_ty context)
     673  {
     674    int c;
     675  
     676    for (;;)
     677      {
     678        c = phase2_getc ();
     679  
     680        if (c == EOF || c == CL_BRACE)
     681          return c;
     682        if ((looking_for == te_space_separator
     683             || looking_for == te_space_separator_bracket)
     684            && (c == ' ' || c == BS_NL
     685                || c == '\t' || c == '\v' || c == '\f' || c == '\r'
     686                || c == ';' || c == '\n'))
     687          return c;
     688        if (looking_for == te_space_separator_bracket && c == ']')
     689          return c;
     690        if (looking_for == te_paren && c == ')')
     691          return c;
     692        if (looking_for == te_quote && c == '"')
     693          return c;
     694  
     695        if (c == '$')
     696          {
     697            /* Distinguish $varname, ${varname} and lone $.  */
     698            c = phase2_getc ();
     699            if (c == '{')
     700              {
     701                /* ${varname} */
     702                do
     703                  c = phase2_getc ();
     704                while (c != EOF && c != '}');
     705                wp->type = t_other;
     706              }
     707            else
     708              {
     709                bool nonempty = false;
     710  
     711                for (; c != EOF && c != CL_BRACE; c = phase2_getc ())
     712                  {
     713                    if (c_isalnum ((unsigned char) c) || (c == '_'))
     714                      {
     715                        nonempty = true;
     716                        continue;
     717                      }
     718                    if (c == ':')
     719                      {
     720                        c = phase2_getc ();
     721                        if (c == ':')
     722                          {
     723                            do
     724                              c = phase2_getc ();
     725                            while (c == ':');
     726  
     727                            phase2_ungetc (c);
     728                            nonempty = true;
     729                            continue;
     730                          }
     731                        phase2_ungetc (c);
     732                        c = ':';
     733                      }
     734                    break;
     735                  }
     736                if (c == '(')
     737                  {
     738                    /* $varname(index) */
     739                    struct word index_word;
     740  
     741                    index_word.type = t_other;
     742                    c = accumulate_word (&index_word, te_paren, null_context);
     743                    if (c != EOF && c != ')')
     744                      phase2_ungetc (c);
     745                    wp->type = t_other;
     746                  }
     747                else
     748                  {
     749                    phase2_ungetc (c);
     750                    if (nonempty)
     751                      {
     752                        /* $varname */
     753                        wp->type = t_other;
     754                      }
     755                    else
     756                      {
     757                        /* lone $ */
     758                        if (wp->type == t_string)
     759                          {
     760                            grow_token (wp->token);
     761                            wp->token->chars[wp->token->charcount++] = '$';
     762                          }
     763                      }
     764                  }
     765              }
     766          }
     767        else if (c == '[')
     768          {
     769            if (++bracket_nesting_depth > MAX_NESTING_DEPTH)
     770              {
     771                error_with_progname = false;
     772                error (EXIT_FAILURE, 0, _("%s:%d: error: too many open brackets"),
     773                       logical_file_name, line_number);
     774              }
     775            read_command_list (']', context);
     776            bracket_nesting_depth--;
     777            wp->type = t_other;
     778          }
     779        else if (c == '\\')
     780          {
     781            unsigned int uc = do_getc_escaped ();
     782            assert (uc < 0x110000);
     783            if (uc >= 0xd800 && uc <= 0xdfff)
     784              {
     785                if (uc < 0xdc00)
     786                  {
     787                    /* Saw a high surrogate Unicode character.
     788                       Is it followed by a low surrogate Unicode character?  */
     789                    c = phase2_getc ();
     790                    if (c == '\\')
     791                      {
     792                        int uc2 = do_getc_escaped_low_surrogate ();
     793                        if (uc2 >= 0)
     794                          {
     795                            /* Saw a low surrogate Unicode character.  */
     796                            assert (uc2 >= 0xdc00 && uc2 <= 0xdfff);
     797                            uc = 0x10000 + ((uc - 0xd800) << 10) + (uc2 - 0xdc00);
     798                            goto saw_unicode_escape;
     799                          }
     800                      }
     801                    phase2_ungetc (c);
     802                  }
     803                error_with_progname = false;
     804                error (0, 0, _("%s:%d: warning: invalid Unicode character"),
     805                       logical_file_name, line_number);
     806                error_with_progname = true;
     807                goto done_escape;
     808              }
     809           saw_unicode_escape:
     810            {
     811              unsigned char utf8buf[6];
     812              int count = u8_uctomb (utf8buf, uc, 6);
     813              int i;
     814              assert (count > 0);
     815              if (wp->type == t_string)
     816                for (i = 0; i < count; i++)
     817                  {
     818                    grow_token (wp->token);
     819                    wp->token->chars[wp->token->charcount++] = utf8buf[i];
     820                  }
     821            }
     822           done_escape: ;
     823          }
     824        else
     825          {
     826            if (wp->type == t_string)
     827              {
     828                grow_token (wp->token);
     829                wp->token->chars[wp->token->charcount++] = (unsigned char) c;
     830              }
     831          }
     832      }
     833  }
     834  
     835  
     836  /* Read the next word.
     837     'looking_for' denotes a parse terminator, either ']' or '\0'.  */
     838  static void
     839  read_word (struct word *wp, int looking_for, flag_context_ty context)
     840  {
     841    int c;
     842  
     843    do
     844      c = phase2_getc ();
     845    while (c == ' ' || c == BS_NL
     846           || c == '\t' || c == '\v' || c == '\f' || c == '\r');
     847  
     848    if (c == EOF)
     849      {
     850        wp->type = t_eof;
     851        return;
     852      }
     853  
     854    if (c == CL_BRACE)
     855      {
     856        wp->type = t_brace;
     857        last_non_comment_line = line_number;
     858        return;
     859      }
     860  
     861    if (c == '\n')
     862      {
     863        /* Comments assumed to be grouped with a message must immediately
     864           precede it, with no non-whitespace token on a line between both.  */
     865        if (last_non_comment_line > last_comment_line)
     866          savable_comment_reset ();
     867        wp->type = t_separator;
     868        return;
     869      }
     870  
     871    if (c == ';')
     872      {
     873        wp->type = t_separator;
     874        last_non_comment_line = line_number;
     875        return;
     876      }
     877  
     878    if (looking_for == ']' && c == ']')
     879      {
     880        wp->type = t_bracket;
     881        last_non_comment_line = line_number;
     882        return;
     883      }
     884  
     885    if (c == '{')
     886      {
     887        int previous_depth;
     888        enum word_type terminator;
     889  
     890        /* Start a new nested character group, which lasts until the next
     891           balanced '}' (ignoring \} things).  */
     892        previous_depth = phase2_push () - 1;
     893  
     894        /* Interpret it as a command list.  */
     895        if (++brace_nesting_depth > MAX_NESTING_DEPTH)
     896          {
     897            error_with_progname = false;
     898            error (EXIT_FAILURE, 0, _("%s:%d: error: too many open braces"),
     899                   logical_file_name, line_number);
     900          }
     901        terminator = read_command_list ('\0', null_context);
     902        brace_nesting_depth--;
     903  
     904        if (terminator == t_brace)
     905          phase2_pop (previous_depth);
     906  
     907        wp->type = t_other;
     908        last_non_comment_line = line_number;
     909        return;
     910      }
     911  
     912    wp->type = t_string;
     913    wp->token = XMALLOC (struct token);
     914    init_token (wp->token);
     915    wp->line_number_at_start = line_number;
     916  
     917    if (c == '"')
     918      {
     919        c = accumulate_word (wp, te_quote, context);
     920        if (c != EOF && c != '"')
     921          phase2_ungetc (c);
     922      }
     923    else
     924      {
     925        phase2_ungetc (c);
     926        c = accumulate_word (wp,
     927                             looking_for == ']'
     928                             ? te_space_separator_bracket
     929                             : te_space_separator,
     930                             context);
     931        if (c != EOF)
     932          phase2_ungetc (c);
     933      }
     934  
     935    if (wp->type != t_string)
     936      {
     937        free_token (wp->token);
     938        free (wp->token);
     939      }
     940    last_non_comment_line = line_number;
     941  }
     942  
     943  
     944  /* Read the next command.
     945     'looking_for' denotes a parse terminator, either ']' or '\0'.
     946     Returns the type of the word that terminated the command: t_separator or
     947     t_bracket (only if looking_for is ']') or t_brace or t_eof.  */
     948  static enum word_type
     949  read_command (int looking_for, flag_context_ty outer_context)
     950  {
     951    int c;
     952  
     953    /* Skip whitespace and comments.  */
     954    for (;;)
     955      {
     956        c = phase2_getc ();
     957  
     958        if (c == ' ' || c == BS_NL
     959            || c == '\t' || c == '\v' || c == '\f' || c == '\r')
     960          continue;
     961        if (c == '#')
     962          {
     963            /* Skip a comment up to end of line.  */
     964            last_comment_line = line_number;
     965            comment_start ();
     966            for (;;)
     967              {
     968                c = phase2_getc ();
     969                if (c == EOF || c == CL_BRACE || c == '\n')
     970                  break;
     971                /* We skip all leading white space, but not EOLs.  */
     972                if (!(buflen == 0 && (c == ' ' || c == '\t')))
     973                  comment_add (c);
     974              }
     975            comment_line_end ();
     976            continue;
     977          }
     978        break;
     979      }
     980    phase2_ungetc (c);
     981  
     982    /* Read the words that make up the command.  */
     983    {
     984      int arg = 0;                /* Current argument number.  */
     985      flag_context_list_iterator_ty context_iter;
     986      const struct callshapes *shapes = NULL;
     987      struct arglist_parser *argparser = NULL;
     988  
     989      for (;; arg++)
     990        {
     991          struct word inner;
     992          flag_context_ty inner_context;
     993  
     994          if (arg == 0)
     995            inner_context = null_context;
     996          else
     997            inner_context =
     998              inherited_context (outer_context,
     999                                 flag_context_list_iterator_advance (
    1000                                   &context_iter));
    1001  
    1002          read_word (&inner, looking_for, inner_context);
    1003  
    1004          /* Recognize end of command.  */
    1005          if (inner.type == t_separator || inner.type == t_bracket
    1006              || inner.type == t_brace || inner.type == t_eof)
    1007            {
    1008              if (argparser != NULL)
    1009                arglist_parser_done (argparser, arg);
    1010              return inner.type;
    1011            }
    1012  
    1013          if (extract_all)
    1014            {
    1015              if (inner.type == t_string)
    1016                {
    1017                  lex_pos_ty pos;
    1018  
    1019                  pos.file_name = logical_file_name;
    1020                  pos.line_number = inner.line_number_at_start;
    1021                  remember_a_message (mlp, NULL, string_of_word (&inner), false,
    1022                                      false, inner_context, &pos,
    1023                                      NULL, savable_comment, false);
    1024                }
    1025            }
    1026  
    1027          if (arg == 0)
    1028            {
    1029              /* This is the function position.  */
    1030              if (inner.type == t_string)
    1031                {
    1032                  char *function_name = string_of_word (&inner);
    1033                  char *stripped_name;
    1034                  void *keyword_value;
    1035  
    1036                  /* A leading "::" is redundant.  */
    1037                  stripped_name = function_name;
    1038                  if (function_name[0] == ':' && function_name[1] == ':')
    1039                    stripped_name += 2;
    1040  
    1041                  if (hash_find_entry (&keywords,
    1042                                       stripped_name, strlen (stripped_name),
    1043                                       &keyword_value)
    1044                      == 0)
    1045                    shapes = (const struct callshapes *) keyword_value;
    1046  
    1047                  argparser = arglist_parser_alloc (mlp, shapes);
    1048  
    1049                  context_iter =
    1050                    flag_context_list_iterator (
    1051                      flag_context_list_table_lookup (
    1052                        flag_context_list_table,
    1053                        stripped_name, strlen (stripped_name)));
    1054  
    1055                  free (function_name);
    1056                }
    1057              else
    1058                context_iter = null_context_list_iterator;
    1059            }
    1060          else
    1061            {
    1062              /* These are the argument positions.  */
    1063              if (argparser != NULL && inner.type == t_string)
    1064                {
    1065                  char *s = string_of_word (&inner);
    1066                  mixed_string_ty *ms =
    1067                    mixed_string_alloc_simple (s, lc_string,
    1068                                               logical_file_name,
    1069                                               inner.line_number_at_start);
    1070                  free (s);
    1071                  arglist_parser_remember (argparser, arg, ms,
    1072                                           inner_context,
    1073                                           logical_file_name,
    1074                                           inner.line_number_at_start,
    1075                                           savable_comment, false);
    1076                }
    1077            }
    1078  
    1079          free_word (&inner);
    1080        }
    1081    }
    1082  }
    1083  
    1084  
    1085  /* Read a list of commands.
    1086     'looking_for' denotes a parse terminator, either ']' or '\0'.
    1087     Returns the type of the word that terminated the command list:
    1088     t_bracket (only if looking_for is ']') or t_brace or t_eof.  */
    1089  static enum word_type
    1090  read_command_list (int looking_for, flag_context_ty outer_context)
    1091  {
    1092    for (;;)
    1093      {
    1094        enum word_type terminator;
    1095  
    1096        terminator = read_command (looking_for, outer_context);
    1097        if (terminator != t_separator)
    1098          return terminator;
    1099      }
    1100  }
    1101  
    1102  
    1103  void
    1104  extract_tcl (FILE *f,
    1105               const char *real_filename, const char *logical_filename,
    1106               flag_context_list_table_ty *flag_table,
    1107               msgdomain_list_ty *mdlp)
    1108  {
    1109    mlp = mdlp->item[0]->messages;
    1110  
    1111    /* We convert our strings to UTF-8 encoding.  */
    1112    xgettext_current_source_encoding = po_charset_utf8;
    1113  
    1114    fp = f;
    1115    real_file_name = real_filename;
    1116    logical_file_name = xstrdup (logical_filename);
    1117    line_number = 1;
    1118  
    1119    phase1_pushback_length = 0;
    1120    phase2_pushback_length = 0;
    1121  
    1122    /* Initially, no brace is open.  */
    1123    brace_depth = 1000000;
    1124  
    1125    last_comment_line = -1;
    1126    last_non_comment_line = -1;
    1127  
    1128    flag_context_list_table = flag_table;
    1129    bracket_nesting_depth = 0;
    1130    brace_nesting_depth = 0;
    1131  
    1132    init_keywords ();
    1133  
    1134    /* Eat tokens until eof is seen.  */
    1135    read_command_list ('\0', null_context);
    1136  
    1137    fp = NULL;
    1138    real_file_name = NULL;
    1139    logical_file_name = NULL;
    1140    line_number = 0;
    1141  }