1  /* xgettext Lisp backend.
       2     Copyright (C) 2001-2003, 2005-2009, 2018-2023 Free Software Foundation, Inc.
       3  
       4     This file was written by Bruno Haible <haible@clisp.cons.org>, 2001.
       5  
       6     This program is free software: you can redistribute it and/or modify
       7     it under the terms of the GNU General Public License as published by
       8     the Free Software Foundation; either version 3 of the License, or
       9     (at your option) any later version.
      10  
      11     This program is distributed in the hope that it will be useful,
      12     but WITHOUT ANY WARRANTY; without even the implied warranty of
      13     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      14     GNU General Public License for more details.
      15  
      16     You should have received a copy of the GNU General Public License
      17     along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
      18  
      19  #ifdef HAVE_CONFIG_H
      20  # include "config.h"
      21  #endif
      22  
      23  /* Specification.  */
      24  #include "x-lisp.h"
      25  
      26  #include <errno.h>
      27  #include <stdbool.h>
      28  #include <stdio.h>
      29  #include <stdlib.h>
      30  #include <string.h>
      31  
      32  #include "attribute.h"
      33  #include "message.h"
      34  #include "xgettext.h"
      35  #include "xg-pos.h"
      36  #include "xg-mixed-string.h"
      37  #include "xg-arglist-context.h"
      38  #include "xg-arglist-callshape.h"
      39  #include "xg-arglist-parser.h"
      40  #include "xg-message.h"
      41  #include "error.h"
      42  #include "error-progname.h"
      43  #include "xalloc.h"
      44  #include "mem-hash-map.h"
      45  #include "gettext.h"
      46  
      47  #define _(s) gettext(s)
      48  
      49  
      50  /* The Common Lisp syntax is described in the Common Lisp HyperSpec, chapter 2.
      51     Since we are interested only in strings and in forms similar to
      52          (gettext msgid ...)
      53     or   (ngettext msgid msgid_plural ...)
      54     we make the following simplifications:
      55  
      56     - Assume the keywords and strings are in an ASCII compatible encoding.
      57       This means we can read the input file one byte at a time, instead of
      58       one character at a time.  No need to worry about multibyte characters:
      59       If they occur as part of identifiers, they most probably act as
      60       constituent characters, and the byte based approach will do the same.
      61  
      62     - Assume the read table is the standard Common Lisp read table.
      63       Non-standard read tables are mostly used to read data, not programs.
      64  
      65     - Assume the read table case is :UPCASE, and *READ-BASE* is 10.
      66  
      67     - Don't interpret #n= and #n#, they usually don't appear in programs.
      68  
      69     - Don't interpret #+, #-, they are unlikely to appear in a gettext form.
      70  
      71     The remaining syntax rules are:
      72  
      73     - The syntax code assigned to each character, and how tokens are built
      74       up from characters (single escape, multiple escape etc.).
      75  
      76     - Comment syntax: ';' and '#| ... |#'.
      77  
      78     - String syntax: "..." with single escapes.
      79  
      80     - Read macros and dispatch macro character '#'.  Needed to be able to
      81       tell which is the n-th argument of a function call.
      82  
      83   */
      84  
      85  
      86  /* ========================= Lexer customization.  ========================= */
      87  
      88  /* 'readtable_case' is the case conversion that is applied to non-escaped
      89      parts of symbol tokens.  In Common Lisp: (readtable-case *readtable*).  */
      90  
      91  enum rtcase
      92  {
      93    case_upcase,
      94    case_downcase,
      95    case_preserve,
      96    case_invert
      97  };
      98  
      99  static enum rtcase readtable_case = case_upcase;
     100  
     101  /* 'read_base' is the assumed radix of integers and rational numbers.
     102     In Common Lisp: *read-base*.  */
     103  static int read_base = 10;
     104  
     105  /* 'read_preserve_whitespace' specifies whether a whitespace character
     106     that terminates a token must be pushed back on the input stream.
     107     We set it to true, because the special newline side effect in read_object()
     108     requires that read_object() sees every newline not inside a token.  */
     109  static bool read_preserve_whitespace = true;
     110  
     111  
     112  /* ====================== Keyword set customization.  ====================== */
     113  
     114  /* If true extract all strings.  */
     115  static bool extract_all = false;
     116  
     117  static hash_table keywords;
     118  static bool default_keywords = true;
     119  
     120  
     121  void
     122  x_lisp_extract_all ()
     123  {
     124    extract_all = true;
     125  }
     126  
     127  
     128  void
     129  x_lisp_keyword (const char *name)
     130  {
     131    if (name == NULL)
     132      default_keywords = false;
     133    else
     134      {
     135        const char *end;
     136        struct callshape shape;
     137        const char *colon;
     138        size_t len;
     139        char *symname;
     140        size_t i;
     141  
     142        if (keywords.table == NULL)
     143          hash_init (&keywords, 100);
     144  
     145        split_keywordspec (name, &end, &shape);
     146  
     147        /* The characters between name and end should form a valid Lisp symbol.
     148           Extract the symbol name part.  */
     149        colon = strchr (name, ':');
     150        if (colon != NULL && colon < end)
     151          {
     152            name = colon + 1;
     153            if (name < end && *name == ':')
     154              name++;
     155            colon = strchr (name, ':');
     156            if (colon != NULL && colon < end)
     157              return;
     158          }
     159  
     160        /* Uppercase it.  */
     161        len = end - name;
     162        symname = XNMALLOC (len, char);
     163        for (i = 0; i < len; i++)
     164          symname[i] =
     165            (name[i] >= 'a' && name[i] <= 'z' ? name[i] - 'a' + 'A' : name[i]);
     166  
     167        insert_keyword_callshape (&keywords, symname, len, &shape);
     168      }
     169  }
     170  
     171  /* Finish initializing the keywords hash table.
     172     Called after argument processing, before each file is processed.  */
     173  static void
     174  init_keywords ()
     175  {
     176    if (default_keywords)
     177      {
     178        /* When adding new keywords here, also update the documentation in
     179           xgettext.texi!  */
     180        x_lisp_keyword ("gettext");       /* I18N:GETTEXT */
     181        x_lisp_keyword ("ngettext:1,2");  /* I18N:NGETTEXT */
     182        x_lisp_keyword ("gettext-noop");
     183        default_keywords = false;
     184      }
     185  }
     186  
     187  void
     188  init_flag_table_lisp ()
     189  {
     190    xgettext_record_flag ("gettext:1:pass-lisp-format");
     191    xgettext_record_flag ("ngettext:1:pass-lisp-format");
     192    xgettext_record_flag ("ngettext:2:pass-lisp-format");
     193    xgettext_record_flag ("gettext-noop:1:pass-lisp-format");
     194    xgettext_record_flag ("format:2:lisp-format");
     195  }
     196  
     197  
     198  /* ======================== Reading of characters.  ======================== */
     199  
     200  /* The input file stream.  */
     201  static FILE *fp;
     202  
     203  
     204  /* Fetch the next character from the input file.  */
     205  static int
     206  do_getc ()
     207  {
     208    int c = getc (fp);
     209  
     210    if (c == EOF)
     211      {
     212        if (ferror (fp))
     213          error (EXIT_FAILURE, errno,
     214                 _("error while reading \"%s\""), real_file_name);
     215      }
     216    else if (c == '\n')
     217     line_number++;
     218  
     219    return c;
     220  }
     221  
     222  /* Put back the last fetched character, not EOF.  */
     223  static void
     224  do_ungetc (int c)
     225  {
     226    if (c == '\n')
     227      line_number--;
     228    ungetc (c, fp);
     229  }
     230  
     231  
     232  /* ========= Reading of tokens.  See CLHS 2.2 "Reader Algorithm".  ========= */
     233  
     234  
     235  /* Syntax code.  See CLHS 2.1.4 "Character Syntax Types".  */
     236  
     237  enum syntax_code
     238  {
     239    syntax_illegal,       /* non-printable, except whitespace     */
     240    syntax_single_esc,    /* '\' (single escape)                  */
     241    syntax_multi_esc,     /* '|' (multiple escape)                */
     242    syntax_constituent,   /* everything else (constituent)        */
     243    syntax_whitespace,    /* TAB,LF,FF,CR,' ' (whitespace)        */
     244    syntax_eof,           /* EOF                                  */
     245    syntax_t_macro,       /* '()'"' (terminating macro)           */
     246    syntax_nt_macro       /* '#' (non-terminating macro)          */
     247  };
     248  
     249  /* Returns the syntax code of a character.  */
     250  static enum syntax_code
     251  syntax_code_of (unsigned char c)
     252  {
     253    switch (c)
     254      {
     255      case '\\':
     256        return syntax_single_esc;
     257      case '|':
     258        return syntax_multi_esc;
     259      case '\t': case '\n': case '\f': case '\r': case ' ':
     260        return syntax_whitespace;
     261      case '(': case ')': case '\'': case '"': case ',': case ';': case '`':
     262        return syntax_t_macro;
     263      case '#':
     264        return syntax_nt_macro;
     265      default:
     266        if (c < ' ' && c != '\b')
     267          return syntax_illegal;
     268        else
     269          return syntax_constituent;
     270      }
     271  }
     272  
     273  struct char_syntax
     274  {
     275    int ch;                       /* character */
     276    enum syntax_code scode;       /* syntax code */
     277  };
     278  
     279  /* Returns the next character and its syntax code.  */
     280  static void
     281  read_char_syntax (struct char_syntax *p)
     282  {
     283    int c = do_getc ();
     284  
     285    p->ch = c;
     286    p->scode = (c == EOF ? syntax_eof : syntax_code_of (c));
     287  }
     288  
     289  /* Every character in a token has an attribute assigned.  The attributes
     290     help during interpretation of the token.  See
     291     CLHS 2.3 "Interpretation of Tokens" for the possible interpretations,
     292     and CLHS 2.1.4.2 "Constituent Traits".  */
     293  
     294  enum attribute
     295  {
     296    a_illg,       /* invalid constituent */
     297    a_pack_m,     /* ':' package marker */
     298    a_alpha,      /* normal alphabetic */
     299    a_escaped,    /* alphabetic but not subject to case conversion */
     300    a_ratio,      /* '/' */
     301    a_dot,        /* '.' */
     302    a_sign,       /* '+-' */
     303    a_extens,     /* '_^' extension characters */
     304    a_digit,      /* '0123456789' */
     305    a_letterdigit,/* 'A'-'Z','a'-'z' below base, except 'esfdlESFDL' */
     306    a_expodigit,  /* 'esfdlESFDL' below base */
     307    a_letter,     /* 'A'-'Z','a'-'z', except 'esfdlESFDL' */
     308    a_expo        /* 'esfdlESFDL' */
     309  };
     310  
     311  #define is_letter_attribute(a) ((a) >= a_letter)
     312  #define is_number_attribute(a) ((a) >= a_ratio)
     313  
     314  /* Returns the attribute of a character, assuming base 10.  */
     315  static enum attribute
     316  attribute_of (unsigned char c)
     317  {
     318    switch (c)
     319      {
     320      case ':':
     321        return a_pack_m;
     322      case '/':
     323        return a_ratio;
     324      case '.':
     325        return a_dot;
     326      case '+': case '-':
     327        return a_sign;
     328      case '_': case '^':
     329        return a_extens;
     330      case '0': case '1': case '2': case '3': case '4':
     331      case '5': case '6': case '7': case '8': case '9':
     332        return a_digit;
     333      case 'a': case 'b': case 'c': case 'g': case 'h': case 'i': case 'j':
     334      case 'k': case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
     335      case 't': case 'u': case 'v': case 'w': case 'x': case 'y': case 'z':
     336      case 'A': case 'B': case 'C': case 'G': case 'H': case 'I': case 'J':
     337      case 'K': case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
     338      case 'T': case 'U': case 'V': case 'W': case 'X': case 'Y': case 'Z':
     339        return a_letter;
     340      case 'e': case 's': case 'd': case 'f': case 'l':
     341      case 'E': case 'S': case 'D': case 'F': case 'L':
     342        return a_expo;
     343      default:
     344        /* Treat everything as valid.  Never return a_illg.  */
     345        return a_alpha;
     346      }
     347  }
     348  
     349  struct token_char
     350  {
     351    unsigned char ch;             /* character */
     352    unsigned char attribute;      /* attribute */
     353  };
     354  
     355  /* A token consists of a sequence of characters with associated attribute.  */
     356  struct token
     357  {
     358    int allocated;                /* number of allocated 'token_char's */
     359    int charcount;                /* number of used 'token_char's */
     360    struct token_char *chars;     /* the token's constituents */
     361    bool with_escape;             /* whether single-escape or multiple escape occurs */
     362  };
     363  
     364  /* Initialize a 'struct token'.  */
     365  static inline void
     366  init_token (struct token *tp)
     367  {
     368    tp->allocated = 10;
     369    tp->chars = XNMALLOC (tp->allocated, struct token_char);
     370    tp->charcount = 0;
     371  }
     372  
     373  /* Free the memory pointed to by a 'struct token'.  */
     374  static inline void
     375  free_token (struct token *tp)
     376  {
     377    free (tp->chars);
     378  }
     379  
     380  /* Ensure there is enough room in the token for one more character.  */
     381  static inline void
     382  grow_token (struct token *tp)
     383  {
     384    if (tp->charcount == tp->allocated)
     385      {
     386        tp->allocated *= 2;
     387        tp->chars = (struct token_char *) xrealloc (tp->chars, tp->allocated * sizeof (struct token_char));
     388      }
     389  }
     390  
     391  /* Read the next token.  If 'first' is given, it points to the first
     392     character, which has already been read.
     393     The algorithm follows CLHS 2.2 "Reader Algorithm".  */
     394  static void
     395  read_token (struct token *tp, const struct char_syntax *first)
     396  {
     397    bool multiple_escape_flag;
     398    struct char_syntax curr;
     399  
     400    init_token (tp);
     401    tp->with_escape = false;
     402  
     403    multiple_escape_flag = false;
     404    if (first)
     405      curr = *first;
     406    else
     407      read_char_syntax (&curr);
     408  
     409    for (;; read_char_syntax (&curr))
     410      {
     411        switch (curr.scode)
     412          {
     413          case syntax_illegal:
     414            /* Invalid input.  Be tolerant, no error message.  */
     415            do_ungetc (curr.ch);
     416            return;
     417  
     418          case syntax_single_esc:
     419            tp->with_escape = true;
     420            read_char_syntax (&curr);
     421            if (curr.scode == syntax_eof)
     422              /* Invalid input.  Be tolerant, no error message.  */
     423              return;
     424            grow_token (tp);
     425            tp->chars[tp->charcount].ch = curr.ch;
     426            tp->chars[tp->charcount].attribute = a_escaped;
     427            tp->charcount++;
     428            break;
     429  
     430          case syntax_multi_esc:
     431            multiple_escape_flag = !multiple_escape_flag;
     432            tp->with_escape = true;
     433            break;
     434  
     435          case syntax_constituent:
     436          case syntax_nt_macro:
     437            grow_token (tp);
     438            if (multiple_escape_flag)
     439              {
     440                tp->chars[tp->charcount].ch = curr.ch;
     441                tp->chars[tp->charcount].attribute = a_escaped;
     442                tp->charcount++;
     443              }
     444            else
     445              {
     446                tp->chars[tp->charcount].ch = curr.ch;
     447                tp->chars[tp->charcount].attribute = attribute_of (curr.ch);
     448                tp->charcount++;
     449              }
     450            break;
     451  
     452          case syntax_whitespace:
     453          case syntax_t_macro:
     454            if (multiple_escape_flag)
     455              {
     456                grow_token (tp);
     457                tp->chars[tp->charcount].ch = curr.ch;
     458                tp->chars[tp->charcount].attribute = a_escaped;
     459                tp->charcount++;
     460              }
     461            else
     462              {
     463                if (curr.scode != syntax_whitespace || read_preserve_whitespace)
     464                  do_ungetc (curr.ch);
     465                return;
     466              }
     467            break;
     468  
     469          case syntax_eof:
     470            if (multiple_escape_flag)
     471              /* Invalid input.  Be tolerant, no error message.  */
     472              ;
     473            return;
     474          }
     475      }
     476  }
     477  
     478  /* A potential number is a token which
     479     1. consists only of digits, '+','-','/','^','_','.' and number markers.
     480        The base for digits is context dependent, but always 10 if a dot '.'
     481        occurs. A number marker is a non-digit letter which is not adjacent
     482        to a non-digit letter.
     483     2. has at least one digit.
     484     3. starts with a digit, '+','-','.','^' or '_'.
     485     4. does not end with '+' or '-'.
     486     See CLHS 2.3.1.1 "Potential Numbers as Tokens".
     487   */
     488  
     489  static inline bool
     490  has_a_dot (const struct token *tp)
     491  {
     492    int n = tp->charcount;
     493    int i;
     494  
     495    for (i = 0; i < n; i++)
     496      if (tp->chars[i].attribute == a_dot)
     497        return true;
     498    return false;
     499  }
     500  
     501  static inline bool
     502  all_a_number (const struct token *tp)
     503  {
     504    int n = tp->charcount;
     505    int i;
     506  
     507    for (i = 0; i < n; i++)
     508      if (!is_number_attribute (tp->chars[i].attribute))
     509        return false;
     510    return true;
     511  }
     512  
     513  static inline void
     514  a_letter_to_digit (const struct token *tp, int base)
     515  {
     516    int n = tp->charcount;
     517    int i;
     518  
     519    for (i = 0; i < n; i++)
     520      if (is_letter_attribute (tp->chars[i].attribute))
     521        {
     522          int c = tp->chars[i].ch;
     523  
     524          if (c >= 'a')
     525            c -= 'a' - 'A';
     526          if (c - 'A' + 10 < base)
     527            tp->chars[i].attribute -= 2; /* a_letter -> a_letterdigit,
     528                                            a_expo -> a_expodigit */
     529        }
     530  }
     531  
     532  static inline bool
     533  has_a_digit (const struct token *tp)
     534  {
     535    int n = tp->charcount;
     536    int i;
     537  
     538    for (i = 0; i < n; i++)
     539      if (tp->chars[i].attribute == a_digit
     540          || tp->chars[i].attribute == a_letterdigit
     541          || tp->chars[i].attribute == a_expodigit)
     542        return true;
     543    return false;
     544  }
     545  
     546  static inline bool
     547  has_adjacent_letters (const struct token *tp)
     548  {
     549    int n = tp->charcount;
     550    int i;
     551  
     552    for (i = 1; i < n; i++)
     553      if (is_letter_attribute (tp->chars[i-1].attribute)
     554          && is_letter_attribute (tp->chars[i].attribute))
     555        return true;
     556    return false;
     557  }
     558  
     559  static bool
     560  is_potential_number (const struct token *tp, int *basep)
     561  {
     562    /* CLHS 2.3.1.1.1:
     563       "A potential number cannot contain any escape characters."  */
     564    if (tp->with_escape)
     565      return false;
     566  
     567    if (has_a_dot (tp))
     568      *basep = 10;
     569  
     570    if (!all_a_number (tp))
     571      return false;
     572  
     573    a_letter_to_digit (tp, *basep);
     574  
     575    if (!has_a_digit (tp))
     576      return false;
     577  
     578    if (has_adjacent_letters (tp))
     579      return false;
     580  
     581    if (!(tp->chars[0].attribute >= a_dot
     582          && tp->chars[0].attribute <= a_expodigit))
     583      return false;
     584  
     585    if (tp->chars[tp->charcount - 1].attribute == a_sign)
     586      return false;
     587  
     588    return true;
     589  }
     590  
     591  /* A number is one of integer, ratio, float.  Each has a particular syntax.
     592     See CLHS 2.3.1 "Numbers as Tokens".
     593     But note a mistake: The exponent rule should read:
     594         exponent ::= exponent-marker [sign] {decimal-digit}+
     595     (see 22.1.3.1.3 "Printing Floats").  */
     596  
     597  enum number_type
     598  {
     599    n_none,
     600    n_integer,
     601    n_ratio,
     602    n_float
     603  };
     604  
     605  static enum number_type
     606  is_number (const struct token *tp, int *basep)
     607  {
     608    struct token_char *ptr_limit;
     609    struct token_char *ptr1;
     610  
     611    if (!is_potential_number (tp, basep))
     612      return n_none;
     613  
     614    /* is_potential_number guarantees
     615       - all attributes are >= a_ratio,
     616       - there is at least one a_digit or a_letterdigit or a_expodigit, and
     617       - if there is an a_dot, then *basep = 10.  */
     618  
     619    ptr1 = &tp->chars[0];
     620    ptr_limit = &tp->chars[tp->charcount];
     621  
     622    if (ptr1->attribute == a_sign)
     623      ptr1++;
     624  
     625    /* Test for syntax
     626     * { a_sign | }
     627     * { a_digit < base }+ { a_ratio { a_digit < base }+ | }
     628     */
     629    {
     630      bool seen_a_ratio = false;
     631      bool seen_a_digit = false;  /* seen a digit in last digit block? */
     632      struct token_char *ptr;
     633  
     634      for (ptr = ptr1;; ptr++)
     635        {
     636          if (ptr >= ptr_limit)
     637            {
     638              if (!seen_a_digit)
     639                break;
     640              if (seen_a_ratio)
     641                return n_ratio;
     642              else
     643                return n_integer;
     644            }
     645          if (ptr->attribute == a_digit
     646              || ptr->attribute == a_letterdigit
     647              || ptr->attribute == a_expodigit)
     648            {
     649              int c = ptr->ch;
     650  
     651              c = (c < 'A' ? c - '0' : c < 'a' ? c - 'A' + 10 : c - 'a' + 10);
     652              if (c >= *basep)
     653                break;
     654              seen_a_digit = true;
     655            }
     656          else if (ptr->attribute == a_ratio)
     657            {
     658              if (seen_a_ratio || !seen_a_digit)
     659                break;
     660              seen_a_ratio = true;
     661              seen_a_digit = false;
     662            }
     663          else
     664            break;
     665        }
     666    }
     667  
     668    /* Test for syntax
     669     * { a_sign | }
     670     * { a_digit }* { a_dot { a_digit }* | }
     671     * { a_expo { a_sign | } { a_digit }+ | }
     672     *
     673     * If there is an exponent part, there must be digits before the dot or
     674     * after the dot. The result is a float.
     675     * If there is no exponen:
     676     *   If there is no dot, it would an integer in base 10, but is has already
     677     *   been verified to not be an integer in the current base.
     678     *   If there is a dot:
     679     *     If there are digits after the dot, it's a float.
     680     *     Otherwise, if there are digits before the dot, it's an integer.
     681     */
     682    *basep = 10;
     683    {
     684      bool seen_a_dot = false;
     685      bool seen_a_dot_with_leading_digits = false;
     686      bool seen_a_digit = false;  /* seen a digit in last digit block? */
     687      struct token_char *ptr;
     688  
     689      for (ptr = ptr1;; ptr++)
     690        {
     691          if (ptr >= ptr_limit)
     692            {
     693              /* no exponent */
     694              if (!seen_a_dot)
     695                return n_none;
     696              if (seen_a_digit)
     697                return n_float;
     698              if (seen_a_dot_with_leading_digits)
     699                return n_integer;
     700              else
     701                return n_none;
     702            }
     703          if (ptr->attribute == a_digit)
     704            {
     705              seen_a_digit = true;
     706            }
     707          else if (ptr->attribute == a_dot)
     708            {
     709              if (seen_a_dot)
     710                return n_none;
     711              seen_a_dot = true;
     712              if (seen_a_digit)
     713                seen_a_dot_with_leading_digits = true;
     714              seen_a_digit = false;
     715            }
     716          else if (ptr->attribute == a_expo || ptr->attribute == a_expodigit)
     717            break;
     718          else
     719            return n_none;
     720        }
     721      ptr++;
     722      if (!seen_a_dot_with_leading_digits || !seen_a_digit)
     723        return n_none;
     724      if (ptr >= ptr_limit)
     725        return n_none;
     726      if (ptr->attribute == a_sign)
     727        ptr++;
     728      seen_a_digit = false;
     729      for (;; ptr++)
     730        {
     731          if (ptr >= ptr_limit)
     732            break;
     733          if (ptr->attribute != a_digit)
     734            return n_none;
     735          seen_a_digit = true;
     736        }
     737      if (!seen_a_digit)
     738        return n_none;
     739      return n_float;
     740    }
     741  }
     742  
     743  /* A token representing a symbol must be case converted.
     744     For portability, we convert only ASCII characters here.  */
     745  
     746  static void
     747  upcase_token (struct token *tp)
     748  {
     749    int n = tp->charcount;
     750    int i;
     751  
     752    for (i = 0; i < n; i++)
     753      if (tp->chars[i].attribute != a_escaped)
     754        {
     755          unsigned char c = tp->chars[i].ch;
     756          if (c >= 'a' && c <= 'z')
     757            tp->chars[i].ch = c - 'a' + 'A';
     758        }
     759  }
     760  
     761  static void
     762  downcase_token (struct token *tp)
     763  {
     764    int n = tp->charcount;
     765    int i;
     766  
     767    for (i = 0; i < n; i++)
     768      if (tp->chars[i].attribute != a_escaped)
     769        {
     770          unsigned char c = tp->chars[i].ch;
     771          if (c >= 'A' && c <= 'Z')
     772            tp->chars[i].ch = c - 'A' + 'a';
     773        }
     774  }
     775  
     776  static void
     777  case_convert_token (struct token *tp)
     778  {
     779    int n = tp->charcount;
     780    int i;
     781  
     782    switch (readtable_case)
     783      {
     784      case case_upcase:
     785        upcase_token (tp);
     786        break;
     787  
     788      case case_downcase:
     789        downcase_token (tp);
     790        break;
     791  
     792      case case_preserve:
     793        break;
     794  
     795      case case_invert:
     796        {
     797          bool seen_uppercase = false;
     798          bool seen_lowercase = false;
     799          for (i = 0; i < n; i++)
     800            if (tp->chars[i].attribute != a_escaped)
     801              {
     802                unsigned char c = tp->chars[i].ch;
     803                if (c >= 'a' && c <= 'z')
     804                  seen_lowercase = true;
     805                if (c >= 'A' && c <= 'Z')
     806                  seen_uppercase = true;
     807              }
     808          if (seen_uppercase)
     809            {
     810              if (!seen_lowercase)
     811                downcase_token (tp);
     812            }
     813          else
     814            {
     815              if (seen_lowercase)
     816                upcase_token (tp);
     817            }
     818        }
     819        break;
     820      }
     821  }
     822  
     823  
     824  /* ========================= Accumulating comments ========================= */
     825  
     826  
     827  static char *buffer;
     828  static size_t bufmax;
     829  static size_t buflen;
     830  
     831  static inline void
     832  comment_start ()
     833  {
     834    buflen = 0;
     835  }
     836  
     837  static inline void
     838  comment_add (int c)
     839  {
     840    if (buflen >= bufmax)
     841      {
     842        bufmax = 2 * bufmax + 10;
     843        buffer = xrealloc (buffer, bufmax);
     844      }
     845    buffer[buflen++] = c;
     846  }
     847  
     848  static inline void
     849  comment_line_end (size_t chars_to_remove)
     850  {
     851    buflen -= chars_to_remove;
     852    while (buflen >= 1
     853           && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
     854      --buflen;
     855    if (chars_to_remove == 0 && buflen >= bufmax)
     856      {
     857        bufmax = 2 * bufmax + 10;
     858        buffer = xrealloc (buffer, bufmax);
     859      }
     860    buffer[buflen] = '\0';
     861    savable_comment_add (buffer);
     862  }
     863  
     864  
     865  /* These are for tracking whether comments count as immediately before
     866     keyword.  */
     867  static int last_comment_line;
     868  static int last_non_comment_line;
     869  
     870  
     871  /* ========================= Accumulating messages ========================= */
     872  
     873  
     874  static message_list_ty *mlp;
     875  
     876  
     877  /* ============== Reading of objects.  See CLHS 2 "Syntax".  ============== */
     878  
     879  
     880  /* We are only interested in symbols (e.g. GETTEXT or NGETTEXT) and strings.
     881     Other objects need not to be represented precisely.  */
     882  enum object_type
     883  {
     884    t_symbol,     /* symbol */
     885    t_string,     /* string */
     886    t_other,      /* other kind of real object */
     887    t_dot,        /* '.' pseudo object */
     888    t_close,      /* ')' pseudo object */
     889    t_eof         /* EOF marker */
     890  };
     891  
     892  struct object
     893  {
     894    enum object_type type;
     895    struct token *token;          /* for t_symbol and t_string */
     896    int line_number_at_start;     /* for t_string */
     897  };
     898  
     899  /* Free the memory pointed to by a 'struct object'.  */
     900  static inline void
     901  free_object (struct object *op)
     902  {
     903    if (op->type == t_symbol || op->type == t_string)
     904      {
     905        free_token (op->token);
     906        free (op->token);
     907      }
     908  }
     909  
     910  /* Convert a t_symbol/t_string token to a char*.  */
     911  static char *
     912  string_of_object (const struct object *op)
     913  {
     914    char *str;
     915    const struct token_char *p;
     916    char *q;
     917    int n;
     918  
     919    if (!(op->type == t_symbol || op->type == t_string))
     920      abort ();
     921    n = op->token->charcount;
     922    str = XNMALLOC (n + 1, char);
     923    q = str;
     924    for (p = op->token->chars; n > 0; p++, n--)
     925      *q++ = p->ch;
     926    *q = '\0';
     927    return str;
     928  }
     929  
     930  
     931  /* Context lookup table.  */
     932  static flag_context_list_table_ty *flag_context_list_table;
     933  
     934  
     935  /* Maximum supported nesting depth.  */
     936  #define MAX_NESTING_DEPTH 1000
     937  
     938  /* Current nesting depth.  */
     939  static int nesting_depth;
     940  
     941  
     942  /* Read the next object.  */
     943  static void
     944  read_object (struct object *op, flag_context_ty outer_context)
     945  {
     946    if (nesting_depth > MAX_NESTING_DEPTH)
     947      {
     948        error_with_progname = false;
     949        error (EXIT_FAILURE, 0, _("%s:%d: error: too deeply nested objects"),
     950               logical_file_name, line_number);
     951      }
     952    for (;;)
     953      {
     954        struct char_syntax curr;
     955  
     956        read_char_syntax (&curr);
     957  
     958        switch (curr.scode)
     959          {
     960          case syntax_eof:
     961            op->type = t_eof;
     962            return;
     963  
     964          case syntax_whitespace:
     965            if (curr.ch == '\n')
     966              /* Comments assumed to be grouped with a message must immediately
     967                 precede it, with no non-whitespace token on a line between
     968                 both.  */
     969              if (last_non_comment_line > last_comment_line)
     970                savable_comment_reset ();
     971            continue;
     972  
     973          case syntax_illegal:
     974            op->type = t_other;
     975            return;
     976  
     977          case syntax_single_esc:
     978          case syntax_multi_esc:
     979          case syntax_constituent:
     980            /* Start reading a token.  */
     981            op->token = XMALLOC (struct token);
     982            read_token (op->token, &curr);
     983            last_non_comment_line = line_number;
     984  
     985            /* Interpret the token.  */
     986  
     987            /* Dots.  */
     988            if (!op->token->with_escape
     989                && op->token->charcount == 1
     990                && op->token->chars[0].attribute == a_dot)
     991              {
     992                free_token (op->token);
     993                free (op->token);
     994                op->type = t_dot;
     995                return;
     996              }
     997            /* Tokens consisting entirely of dots are illegal, but be tolerant
     998               here.  */
     999  
    1000            /* Number.  */
    1001            {
    1002              int base = read_base;
    1003  
    1004              if (is_number (op->token, &base) != n_none)
    1005                {
    1006                  free_token (op->token);
    1007                  free (op->token);
    1008                  op->type = t_other;
    1009                  return;
    1010                }
    1011            }
    1012  
    1013            /* We interpret all other tokens as symbols (including 'reserved
    1014               tokens', i.e. potential numbers which are not numbers).  */
    1015            case_convert_token (op->token);
    1016            op->type = t_symbol;
    1017            return;
    1018  
    1019          case syntax_t_macro:
    1020          case syntax_nt_macro:
    1021            /* Read a macro.  */
    1022            switch (curr.ch)
    1023              {
    1024              case '(':
    1025                {
    1026                  int arg = 0;            /* Current argument number.  */
    1027                  flag_context_list_iterator_ty context_iter;
    1028                  const struct callshapes *shapes = NULL;
    1029                  struct arglist_parser *argparser = NULL;
    1030  
    1031                  for (;; arg++)
    1032                    {
    1033                      struct object inner;
    1034                      flag_context_ty inner_context;
    1035  
    1036                      if (arg == 0)
    1037                        inner_context = null_context;
    1038                      else
    1039                        inner_context =
    1040                          inherited_context (outer_context,
    1041                                             flag_context_list_iterator_advance (
    1042                                               &context_iter));
    1043  
    1044                      ++nesting_depth;
    1045                      read_object (&inner, inner_context);
    1046                      nesting_depth--;
    1047  
    1048                      /* Recognize end of list.  */
    1049                      if (inner.type == t_close)
    1050                        {
    1051                          op->type = t_other;
    1052                          /* Don't bother converting "()" to "NIL".  */
    1053                          last_non_comment_line = line_number;
    1054                          if (argparser != NULL)
    1055                            arglist_parser_done (argparser, arg);
    1056                          return;
    1057                        }
    1058  
    1059                      /* Dots are not allowed in every position.
    1060                         But be tolerant.  */
    1061  
    1062                      /* EOF inside list is illegal.
    1063                         But be tolerant.  */
    1064                      if (inner.type == t_eof)
    1065                        break;
    1066  
    1067                      if (arg == 0)
    1068                        {
    1069                          /* This is the function position.  */
    1070                          if (inner.type == t_symbol)
    1071                            {
    1072                              char *symbol_name = string_of_object (&inner);
    1073                              int i;
    1074                              int prefix_len;
    1075                              void *keyword_value;
    1076  
    1077                              /* Omit any package name.  */
    1078                              i = inner.token->charcount;
    1079                              while (i > 0
    1080                                     && inner.token->chars[i-1].attribute != a_pack_m)
    1081                                i--;
    1082                              prefix_len = i;
    1083  
    1084                              if (hash_find_entry (&keywords,
    1085                                                   symbol_name + prefix_len,
    1086                                                   strlen (symbol_name + prefix_len),
    1087                                                   &keyword_value)
    1088                                  == 0)
    1089                                shapes = (const struct callshapes *) keyword_value;
    1090  
    1091                              argparser = arglist_parser_alloc (mlp, shapes);
    1092  
    1093                              context_iter =
    1094                                flag_context_list_iterator (
    1095                                  flag_context_list_table_lookup (
    1096                                    flag_context_list_table,
    1097                                    symbol_name, strlen (symbol_name)));
    1098  
    1099                              free (symbol_name);
    1100                            }
    1101                          else
    1102                            context_iter = null_context_list_iterator;
    1103                        }
    1104                      else
    1105                        {
    1106                          /* These are the argument positions.  */
    1107                          if (argparser != NULL && inner.type == t_string)
    1108                            {
    1109                              char *s = string_of_object (&inner);
    1110                              mixed_string_ty *ms =
    1111                                mixed_string_alloc_simple (s, lc_string,
    1112                                                           logical_file_name,
    1113                                                           inner.line_number_at_start);
    1114                              free (s);
    1115                              arglist_parser_remember (argparser, arg, ms,
    1116                                                       inner_context,
    1117                                                       logical_file_name,
    1118                                                       inner.line_number_at_start,
    1119                                                       savable_comment, false);
    1120                            }
    1121                        }
    1122  
    1123                      free_object (&inner);
    1124                    }
    1125  
    1126                  if (argparser != NULL)
    1127                    arglist_parser_done (argparser, arg);
    1128                }
    1129                op->type = t_other;
    1130                last_non_comment_line = line_number;
    1131                return;
    1132  
    1133              case ')':
    1134                /* Tell the caller about the end of list.
    1135                   Unmatched closing parenthesis is illegal.
    1136                   But be tolerant.  */
    1137                op->type = t_close;
    1138                last_non_comment_line = line_number;
    1139                return;
    1140  
    1141              case ',':
    1142                {
    1143                  int c = do_getc ();
    1144                  /* The ,@ handling inside lists is wrong anyway, because
    1145                     ,@form expands to an unknown number of elements.  */
    1146                  if (c != EOF && c != '@' && c != '.')
    1147                    do_ungetc (c);
    1148                }
    1149                FALLTHROUGH;
    1150              case '\'':
    1151              case '`':
    1152                {
    1153                  struct object inner;
    1154  
    1155                  ++nesting_depth;
    1156                  read_object (&inner, null_context);
    1157                  nesting_depth--;
    1158  
    1159                  /* Dots and EOF are not allowed here.  But be tolerant.  */
    1160  
    1161                  free_object (&inner);
    1162  
    1163                  op->type = t_other;
    1164                  last_non_comment_line = line_number;
    1165                  return;
    1166                }
    1167  
    1168              case ';':
    1169                {
    1170                  bool all_semicolons = true;
    1171  
    1172                  last_comment_line = line_number;
    1173                  comment_start ();
    1174                  for (;;)
    1175                    {
    1176                      int c = do_getc ();
    1177                      if (c == EOF || c == '\n')
    1178                        break;
    1179                      if (c != ';')
    1180                        all_semicolons = false;
    1181                      if (!all_semicolons)
    1182                        {
    1183                          /* We skip all leading white space, but not EOLs.  */
    1184                          if (!(buflen == 0 && (c == ' ' || c == '\t')))
    1185                            comment_add (c);
    1186                        }
    1187                    }
    1188                  comment_line_end (0);
    1189                  continue;
    1190                }
    1191  
    1192              case '"':
    1193                {
    1194                  op->token = XMALLOC (struct token);
    1195                  init_token (op->token);
    1196                  op->line_number_at_start = line_number;
    1197                  for (;;)
    1198                    {
    1199                      int c = do_getc ();
    1200                      if (c == EOF)
    1201                        /* Invalid input.  Be tolerant, no error message.  */
    1202                        break;
    1203                      if (c == '"')
    1204                        break;
    1205                      if (c == '\\') /* syntax_single_esc */
    1206                        {
    1207                          c = do_getc ();
    1208                          if (c == EOF)
    1209                            /* Invalid input.  Be tolerant, no error message.  */
    1210                            break;
    1211                        }
    1212                      grow_token (op->token);
    1213                      op->token->chars[op->token->charcount++].ch = c;
    1214                    }
    1215                  op->type = t_string;
    1216  
    1217                  if (extract_all)
    1218                    {
    1219                      lex_pos_ty pos;
    1220  
    1221                      pos.file_name = logical_file_name;
    1222                      pos.line_number = op->line_number_at_start;
    1223                      remember_a_message (mlp, NULL, string_of_object (op), false,
    1224                                          false, null_context, &pos,
    1225                                          NULL, savable_comment, false);
    1226                    }
    1227                  last_non_comment_line = line_number;
    1228                  return;
    1229                }
    1230  
    1231              case '#':
    1232                /* Dispatch macro handling.  */
    1233                {
    1234                  int dmc;
    1235  
    1236                  for (;;)
    1237                    {
    1238                      dmc = do_getc ();
    1239                      if (dmc == EOF)
    1240                        /* Invalid input.  Be tolerant, no error message.  */
    1241                        {
    1242                          op->type = t_other;
    1243                          return;
    1244                        }
    1245                      if (!(dmc >= '0' && dmc <= '9'))
    1246                        break;
    1247                    }
    1248  
    1249                  switch (dmc)
    1250                    {
    1251                    case '(':
    1252                    case '"':
    1253                      do_ungetc (dmc);
    1254                      FALLTHROUGH;
    1255                    case '\'':
    1256                    case ':':
    1257                    case '.':
    1258                    case ',':
    1259                    case 'A': case 'a':
    1260                    case 'C': case 'c':
    1261                    case 'P': case 'p':
    1262                    case 'S': case 's':
    1263                      {
    1264                        struct object inner;
    1265                        ++nesting_depth;
    1266                        read_object (&inner, null_context);
    1267                        nesting_depth--;
    1268                        /* Dots and EOF are not allowed here.
    1269                           But be tolerant.  */
    1270                        free_object (&inner);
    1271                        op->type = t_other;
    1272                        last_non_comment_line = line_number;
    1273                        return;
    1274                      }
    1275  
    1276                    case '|':
    1277                      {
    1278                        int depth = 0;
    1279                        int c;
    1280  
    1281                        comment_start ();
    1282                        c = do_getc ();
    1283                        for (;;)
    1284                          {
    1285                            if (c == EOF)
    1286                              break;
    1287                            if (c == '|')
    1288                              {
    1289                                c = do_getc ();
    1290                                if (c == EOF)
    1291                                  break;
    1292                                if (c == '#')
    1293                                  {
    1294                                    if (depth == 0)
    1295                                      {
    1296                                        comment_line_end (0);
    1297                                        break;
    1298                                      }
    1299                                    depth--;
    1300                                    comment_add ('|');
    1301                                    comment_add ('#');
    1302                                    c = do_getc ();
    1303                                  }
    1304                                else
    1305                                  comment_add ('|');
    1306                              }
    1307                            else if (c == '#')
    1308                              {
    1309                                c = do_getc ();
    1310                                if (c == EOF)
    1311                                  break;
    1312                                comment_add ('#');
    1313                                if (c == '|')
    1314                                  {
    1315                                    depth++;
    1316                                    comment_add ('|');
    1317                                    c = do_getc ();
    1318                                  }
    1319                              }
    1320                            else
    1321                              {
    1322                                /* We skip all leading white space.  */
    1323                                if (!(buflen == 0 && (c == ' ' || c == '\t')))
    1324                                  comment_add (c);
    1325                                if (c == '\n')
    1326                                  {
    1327                                    comment_line_end (1);
    1328                                    comment_start ();
    1329                                  }
    1330                                c = do_getc ();
    1331                              }
    1332                          }
    1333                        if (c == EOF)
    1334                          {
    1335                            /* EOF not allowed here.  But be tolerant.  */
    1336                            op->type = t_eof;
    1337                            return;
    1338                          }
    1339                        last_comment_line = line_number;
    1340                        continue;
    1341                      }
    1342  
    1343                    case '\\':
    1344                      {
    1345                        struct token token;
    1346                        struct char_syntax first;
    1347                        first.ch = '\\';
    1348                        first.scode = syntax_single_esc;
    1349                        read_token (&token, &first);
    1350                        free_token (&token);
    1351                        op->type = t_other;
    1352                        last_non_comment_line = line_number;
    1353                        return;
    1354                      }
    1355  
    1356                    case 'B': case 'b':
    1357                    case 'O': case 'o':
    1358                    case 'X': case 'x':
    1359                    case 'R': case 'r':
    1360                    case '*':
    1361                      {
    1362                        struct token token;
    1363                        read_token (&token, NULL);
    1364                        free_token (&token);
    1365                        op->type = t_other;
    1366                        last_non_comment_line = line_number;
    1367                        return;
    1368                      }
    1369  
    1370                    case '=':
    1371                      /* Ignore read labels.  */
    1372                      continue;
    1373  
    1374                    case '#':
    1375                      /* Don't bother looking up the corresponding object.  */
    1376                      op->type = t_other;
    1377                      last_non_comment_line = line_number;
    1378                      return;
    1379  
    1380                    case '+':
    1381                    case '-':
    1382                      /* Simply assume every feature expression is true.  */
    1383                      {
    1384                        struct object inner;
    1385                        ++nesting_depth;
    1386                        read_object (&inner, null_context);
    1387                        nesting_depth--;
    1388                        /* Dots and EOF are not allowed here.
    1389                           But be tolerant.  */
    1390                        free_object (&inner);
    1391                        continue;
    1392                      }
    1393  
    1394                    default:
    1395                      op->type = t_other;
    1396                      last_non_comment_line = line_number;
    1397                      return;
    1398                    }
    1399                  /*NOTREACHED*/
    1400                  abort ();
    1401                }
    1402  
    1403              default:
    1404                /*NOTREACHED*/
    1405                abort ();
    1406              }
    1407  
    1408          default:
    1409            /*NOTREACHED*/
    1410            abort ();
    1411          }
    1412      }
    1413  }
    1414  
    1415  
    1416  void
    1417  extract_lisp (FILE *f,
    1418                const char *real_filename, const char *logical_filename,
    1419                flag_context_list_table_ty *flag_table,
    1420                msgdomain_list_ty *mdlp)
    1421  {
    1422    mlp = mdlp->item[0]->messages;
    1423  
    1424    fp = f;
    1425    real_file_name = real_filename;
    1426    logical_file_name = xstrdup (logical_filename);
    1427    line_number = 1;
    1428  
    1429    last_comment_line = -1;
    1430    last_non_comment_line = -1;
    1431  
    1432    flag_context_list_table = flag_table;
    1433    nesting_depth = 0;
    1434  
    1435    init_keywords ();
    1436  
    1437    /* Eat tokens until eof is seen.  When read_object returns
    1438       due to an unbalanced closing parenthesis, just restart it.  */
    1439    do
    1440      {
    1441        struct object toplevel_object;
    1442  
    1443        read_object (&toplevel_object, null_context);
    1444  
    1445        if (toplevel_object.type == t_eof)
    1446          break;
    1447  
    1448        free_object (&toplevel_object);
    1449      }
    1450    while (!feof (fp));
    1451  
    1452    /* Close scanner.  */
    1453    fp = NULL;
    1454    real_file_name = NULL;
    1455    logical_file_name = NULL;
    1456    line_number = 0;
    1457  }