1  /* xgettext Vala backend.
       2     Copyright (C) 2013-2014, 2018-2023 Free Software Foundation, Inc.
       3  
       4     This file was written by Daiki Ueno <ueno@gnu.org>, 2013.
       5  
       6     This program is free software: you can redistribute it and/or modify
       7     it under the terms of the GNU General Public License as published by
       8     the Free Software Foundation; either version 3 of the License, or
       9     (at your option) any later version.
      10  
      11     This program is distributed in the hope that it will be useful,
      12     but WITHOUT ANY WARRANTY; without even the implied warranty of
      13     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      14     GNU General Public License for more details.
      15  
      16     You should have received a copy of the GNU General Public License
      17     along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
      18  
      19  #ifdef HAVE_CONFIG_H
      20  # include "config.h"
      21  #endif
      22  
      23  /* Specification.  */
      24  #include "x-vala.h"
      25  
      26  #include <assert.h>
      27  #include <errno.h>
      28  #include <stdbool.h>
      29  #include <stdio.h>
      30  #include <stdlib.h>
      31  #include <string.h>
      32  
      33  #include "attribute.h"
      34  #include "message.h"
      35  #include "rc-str-list.h"
      36  #include "xgettext.h"
      37  #include "xg-pos.h"
      38  #include "xg-encoding.h"
      39  #include "xg-mixed-string.h"
      40  #include "xg-arglist-context.h"
      41  #include "xg-arglist-callshape.h"
      42  #include "xg-arglist-parser.h"
      43  #include "xg-message.h"
      44  #include "error.h"
      45  #include "error-progname.h"
      46  #include "xalloc.h"
      47  #include "xvasprintf.h"
      48  #include "mem-hash-map.h"
      49  #include "po-charset.h"
      50  #include "gettext.h"
      51  
      52  #define _(s) gettext(s)
      53  
      54  #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
      55  
      56  /* The Vala syntax is defined in the Vala Reference Manual
      57     https://www.vala-project.org/doc/vala/.
      58     See also vala/valascanner.vala.  */
      59  
      60  /* ====================== Keyword set customization.  ====================== */
      61  
      62  /* If true extract all strings.  */
      63  static bool extract_all = false;
      64  
      65  static hash_table keywords;
      66  static bool default_keywords = true;
      67  
      68  
      69  void
      70  x_vala_extract_all ()
      71  {
      72    extract_all = true;
      73  }
      74  
      75  
      76  static void
      77  add_keyword (const char *name, hash_table *keywords)
      78  {
      79    if (name == NULL)
      80      default_keywords = false;
      81    else
      82      {
      83        const char *end;
      84        struct callshape shape;
      85        const char *colon;
      86  
      87        if (keywords->table == NULL)
      88          hash_init (keywords, 100);
      89  
      90        split_keywordspec (name, &end, &shape);
      91  
      92        /* The characters between name and end should form a valid C identifier.
      93           A colon means an invalid parse in split_keywordspec().  */
      94        colon = strchr (name, ':');
      95        if (colon == NULL || colon >= end)
      96          insert_keyword_callshape (keywords, name, end - name, &shape);
      97      }
      98  }
      99  
     100  void
     101  x_vala_keyword (const char *name)
     102  {
     103    add_keyword (name, &keywords);
     104  }
     105  
     106  static void
     107  init_keywords ()
     108  {
     109    if (default_keywords)
     110      {
     111        /* When adding new keywords here, also update the documentation in
     112           xgettext.texi!  */
     113        x_vala_keyword ("dgettext:2");
     114        x_vala_keyword ("dcgettext:2");
     115        x_vala_keyword ("ngettext:1,2");
     116        x_vala_keyword ("dngettext:2,3");
     117        x_vala_keyword ("dpgettext:2g");
     118        x_vala_keyword ("dpgettext2:2c,3");
     119        x_vala_keyword ("_");
     120        x_vala_keyword ("Q_");
     121        x_vala_keyword ("N_");
     122        x_vala_keyword ("NC_:1c,2");
     123  
     124        default_keywords = false;
     125      }
     126  }
     127  
     128  void
     129  init_flag_table_vala ()
     130  {
     131    /* Vala leaves string formatting to Glib functions and thus the
     132       format string is exactly same as C.  See also
     133       vapi/glib-2.0.vapi.  */
     134  
     135    xgettext_record_flag ("dgettext:2:pass-c-format!Vala");
     136    xgettext_record_flag ("dcgettext:2:pass-c-format!Vala");
     137    xgettext_record_flag ("ngettext:1:pass-c-format!Vala");
     138    xgettext_record_flag ("ngettext:2:pass-c-format!Vala");
     139    xgettext_record_flag ("dngettext:2:pass-c-format!Vala");
     140    xgettext_record_flag ("dngettext:3:pass-c-format!Vala");
     141    xgettext_record_flag ("dpgettext:2:pass-c-format!Vala");
     142    xgettext_record_flag ("dpgettext2:3:pass-c-format!Vala");
     143    xgettext_record_flag ("_:1:pass-c-format!Vala");
     144    xgettext_record_flag ("Q_:1:pass-c-format!Vala");
     145    xgettext_record_flag ("N_:1:pass-c-format!Vala");
     146    xgettext_record_flag ("NC_:2:pass-c-format!Vala");
     147  
     148    xgettext_record_flag ("printf:1:c-format!Vala");
     149    xgettext_record_flag ("vprintf:1:c-format!Vala");
     150  }
     151  
     152  
     153  /* ======================== Reading of characters.  ======================== */
     154  
     155  /* The input file stream.  */
     156  static FILE *fp;
     157  
     158  
     159  /* 1. line_number handling.  */
     160  
     161  #define MAX_PHASE1_PUSHBACK 16
     162  static unsigned char phase1_pushback[MAX_PHASE1_PUSHBACK];
     163  static int phase1_pushback_length;
     164  
     165  
     166  static int
     167  phase1_getc ()
     168  {
     169    int c;
     170  
     171    if (phase1_pushback_length)
     172      c = phase1_pushback[--phase1_pushback_length];
     173    else
     174      {
     175        c = getc (fp);
     176        if (c == EOF)
     177          {
     178            if (ferror (fp))
     179              error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
     180                     real_file_name);
     181            return EOF;
     182          }
     183      }
     184  
     185    if (c == '\n')
     186      ++line_number;
     187    return c;
     188  }
     189  
     190  
     191  /* Supports 2 characters of pushback.  */
     192  static void
     193  phase1_ungetc (int c)
     194  {
     195    if (c != EOF)
     196      {
     197        if (c == '\n')
     198          --line_number;
     199  
     200        if (phase1_pushback_length == SIZEOF (phase1_pushback))
     201          abort ();
     202        phase1_pushback[phase1_pushback_length++] = c;
     203      }
     204  }
     205  
     206  
     207  /* These are for tracking whether comments count as immediately before
     208     keyword.  */
     209  static int last_comment_line;
     210  static int last_non_comment_line;
     211  
     212  /* Accumulating comments.  */
     213  
     214  static char *buffer;
     215  static size_t bufmax;
     216  static size_t buflen;
     217  
     218  static inline void
     219  comment_start ()
     220  {
     221    buflen = 0;
     222  }
     223  
     224  static inline void
     225  comment_add (int c)
     226  {
     227    if (buflen >= bufmax)
     228      {
     229        bufmax = 2 * bufmax + 10;
     230        buffer = xrealloc (buffer, bufmax);
     231      }
     232    buffer[buflen++] = c;
     233  }
     234  
     235  static inline void
     236  comment_line_end (size_t chars_to_remove)
     237  {
     238    buflen -= chars_to_remove;
     239    while (buflen >= 1
     240           && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
     241      --buflen;
     242    if (chars_to_remove == 0 && buflen >= bufmax)
     243      {
     244        bufmax = 2 * bufmax + 10;
     245        buffer = xrealloc (buffer, bufmax);
     246      }
     247    buffer[buflen] = '\0';
     248    savable_comment_add (buffer);
     249  }
     250  
     251  
     252  /* 2. Replace each comment that is not inside a character constant or
     253     string literal with a space character.  */
     254  
     255  static int
     256  phase2_getc ()
     257  {
     258    int c;
     259    bool last_was_star;
     260  
     261    c = phase1_getc ();
     262    if (c != '/')
     263      return c;
     264    c = phase1_getc ();
     265    switch (c)
     266      {
     267      default:
     268        phase1_ungetc (c);
     269        return '/';
     270  
     271      case '*':
     272        /* C comment.  */
     273        comment_start ();
     274        last_was_star = false;
     275        for (;;)
     276          {
     277            c = phase1_getc ();
     278            if (c == EOF)
     279              break;
     280            /* We skip all leading white space, but not EOLs.  */
     281            if (!(buflen == 0 && (c == ' ' || c == '\t')))
     282              comment_add (c);
     283            switch (c)
     284              {
     285              case '\n':
     286                comment_line_end (1);
     287                comment_start ();
     288                last_was_star = false;
     289                continue;
     290  
     291              case '*':
     292                last_was_star = true;
     293                continue;
     294  
     295              case '/':
     296                if (last_was_star)
     297                  {
     298                    comment_line_end (2);
     299                    break;
     300                  }
     301                FALLTHROUGH;
     302  
     303              default:
     304                last_was_star = false;
     305                continue;
     306              }
     307            break;
     308          }
     309        last_comment_line = line_number;
     310        return ' ';
     311  
     312      case '/':
     313        /* C++ or ISO C 99 comment.  */
     314        comment_start ();
     315        for (;;)
     316          {
     317            c = phase1_getc ();
     318            if (c == '\n' || c == EOF)
     319              break;
     320            /* We skip all leading white space, but not EOLs.  */
     321            if (!(buflen == 0 && (c == ' ' || c == '\t')))
     322              comment_add (c);
     323          }
     324        comment_line_end (0);
     325        last_comment_line = line_number;
     326        return '\n';
     327      }
     328  }
     329  
     330  
     331  static void
     332  phase2_ungetc (int c)
     333  {
     334    phase1_ungetc (c);
     335  }
     336  
     337  
     338  /* ========================== Reading of tokens.  ========================== */
     339  
     340  enum token_type_ty
     341  {
     342    token_type_character_constant,        /* 'x' */
     343    token_type_eof,
     344    token_type_lparen,                    /* ( */
     345    token_type_rparen,                    /* ) */
     346    token_type_lbrace,                    /* { */
     347    token_type_rbrace,                    /* } */
     348    token_type_assign,                    /* = += -= *= /= %= <<= >>= &= |= ^= */
     349    token_type_return,                    /* return */
     350    token_type_plus,                      /* + */
     351    token_type_arithmetic_operator,       /* - * / % << >> & | ^ */
     352    token_type_equality_test_operator,    /* == < > >= <= != */
     353    token_type_logic_operator,            /* ! && || */
     354    token_type_comma,                     /* , */
     355    token_type_question,                  /* ? */
     356    token_type_colon,                     /* : */
     357    token_type_number,                    /* 2.7 */
     358    token_type_string_literal,            /* "abc" */
     359    token_type_string_template,           /* @"abc" */
     360    token_type_regex_literal,             /* /.../ */
     361    token_type_symbol,                    /* if else etc. */
     362    token_type_other
     363  };
     364  typedef enum token_type_ty token_type_ty;
     365  
     366  typedef struct token_ty token_ty;
     367  struct token_ty
     368  {
     369    token_type_ty type;
     370    char *string;                         /* for token_type_symbol */
     371    mixed_string_ty *mixed_string;        /* for token_type_string_literal */
     372    refcounted_string_list_ty *comment;   /* for token_type_string_literal */
     373    int line_number;
     374  };
     375  
     376  /* Free the memory pointed to by a 'struct token_ty'.  */
     377  static inline void
     378  free_token (token_ty *tp)
     379  {
     380    if (tp->type == token_type_symbol)
     381      free (tp->string);
     382    if (tp->type == token_type_string_literal)
     383      {
     384        mixed_string_free (tp->mixed_string);
     385        drop_reference (tp->comment);
     386      }
     387  }
     388  
     389  
     390  /* Return value of phase7_getc when EOF is reached.  */
     391  #define P7_EOF (-1)
     392  
     393  /* Replace escape sequences within character strings with their single
     394     character equivalents.  */
     395  #define P7_QUOTES (-3)
     396  #define P7_QUOTE (-4)
     397  #define P7_NEWLINE (-5)
     398  
     399  /* Convert an UTF-16 or UTF-32 code point to a return value that can be
     400     distinguished from a single-byte return value.  */
     401  #define UNICODE(code) (0x100 + (code))
     402  
     403  /* Test a return value of phase7_getuc whether it designates an UTF-16 or
     404     UTF-32 code point.  */
     405  #define IS_UNICODE(p7_result) ((p7_result) >= 0x100)
     406  
     407  /* Extract the UTF-16 or UTF-32 code of a return value that satisfies
     408     IS_UNICODE.  */
     409  #define UNICODE_VALUE(p7_result) ((p7_result) - 0x100)
     410  
     411  
     412  static int
     413  phase7_getc ()
     414  {
     415    int c, j;
     416  
     417    /* Use phase 1, because phase 2 elides comments.  */
     418    c = phase1_getc ();
     419  
     420    if (c == EOF)
     421      return P7_EOF;
     422  
     423    /* Return a magic newline indicator, so that we can distinguish
     424       between the user requesting a newline in the string (e.g. using
     425       "\n" or "\012") from the user failing to terminate the string or
     426       character constant.  The ANSI C standard says: 3.1.3.4 Character
     427       Constants contain "any character except single quote, backslash or
     428       newline; or an escape sequence" and 3.1.4 String Literals contain
     429       "any character except double quote, backslash or newline; or an
     430       escape sequence".
     431  
     432       Most compilers give a fatal error in this case, however gcc is
     433       stupidly silent, even though this is a very common typo.  OK, so
     434       "gcc --pedantic" will tell me, but that gripes about too much other
     435       stuff.  Could I have a "gcc -Wnewline-in-string" option, or
     436       better yet a "gcc -fno-newline-in-string" option, please?  Gcc is
     437       also inconsistent between string literals and character constants:
     438       you may not embed newlines in character constants; try it, you get
     439       a useful diagnostic.  --PMiller  */
     440    if (c == '\n')
     441      return P7_NEWLINE;
     442  
     443    if (c == '"')
     444      return P7_QUOTES;
     445    if (c == '\'')
     446      return P7_QUOTE;
     447    if (c != '\\')
     448      return c;
     449    c = phase1_getc ();
     450    switch (c)
     451      {
     452      default:
     453        /* Unknown escape sequences really should be an error, but just
     454           ignore them, and let the real compiler complain.  */
     455        phase1_ungetc (c);
     456        return '\\';
     457  
     458      case '"':
     459      case '\'':
     460      case '\\':
     461      case '$':
     462        return c;
     463  
     464      case 'b':
     465        return '\b';
     466  
     467      case 'f':
     468        return '\f';
     469      case 'n':
     470        return '\n';
     471      case 'r':
     472        return '\r';
     473      case 't':
     474        return '\t';
     475      case 'v':
     476        return '\v';
     477  
     478      case 'x':
     479        c = phase1_getc ();
     480        switch (c)
     481          {
     482          default:
     483            phase1_ungetc (c);
     484            phase1_ungetc ('x');
     485            return '\\';
     486  
     487          case '0': case '1': case '2': case '3': case '4':
     488          case '5': case '6': case '7': case '8': case '9':
     489          case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
     490          case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
     491            break;
     492          }
     493        {
     494          int n;
     495          bool overflow;
     496  
     497          n = 0;
     498          overflow = false;
     499  
     500          for (;;)
     501            {
     502              switch (c)
     503                {
     504                default:
     505                  phase1_ungetc (c);
     506                  if (overflow)
     507                    {
     508                      error_with_progname = false;
     509                      error (0, 0, _("%s:%d: warning: hexadecimal escape sequence out of range"),
     510                             logical_file_name, line_number);
     511                      error_with_progname = true;
     512                    }
     513                  return n;
     514  
     515                case '0': case '1': case '2': case '3': case '4':
     516                case '5': case '6': case '7': case '8': case '9':
     517                  if (n < 0x100 / 16)
     518                    n = n * 16 + c - '0';
     519                  else
     520                    overflow = true;
     521                break;
     522  
     523                case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
     524                  if (n < 0x100 / 16)
     525                    n = n * 16 + 10 + c - 'A';
     526                  else
     527                    overflow = true;
     528                  break;
     529  
     530                case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
     531                  if (n < 0x100 / 16)
     532                    n = n * 16 + 10 + c - 'a';
     533                  else
     534                    overflow = true;
     535                  break;
     536                }
     537              c = phase1_getc ();
     538            }
     539        }
     540  
     541      case '0':
     542        {
     543          int n;
     544  
     545          n = 0;
     546          for (j = 0; j < 3; ++j)
     547            {
     548              n = n * 8 + c - '0';
     549              c = phase1_getc ();
     550              switch (c)
     551                {
     552                default:
     553                  break;
     554  
     555                case '0': case '1': case '2': case '3':
     556                case '4': case '5': case '6': case '7':
     557                  continue;
     558                }
     559              break;
     560            }
     561          phase1_ungetc (c);
     562          return n;
     563        }
     564  
     565      case 'u':
     566        {
     567          unsigned char buf[8];
     568          int n;
     569  
     570          n = 0;
     571          for (j = 0; j < 4; j++)
     572            {
     573              int c1 = phase1_getc ();
     574  
     575              if (c1 >= '0' && c1 <= '9')
     576                n = (n << 4) + (c1 - '0');
     577              else if (c1 >= 'A' && c1 <= 'F')
     578                n = (n << 4) + (c1 - 'A' + 10);
     579              else if (c1 >= 'a' && c1 <= 'f')
     580                n = (n << 4) + (c1 - 'a' + 10);
     581              else
     582                {
     583                  phase1_ungetc (c1);
     584                  while (--j >= 0)
     585                    phase1_ungetc (buf[j]);
     586                  phase1_ungetc (c);
     587                  return '\\';
     588                }
     589  
     590              buf[j] = c1;
     591            }
     592  
     593          if (n < 0x110000)
     594            return UNICODE (n);
     595  
     596          error_with_progname = false;
     597          error (0, 0, _("%s:%d: warning: invalid Unicode character"),
     598                 logical_file_name, line_number);
     599          error_with_progname = true;
     600  
     601          while (--j >= 0)
     602            phase1_ungetc (buf[j]);
     603          phase1_ungetc (c);
     604          return '\\';
     605        }
     606      }
     607  }
     608  
     609  
     610  static void
     611  phase7_ungetc (int c)
     612  {
     613    phase1_ungetc (c);
     614  }
     615  
     616  
     617  /* 3. Parse each resulting logical line as preprocessing tokens and
     618     white space.  Preprocessing tokens and Vala tokens don't always
     619     match.  */
     620  
     621  static token_ty phase3_pushback[2];
     622  static int phase3_pushback_length;
     623  
     624  
     625  static token_type_ty last_token_type;
     626  
     627  static void
     628  phase3_scan_regex ()
     629  {
     630      int c;
     631  
     632      for (;;)
     633        {
     634          c = phase1_getc ();
     635          if (c == '/')
     636            break;
     637          if (c == '\\')
     638            {
     639              c = phase1_getc ();
     640              if (c != EOF)
     641                continue;
     642            }
     643          if (c == EOF)
     644            {
     645              error_with_progname = false;
     646              error (0, 0,
     647                     _("%s:%d: warning: regular expression literal terminated too early"),
     648                     logical_file_name, line_number);
     649              error_with_progname = true;
     650              return;
     651            }
     652        }
     653  
     654      c = phase2_getc ();
     655      if (!(c == 'i' || c == 's' || c == 'm' || c == 'x'))
     656        phase2_ungetc (c);
     657  }
     658  
     659  static void
     660  phase3_get (token_ty *tp)
     661  {
     662    static char *buffer;
     663    static int bufmax;
     664    int bufpos;
     665  
     666  #undef APPEND
     667  #define APPEND(c)                               \
     668    do                                            \
     669      {                                           \
     670        if (bufpos >= bufmax)                     \
     671          {                                       \
     672            bufmax = 2 * bufmax + 10;             \
     673            buffer = xrealloc (buffer, bufmax);   \
     674          }                                       \
     675        buffer[bufpos++] = c;                     \
     676      }                                           \
     677    while (0)
     678  
     679    if (phase3_pushback_length)
     680      {
     681        *tp = phase3_pushback[--phase3_pushback_length];
     682        last_token_type = tp->type;
     683        return;
     684      }
     685  
     686    for (;;)
     687      {
     688        bool template;
     689        bool verbatim;
     690        int c;
     691  
     692        tp->line_number = line_number;
     693        c = phase2_getc ();
     694  
     695        switch (c)
     696          {
     697          case EOF:
     698            tp->type = last_token_type = token_type_eof;
     699            return;
     700  
     701          case '\n':
     702            if (last_non_comment_line > last_comment_line)
     703              savable_comment_reset ();
     704            FALLTHROUGH;
     705          case ' ':
     706          case '\f':
     707          case '\t':
     708            /* Ignore whitespace and comments.  */
     709            continue;
     710          default:
     711            break;
     712          }
     713  
     714        last_non_comment_line = tp->line_number;
     715        template = false;
     716        verbatim = false;
     717  
     718        switch (c)
     719          {
     720          case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
     721          case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N':
     722          case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U':
     723          case 'V': case 'W': case 'X': case 'Y': case 'Z':
     724          case '_':
     725          case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
     726          case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
     727          case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u':
     728          case 'v': case 'w': case 'x': case 'y': case 'z':
     729            bufpos = 0;
     730            for (;;)
     731              {
     732                APPEND (c);
     733                c = phase2_getc ();
     734                switch (c)
     735                  {
     736                  case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
     737                  case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
     738                  case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
     739                  case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
     740                  case 'Y': case 'Z':
     741                  case '_':
     742                  case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
     743                  case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
     744                  case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
     745                  case 's': case 't': case 'u': case 'v': case 'w': case 'x':
     746                  case 'y': case 'z':
     747                  case '0': case '1': case '2': case '3': case '4':
     748                  case '5': case '6': case '7': case '8': case '9':
     749                    continue;
     750  
     751                  default:
     752                    phase2_ungetc (c);
     753                    break;
     754                  }
     755                break;
     756              }
     757            APPEND (0);
     758            if (strcmp (buffer, "return") == 0)
     759              tp->type = last_token_type = token_type_return;
     760            else
     761              {
     762                tp->string = xstrdup (buffer);
     763                tp->type = last_token_type = token_type_symbol;
     764              }
     765            return;
     766  
     767          case '.':
     768            c = phase2_getc ();
     769            phase2_ungetc (c);
     770            switch (c)
     771              {
     772              default:
     773                tp->string = xstrdup (".");
     774                tp->type = last_token_type = token_type_symbol;
     775                return;
     776  
     777              case '0': case '1': case '2': case '3': case '4':
     778              case '5': case '6': case '7': case '8': case '9':
     779                c = '.';
     780                break;
     781              }
     782            FALLTHROUGH;
     783  
     784          case '0': case '1': case '2': case '3': case '4':
     785          case '5': case '6': case '7': case '8': case '9':
     786            /* The preprocessing number token is more "generous" than the C
     787               number tokens.  This is mostly due to token pasting (another
     788               thing we can ignore here).  */
     789            bufpos = 0;
     790            for (;;)
     791              {
     792                APPEND (c);
     793                c = phase2_getc ();
     794                switch (c)
     795                  {
     796                  case 'e':
     797                  case 'E':
     798                    APPEND (c);
     799                    c = phase2_getc ();
     800                    if (c != '+' && c != '-')
     801                      {
     802                        phase2_ungetc (c);
     803                        break;
     804                      }
     805                    continue;
     806  
     807                  case 'A': case 'B': case 'C': case 'D':           case 'F':
     808                  case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
     809                  case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
     810                  case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
     811                  case 'Y': case 'Z':
     812                  case 'a': case 'b': case 'c': case 'd':           case 'f':
     813                  case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
     814                  case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
     815                  case 's': case 't': case 'u': case 'v': case 'w': case 'x':
     816                  case 'y': case 'z':
     817                  case '0': case '1': case '2': case '3': case '4':
     818                  case '5': case '6': case '7': case '8': case '9':
     819                  case '.':
     820                    continue;
     821  
     822                  default:
     823                    phase2_ungetc (c);
     824                    break;
     825                  }
     826                break;
     827              }
     828            APPEND (0);
     829            tp->type = last_token_type = token_type_number;
     830            return;
     831  
     832          case '\'':
     833            for (;;)
     834              {
     835                c = phase7_getc ();
     836                if (c == P7_NEWLINE)
     837                  {
     838                    error_with_progname = false;
     839                    error (0, 0, _("%s:%d: warning: unterminated character constant"),
     840                           logical_file_name, line_number - 1);
     841                    error_with_progname = true;
     842                    phase7_ungetc ('\n');
     843                    break;
     844                  }
     845                if (c == P7_EOF || c == P7_QUOTE)
     846                  break;
     847              }
     848            tp->type = last_token_type = token_type_character_constant;
     849            return;
     850  
     851            /* Vala provides strings in three different formats.
     852  
     853               Usual string literals:
     854                 "..."
     855               Verbatim string literals:
     856                 """...""" (where ... can include newlines and double quotes)
     857               String templates.
     858                 @"...", @"""..."""
     859  
     860               Note that, with the current implementation string
     861               templates are not subject to translation, because they are
     862               inspected at compile time.  For example, the following code
     863  
     864                 string bar = "bar";
     865                 string foo = _(@"foo $bar");
     866  
     867               will be translated into the C code, like:
     868  
     869                 _(g_strconcat ("foo ", "bar", NULL));  */
     870          case '@':
     871            c = phase2_getc ();
     872            if (c != '"')
     873              {
     874                phase2_ungetc (c);
     875                tp->type = last_token_type = token_type_other;
     876                return;
     877              }
     878            template = true;
     879            FALLTHROUGH;
     880          case '"':
     881            {
     882              struct mixed_string_buffer msb;
     883              {
     884                int c2 = phase1_getc ();
     885  
     886                if (c2 == '"')
     887                  {
     888                    int c3 = phase1_getc ();
     889                    if (c3 == '"')
     890                      verbatim = true;
     891                    else
     892                      {
     893                        phase1_ungetc (c3);
     894                        phase1_ungetc (c2);
     895                      }
     896                  }
     897                else
     898                  phase2_ungetc (c2);
     899              }
     900  
     901              /* Start accumulating the string.  */
     902              mixed_string_buffer_init (&msb, lc_string,
     903                                        logical_file_name, line_number);
     904              if (verbatim)
     905                for (;;)
     906                  {
     907                    c = phase1_getc ();
     908  
     909                    /* Keep line_number in sync.  */
     910                    msb.line_number = line_number;
     911  
     912                    if (c == '"')
     913                      {
     914                        int c2 = phase1_getc ();
     915                        if (c2 == '"')
     916                          {
     917                            int c3 = phase1_getc ();
     918                            if (c3 == '"')
     919                              break;
     920                            phase1_ungetc (c3);
     921                          }
     922                        phase1_ungetc (c2);
     923                      }
     924                    if (c == EOF)
     925                      break;
     926                    mixed_string_buffer_append_char (&msb, c);
     927                  }
     928              else
     929                for (;;)
     930                  {
     931                    c = phase7_getc ();
     932  
     933                    /* Keep line_number in sync.  */
     934                    msb.line_number = line_number;
     935  
     936                    if (c == P7_NEWLINE)
     937                      {
     938                        error_with_progname = false;
     939                        error (0, 0,
     940                               _("%s:%d: warning: unterminated string literal"),
     941                               logical_file_name, line_number - 1);
     942                        error_with_progname = true;
     943                        phase7_ungetc ('\n');
     944                        break;
     945                      }
     946                    if (c == P7_QUOTES)
     947                      break;
     948                    if (c == P7_EOF)
     949                      break;
     950                    if (c == P7_QUOTE)
     951                      c = '\'';
     952                    if (IS_UNICODE (c))
     953                      {
     954                        assert (UNICODE_VALUE (c) >= 0
     955                                && UNICODE_VALUE (c) < 0x110000);
     956                        mixed_string_buffer_append_unicode (&msb,
     957                                                            UNICODE_VALUE (c));
     958                      }
     959                    else
     960                      mixed_string_buffer_append_char (&msb, c);
     961                  }
     962              /* Done accumulating the string.  */
     963              if (template)
     964                {
     965                  tp->type = token_type_string_template;
     966                  mixed_string_buffer_destroy (&msb);
     967                }
     968              else
     969                {
     970                  tp->type = token_type_string_literal;
     971                  tp->mixed_string = mixed_string_buffer_result (&msb);
     972                  tp->comment = add_reference (savable_comment);
     973                }
     974              last_token_type = tp->type;
     975              return;
     976            }
     977  
     978          case '/':
     979            switch (last_token_type)
     980              {
     981              case token_type_lparen:
     982              case token_type_lbrace:
     983              case token_type_assign:
     984              case token_type_return:
     985              case token_type_plus:
     986              case token_type_arithmetic_operator:
     987              case token_type_equality_test_operator:
     988              case token_type_logic_operator:
     989              case token_type_comma:
     990              case token_type_question:
     991              case token_type_colon:
     992                phase3_scan_regex ();
     993                tp->type = last_token_type = token_type_regex_literal;
     994                break;
     995              default:
     996                {
     997                  int c2 = phase2_getc ();
     998                  if (c2 == '=')
     999                    tp->type = last_token_type = token_type_assign;
    1000                  else
    1001                    {
    1002                      phase2_ungetc (c2);
    1003                      tp->type = last_token_type = token_type_arithmetic_operator;
    1004                    }
    1005                  break;
    1006                }
    1007              }
    1008            return;
    1009  
    1010          case '(':
    1011            tp->type = last_token_type = token_type_lparen;
    1012            return;
    1013  
    1014          case ')':
    1015            tp->type = last_token_type = token_type_rparen;
    1016            return;
    1017  
    1018          case '{':
    1019            tp->type = last_token_type = token_type_lbrace;
    1020            return;
    1021  
    1022          case '}':
    1023            tp->type = last_token_type = token_type_rbrace;
    1024            return;
    1025  
    1026          case '+':
    1027            {
    1028              int c2 = phase2_getc ();
    1029              switch (c2)
    1030                {
    1031                case '+':
    1032                  tp->type = last_token_type = token_type_other;
    1033                  break;
    1034                case '=':
    1035                  tp->type = last_token_type = token_type_assign;
    1036                  break;
    1037                default:
    1038                  phase2_ungetc (c2);
    1039                  tp->type = last_token_type = token_type_plus;
    1040                  break;
    1041                }
    1042              return;
    1043            }
    1044  
    1045          case '-':
    1046            {
    1047              int c2 = phase2_getc ();
    1048              switch (c2)
    1049                {
    1050                case '-':
    1051                  tp->type = last_token_type = token_type_other;
    1052                  break;
    1053                case '=':
    1054                  tp->type = last_token_type = token_type_assign;
    1055                  break;
    1056                default:
    1057                  phase2_ungetc (c2);
    1058                  tp->type = last_token_type = token_type_arithmetic_operator;
    1059                  break;
    1060                }
    1061              return;
    1062            }
    1063  
    1064          case '%':
    1065          case '^':
    1066            {
    1067              int c2 = phase2_getc ();
    1068              if (c2 == '=')
    1069  	      tp->type = last_token_type = token_type_assign;
    1070              else
    1071                {
    1072                  phase2_ungetc (c2);
    1073                  tp->type = last_token_type = token_type_logic_operator;
    1074                }
    1075              return;
    1076            }
    1077  
    1078          case '=':
    1079            {
    1080              int c2 = phase2_getc ();
    1081              switch (c2)
    1082                {
    1083                case '=':
    1084                  tp->type = last_token_type = token_type_equality_test_operator;
    1085                  break;
    1086                case '>':
    1087                  tp->type = last_token_type = token_type_other;
    1088                  break;
    1089                default:
    1090                  phase2_ungetc (c2);
    1091                  tp->type = last_token_type = token_type_assign;
    1092                  break;
    1093                }
    1094              return;
    1095            }
    1096  
    1097          case '!':
    1098            {
    1099              int c2 = phase2_getc ();
    1100              if (c2 == '=')
    1101                tp->type = last_token_type = token_type_equality_test_operator;
    1102              else
    1103                {
    1104                  phase2_ungetc (c2);
    1105                  tp->type = last_token_type = token_type_logic_operator;
    1106                }
    1107              return;
    1108            }
    1109  
    1110          case '>':
    1111          case '<':
    1112            {
    1113              int c2 = phase2_getc ();
    1114              if (c2 == '=')
    1115  	      tp->type = last_token_type = token_type_equality_test_operator;
    1116              else if (c2 == c)
    1117                {
    1118                  int c3 = phase2_getc ();
    1119                  if (c3 == '=')
    1120                    tp->type = last_token_type = token_type_assign;
    1121                  else
    1122                    {
    1123                      phase2_ungetc (c2);
    1124                      phase2_ungetc (c3);
    1125                      tp->type = last_token_type = token_type_other;
    1126                    }
    1127                }
    1128              else
    1129                {
    1130                  phase2_ungetc (c2);
    1131                  tp->type = last_token_type = token_type_equality_test_operator;
    1132                }
    1133              return;
    1134            }
    1135  
    1136          case ',':
    1137            tp->type = last_token_type = token_type_comma;
    1138            return;
    1139  
    1140          case ':':
    1141            tp->type = last_token_type = token_type_colon;
    1142            return;
    1143  
    1144          case '&':
    1145          case '|':
    1146            {
    1147              int c2 = phase2_getc ();
    1148              if (c2 == c)
    1149  	      tp->type = last_token_type = token_type_logic_operator;
    1150              else if (c2 == '=')
    1151  	      tp->type = last_token_type = token_type_assign;
    1152              else
    1153                {
    1154                  phase2_ungetc (c2);
    1155                  tp->type = last_token_type = token_type_arithmetic_operator;
    1156                }
    1157              return;
    1158            }
    1159  
    1160          case '?':
    1161            {
    1162              int c2 = phase2_getc ();
    1163              if (c2 == '?')
    1164                tp->type = last_token_type = token_type_logic_operator;
    1165              else
    1166                {
    1167                  phase2_ungetc (c2);
    1168                  tp->type = last_token_type = token_type_question;
    1169                }
    1170              return;
    1171            }
    1172  
    1173          default:
    1174            tp->type = last_token_type = token_type_other;
    1175            return;
    1176          }
    1177      }
    1178  #undef APPEND
    1179  }
    1180  
    1181  static void
    1182  phase3_unget (token_ty *tp)
    1183  {
    1184    if (tp->type != token_type_eof)
    1185      {
    1186        if (phase3_pushback_length == SIZEOF (phase3_pushback))
    1187          abort ();
    1188        phase3_pushback[phase3_pushback_length++] = *tp;
    1189      }
    1190  }
    1191  
    1192  
    1193  /* String concatenation with '+'.  */
    1194  
    1195  static void
    1196  x_vala_lex (token_ty *tp)
    1197  {
    1198    phase3_get (tp);
    1199    if (tp->type == token_type_string_literal)
    1200      {
    1201        mixed_string_ty *sum = tp->mixed_string;
    1202  
    1203        for (;;)
    1204          {
    1205            token_ty token2;
    1206  
    1207            phase3_get (&token2);
    1208            if (token2.type == token_type_plus)
    1209              {
    1210                token_ty token3;
    1211  
    1212                phase3_get (&token3);
    1213                if (token3.type == token_type_string_literal)
    1214                  {
    1215                    sum = mixed_string_concat_free1 (sum, token3.mixed_string);
    1216  
    1217                    free_token (&token3);
    1218                    free_token (&token2);
    1219                    continue;
    1220                  }
    1221                phase3_unget (&token3);
    1222              }
    1223            phase3_unget (&token2);
    1224            break;
    1225          }
    1226        tp->mixed_string = sum;
    1227      }
    1228  }
    1229  
    1230  
    1231  /* ========================= Extracting strings.  ========================== */
    1232  
    1233  
    1234  /* Context lookup table.  */
    1235  static flag_context_list_table_ty *flag_context_list_table;
    1236  
    1237  
    1238  /* Maximum supported nesting depth.  */
    1239  #define MAX_NESTING_DEPTH 1000
    1240  
    1241  /* Current nesting depth.  */
    1242  static int nesting_depth;
    1243  
    1244  
    1245  /* The file is broken into tokens.  Scan the token stream, looking for
    1246     a keyword, followed by a left paren, followed by a string.  When we
    1247     see this sequence, we have something to remember.  We assume we are
    1248     looking at a valid Vala program, and leave the complaints about the
    1249     grammar to the compiler.
    1250  
    1251       Normal handling: Look for
    1252         keyword ( ... msgid ... )
    1253         keyword msgid
    1254       Plural handling: Look for
    1255         keyword ( ... msgid ... msgid_plural ... )
    1256  
    1257     We use recursion because the arguments before msgid or between msgid
    1258     and msgid_plural can contain subexpressions of the same form.  */
    1259  
    1260  /* Extract messages until the next balanced closing parenthesis or bracket.
    1261     Extracted messages are added to MLP.
    1262     DELIM can be either token_type_rparen or token_type_rbracket, or
    1263     token_type_eof to accept both.
    1264     Return true upon eof, false upon closing parenthesis or bracket.  */
    1265  static bool
    1266  extract_balanced (message_list_ty *mlp, token_type_ty delim,
    1267                    flag_context_ty outer_context,
    1268                    flag_context_list_iterator_ty context_iter,
    1269                    struct arglist_parser *argparser)
    1270  {
    1271    /* Current argument number.  */
    1272    int arg = 1;
    1273    /* 0 when no keyword has been seen.  1 right after a keyword is seen.  */
    1274    int state;
    1275    /* Parameters of the keyword just seen.  Defined only in state 1.  */
    1276    const struct callshapes *next_shapes = NULL;
    1277    /* Context iterator that will be used if the next token is a '('.  */
    1278    flag_context_list_iterator_ty next_context_iter =
    1279      passthrough_context_list_iterator;
    1280    /* Current context.  */
    1281    flag_context_ty inner_context =
    1282      inherited_context (outer_context,
    1283                         flag_context_list_iterator_advance (&context_iter));
    1284  
    1285    /* Start state is 0.  */
    1286    state = 0;
    1287  
    1288    for (;;)
    1289      {
    1290        token_ty token;
    1291  
    1292        x_vala_lex (&token);
    1293  
    1294        switch (token.type)
    1295          {
    1296          case token_type_symbol:
    1297            {
    1298              void *keyword_value;
    1299  
    1300              if (hash_find_entry (&keywords, token.string, strlen (token.string),
    1301                                   &keyword_value)
    1302                  == 0)
    1303                {
    1304                  next_shapes = (const struct callshapes *) keyword_value;
    1305                  state = 1;
    1306                }
    1307              else
    1308                state = 0;
    1309            }
    1310            next_context_iter =
    1311              flag_context_list_iterator (
    1312                flag_context_list_table_lookup (
    1313                  flag_context_list_table,
    1314                  token.string, strlen (token.string)));
    1315            free (token.string);
    1316            continue;
    1317  
    1318          case token_type_lparen:
    1319            if (++nesting_depth > MAX_NESTING_DEPTH)
    1320              {
    1321                error_with_progname = false;
    1322                error (EXIT_FAILURE, 0, _("%s:%d: error: too many open parentheses"),
    1323                       logical_file_name, line_number);
    1324              }
    1325            if (extract_balanced (mlp, token_type_rparen,
    1326                                  inner_context, next_context_iter,
    1327                                  arglist_parser_alloc (mlp,
    1328                                                        state ? next_shapes : NULL)))
    1329              {
    1330                arglist_parser_done (argparser, arg);
    1331                return true;
    1332              }
    1333            nesting_depth--;
    1334            next_context_iter = null_context_list_iterator;
    1335            state = 0;
    1336            break;
    1337  
    1338          case token_type_rparen:
    1339            if (delim == token_type_rparen || delim == token_type_eof)
    1340              {
    1341                arglist_parser_done (argparser, arg);
    1342                return false;
    1343              }
    1344  
    1345            next_context_iter = null_context_list_iterator;
    1346            state = 0;
    1347            continue;
    1348  
    1349          case token_type_comma:
    1350            arg++;
    1351            inner_context =
    1352              inherited_context (outer_context,
    1353                                 flag_context_list_iterator_advance (
    1354                                   &context_iter));
    1355            next_context_iter = passthrough_context_list_iterator;
    1356            state = 0;
    1357            continue;
    1358  
    1359          case token_type_eof:
    1360            arglist_parser_done (argparser, arg);
    1361            return true;
    1362  
    1363          case token_type_string_literal:
    1364            {
    1365              lex_pos_ty pos;
    1366  
    1367              pos.file_name = logical_file_name;
    1368              pos.line_number = token.line_number;
    1369  
    1370              if (extract_all)
    1371                {
    1372                  char *string = mixed_string_contents (token.mixed_string);
    1373                  mixed_string_free (token.mixed_string);
    1374                  remember_a_message (mlp, NULL, string, true, false,
    1375                                      inner_context, &pos,
    1376                                      NULL, token.comment, false);
    1377                }
    1378              else
    1379                {
    1380                  /* A string immediately after a symbol means a function call.  */
    1381                  if (state)
    1382                    {
    1383                      struct arglist_parser *tmp_argparser;
    1384                      tmp_argparser = arglist_parser_alloc (mlp, next_shapes);
    1385  
    1386                      arglist_parser_remember (tmp_argparser, 1,
    1387                                               token.mixed_string, inner_context,
    1388                                               pos.file_name, pos.line_number,
    1389                                               token.comment, false);
    1390                      arglist_parser_done (tmp_argparser, 1);
    1391                    }
    1392                  else
    1393                    arglist_parser_remember (argparser, arg,
    1394                                             token.mixed_string, inner_context,
    1395                                             pos.file_name, pos.line_number,
    1396                                             token.comment, false);
    1397                }
    1398            }
    1399            drop_reference (token.comment);
    1400            next_context_iter = null_context_list_iterator;
    1401            state = 0;
    1402            continue;
    1403  
    1404          case token_type_character_constant:
    1405          case token_type_lbrace:
    1406          case token_type_rbrace:
    1407          case token_type_assign:
    1408          case token_type_return:
    1409          case token_type_plus:
    1410          case token_type_arithmetic_operator:
    1411          case token_type_equality_test_operator:
    1412          case token_type_logic_operator:
    1413          case token_type_question:
    1414          case token_type_colon:
    1415          case token_type_number:
    1416          case token_type_string_template:
    1417          case token_type_regex_literal:
    1418          case token_type_other:
    1419            next_context_iter = null_context_list_iterator;
    1420            state = 0;
    1421            continue;
    1422  
    1423          default:
    1424            abort ();
    1425          }
    1426      }
    1427  }
    1428  
    1429  void
    1430  extract_vala (FILE *f,
    1431                const char *real_filename, const char *logical_filename,
    1432                flag_context_list_table_ty *flag_table,
    1433                msgdomain_list_ty *mdlp)
    1434  {
    1435    message_list_ty *mlp = mdlp->item[0]->messages;
    1436  
    1437    fp = f;
    1438    real_file_name = real_filename;
    1439    logical_file_name = xstrdup (logical_filename);
    1440    line_number = 1;
    1441  
    1442    phase1_pushback_length = 0;
    1443  
    1444    last_comment_line = -1;
    1445    last_non_comment_line = -1;
    1446  
    1447    phase3_pushback_length = 0;
    1448    last_token_type = token_type_other;
    1449  
    1450    flag_context_list_table = flag_table;
    1451    nesting_depth = 0;
    1452  
    1453    init_keywords ();
    1454  
    1455    /* Eat tokens until eof is seen.  When extract_parenthesized returns
    1456       due to an unbalanced closing parenthesis, just restart it.  */
    1457    while (!extract_balanced (mlp, token_type_eof,
    1458                              null_context, null_context_list_iterator,
    1459                              arglist_parser_alloc (mlp, NULL)))
    1460      ;
    1461  
    1462    fp = NULL;
    1463    real_file_name = NULL;
    1464    logical_file_name = NULL;
    1465    line_number = 0;
    1466  }