1  /* xgettext Lua backend.
       2     Copyright (C) 2012-2013, 2016, 2018-2023 Free Software Foundation, Inc.
       3  
       4     This file was written by Ľubomír Remák <lubomirr@lubomirr.eu>, 2012.
       5  
       6     This program is free software: you can redistribute it and/or modify
       7     it under the terms of the GNU General Public License as published by
       8     the Free Software Foundation; either version 3 of the License, or
       9     (at your option) any later version.
      10  
      11     This program is distributed in the hope that it will be useful,
      12     but WITHOUT ANY WARRANTY; without even the implied warranty of
      13     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      14     GNU General Public License for more details.
      15  
      16     You should have received a copy of the GNU General Public License
      17     along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
      18  
      19  #ifdef HAVE_CONFIG_H
      20  #include "config.h"
      21  #endif
      22  
      23  /* Specification.  */
      24  #include "x-lua.h"
      25  
      26  #include <errno.h>
      27  #include <stdbool.h>
      28  #include <stdio.h>
      29  #include <stdlib.h>
      30  
      31  #include "attribute.h"
      32  #include "message.h"
      33  #include "rc-str-list.h"
      34  #include "xgettext.h"
      35  #include "xg-pos.h"
      36  #include "xg-mixed-string.h"
      37  #include "xg-arglist-context.h"
      38  #include "xg-arglist-callshape.h"
      39  #include "xg-arglist-parser.h"
      40  #include "xg-message.h"
      41  #include "error.h"
      42  #include "error-progname.h"
      43  #include "xalloc.h"
      44  #include "gettext.h"
      45  #include "po-charset.h"
      46  
      47  #define _(s) gettext(s)
      48  
      49  #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
      50  
      51  /* The Lua syntax is defined in the Lua manual sections 3.1 and 9,
      52     which can be found at
      53     https://www.lua.org/manual/5.2/manual.html#3.1
      54     https://www.lua.org/manual/5.2/manual.html#9  */
      55  
      56  /* If true extract all strings.  */
      57  static bool extract_all = false;
      58  
      59  /* A hash table for keywords.  */
      60  static hash_table keywords;
      61  static bool default_keywords = true;
      62  
      63  /* Set extract_all flag (gettext will extract all strings).  */
      64  void
      65  x_lua_extract_all ()
      66  {
      67    extract_all = true;
      68  }
      69  
      70  /* Adds a keyword.  Copied from other lexers.  */
      71  void
      72  x_lua_keyword (const char *name)
      73  {
      74    if (name == NULL)
      75      default_keywords = false;
      76    else
      77      {
      78        const char *end;
      79        struct callshape shape;
      80        const char *colon;
      81  
      82        if (keywords.table == NULL)
      83          hash_init (&keywords, 100);
      84  
      85        split_keywordspec (name, &end, &shape);
      86  
      87        /* The characters between name and end should form a valid C identifier.
      88           A colon means an invalid parse in split_keywordspec().  */
      89        colon = strchr (name, ':');
      90        if (colon == NULL || colon >= end)
      91          insert_keyword_callshape (&keywords, name, end - name, &shape);
      92      }
      93  }
      94  
      95  /* Finish initializing the keywords hash table.
      96     Called after argument processing, before each file is processed.  */
      97  static void
      98  init_keywords ()
      99  {
     100    if (default_keywords)
     101      {
     102        /* When adding new keywords here, also update the documentation in
     103           xgettext.texi!  */
     104        x_lua_keyword ("_");
     105        x_lua_keyword ("gettext.gettext");
     106        x_lua_keyword ("gettext.dgettext:2");
     107        x_lua_keyword ("gettext.dcgettext:2");
     108        x_lua_keyword ("gettext.ngettext:1,2");
     109        x_lua_keyword ("gettext.dngettext:2,3");
     110        x_lua_keyword ("gettext.dcngettext:2,3");
     111        default_keywords = false;
     112      }
     113  }
     114  
     115  void
     116  init_flag_table_lua ()
     117  {
     118    xgettext_record_flag ("_:1:pass-lua-format");
     119    xgettext_record_flag ("gettext.gettext:1:pass-lua-format");
     120    xgettext_record_flag ("gettext.dgettext:2:pass-lua-format");
     121    xgettext_record_flag ("gettext.dcgettext:2:pass-lua-format");
     122    xgettext_record_flag ("gettext.ngettext:1:pass-lua-format");
     123    xgettext_record_flag ("gettext.ngettext:2:pass-lua-format");
     124    xgettext_record_flag ("gettext.dngettext:2:pass-lua-format");
     125    xgettext_record_flag ("gettext.dngettext:3:pass-lua-format");
     126    xgettext_record_flag ("gettext.dcngettext:2:pass-lua-format");
     127    xgettext_record_flag ("gettext.dcngettext:3:pass-lua-format");
     128    xgettext_record_flag ("string.format:1:lua-format");
     129  }
     130  
     131  
     132  /* ======================== Reading of characters.  ======================== */
     133  
     134  /* The input file stream.  */
     135  static FILE *fp;
     136  
     137  
     138  /* 1. line_number handling.  */
     139  
     140  static unsigned char phase1_pushback[2];
     141  static int phase1_pushback_length;
     142  
     143  static bool first_character;
     144  
     145  static int
     146  phase1_getc ()
     147  {
     148    int c;
     149  
     150    if (phase1_pushback_length)
     151      c = phase1_pushback[--phase1_pushback_length];
     152    else
     153      {
     154        c = getc (fp);
     155  
     156        if (first_character)
     157          {
     158            first_character = false;
     159  
     160            /* Ignore shebang line.  No pushback required in this case.  */
     161            if (c == '#')
     162              {
     163                while (c != '\n' && c != EOF)
     164                  c = getc (fp);
     165                if (c == '\n')
     166                  {
     167                    line_number++;
     168                    c = getc (fp);
     169                  }
     170              }
     171          }
     172  
     173        if (c == EOF)
     174          {
     175            if (ferror (fp))
     176              error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
     177                     real_file_name);
     178            return EOF;
     179          }
     180      }
     181  
     182    if (c == '\n')
     183      line_number++;
     184  
     185    return c;
     186  }
     187  
     188  /* Supports 2 characters of pushback.  */
     189  
     190  static void
     191  phase1_ungetc (int c)
     192  {
     193    if (c != EOF)
     194      {
     195        if (c == '\n')
     196          --line_number;
     197  
     198        if (phase1_pushback_length == SIZEOF (phase1_pushback))
     199          abort ();
     200        phase1_pushback[phase1_pushback_length++] = c;
     201      }
     202  }
     203  
     204  
     205  /* These are for tracking whether comments count as immediately before
     206     keyword.  */
     207  static int last_comment_line;
     208  static int last_non_comment_line;
     209  
     210  /* Accumulating comments.  */
     211  
     212  static char *buffer;
     213  static size_t bufmax;
     214  static size_t buflen;
     215  
     216  static inline void
     217  comment_start ()
     218  {
     219    buflen = 0;
     220  }
     221  
     222  static inline void
     223  comment_add (int c)
     224  {
     225    if (buflen >= bufmax)
     226      {
     227        bufmax = 2 * bufmax + 10;
     228        buffer = xrealloc (buffer, bufmax);
     229      }
     230    buffer[buflen++] = c;
     231  }
     232  
     233  static inline void
     234  comment_line_end (size_t chars_to_remove)
     235  {
     236    buflen -= chars_to_remove;
     237    while (buflen >= 1
     238           && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
     239      --buflen;
     240    if (chars_to_remove == 0 && buflen >= bufmax)
     241      {
     242        bufmax = 2 * bufmax + 10;
     243        buffer = xrealloc (buffer, bufmax);
     244      }
     245    buffer[buflen] = '\0';
     246    savable_comment_add (buffer);
     247  }
     248  
     249  /* Eats characters until '\n' and adds them to the comment.  */
     250  static void
     251  eat_comment_line ()
     252  {
     253    for (;;)
     254      {
     255        int c = phase1_getc ();
     256        if (c == '\n' || c == EOF)
     257          {
     258            comment_line_end (0);
     259            break;
     260          }
     261  
     262        if (!(buflen == 0 && (c == ' ' || c == '\t')))
     263          comment_add (c);
     264      }
     265  }
     266  
     267  static int
     268  phase2_getc ()
     269  {
     270    int c;
     271    int lineno;
     272  
     273    c = phase1_getc ();
     274  
     275    if (c == '-')
     276      {
     277        c = phase1_getc ();
     278  
     279        if (c == '-')
     280          {
     281            /* It starts with '--', so it must be either a short or a long
     282               comment.  */
     283            c = phase1_getc ();
     284  
     285            if (c == '[')
     286              {
     287                c = phase1_getc ();
     288  
     289                int esigns = 0;
     290                while (c == '=')
     291                  {
     292                    esigns++;
     293                    c = phase1_getc ();
     294                  }
     295  
     296                if (c == '[')
     297                  {
     298                    /* Long comment.  */
     299                    bool right_bracket = false;
     300                    bool end = false;
     301                    int esigns2 = 0;
     302  
     303                    lineno = line_number;
     304                    comment_start ();
     305                    while (!end)
     306                      {
     307                        c = phase1_getc ();
     308  
     309                        if (c == EOF)
     310                          break;
     311  
     312                        /* Ignore leading spaces and tabs.  */
     313                        if (!(buflen == 0 && (c == ' ' || c == '\t')))
     314                          {
     315                            comment_add (c);
     316  
     317                            switch (c)
     318                              {
     319                              case ']':
     320                                if (!right_bracket)
     321                                  {
     322                                    right_bracket = true;
     323                                    esigns2 = 0;
     324                                  }
     325                                else
     326                                  {
     327                                    if (esigns2 == esigns)
     328                                      {
     329                                        comment_line_end (2 + esigns);
     330                                        end = true;
     331                                      }
     332                                  }
     333                                break;
     334  
     335                              case '=':
     336                                if (right_bracket)
     337                                  esigns2++;
     338                                break;
     339  
     340                              case '\n':
     341                                comment_line_end (1);
     342                                comment_start ();
     343                                lineno = line_number;
     344                                FALLTHROUGH;
     345                              default:
     346                                right_bracket = false;
     347                              }
     348                          }
     349                      }
     350                    last_comment_line = lineno;
     351                    return ' ';
     352                  }
     353                else
     354                  {
     355                    /* One line (short) comment, starting with '--[=...='.  */
     356                    lineno = last_comment_line;
     357                    comment_start ();
     358                    comment_add ('[');
     359                    while (esigns--)
     360                      comment_add ('=');
     361                    phase1_ungetc (c);
     362                    eat_comment_line ();
     363                    last_comment_line = lineno;
     364                    return '\n';
     365                  }
     366              }
     367            else
     368              {
     369                /* One line (short) comment.  */
     370                lineno = line_number;
     371                comment_start ();
     372                phase1_ungetc (c);
     373                eat_comment_line ();
     374                last_comment_line = lineno;
     375                return '\n';
     376              }
     377          }
     378        else
     379          {
     380            /* Minus sign.  */
     381            phase1_ungetc (c);
     382            return '-';
     383          }
     384      }
     385    else
     386      return c;
     387  }
     388  
     389  
     390  /* ========================== Reading of tokens.  ========================== */
     391  
     392  enum token_type_ty
     393  {
     394    token_type_eof,
     395    token_type_lparen,            /* ( */
     396    token_type_rparen,            /* ) */
     397    token_type_lbracket,          /* [ */
     398    token_type_rbracket,          /* ] */
     399    token_type_comma,             /* , */
     400    token_type_dot,               /* . */
     401    token_type_doubledot,         /* .. */
     402    token_type_operator1,         /* + - * / % not # - ^ */
     403    token_type_operator2,         /* < > <= >= ~= == and or */
     404    token_type_string,
     405    token_type_number,
     406    token_type_symbol,
     407    token_type_other
     408  };
     409  
     410  typedef enum token_type_ty token_type_ty;
     411  
     412  typedef struct token_ty token_ty;
     413  struct token_ty
     414  {
     415    token_type_ty type;
     416    char *string; /* for token_type_string_literal, token_type_symbol */
     417    refcounted_string_list_ty *comment;  /* for token_type_string_literal */
     418    int line_number;
     419  };
     420  
     421  /* Free the memory pointed to by a 'struct token_ty'.  */
     422  static inline void
     423  free_token (token_ty *tp)
     424  {
     425    if (tp->type == token_type_string || tp->type == token_type_symbol)
     426      free (tp->string);
     427    if (tp->type == token_type_string)
     428      drop_reference (tp->comment);
     429  }
     430  
     431  /* Our current string.  */
     432  static int string_buf_length;
     433  static int string_buf_alloc;
     434  static char *string_buf;
     435  
     436  static void
     437  string_start ()
     438  {
     439    string_buf_length = 0;
     440  }
     441  
     442  static void
     443  string_add (int c)
     444  {
     445    if (string_buf_length >= string_buf_alloc)
     446      {
     447        string_buf_alloc = 2 * string_buf_alloc + 10;
     448        string_buf = xrealloc (string_buf, string_buf_alloc);
     449      }
     450  
     451    string_buf[string_buf_length++] = c;
     452  }
     453  
     454  static void
     455  string_end ()
     456  {
     457    if (string_buf_length >= string_buf_alloc)
     458      {
     459        string_buf_alloc = string_buf_alloc + 1;
     460        string_buf = xrealloc (string_buf, string_buf_alloc);
     461      }
     462  
     463    string_buf[string_buf_length] = '\0';
     464  }
     465  
     466  
     467  /* We need 3 pushback tokens for string optimization.  */
     468  static int phase3_pushback_length;
     469  static token_ty phase3_pushback[3];
     470  
     471  
     472  static void
     473  phase3_unget (token_ty *tp)
     474  {
     475    if (tp->type != token_type_eof)
     476      {
     477        if (phase3_pushback_length == SIZEOF (phase3_pushback))
     478          abort ();
     479        phase3_pushback[phase3_pushback_length++] = *tp;
     480      }
     481  }
     482  
     483  static void
     484  phase3_get (token_ty *tp)
     485  {
     486    int c;
     487    int c2;
     488    int c_start;
     489  
     490    if (phase3_pushback_length)
     491      {
     492        *tp = phase3_pushback[--phase3_pushback_length];
     493        return;
     494      }
     495  
     496    tp->string = NULL;
     497  
     498    for (;;)
     499      {
     500        tp->line_number = line_number;
     501        c = phase2_getc ();
     502  
     503        switch (c)
     504          {
     505          case EOF:
     506            tp->type = token_type_eof;
     507            return;
     508  
     509          case '\n':
     510            if (last_non_comment_line > last_comment_line)
     511              savable_comment_reset ();
     512            FALLTHROUGH;
     513          case ' ':
     514          case '\t':
     515          case '\f':
     516            continue;
     517  
     518          case '+':
     519          case '-':
     520          case '*':
     521          case '/':
     522          case '^':
     523          case '%':
     524          case '#':
     525            tp->type = token_type_operator1;
     526            return;
     527          case '<':
     528          case '>':
     529          case '=':
     530            c2 = phase1_getc ();
     531            if (c2 != '=')
     532              phase1_ungetc (c2);
     533            tp->type = token_type_operator2;
     534            return;
     535          case '~':
     536            c2 = phase1_getc ();
     537            if (c2 == '=')
     538              {
     539                tp->type = token_type_operator2;
     540                return;
     541              }
     542            else
     543              phase1_ungetc (c2);
     544            continue;
     545          case '(':
     546            tp->type = token_type_lparen;
     547            return;
     548          case ')':
     549            tp->type = token_type_rparen;
     550            return;
     551          case ',':
     552            tp->type = token_type_comma;
     553            return;
     554  
     555          case ';':
     556            tp->type = token_type_other;
     557            return;
     558  
     559            /* There are three operators beginning with a dot.  '.',
     560               '..' and '...'.  The most useful for us is the string
     561               concatenation operator ('..').  */
     562          case '.':
     563            c = phase1_getc ();
     564            if (c == '.')
     565              {
     566                c = phase1_getc ();
     567                if (c == '.')
     568                  {
     569                    tp->type = token_type_other;
     570                    return;
     571                  }
     572                else
     573                  {
     574                    phase1_ungetc (c);
     575                    tp->type = token_type_doubledot;
     576                    return;
     577                  }
     578              }
     579            else if (c >= '0' && c <= '9')
     580              {
     581                /* It's a number.  We aren't interested in the actual
     582                   numeric value, so ignore the dot and let next
     583                   iteration eat the number.  */
     584                phase1_ungetc (c);
     585                continue;
     586              }
     587            else
     588              {
     589                phase1_ungetc (c);
     590                tp->type = token_type_dot;
     591                return;
     592              }
     593  
     594          case '"':
     595          case '\'':
     596            c_start = c;
     597            string_start ();
     598  
     599            for (;;)
     600              {
     601                /* We need unprocessed characters from phase 1.  */
     602                c = phase1_getc ();
     603  
     604                if (c == EOF || c == c_start || c == '\n')
     605                  {
     606                    /* End of string.  */
     607                    string_end ();
     608                    tp->string = xstrdup (string_buf);
     609                    tp->comment = add_reference (savable_comment);
     610                    tp->type = token_type_string;
     611                    return;
     612                  }
     613  
     614                /* We got '\', this is probably an escape sequence.  */
     615                if (c == '\\')
     616                  {
     617                    c = phase1_getc ();
     618                    switch (c)
     619                      {
     620                      case 'a':
     621                        string_add ('\a');
     622                        break;
     623                      case 'b':
     624                        string_add ('\b');
     625                        break;
     626                      case 'f':
     627                        string_add ('\f');
     628                        break;
     629                      case 'n':
     630                        string_add ('\n');
     631                        break;
     632                      case 'r':
     633                        string_add ('\r');
     634                        break;
     635                      case 't':
     636                        string_add ('\t');
     637                        break;
     638                      case 'v':
     639                        string_add ('\v');
     640                        break;
     641                      case 'x':
     642                        {
     643                          int num = 0;
     644                          int i = 0;
     645  
     646                          for (i = 0; i < 2; i++)
     647                            {
     648                              c = phase1_getc ();
     649                              if (c >= '0' && c <= '9')
     650                                num += c - '0';
     651                              else if (c >= 'a' && c <= 'f')
     652                                num += c - 'a' + 10;
     653                              else if (c >= 'A' && c <= 'F')
     654                                num += c - 'A' + 10;
     655                              else
     656                                {
     657                                  phase1_ungetc (c);
     658                                  break;
     659                                }
     660  
     661                              if (i == 0)
     662                                num *= 16;
     663                            }
     664  
     665                          if (i == 2)
     666                            string_add (num);
     667                        }
     668  
     669                        break;
     670                      case 'z':
     671                        /* Ignore the following whitespace.  */
     672                        do
     673                          {
     674                            c = phase1_getc ();
     675                          }
     676                        while (c == ' ' || c == '\n' || c == '\t' || c == '\r'
     677                               || c == '\f' || c == '\v');
     678  
     679                        phase1_ungetc (c);
     680  
     681                        break;
     682                      default:
     683                        /* Check if it's a '\ddd' sequence.  */
     684                        if (c >= '0' && c <= '9')
     685                          {
     686                            int num = 0;
     687                            int i = 0;
     688  
     689                            while (c >= '0' && c <= '9' && i < 3)
     690                              {
     691                                num *= 10;
     692                                num += (c - '0');
     693                                c = phase1_getc ();
     694                                i++;
     695                              }
     696  
     697                            /* The last read character is either a
     698                               non-number or another number after our
     699                               '\ddd' sequence.  We need to ungetc it.  */
     700                            phase1_ungetc (c);
     701  
     702                            /* The sequence number is too big, this
     703                               causes a lexical error.  Ignore it.  */
     704                            if (num < 256)
     705                              string_add (num);
     706                          }
     707                        else
     708                          string_add (c);
     709                      }
     710                  }
     711                else
     712                  string_add (c);
     713              }
     714            break;
     715  
     716          case '[':
     717            c = phase1_getc ();
     718  
     719            /* Count the number of equal signs.  */
     720            int esigns = 0;
     721            while (c == '=')
     722              {
     723                esigns++;
     724                c = phase1_getc ();
     725              }
     726  
     727            if (c != '[')
     728              {
     729                /* We did not find what we were looking for, ungetc it.  */
     730                phase1_ungetc (c);
     731                if (esigns == 0)
     732                  {
     733                    /* Our current character isn't '[' and we got 0 equal
     734                       signs, so the first '[' must have been a left
     735                       bracket.  */
     736                    tp->type = token_type_lbracket;
     737                    return;
     738                  }
     739                else
     740                  /* Lexical error, ignore it.  */
     741                  continue;
     742              }
     743  
     744            /* Found an opening long bracket.  */
     745            string_start ();
     746  
     747            /* See if it is immediately followed by a newline.  */
     748            c = phase1_getc ();
     749            if (c != '\n')
     750              phase1_ungetc (c);
     751  
     752            for (;;)
     753              {
     754                c = phase1_getc ();
     755  
     756                if (c == EOF)
     757                  {
     758                    string_end ();
     759                    tp->string = xstrdup (string_buf);
     760                    tp->comment = add_reference (savable_comment);
     761                    tp->type = token_type_string;
     762                    return;
     763                  }
     764                if (c == ']')
     765                  {
     766                    c = phase1_getc ();
     767  
     768                    /* Count the number of equal signs.  */
     769                    int esigns2 = 0;
     770                    while (c == '=')
     771                      {
     772                        esigns2++;
     773                        c = phase1_getc ();
     774                      }
     775  
     776                    if (c == ']' && esigns == esigns2)
     777                      {
     778                        /* We got ']==...==]', where the number of equal
     779                           signs matches the number of equal signs in
     780                           the opening bracket.  */
     781                        string_end ();
     782                        tp->string = xstrdup (string_buf);
     783                        tp->comment = add_reference (savable_comment);
     784                        tp->type = token_type_string;
     785                        return;
     786                      }
     787                    else
     788                      {
     789                        /* Otherwise we got either ']==' garbage or
     790                           ']==...==]' with a different number of equal
     791                           signs.
     792  
     793                           Add ']' and equal signs to the string, and
     794                           ungetc the current character, because the
     795                           second ']' might be a part of another closing
     796                           long bracket, e.g. '==]===]'.  */
     797                        phase1_ungetc (c);
     798  
     799                        string_add (']');
     800                        while (esigns2--)
     801                          string_add ('=');
     802                      }
     803                  }
     804                else
     805                  string_add (c);
     806              }
     807            break;
     808  
     809          case ']':
     810            tp->type = token_type_rbracket;
     811            return;
     812  
     813          default:
     814            if (c >= '0' && c <= '9')
     815              {
     816                while (c >= '0' && c <= '9')
     817                  c = phase1_getc ();
     818  
     819                if (c == '.')
     820                  {
     821                    c = phase1_getc ();
     822                    while (c >= '0' && c <= '9')
     823                      c = phase1_getc ();
     824                  }
     825  
     826                if (c == 'e' || c == 'E')
     827                  {
     828                    if (c == '+' || c == '-')
     829                      c = phase1_getc ();
     830                    while (c >= '0' && c <= '9')
     831                      c = phase1_getc ();
     832                  }
     833  
     834                phase1_ungetc (c);
     835  
     836                tp->type = token_type_number;
     837                return;
     838              }
     839            else if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
     840                     || c == '_')
     841              {
     842                string_start ();
     843                while ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
     844                       || c == '_' || (c >= '0' && c <= '9'))
     845                  {
     846                    string_add (c);
     847                    c = phase1_getc ();
     848                  }
     849                string_end ();
     850                phase1_ungetc (c);
     851  
     852                if (strcmp (string_buf, "not") == 0)
     853                  tp->type = token_type_operator1;
     854                else if (strcmp (string_buf, "and") == 0)
     855                  tp->type = token_type_operator2;
     856                else if (strcmp (string_buf, "or") == 0)
     857                  tp->type = token_type_operator2;
     858                else
     859                  {
     860                    tp->string = xstrdup (string_buf);
     861                    tp->type = token_type_symbol;
     862                  }
     863                return;
     864              }
     865            else
     866              tp->type = token_type_other;
     867          }
     868      }
     869  }
     870  
     871  /* String and symbol concatenation.  */
     872  
     873  static token_type_ty phase4_last;
     874  
     875  /* We need 3 pushback tokens for string and symbol concatenation.  */
     876  static int phase4_pushback_length;
     877  static token_ty phase4_pushback[3];
     878  
     879  static void
     880  phase4_unget (token_ty *tp)
     881  {
     882    if (tp->type != token_type_eof)
     883      {
     884        if (phase4_pushback_length == SIZEOF (phase4_pushback))
     885          abort ();
     886        phase4_pushback[phase4_pushback_length++] = *tp;
     887      }
     888  }
     889  
     890  static void
     891  phase4_get (token_ty *tp)
     892  {
     893    if (phase4_pushback_length)
     894      {
     895        *tp = phase4_pushback[--phase4_pushback_length];
     896        phase4_last = tp->type;
     897        return;
     898      }
     899  
     900    phase3_get (tp);
     901    if (tp->type == token_type_string
     902        && !(phase4_last == token_type_operator1
     903             || phase4_last == token_type_dot
     904             || phase4_last == token_type_symbol
     905             || phase4_last == token_type_doubledot
     906             || phase4_last == token_type_rparen))
     907      {
     908        char *sum = tp->string;
     909        size_t sum_len = strlen (sum);
     910  
     911        for (;;)
     912          {
     913            token_ty token2;
     914  
     915            phase3_get (&token2);
     916            if (token2.type == token_type_doubledot)
     917              {
     918                token_ty token3;
     919  
     920                phase3_get (&token3);
     921                if (token3.type == token_type_string)
     922                  {
     923                    token_ty token_after;
     924  
     925                    phase3_get (&token_after);
     926                    if (token_after.type != token_type_operator1)
     927                      {
     928                        char *addend = token3.string;
     929                        size_t addend_len = strlen (addend);
     930  
     931                        sum = (char *) xrealloc (sum, sum_len + addend_len + 1);
     932                        memcpy (sum + sum_len, addend, addend_len + 1);
     933                        sum_len += addend_len;
     934  
     935                        phase3_unget (&token_after);
     936                        free_token (&token3);
     937                        free_token (&token2);
     938                        continue;
     939                      }
     940                    phase3_unget (&token_after);
     941                  }
     942                phase3_unget (&token3);
     943              }
     944            phase3_unget (&token2);
     945            break;
     946          }
     947        tp->string = sum;
     948      }
     949    phase4_last = tp->type;
     950  }
     951  
     952  static void
     953  phase5_get (token_ty *tp)
     954  {
     955    phase4_get (tp);
     956  
     957    /* Combine symbol1 . ... . symbolN to a single strings, so that
     958       we can recognize function calls like
     959       gettext.gettext.  The information present for
     960       symbolI.....symbolN has precedence over the information for
     961       symbolJ.....symbolN with J > I.  */
     962    if (tp->type == token_type_symbol)
     963      {
     964        char *sum = tp->string;
     965        size_t sum_len = strlen (sum);
     966  
     967        for (;;)
     968          {
     969            token_ty token2;
     970  
     971            phase4_get (&token2);
     972            if (token2.type == token_type_dot)
     973              {
     974                token_ty token3;
     975  
     976                phase4_get (&token3);
     977                if (token3.type == token_type_symbol)
     978                  {
     979                    char *addend = token3.string;
     980                    size_t addend_len = strlen (addend);
     981  
     982                    sum = (char *) xrealloc (sum, sum_len + 1 + addend_len + 1);
     983                    sum[sum_len] = '.';
     984                    memcpy (sum + sum_len + 1, addend, addend_len + 1);
     985                    sum_len += 1 + addend_len;
     986  
     987                    free_token (&token2);
     988                    free_token (&token3);
     989                    continue;
     990                  }
     991                phase4_unget (&token3);
     992              }
     993            phase4_unget (&token2);
     994            break;
     995          }
     996        tp->string = sum;
     997      }
     998  }
     999  
    1000  static void
    1001  x_lua_lex (token_ty *tok)
    1002  {
    1003    phase5_get (tok);
    1004  }
    1005  
    1006  
    1007  /* ========================= Extracting strings.  ========================== */
    1008  
    1009  
    1010  /* Context lookup table.  */
    1011  static flag_context_list_table_ty *flag_context_list_table;
    1012  
    1013  
    1014  /* Maximum supported nesting depth.  */
    1015  #define MAX_NESTING_DEPTH 1000
    1016  
    1017  /* Current nesting depths.  */
    1018  static int paren_nesting_depth;
    1019  static int bracket_nesting_depth;
    1020  
    1021  
    1022  /* The file is broken into tokens.  Scan the token stream, looking for
    1023     a keyword, followed by a left paren, followed by a string.  When we
    1024     see this sequence, we have something to remember.  We assume we are
    1025     looking at a valid Lua program, and leave the complaints about the
    1026     grammar to the compiler.
    1027  
    1028       Normal handling: Look for
    1029         keyword ( ... msgid ... )
    1030         keyword msgid
    1031       Plural handling: Look for
    1032         keyword ( ... msgid ... msgid_plural ... )
    1033  
    1034     We use recursion because the arguments before msgid or between msgid
    1035     and msgid_plural can contain subexpressions of the same form.  */
    1036  
    1037  /* Extract messages until the next balanced closing parenthesis or bracket.
    1038     Extracted messages are added to MLP.
    1039     DELIM can be either token_type_rparen or token_type_rbracket, or
    1040     token_type_eof to accept both.
    1041     Return true upon eof, false upon closing parenthesis or bracket.  */
    1042  static bool
    1043  extract_balanced (message_list_ty *mlp, token_type_ty delim,
    1044                    flag_context_ty outer_context,
    1045                    flag_context_list_iterator_ty context_iter,
    1046                    struct arglist_parser *argparser)
    1047  {
    1048    /* Current argument number.  */
    1049    int arg = 1;
    1050    /* 0 when no keyword has been seen.  1 right after a keyword is seen.  */
    1051    int state;
    1052    /* Parameters of the keyword just seen.  Defined only in state 1.  */
    1053    const struct callshapes *next_shapes = NULL;
    1054    /* Context iterator that will be used if the next token is a '('.  */
    1055    flag_context_list_iterator_ty next_context_iter =
    1056      passthrough_context_list_iterator;
    1057    /* Current context.  */
    1058    flag_context_ty inner_context =
    1059      inherited_context (outer_context,
    1060                         flag_context_list_iterator_advance (&context_iter));
    1061  
    1062    /* Start state is 0.  */
    1063    state = 0;
    1064  
    1065    for (;;)
    1066      {
    1067        token_ty token;
    1068  
    1069        x_lua_lex (&token);
    1070  
    1071        switch (token.type)
    1072          {
    1073          case token_type_symbol:
    1074            {
    1075              void *keyword_value;
    1076  
    1077              if (hash_find_entry (&keywords, token.string, strlen (token.string),
    1078                                   &keyword_value)
    1079                  == 0)
    1080                {
    1081                  next_shapes = (const struct callshapes *) keyword_value;
    1082                  state = 1;
    1083                }
    1084              else
    1085                state = 0;
    1086            }
    1087            next_context_iter =
    1088              flag_context_list_iterator (
    1089                flag_context_list_table_lookup (
    1090                  flag_context_list_table,
    1091                  token.string, strlen (token.string)));
    1092            free (token.string);
    1093            continue;
    1094  
    1095          case token_type_lparen:
    1096            if (++paren_nesting_depth > MAX_NESTING_DEPTH)
    1097              {
    1098                error_with_progname = false;
    1099                error (EXIT_FAILURE, 0, _("%s:%d: error: too many open parentheses"),
    1100                       logical_file_name, line_number);
    1101              }
    1102            if (extract_balanced (mlp, token_type_rparen,
    1103                                  inner_context, next_context_iter,
    1104                                  arglist_parser_alloc (mlp,
    1105                                                        state ? next_shapes : NULL)))
    1106              {
    1107                arglist_parser_done (argparser, arg);
    1108                return true;
    1109              }
    1110            paren_nesting_depth--;
    1111            next_context_iter = null_context_list_iterator;
    1112            state = 0;
    1113            break;
    1114  
    1115          case token_type_rparen:
    1116            if (delim == token_type_rparen || delim == token_type_eof)
    1117              {
    1118                arglist_parser_done (argparser, arg);
    1119                return false;
    1120              }
    1121  
    1122            next_context_iter = null_context_list_iterator;
    1123            state = 0;
    1124            continue;
    1125  
    1126          case token_type_lbracket:
    1127            if (++bracket_nesting_depth > MAX_NESTING_DEPTH)
    1128              {
    1129                error_with_progname = false;
    1130                error (EXIT_FAILURE, 0, _("%s:%d: error: too many open brackets"),
    1131                       logical_file_name, line_number);
    1132              }
    1133            if (extract_balanced (mlp, token_type_rbracket,
    1134                                  null_context, null_context_list_iterator,
    1135                                  arglist_parser_alloc (mlp, NULL)))
    1136              {
    1137                arglist_parser_done (argparser, arg);
    1138                return true;
    1139              }
    1140            bracket_nesting_depth--;
    1141            next_context_iter = null_context_list_iterator;
    1142            state = 0;
    1143            break;
    1144  
    1145          case token_type_rbracket:
    1146            if (delim == token_type_rbracket || delim == token_type_eof)
    1147              {
    1148                arglist_parser_done (argparser, arg);
    1149                return false;
    1150              }
    1151  
    1152            next_context_iter = null_context_list_iterator;
    1153            state = 0;
    1154            continue;
    1155  
    1156          case token_type_comma:
    1157            arg++;
    1158            inner_context =
    1159              inherited_context (outer_context,
    1160                                 flag_context_list_iterator_advance (
    1161                                   &context_iter));
    1162            next_context_iter = passthrough_context_list_iterator;
    1163            state = 0;
    1164            continue;
    1165  
    1166          case token_type_eof:
    1167            arglist_parser_done (argparser, arg);
    1168            return true;
    1169  
    1170          case token_type_string:
    1171            {
    1172              lex_pos_ty pos;
    1173              pos.file_name = logical_file_name;
    1174              pos.line_number = token.line_number;
    1175  
    1176              if (extract_all)
    1177                remember_a_message (mlp, NULL, token.string, false, false,
    1178                                    inner_context, &pos,
    1179                                    NULL, token.comment, false);
    1180              else
    1181                {
    1182                  mixed_string_ty *ms =
    1183                    mixed_string_alloc_simple (token.string, lc_string,
    1184                                               pos.file_name, pos.line_number);
    1185                  free (token.string);
    1186                  /* A string immediately after a symbol means a function call.  */
    1187                  if (state)
    1188                    {
    1189                      struct arglist_parser *tmp_argparser;
    1190                      tmp_argparser = arglist_parser_alloc (mlp, next_shapes);
    1191  
    1192                      arglist_parser_remember (tmp_argparser, 1, ms,
    1193                                               inner_context,
    1194                                               pos.file_name, pos.line_number,
    1195                                               token.comment, false);
    1196                      arglist_parser_done (tmp_argparser, 1);
    1197                    }
    1198                  else
    1199                    arglist_parser_remember (argparser, arg, ms,
    1200                                             inner_context,
    1201                                             pos.file_name, pos.line_number,
    1202                                             token.comment, false);
    1203                }
    1204            }
    1205            drop_reference (token.comment);
    1206            next_context_iter = null_context_list_iterator;
    1207            state = 0;
    1208            continue;
    1209  
    1210          case token_type_dot:
    1211          case token_type_doubledot:
    1212          case token_type_operator1:
    1213          case token_type_operator2:
    1214          case token_type_number:
    1215          case token_type_other:
    1216            next_context_iter = null_context_list_iterator;
    1217            state = 0;
    1218            continue;
    1219  
    1220          default:
    1221            abort ();
    1222          }
    1223      }
    1224  }
    1225  
    1226  void
    1227  extract_lua (FILE *f,
    1228               const char *real_filename, const char *logical_filename,
    1229               flag_context_list_table_ty *flag_table,
    1230               msgdomain_list_ty *mdlp)
    1231  {
    1232    message_list_ty *mlp = mdlp->item[0]->messages;
    1233  
    1234    fp = f;
    1235    real_file_name = real_filename;
    1236    logical_file_name = xstrdup (logical_filename);
    1237    line_number = 1;
    1238  
    1239    phase1_pushback_length = 0;
    1240    first_character = true;
    1241  
    1242    last_comment_line = -1;
    1243    last_non_comment_line = -1;
    1244  
    1245    phase3_pushback_length = 0;
    1246  
    1247    phase4_last = token_type_eof;
    1248    phase4_pushback_length = 0;
    1249  
    1250    flag_context_list_table = flag_table;
    1251    paren_nesting_depth = 0;
    1252    bracket_nesting_depth = 0;
    1253  
    1254    init_keywords ();
    1255  
    1256    /* Eat tokens until eof is seen.  When extract_parenthesized returns
    1257       due to an unbalanced closing parenthesis, just restart it.  */
    1258    while (!extract_balanced (mlp, token_type_eof,
    1259                              null_context, null_context_list_iterator,
    1260                              arglist_parser_alloc (mlp, NULL)))
    1261      ;
    1262  
    1263    fp = NULL;
    1264    real_file_name = NULL;
    1265    logical_file_name = NULL;
    1266    line_number = 0;
    1267  }