1  /* xgettext Python backend.
       2     Copyright (C) 2002-2003, 2005-2011, 2013-2014, 2018-2023 Free Software Foundation, Inc.
       3  
       4     This file was written by Bruno Haible <haible@clisp.cons.org>, 2002.
       5  
       6     This program is free software: you can redistribute it and/or modify
       7     it under the terms of the GNU General Public License as published by
       8     the Free Software Foundation; either version 3 of the License, or
       9     (at your option) any later version.
      10  
      11     This program is distributed in the hope that it will be useful,
      12     but WITHOUT ANY WARRANTY; without even the implied warranty of
      13     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      14     GNU General Public License for more details.
      15  
      16     You should have received a copy of the GNU General Public License
      17     along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
      18  
      19  #ifdef HAVE_CONFIG_H
      20  # include "config.h"
      21  #endif
      22  
      23  /* Specification.  */
      24  #include "x-python.h"
      25  
      26  #include <assert.h>
      27  #include <errno.h>
      28  #include <stdbool.h>
      29  #include <stdio.h>
      30  #include <stdlib.h>
      31  #include <string.h>
      32  
      33  #include "attribute.h"
      34  #include "message.h"
      35  #include "rc-str-list.h"
      36  #include "xgettext.h"
      37  #include "xg-pos.h"
      38  #include "xg-encoding.h"
      39  #include "xg-mixed-string.h"
      40  #include "xg-arglist-context.h"
      41  #include "xg-arglist-callshape.h"
      42  #include "xg-arglist-parser.h"
      43  #include "xg-message.h"
      44  #include "error.h"
      45  #include "error-progname.h"
      46  #include "progname.h"
      47  #include "basename-lgpl.h"
      48  #include "xerror.h"
      49  #include "xvasprintf.h"
      50  #include "xalloc.h"
      51  #include "c-strstr.h"
      52  #include "c-ctype.h"
      53  #include "po-charset.h"
      54  #include "uniname.h"
      55  #include "unistr.h"
      56  #include "gettext.h"
      57  
      58  #define _(s) gettext(s)
      59  
      60  #undef max /* clean up after MSVC's <stdlib.h> */
      61  #define max(a,b) ((a) > (b) ? (a) : (b))
      62  
      63  #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
      64  
      65  
      66  /* The Python syntax is defined in the Python Reference Manual
      67     /usr/share/doc/packages/python/html/ref/index.html.
      68     See also Python-2.0/Parser/tokenizer.c, Python-2.0/Python/compile.c,
      69     Python-2.0/Objects/unicodeobject.c.  */
      70  
      71  
      72  /* ====================== Keyword set customization.  ====================== */
      73  
      74  /* If true extract all strings.  */
      75  static bool extract_all = false;
      76  
      77  static hash_table keywords;
      78  static bool default_keywords = true;
      79  
      80  
      81  void
      82  x_python_extract_all ()
      83  {
      84    extract_all = true;
      85  }
      86  
      87  
      88  void
      89  x_python_keyword (const char *name)
      90  {
      91    if (name == NULL)
      92      default_keywords = false;
      93    else
      94      {
      95        const char *end;
      96        struct callshape shape;
      97        const char *colon;
      98  
      99        if (keywords.table == NULL)
     100          hash_init (&keywords, 100);
     101  
     102        split_keywordspec (name, &end, &shape);
     103  
     104        /* The characters between name and end should form a valid C identifier.
     105           A colon means an invalid parse in split_keywordspec().  */
     106        colon = strchr (name, ':');
     107        if (colon == NULL || colon >= end)
     108          insert_keyword_callshape (&keywords, name, end - name, &shape);
     109      }
     110  }
     111  
     112  /* Finish initializing the keywords hash table.
     113     Called after argument processing, before each file is processed.  */
     114  static void
     115  init_keywords ()
     116  {
     117    if (default_keywords)
     118      {
     119        /* When adding new keywords here, also update the documentation in
     120           xgettext.texi!  */
     121        x_python_keyword ("gettext");
     122        x_python_keyword ("ugettext");
     123        x_python_keyword ("dgettext:2");
     124        x_python_keyword ("ngettext:1,2");
     125        x_python_keyword ("ungettext:1,2");
     126        x_python_keyword ("dngettext:2,3");
     127        x_python_keyword ("_");
     128        default_keywords = false;
     129      }
     130  }
     131  
     132  void
     133  init_flag_table_python ()
     134  {
     135    xgettext_record_flag ("gettext:1:pass-python-format");
     136    xgettext_record_flag ("ugettext:1:pass-python-format");
     137    xgettext_record_flag ("dgettext:2:pass-python-format");
     138    xgettext_record_flag ("ngettext:1:pass-python-format");
     139    xgettext_record_flag ("ngettext:2:pass-python-format");
     140    xgettext_record_flag ("ungettext:1:pass-python-format");
     141    xgettext_record_flag ("ungettext:2:pass-python-format");
     142    xgettext_record_flag ("dngettext:2:pass-python-format");
     143    xgettext_record_flag ("dngettext:3:pass-python-format");
     144    xgettext_record_flag ("_:1:pass-python-format");
     145    /* xgettext_record_flag ("%:1:python-format"); // % is an infix operator! */
     146  
     147    xgettext_record_flag ("gettext:1:pass-python-brace-format");
     148    xgettext_record_flag ("ugettext:1:pass-python-brace-format");
     149    xgettext_record_flag ("dgettext:2:pass-python-brace-format");
     150    xgettext_record_flag ("ngettext:1:pass-python-brace-format");
     151    xgettext_record_flag ("ngettext:2:pass-python-brace-format");
     152    xgettext_record_flag ("ungettext:1:pass-python-brace-format");
     153    xgettext_record_flag ("ungettext:2:pass-python-brace-format");
     154    xgettext_record_flag ("dngettext:2:pass-python-brace-format");
     155    xgettext_record_flag ("dngettext:3:pass-python-brace-format");
     156    xgettext_record_flag ("_:1:pass-python-brace-format");
     157    /* xgettext_record_flag ("format:1:python-brace-format"); */
     158  }
     159  
     160  
     161  /* ======================== Reading of characters.  ======================== */
     162  
     163  /* The input file stream.  */
     164  static FILE *fp;
     165  
     166  
     167  /* 0. Terminate line by \n, regardless whether the external
     168     representation of a line terminator is CR (Mac), and CR/LF
     169     (DOS/Windows), as Python treats them equally.  */
     170  static int
     171  phase0_getc ()
     172  {
     173    int c;
     174  
     175    c = getc (fp);
     176    if (c == EOF)
     177      {
     178        if (ferror (fp))
     179          error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
     180                 real_file_name);
     181        return EOF;
     182      }
     183  
     184    if (c == '\r')
     185      {
     186        int c1 = getc (fp);
     187  
     188        if (c1 != EOF && c1 != '\n')
     189          ungetc (c1, fp);
     190  
     191        /* Seen line terminator CR or CR/LF.  */
     192        return '\n';
     193      }
     194  
     195    return c;
     196  }
     197  
     198  /* Supports only one pushback character, and not '\n'.  */
     199  static inline void
     200  phase0_ungetc (int c)
     201  {
     202    if (c != EOF)
     203      ungetc (c, fp);
     204  }
     205  
     206  
     207  /* 1. line_number handling.  */
     208  
     209  /* Maximum used, roughly a safer MB_LEN_MAX.  */
     210  #define MAX_PHASE1_PUSHBACK 16
     211  static unsigned char phase1_pushback[MAX_PHASE1_PUSHBACK];
     212  static int phase1_pushback_length;
     213  
     214  /* Read the next single byte from the input file.  */
     215  static int
     216  phase1_getc ()
     217  {
     218    int c;
     219  
     220    if (phase1_pushback_length)
     221      c = phase1_pushback[--phase1_pushback_length];
     222    else
     223      c = phase0_getc ();
     224  
     225    if (c == '\n')
     226      ++line_number;
     227  
     228    return c;
     229  }
     230  
     231  /* Supports MAX_PHASE1_PUSHBACK characters of pushback.  */
     232  static void
     233  phase1_ungetc (int c)
     234  {
     235    if (c != EOF)
     236      {
     237        if (c == '\n')
     238          --line_number;
     239  
     240        if (phase1_pushback_length == SIZEOF (phase1_pushback))
     241          abort ();
     242        phase1_pushback[phase1_pushback_length++] = c;
     243      }
     244  }
     245  
     246  
     247  /* Phase 2: Conversion to Unicode.
     248     This is done early because PEP 0263 specifies that conversion to Unicode
     249     conceptually occurs before tokenization.  A test case where it matters
     250     is with encodings like BIG5: when a double-byte character ending in 0x5C
     251     is followed by '\' or 'u0021', the tokenizer must not treat the second
     252     half of the double-byte character as a backslash.  */
     253  
     254  /* End-of-file indicator for functions returning an UCS-4 character.  */
     255  #define UEOF -1
     256  
     257  static lexical_context_ty lexical_context;
     258  
     259  static int phase2_pushback[max (9, UNINAME_MAX + 3)];
     260  static int phase2_pushback_length;
     261  
     262  /* Read the next Unicode UCS-4 character from the input file.  */
     263  static int
     264  phase2_getc ()
     265  {
     266    if (phase2_pushback_length)
     267      return phase2_pushback[--phase2_pushback_length];
     268  
     269    if (xgettext_current_source_encoding == po_charset_ascii)
     270      {
     271        int c = phase1_getc ();
     272        if (c == EOF)
     273          return UEOF;
     274        if (!c_isascii (c))
     275          {
     276            multiline_error (xstrdup (""),
     277                             xasprintf ("%s\n%s\n",
     278                                        non_ascii_error_message (lexical_context,
     279                                                                 real_file_name,
     280                                                                 line_number),
     281                                        _("\
     282  Please specify the source encoding through --from-code or through a comment\n\
     283  as specified in https://www.python.org/peps/pep-0263.html.\n")));
     284            exit (EXIT_FAILURE);
     285          }
     286        return c;
     287      }
     288    else if (xgettext_current_source_encoding != po_charset_utf8)
     289      {
     290  #if HAVE_ICONV
     291        /* Use iconv on an increasing number of bytes.  Read only as many bytes
     292           through phase1_getc as needed.  This is needed to give reasonable
     293           interactive behaviour when fp is connected to an interactive tty.  */
     294        unsigned char buf[MAX_PHASE1_PUSHBACK];
     295        size_t bufcount;
     296  
     297        {
     298          int c = phase1_getc ();
     299          if (c == EOF)
     300            return UEOF;
     301          buf[0] = (unsigned char) c;
     302          bufcount = 1;
     303        }
     304  
     305        for (;;)
     306          {
     307            unsigned char scratchbuf[6];
     308            const char *inptr = (const char *) &buf[0];
     309            size_t insize = bufcount;
     310            char *outptr = (char *) &scratchbuf[0];
     311            size_t outsize = sizeof (scratchbuf);
     312  
     313            size_t res = iconv (xgettext_current_source_iconv,
     314                                (ICONV_CONST char **) &inptr, &insize,
     315                                &outptr, &outsize);
     316            /* We expect that a character has been produced if and only if
     317               some input bytes have been consumed.  */
     318            if ((insize < bufcount) != (outsize < sizeof (scratchbuf)))
     319              abort ();
     320            if (outsize == sizeof (scratchbuf))
     321              {
     322                /* No character has been produced.  Must be an error.  */
     323                if (res != (size_t)(-1))
     324                  abort ();
     325  
     326                if (errno == EILSEQ)
     327                  {
     328                    /* An invalid multibyte sequence was encountered.  */
     329                    goto invalid;
     330                  }
     331                else if (errno == EINVAL)
     332                  {
     333                    /* An incomplete multibyte character.  */
     334                    int c;
     335  
     336                    if (bufcount == MAX_PHASE1_PUSHBACK)
     337                      {
     338                        /* An overlong incomplete multibyte sequence was
     339                           encountered.  */
     340                        multiline_error (xstrdup (""),
     341                                         xasprintf (_("\
     342  %s:%d: Long incomplete multibyte sequence.\n\
     343  Please specify the correct source encoding through --from-code or through a\n\
     344  comment as specified in https://www.python.org/peps/pep-0263.html.\n"),
     345                                         real_file_name, line_number));
     346                        exit (EXIT_FAILURE);
     347                      }
     348  
     349                    /* Read one more byte and retry iconv.  */
     350                    c = phase1_getc ();
     351                    if (c == EOF)
     352                      goto incomplete_at_eof;
     353                    if (c == '\n')
     354                      goto incomplete_at_eol;
     355                    buf[bufcount++] = (unsigned char) c;
     356                  }
     357                else
     358                  error (EXIT_FAILURE, errno, _("%s:%d: iconv failure"),
     359                         real_file_name, line_number);
     360              }
     361            else
     362              {
     363                size_t outbytes = sizeof (scratchbuf) - outsize;
     364                size_t bytes = bufcount - insize;
     365                ucs4_t uc;
     366  
     367                /* We expect that one character has been produced.  */
     368                if (bytes == 0)
     369                  abort ();
     370                if (outbytes == 0)
     371                  abort ();
     372                /* Push back the unused bytes.  */
     373                while (insize > 0)
     374                  phase1_ungetc (buf[--insize]);
     375                /* Convert the character from UTF-8 to UCS-4.  */
     376                if (u8_mbtoucr (&uc, scratchbuf, outbytes) < (int) outbytes)
     377                  {
     378                    /* scratchbuf contains an out-of-range Unicode character
     379                       (> 0x10ffff).  */
     380                    goto invalid;
     381                  }
     382                return uc;
     383              }
     384          }
     385  #else
     386        /* If we don't have iconv(), the only supported values for
     387           xgettext_global_source_encoding and thus also for
     388           xgettext_current_source_encoding are ASCII and UTF-8.  */
     389        abort ();
     390  #endif
     391      }
     392    else
     393      {
     394        /* Read an UTF-8 encoded character.
     395           Reject invalid input, like u8_mbtouc does.  */
     396        int c;
     397        ucs4_t uc;
     398  
     399        c = phase1_getc ();
     400        if (c == EOF)
     401          return UEOF;
     402        if (c < 0x80)
     403          {
     404            uc = c;
     405          }
     406        else if (c < 0xc2)
     407          goto invalid;
     408        else if (c < 0xe0)
     409          {
     410            int c1 = phase1_getc ();
     411            if (c1 == EOF)
     412              goto incomplete_at_eof;
     413            if (c1 == '\n')
     414              goto incomplete_at_eol;
     415            if ((c1 ^ 0x80) < 0x40)
     416              uc = ((unsigned int) (c & 0x1f) << 6)
     417                   | (unsigned int) (c1 ^ 0x80);
     418            else
     419              goto invalid;
     420          }
     421        else if (c < 0xf0)
     422          {
     423            int c1 = phase1_getc ();
     424            if (c1 == EOF)
     425              goto incomplete_at_eof;
     426            if (c1 == '\n')
     427              goto incomplete_at_eol;
     428            if ((c1 ^ 0x80) < 0x40
     429                && (c >= 0xe1 || c1 >= 0xa0)
     430                && (c != 0xed || c1 < 0xa0))
     431              {
     432                int c2 = phase1_getc ();
     433                if (c2 == EOF)
     434                  goto incomplete_at_eof;
     435                if (c2 == '\n')
     436                  goto incomplete_at_eol;
     437                if ((c2 ^ 0x80) < 0x40)
     438                  uc = ((unsigned int) (c & 0x0f) << 12)
     439                       | ((unsigned int) (c1 ^ 0x80) << 6)
     440                       | (unsigned int) (c2 ^ 0x80);
     441                else
     442                  goto invalid;
     443              }
     444            else
     445              goto invalid;
     446          }
     447        else if (c < 0xf8)
     448          {
     449            int c1 = phase1_getc ();
     450            if (c1 == EOF)
     451              goto incomplete_at_eof;
     452            if (c1 == '\n')
     453              goto incomplete_at_eol;
     454            if ((c1 ^ 0x80) < 0x40
     455                && (c >= 0xf1 || c1 >= 0x90)
     456                && (c < 0xf4 || (c == 0xf4 && c1 < 0x90)))
     457              {
     458                int c2 = phase1_getc ();
     459                if (c2 == EOF)
     460                  goto incomplete_at_eof;
     461                if (c2 == '\n')
     462                  goto incomplete_at_eol;
     463                if ((c2 ^ 0x80) < 0x40)
     464                  {
     465                    int c3 = phase1_getc ();
     466                    if (c3 == EOF)
     467                      goto incomplete_at_eof;
     468                    if (c3 == '\n')
     469                      goto incomplete_at_eol;
     470                    if ((c3 ^ 0x80) < 0x40)
     471                      uc = ((unsigned int) (c & 0x07) << 18)
     472                           | ((unsigned int) (c1 ^ 0x80) << 12)
     473                           | ((unsigned int) (c2 ^ 0x80) << 6)
     474                           | (unsigned int) (c3 ^ 0x80);
     475                    else
     476                      goto invalid;
     477                  }
     478                else
     479                  goto invalid;
     480              }
     481            else
     482              goto invalid;
     483          }
     484        else
     485          goto invalid;
     486  
     487        return uc;
     488      }
     489  
     490   invalid:
     491    /* An invalid multibyte sequence was encountered.  */
     492    multiline_error (xstrdup (""),
     493                     xasprintf (_("\
     494  %s:%d: Invalid multibyte sequence.\n\
     495  Please specify the correct source encoding through --from-code or through a\n\
     496  comment as specified in https://www.python.org/peps/pep-0263.html.\n"),
     497                     real_file_name, line_number));
     498    exit (EXIT_FAILURE);
     499  
     500   incomplete_at_eof:
     501    multiline_error (xstrdup (""),
     502                     xasprintf (_("\
     503  %s:%d: Incomplete multibyte sequence at end of file.\n\
     504  Please specify the correct source encoding through --from-code or through a\n\
     505  comment as specified in https://www.python.org/peps/pep-0263.html.\n"),
     506                     real_file_name, line_number));
     507    exit (EXIT_FAILURE);
     508  
     509   incomplete_at_eol:
     510    multiline_error (xstrdup (""),
     511                     xasprintf (_("\
     512  %s:%d: Incomplete multibyte sequence at end of line.\n\
     513  Please specify the correct source encoding through --from-code or through a\n\
     514  comment as specified in https://www.python.org/peps/pep-0263.html.\n"),
     515                     real_file_name, line_number - 1));
     516    exit (EXIT_FAILURE);
     517  }
     518  
     519  /* Supports max (9, UNINAME_MAX + 3) pushback characters.  */
     520  static void
     521  phase2_ungetc (int c)
     522  {
     523    if (c != UEOF)
     524      {
     525        if (phase2_pushback_length == SIZEOF (phase2_pushback))
     526          abort ();
     527        phase2_pushback[phase2_pushback_length++] = c;
     528      }
     529  }
     530  
     531  
     532  /* ========================= Accumulating strings.  ======================== */
     533  
     534  /* See xg-mixed-string.h for the API.  */
     535  
     536  
     537  /* ======================== Accumulating comments.  ======================== */
     538  
     539  
     540  /* Accumulating a single comment line.  */
     541  
     542  static struct mixed_string_buffer comment_buffer;
     543  
     544  static inline void
     545  comment_start ()
     546  {
     547    mixed_string_buffer_init (&comment_buffer, lc_comment,
     548                              logical_file_name, line_number);
     549  }
     550  
     551  static inline bool
     552  comment_at_start ()
     553  {
     554    return mixed_string_buffer_is_empty (&comment_buffer);
     555  }
     556  
     557  static inline void
     558  comment_add (int c)
     559  {
     560    mixed_string_buffer_append_unicode (&comment_buffer, c);
     561  }
     562  
     563  static inline const char *
     564  comment_line_end ()
     565  {
     566    char *buffer =
     567      mixed_string_contents_free1 (mixed_string_buffer_result (&comment_buffer));
     568    size_t buflen = strlen (buffer);
     569  
     570    while (buflen >= 1
     571           && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
     572      --buflen;
     573    buffer[buflen] = '\0';
     574    savable_comment_add (buffer);
     575    lexical_context = lc_outside;
     576    return buffer;
     577  }
     578  
     579  
     580  /* These are for tracking whether comments count as immediately before
     581     keyword.  */
     582  static int last_comment_line;
     583  static int last_non_comment_line;
     584  
     585  
     586  /* ======================== Recognizing comments.  ======================== */
     587  
     588  
     589  /* Recognizing the "coding" comment.
     590     As specified in PEP 0263, it takes the form
     591       "coding" [":"|"="] {alphanumeric or "-" or "_" or "*"}*
     592     or
     593       "set" "fileencoding" "=" {alphanumeric or "-" or "_" or "*"}*
     594     and is located in a comment in a line that
     595       - is either the first or second line,
     596       - is not a continuation line,
     597       - in the first form, contains no other tokens except this comment.  */
     598  
     599  /* Canonicalized encoding name for the current input file.  */
     600  static const char *xgettext_current_file_source_encoding;
     601  
     602  #if HAVE_ICONV
     603  /* Converter from xgettext_current_file_source_encoding to UTF-8 (except from
     604     ASCII or UTF-8, when this conversion is a no-op).  */
     605  static iconv_t xgettext_current_file_source_iconv;
     606  #endif
     607  
     608  static inline void
     609  set_current_file_source_encoding (const char *canon_encoding)
     610  {
     611    xgettext_current_file_source_encoding = canon_encoding;
     612  
     613    if (xgettext_current_file_source_encoding != po_charset_ascii
     614        && xgettext_current_file_source_encoding != po_charset_utf8)
     615      {
     616  #if HAVE_ICONV
     617        iconv_t cd;
     618  
     619        /* Avoid glibc-2.1 bug with EUC-KR.  */
     620  # if ((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
     621       && !defined _LIBICONV_VERSION
     622        if (strcmp (xgettext_current_file_source_encoding, "EUC-KR") == 0)
     623          cd = (iconv_t)(-1);
     624        else
     625  # endif
     626        cd = iconv_open (po_charset_utf8, xgettext_current_file_source_encoding);
     627        if (cd == (iconv_t)(-1))
     628          error_at_line (EXIT_FAILURE, 0, logical_file_name, line_number - 1,
     629                         _("Cannot convert from \"%s\" to \"%s\". %s relies on iconv(), and iconv() does not support this conversion."),
     630                         xgettext_current_file_source_encoding, po_charset_utf8,
     631                         last_component (program_name));
     632        xgettext_current_file_source_iconv = cd;
     633  #else
     634        error_at_line (EXIT_FAILURE, 0, logical_file_name, line_number - 1,
     635                       _("Cannot convert from \"%s\" to \"%s\". %s relies on iconv(). This version was built without iconv()."),
     636                       xgettext_current_file_source_encoding, po_charset_utf8,
     637                       last_component (program_name));
     638  #endif
     639      }
     640  
     641    xgettext_current_source_encoding = xgettext_current_file_source_encoding;
     642  #if HAVE_ICONV
     643    xgettext_current_source_iconv = xgettext_current_file_source_iconv;
     644  #endif
     645  }
     646  
     647  static inline void
     648  try_to_extract_coding (const char *comment)
     649  {
     650    const char *p = c_strstr (comment, "coding");
     651  
     652    if (p != NULL)
     653      {
     654        p += 6;
     655        if (*p == ':' || *p == '=')
     656          {
     657            p++;
     658            while (*p == ' ' || *p == '\t')
     659              p++;
     660            {
     661              const char *encoding_start = p;
     662  
     663              while (c_isalnum (*p) || *p == '-' || *p == '_' || *p == '.')
     664                p++;
     665              {
     666                const char *encoding_end = p;
     667  
     668                if (encoding_end > encoding_start)
     669                  {
     670                    /* Extract the encoding string.  */
     671                    size_t encoding_len = encoding_end - encoding_start;
     672                    char *encoding = XNMALLOC (encoding_len + 1, char);
     673  
     674                    memcpy (encoding, encoding_start, encoding_len);
     675                    encoding[encoding_len] = '\0';
     676  
     677                    {
     678                      /* Canonicalize it.  */
     679                      const char *canon_encoding = po_charset_canonicalize (encoding);
     680                      if (canon_encoding == NULL)
     681                        {
     682                          error_at_line (0, 0,
     683                                         logical_file_name, line_number - 1,
     684                                         _("Unknown encoding \"%s\". Proceeding with ASCII instead."),
     685                                         encoding);
     686                          canon_encoding = po_charset_ascii;
     687                        }
     688  
     689                      /* Activate it.  */
     690                      set_current_file_source_encoding (canon_encoding);
     691                    }
     692  
     693                    free (encoding);
     694                  }
     695              }
     696            }
     697          }
     698      }
     699  }
     700  
     701  /* Tracking whether the current line is a continuation line or contains a
     702     non-blank character.  */
     703  static bool continuation_or_nonblank_line;
     704  
     705  
     706  /* Phase 3: Outside strings, replace backslash-newline with nothing and a
     707     comment with nothing.  */
     708  
     709  static int
     710  phase3_getc ()
     711  {
     712    int c;
     713  
     714    for (;;)
     715      {
     716        c = phase2_getc ();
     717        if (c == '\\')
     718          {
     719            c = phase2_getc ();
     720            if (c != '\n')
     721              {
     722                phase2_ungetc (c);
     723                /* This shouldn't happen usually, because "A backslash is
     724                   illegal elsewhere on a line outside a string literal."  */
     725                return '\\';
     726              }
     727            /* Eat backslash-newline.  */
     728            continuation_or_nonblank_line = true;
     729          }
     730        else if (c == '#')
     731          {
     732            /* Eat a comment.  */
     733            const char *comment;
     734  
     735            last_comment_line = line_number;
     736            comment_start ();
     737            for (;;)
     738              {
     739                c = phase2_getc ();
     740                if (c == UEOF || c == '\n')
     741                  break;
     742                /* We skip all leading white space, but not EOLs.  */
     743                if (!(comment_at_start () && (c == ' ' || c == '\t')))
     744                  comment_add (c);
     745              }
     746            comment = comment_line_end ();
     747            if (line_number - 1 <= 2 && !continuation_or_nonblank_line)
     748              try_to_extract_coding (comment);
     749            continuation_or_nonblank_line = false;
     750            return c;
     751          }
     752        else
     753          {
     754            if (c == '\n')
     755              continuation_or_nonblank_line = false;
     756            else if (!(c == ' ' || c == '\t' || c == '\f'))
     757              continuation_or_nonblank_line = true;
     758            return c;
     759          }
     760      }
     761  }
     762  
     763  /* Supports only one pushback character.  */
     764  static void
     765  phase3_ungetc (int c)
     766  {
     767    phase2_ungetc (c);
     768  }
     769  
     770  
     771  /* ========================= Accumulating strings.  ======================== */
     772  
     773  /* Return value of phase7_getuc when EOF is reached.  */
     774  #define P7_EOF (-1)
     775  #define P7_STRING_END (-2)
     776  
     777  /* Convert an UTF-16 or UTF-32 code point to a return value that can be
     778     distinguished from a single-byte return value.  */
     779  #define UNICODE(code) (0x100 + (code))
     780  
     781  /* Test a return value of phase7_getuc whether it designates an UTF-16 or
     782     UTF-32 code point.  */
     783  #define IS_UNICODE(p7_result) ((p7_result) >= 0x100)
     784  
     785  /* Extract the UTF-16 or UTF-32 code of a return value that satisfies
     786     IS_UNICODE.  */
     787  #define UNICODE_VALUE(p7_result) ((p7_result) - 0x100)
     788  
     789  
     790  /* ========================== Reading of tokens.  ========================== */
     791  
     792  
     793  enum token_type_ty
     794  {
     795    token_type_eof,
     796    token_type_lparen,            /* ( */
     797    token_type_rparen,            /* ) */
     798    token_type_comma,             /* , */
     799    token_type_lbracket,          /* [ */
     800    token_type_rbracket,          /* ] */
     801    token_type_string,            /* "abc", 'abc', """abc""", '''abc''' */
     802    token_type_symbol,            /* symbol, number */
     803    token_type_plus,              /* + */
     804    token_type_other              /* misc. operator */
     805  };
     806  typedef enum token_type_ty token_type_ty;
     807  
     808  typedef struct token_ty token_ty;
     809  struct token_ty
     810  {
     811    token_type_ty type;
     812    char *string;                         /* for token_type_symbol */
     813    mixed_string_ty *mixed_string;        /* for token_type_string */
     814    refcounted_string_list_ty *comment;   /* for token_type_string */
     815    int line_number;
     816  };
     817  
     818  /* Free the memory pointed to by a 'struct token_ty'.  */
     819  static inline void
     820  free_token (token_ty *tp)
     821  {
     822    if (tp->type == token_type_symbol)
     823      free (tp->string);
     824    if (tp->type == token_type_string)
     825      {
     826        mixed_string_free (tp->mixed_string);
     827        drop_reference (tp->comment);
     828      }
     829  }
     830  
     831  
     832  /* There are two different input syntaxes for strings, "abc" and r"abc",
     833     and two different input syntaxes for Unicode strings, u"abc" and ur"abc".
     834     Which escape sequences are understood, i.e. what is interpreted specially
     835     after backslash?
     836      "abc"     \<nl> \\ \' \" \a\b\f\n\r\t\v \ooo \xnn
     837      r"abc"
     838      u"abc"    \<nl> \\ \' \" \a\b\f\n\r\t\v \ooo \xnn \unnnn \Unnnnnnnn \N{...}
     839      ur"abc"                                           \unnnn
     840     The \unnnn values are UTF-16 values; a single \Unnnnnnnn can expand to two
     841     \unnnn items.  The \ooo and \xnn values are in the current source encoding
     842     for byte strings, and Unicode code points for Unicode strings.
     843   */
     844  
     845  static int
     846  phase7_getuc (int quote_char,
     847                bool triple, bool interpret_ansic, bool interpret_unicode,
     848                unsigned int *backslash_counter)
     849  {
     850    int c;
     851  
     852    for (;;)
     853      {
     854        /* Use phase 2, because phase 3 elides comments.  */
     855        c = phase2_getc ();
     856  
     857        if (c == UEOF)
     858          return P7_EOF;
     859  
     860        if (c == quote_char && (interpret_ansic || (*backslash_counter & 1) == 0))
     861          {
     862            if (triple)
     863              {
     864                int c1 = phase2_getc ();
     865                if (c1 == quote_char)
     866                  {
     867                    int c2 = phase2_getc ();
     868                    if (c2 == quote_char)
     869                      return P7_STRING_END;
     870                    phase2_ungetc (c2);
     871                  }
     872                phase2_ungetc (c1);
     873                return UNICODE (c);
     874              }
     875            else
     876              return P7_STRING_END;
     877          }
     878  
     879        if (c == '\n')
     880          {
     881            if (triple)
     882              {
     883                *backslash_counter = 0;
     884                return UNICODE ('\n');
     885              }
     886            /* In r"..." and ur"..." strings, newline is only allowed
     887               immediately after an odd number of backslashes (although the
     888               backslashes are not interpreted!).  */
     889            if (!(interpret_ansic || (*backslash_counter & 1) == 0))
     890              {
     891                *backslash_counter = 0;
     892                return UNICODE ('\n');
     893              }
     894            phase2_ungetc (c);
     895            error_with_progname = false;
     896            error (0, 0, _("%s:%d: warning: unterminated string"),
     897                   logical_file_name, line_number);
     898            error_with_progname = true;
     899            return P7_STRING_END;
     900          }
     901  
     902        if (c != '\\')
     903          {
     904            *backslash_counter = 0;
     905            return UNICODE (c);
     906          }
     907  
     908        /* Backslash handling.  */
     909  
     910        if (!interpret_ansic && !interpret_unicode)
     911          {
     912            ++*backslash_counter;
     913            return UNICODE ('\\');
     914          }
     915  
     916        /* Dispatch according to the character following the backslash.  */
     917        c = phase2_getc ();
     918        if (c == UEOF)
     919          {
     920            ++*backslash_counter;
     921            return UNICODE ('\\');
     922          }
     923  
     924        if (interpret_ansic)
     925          switch (c)
     926            {
     927            case '\n':
     928              continue;
     929            case '\\':
     930              ++*backslash_counter;
     931              return UNICODE (c);
     932            case '\'': case '"':
     933              *backslash_counter = 0;
     934              return UNICODE (c);
     935            case 'a':
     936              *backslash_counter = 0;
     937              return UNICODE ('\a');
     938            case 'b':
     939              *backslash_counter = 0;
     940              return UNICODE ('\b');
     941            case 'f':
     942              *backslash_counter = 0;
     943              return UNICODE ('\f');
     944            case 'n':
     945              *backslash_counter = 0;
     946              return UNICODE ('\n');
     947            case 'r':
     948              *backslash_counter = 0;
     949              return UNICODE ('\r');
     950            case 't':
     951              *backslash_counter = 0;
     952              return UNICODE ('\t');
     953            case 'v':
     954              *backslash_counter = 0;
     955              return UNICODE ('\v');
     956            case '0': case '1': case '2': case '3': case '4':
     957            case '5': case '6': case '7':
     958              {
     959                int n = c - '0';
     960  
     961                c = phase2_getc ();
     962                if (c != UEOF)
     963                  {
     964                    if (c >= '0' && c <= '7')
     965                      {
     966                        n = (n << 3) + (c - '0');
     967                        c = phase2_getc ();
     968                        if (c != UEOF)
     969                          {
     970                            if (c >= '0' && c <= '7')
     971                              n = (n << 3) + (c - '0');
     972                            else
     973                              phase2_ungetc (c);
     974                          }
     975                      }
     976                    else
     977                      phase2_ungetc (c);
     978                  }
     979                *backslash_counter = 0;
     980                if (interpret_unicode)
     981                  return UNICODE (n);
     982                else
     983                  return (unsigned char) n;
     984              }
     985            case 'x':
     986              {
     987                int c1 = phase2_getc ();
     988                int n1;
     989  
     990                if (c1 >= '0' && c1 <= '9')
     991                  n1 = c1 - '0';
     992                else if (c1 >= 'A' && c1 <= 'F')
     993                  n1 = c1 - 'A' + 10;
     994                else if (c1 >= 'a' && c1 <= 'f')
     995                  n1 = c1 - 'a' + 10;
     996                else
     997                  n1 = -1;
     998  
     999                if (n1 >= 0)
    1000                  {
    1001                    int c2 = phase2_getc ();
    1002                    int n2;
    1003  
    1004                    if (c2 >= '0' && c2 <= '9')
    1005                      n2 = c2 - '0';
    1006                    else if (c2 >= 'A' && c2 <= 'F')
    1007                      n2 = c2 - 'A' + 10;
    1008                    else if (c2 >= 'a' && c2 <= 'f')
    1009                      n2 = c2 - 'a' + 10;
    1010                    else
    1011                      n2 = -1;
    1012  
    1013                    if (n2 >= 0)
    1014                      {
    1015                        int n = (n1 << 4) + n2;
    1016                        *backslash_counter = 0;
    1017                        if (interpret_unicode)
    1018                          return UNICODE (n);
    1019                        else
    1020                          return (unsigned char) n;
    1021                      }
    1022  
    1023                    phase2_ungetc (c2);
    1024                  }
    1025                phase2_ungetc (c1);
    1026                phase2_ungetc (c);
    1027                ++*backslash_counter;
    1028                return UNICODE ('\\');
    1029              }
    1030            }
    1031  
    1032        if (interpret_unicode)
    1033          {
    1034            if (c == 'u')
    1035              {
    1036                unsigned char buf[4];
    1037                unsigned int n = 0;
    1038                int i;
    1039  
    1040                for (i = 0; i < 4; i++)
    1041                  {
    1042                    int c1 = phase2_getc ();
    1043  
    1044                    if (c1 >= '0' && c1 <= '9')
    1045                      n = (n << 4) + (c1 - '0');
    1046                    else if (c1 >= 'A' && c1 <= 'F')
    1047                      n = (n << 4) + (c1 - 'A' + 10);
    1048                    else if (c1 >= 'a' && c1 <= 'f')
    1049                      n = (n << 4) + (c1 - 'a' + 10);
    1050                    else
    1051                      {
    1052                        phase2_ungetc (c1);
    1053                        while (--i >= 0)
    1054                          phase2_ungetc (buf[i]);
    1055                        phase2_ungetc (c);
    1056                        ++*backslash_counter;
    1057                        return UNICODE ('\\');
    1058                      }
    1059  
    1060                    buf[i] = c1;
    1061                  }
    1062                *backslash_counter = 0;
    1063                return UNICODE (n);
    1064              }
    1065  
    1066            if (interpret_ansic)
    1067              {
    1068                if (c == 'U')
    1069                  {
    1070                    unsigned char buf[8];
    1071                    unsigned int n = 0;
    1072                    int i;
    1073  
    1074                    for (i = 0; i < 8; i++)
    1075                      {
    1076                        int c1 = phase2_getc ();
    1077  
    1078                        if (c1 >= '0' && c1 <= '9')
    1079                          n = (n << 4) + (c1 - '0');
    1080                        else if (c1 >= 'A' && c1 <= 'F')
    1081                          n = (n << 4) + (c1 - 'A' + 10);
    1082                        else if (c1 >= 'a' && c1 <= 'f')
    1083                          n = (n << 4) + (c1 - 'a' + 10);
    1084                        else
    1085                          {
    1086                            phase2_ungetc (c1);
    1087                            while (--i >= 0)
    1088                              phase2_ungetc (buf[i]);
    1089                            phase2_ungetc (c);
    1090                            ++*backslash_counter;
    1091                            return UNICODE ('\\');
    1092                          }
    1093  
    1094                        buf[i] = c1;
    1095                      }
    1096                    if (n < 0x110000)
    1097                      {
    1098                        *backslash_counter = 0;
    1099                        return UNICODE (n);
    1100                      }
    1101  
    1102                    error_with_progname = false;
    1103                    error (0, 0, _("%s:%d: warning: invalid Unicode character"),
    1104                           logical_file_name, line_number);
    1105                    error_with_progname = true;
    1106  
    1107                    while (--i >= 0)
    1108                      phase2_ungetc (buf[i]);
    1109                    phase2_ungetc (c);
    1110                    ++*backslash_counter;
    1111                    return UNICODE ('\\');
    1112                  }
    1113  
    1114                if (c == 'N')
    1115                  {
    1116                    int c1 = phase2_getc ();
    1117                    if (c1 == '{')
    1118                      {
    1119                        unsigned char buf[UNINAME_MAX + 1];
    1120                        int i;
    1121                        unsigned int n;
    1122  
    1123                        for (i = 0; i < UNINAME_MAX; i++)
    1124                          {
    1125                            int c2 = phase2_getc ();
    1126                            if (!(c2 >= ' ' && c2 <= '~'))
    1127                              {
    1128                                phase2_ungetc (c2);
    1129                                while (--i >= 0)
    1130                                  phase2_ungetc (buf[i]);
    1131                                phase2_ungetc (c1);
    1132                                phase2_ungetc (c);
    1133                                ++*backslash_counter;
    1134                                return UNICODE ('\\');
    1135                              }
    1136                            if (c2 == '}')
    1137                              break;
    1138                            buf[i] = c2;
    1139                          }
    1140                        buf[i] = '\0';
    1141  
    1142                        n = unicode_name_character ((char *) buf);
    1143                        if (n != UNINAME_INVALID)
    1144                          {
    1145                            *backslash_counter = 0;
    1146                            return UNICODE (n);
    1147                          }
    1148  
    1149                        phase2_ungetc ('}');
    1150                        while (--i >= 0)
    1151                          phase2_ungetc (buf[i]);
    1152                      }
    1153                    phase2_ungetc (c1);
    1154                    phase2_ungetc (c);
    1155                    ++*backslash_counter;
    1156                    return UNICODE ('\\');
    1157                  }
    1158              }
    1159          }
    1160  
    1161        phase2_ungetc (c);
    1162        ++*backslash_counter;
    1163        return UNICODE ('\\');
    1164      }
    1165  }
    1166  
    1167  
    1168  /* Combine characters into tokens.  Discard whitespace except newlines at
    1169     the end of logical lines.  */
    1170  
    1171  /* Number of pending open parentheses/braces/brackets.  */
    1172  static int open_pbb;
    1173  
    1174  static token_ty phase5_pushback[2];
    1175  static int phase5_pushback_length;
    1176  
    1177  static void
    1178  phase5_get (token_ty *tp)
    1179  {
    1180    int c;
    1181  
    1182    if (phase5_pushback_length)
    1183      {
    1184        *tp = phase5_pushback[--phase5_pushback_length];
    1185        return;
    1186      }
    1187  
    1188    for (;;)
    1189      {
    1190        tp->line_number = line_number;
    1191        c = phase3_getc ();
    1192  
    1193        switch (c)
    1194          {
    1195          case UEOF:
    1196            tp->type = token_type_eof;
    1197            return;
    1198  
    1199          case ' ':
    1200          case '\t':
    1201          case '\f':
    1202            /* Ignore whitespace and comments.  */
    1203            continue;
    1204  
    1205          case '\n':
    1206            if (last_non_comment_line > last_comment_line)
    1207              savable_comment_reset ();
    1208            /* Ignore newline if and only if it is used for implicit line
    1209               joining.  */
    1210            if (open_pbb > 0)
    1211              continue;
    1212            tp->type = token_type_other;
    1213            return;
    1214          }
    1215  
    1216        last_non_comment_line = tp->line_number;
    1217  
    1218        switch (c)
    1219          {
    1220          case '.':
    1221            {
    1222              int c1 = phase3_getc ();
    1223              phase3_ungetc (c1);
    1224              if (!(c1 >= '0' && c1 <= '9'))
    1225                {
    1226  
    1227                  tp->type = token_type_other;
    1228                  return;
    1229                }
    1230            }
    1231            FALLTHROUGH;
    1232          case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
    1233          case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
    1234          case 'M': case 'N': case 'O': case 'P': case 'Q':
    1235          case 'S': case 'T':           case 'V': case 'W': case 'X':
    1236          case 'Y': case 'Z':
    1237          case '_':
    1238          case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
    1239          case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
    1240          case 'm': case 'n': case 'o': case 'p': case 'q':
    1241          case 's': case 't':           case 'v': case 'w': case 'x':
    1242          case 'y': case 'z':
    1243          case '0': case '1': case '2': case '3': case '4':
    1244          case '5': case '6': case '7': case '8': case '9':
    1245          symbol:
    1246            /* Symbol, or part of a number.  */
    1247            {
    1248              static char *buffer;
    1249              static int bufmax;
    1250              int bufpos;
    1251  
    1252              bufpos = 0;
    1253              for (;;)
    1254                {
    1255                  if (bufpos >= bufmax)
    1256                    {
    1257                      bufmax = 2 * bufmax + 10;
    1258                      buffer = xrealloc (buffer, bufmax);
    1259                    }
    1260                  buffer[bufpos++] = c;
    1261                  c = phase3_getc ();
    1262                  switch (c)
    1263                    {
    1264                    case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
    1265                    case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
    1266                    case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
    1267                    case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
    1268                    case 'Y': case 'Z':
    1269                    case '_':
    1270                    case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
    1271                    case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
    1272                    case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
    1273                    case 's': case 't': case 'u': case 'v': case 'w': case 'x':
    1274                    case 'y': case 'z':
    1275                    case '0': case '1': case '2': case '3': case '4':
    1276                    case '5': case '6': case '7': case '8': case '9':
    1277                      continue;
    1278                    default:
    1279                      phase3_ungetc (c);
    1280                      break;
    1281                    }
    1282                  break;
    1283                }
    1284              if (bufpos >= bufmax)
    1285                {
    1286                  bufmax = 2 * bufmax + 10;
    1287                  buffer = xrealloc (buffer, bufmax);
    1288                }
    1289              buffer[bufpos] = '\0';
    1290              tp->string = xstrdup (buffer);
    1291              tp->type = token_type_symbol;
    1292              return;
    1293            }
    1294  
    1295          /* Strings.  */
    1296            {
    1297              int quote_char;
    1298              bool interpret_ansic;
    1299              bool interpret_unicode;
    1300              bool triple;
    1301              unsigned int backslash_counter;
    1302  
    1303              case 'R': case 'r':
    1304                {
    1305                  int c1 = phase2_getc ();
    1306                  if (c1 == '"' || c1 == '\'')
    1307                    {
    1308                      quote_char = c1;
    1309                      interpret_ansic = false;
    1310                      interpret_unicode = false;
    1311                      goto string;
    1312                    }
    1313                  phase2_ungetc (c1);
    1314                  goto symbol;
    1315                }
    1316  
    1317              case 'U': case 'u':
    1318                {
    1319                  int c1 = phase2_getc ();
    1320                  if (c1 == '"' || c1 == '\'')
    1321                    {
    1322                      quote_char = c1;
    1323                      interpret_ansic = true;
    1324                      interpret_unicode = true;
    1325                      goto string;
    1326                    }
    1327                  if (c1 == 'R' || c1 == 'r')
    1328                    {
    1329                      int c2 = phase2_getc ();
    1330                      if (c2 == '"' || c2 == '\'')
    1331                        {
    1332                          quote_char = c2;
    1333                          interpret_ansic = false;
    1334                          interpret_unicode = true;
    1335                          goto string;
    1336                        }
    1337                      phase2_ungetc (c2);
    1338                    }
    1339                  phase2_ungetc (c1);
    1340                  goto symbol;
    1341                }
    1342  
    1343              case '"': case '\'':
    1344                quote_char = c;
    1345                interpret_ansic = true;
    1346                interpret_unicode = false;
    1347              string:
    1348                triple = false;
    1349                lexical_context = lc_string;
    1350                {
    1351                  int c1 = phase2_getc ();
    1352                  if (c1 == quote_char)
    1353                    {
    1354                      int c2 = phase2_getc ();
    1355                      if (c2 == quote_char)
    1356                        triple = true;
    1357                      else
    1358                        {
    1359                          phase2_ungetc (c2);
    1360                          phase2_ungetc (c1);
    1361                        }
    1362                    }
    1363                  else
    1364                    phase2_ungetc (c1);
    1365                }
    1366                backslash_counter = 0;
    1367                {
    1368                  struct mixed_string_buffer msb;
    1369  
    1370                  /* Start accumulating the string.  */
    1371                  mixed_string_buffer_init (&msb, lexical_context,
    1372                                            logical_file_name, line_number);
    1373                  for (;;)
    1374                    {
    1375                      int uc = phase7_getuc (quote_char, triple, interpret_ansic,
    1376                                             interpret_unicode, &backslash_counter);
    1377  
    1378                      /* Keep line_number in sync.  */
    1379                      msb.line_number = line_number;
    1380  
    1381                      if (uc == P7_EOF || uc == P7_STRING_END)
    1382                        break;
    1383  
    1384                      if (IS_UNICODE (uc))
    1385                        {
    1386                          assert (UNICODE_VALUE (uc) >= 0
    1387                                  && UNICODE_VALUE (uc) < 0x110000);
    1388                          mixed_string_buffer_append_unicode (&msb,
    1389                                                              UNICODE_VALUE (uc));
    1390                        }
    1391                      else
    1392                        mixed_string_buffer_append_char (&msb, uc);
    1393                    }
    1394                  tp->mixed_string = mixed_string_buffer_result (&msb);
    1395                  tp->comment = add_reference (savable_comment);
    1396                  lexical_context = lc_outside;
    1397                  tp->type = token_type_string;
    1398                }
    1399                return;
    1400            }
    1401  
    1402          case '(':
    1403            open_pbb++;
    1404            tp->type = token_type_lparen;
    1405            return;
    1406  
    1407          case ')':
    1408            if (open_pbb > 0)
    1409              open_pbb--;
    1410            tp->type = token_type_rparen;
    1411            return;
    1412  
    1413          case ',':
    1414            tp->type = token_type_comma;
    1415            return;
    1416  
    1417          case '[': case '{':
    1418            open_pbb++;
    1419            tp->type = (c == '[' ? token_type_lbracket : token_type_other);
    1420            return;
    1421  
    1422          case ']': case '}':
    1423            if (open_pbb > 0)
    1424              open_pbb--;
    1425            tp->type = (c == ']' ? token_type_rbracket : token_type_other);
    1426            return;
    1427  
    1428          case '+':
    1429            tp->type = token_type_plus;
    1430            return;
    1431  
    1432          default:
    1433            /* We could carefully recognize each of the 2 and 3 character
    1434               operators, but it is not necessary, as we only need to recognize
    1435               gettext invocations.  Don't bother.  */
    1436            tp->type = token_type_other;
    1437            return;
    1438          }
    1439      }
    1440  }
    1441  
    1442  /* Supports only one pushback token.  */
    1443  static void
    1444  phase5_unget (token_ty *tp)
    1445  {
    1446    if (tp->type != token_type_eof)
    1447      {
    1448        if (phase5_pushback_length == SIZEOF (phase5_pushback))
    1449          abort ();
    1450        phase5_pushback[phase5_pushback_length++] = *tp;
    1451      }
    1452  }
    1453  
    1454  
    1455  /* Combine adjacent strings to form a single string.  Note that the end
    1456     of a logical line appears as a token of its own, therefore strings that
    1457     belong to different logical lines will not be concatenated.  */
    1458  
    1459  static void
    1460  x_python_lex (token_ty *tp)
    1461  {
    1462    phase5_get (tp);
    1463    if (tp->type == token_type_string)
    1464      {
    1465        mixed_string_ty *sum = tp->mixed_string;
    1466  
    1467        for (;;)
    1468          {
    1469            token_ty token2;
    1470            token_ty token3;
    1471            token_ty *tp2 = NULL;
    1472  
    1473            phase5_get (&token2);
    1474            switch (token2.type)
    1475              {
    1476              case token_type_plus:
    1477                {
    1478                  phase5_get (&token3);
    1479                  if (token3.type == token_type_string)
    1480                    {
    1481                      free_token (&token2);
    1482                      tp2 = &token3;
    1483                    }
    1484                  else
    1485                    phase5_unget (&token3);
    1486                }
    1487                break;
    1488              case token_type_string:
    1489                tp2 = &token2;
    1490                break;
    1491              default:
    1492                break;
    1493              }
    1494  
    1495            if (tp2)
    1496              {
    1497                sum = mixed_string_concat_free1 (sum, tp2->mixed_string);
    1498  
    1499                free_token (tp2);
    1500                continue;
    1501              }
    1502            phase5_unget (&token2);
    1503            break;
    1504          }
    1505        tp->mixed_string = sum;
    1506      }
    1507  }
    1508  
    1509  
    1510  /* ========================= Extracting strings.  ========================== */
    1511  
    1512  
    1513  /* Context lookup table.  */
    1514  static flag_context_list_table_ty *flag_context_list_table;
    1515  
    1516  
    1517  /* Maximum supported nesting depth.  */
    1518  #define MAX_NESTING_DEPTH 1000
    1519  
    1520  /* Current nesting depths.  */
    1521  static int paren_nesting_depth;
    1522  static int bracket_nesting_depth;
    1523  
    1524  
    1525  /* The file is broken into tokens.  Scan the token stream, looking for
    1526     a keyword, followed by a left paren, followed by a string.  When we
    1527     see this sequence, we have something to remember.  We assume we are
    1528     looking at a valid C or C++ program, and leave the complaints about
    1529     the grammar to the compiler.
    1530  
    1531       Normal handling: Look for
    1532         keyword ( ... msgid ... )
    1533       Plural handling: Look for
    1534         keyword ( ... msgid ... msgid_plural ... )
    1535  
    1536     We use recursion because the arguments before msgid or between msgid
    1537     and msgid_plural can contain subexpressions of the same form.  */
    1538  
    1539  
    1540  /* Extract messages until the next balanced closing parenthesis or bracket.
    1541     Extracted messages are added to MLP.
    1542     DELIM can be either token_type_rparen or token_type_rbracket, or
    1543     token_type_eof to accept both.
    1544     Return true upon eof, false upon closing parenthesis or bracket.  */
    1545  static bool
    1546  extract_balanced (message_list_ty *mlp,
    1547                    token_type_ty delim,
    1548                    flag_context_ty outer_context,
    1549                    flag_context_list_iterator_ty context_iter,
    1550                    struct arglist_parser *argparser)
    1551  {
    1552    /* Current argument number.  */
    1553    int arg = 1;
    1554    /* 0 when no keyword has been seen.  1 right after a keyword is seen.  */
    1555    int state;
    1556    /* Parameters of the keyword just seen.  Defined only in state 1.  */
    1557    const struct callshapes *next_shapes = NULL;
    1558    /* Context iterator that will be used if the next token is a '('.  */
    1559    flag_context_list_iterator_ty next_context_iter =
    1560      passthrough_context_list_iterator;
    1561    /* Current context.  */
    1562    flag_context_ty inner_context =
    1563      inherited_context (outer_context,
    1564                         flag_context_list_iterator_advance (&context_iter));
    1565  
    1566    /* Start state is 0.  */
    1567    state = 0;
    1568  
    1569    for (;;)
    1570      {
    1571        token_ty token;
    1572  
    1573        x_python_lex (&token);
    1574        switch (token.type)
    1575          {
    1576          case token_type_symbol:
    1577            {
    1578              void *keyword_value;
    1579  
    1580              if (hash_find_entry (&keywords, token.string, strlen (token.string),
    1581                                   &keyword_value)
    1582                  == 0)
    1583                {
    1584                  next_shapes = (const struct callshapes *) keyword_value;
    1585                  state = 1;
    1586                }
    1587              else
    1588                state = 0;
    1589            }
    1590            next_context_iter =
    1591              flag_context_list_iterator (
    1592                flag_context_list_table_lookup (
    1593                  flag_context_list_table,
    1594                  token.string, strlen (token.string)));
    1595            free (token.string);
    1596            continue;
    1597  
    1598          case token_type_lparen:
    1599            if (++paren_nesting_depth > MAX_NESTING_DEPTH)
    1600              {
    1601                error_with_progname = false;
    1602                error (EXIT_FAILURE, 0, _("%s:%d: error: too many open parentheses"),
    1603                       logical_file_name, line_number);
    1604              }
    1605            if (extract_balanced (mlp, token_type_rparen,
    1606                                  inner_context, next_context_iter,
    1607                                  arglist_parser_alloc (mlp,
    1608                                                        state ? next_shapes : NULL)))
    1609              {
    1610                arglist_parser_done (argparser, arg);
    1611                return true;
    1612              }
    1613            paren_nesting_depth--;
    1614            next_context_iter = null_context_list_iterator;
    1615            state = 0;
    1616            continue;
    1617  
    1618          case token_type_rparen:
    1619            if (delim == token_type_rparen || delim == token_type_eof)
    1620              {
    1621                arglist_parser_done (argparser, arg);
    1622                return false;
    1623              }
    1624            next_context_iter = null_context_list_iterator;
    1625            state = 0;
    1626            continue;
    1627  
    1628          case token_type_comma:
    1629            arg++;
    1630            inner_context =
    1631              inherited_context (outer_context,
    1632                                 flag_context_list_iterator_advance (
    1633                                   &context_iter));
    1634            next_context_iter = passthrough_context_list_iterator;
    1635            state = 0;
    1636            continue;
    1637  
    1638          case token_type_lbracket:
    1639            if (++bracket_nesting_depth > MAX_NESTING_DEPTH)
    1640              {
    1641                error_with_progname = false;
    1642                error (EXIT_FAILURE, 0, _("%s:%d: error: too many open brackets"),
    1643                       logical_file_name, line_number);
    1644              }
    1645            if (extract_balanced (mlp, token_type_rbracket,
    1646                                  null_context, null_context_list_iterator,
    1647                                  arglist_parser_alloc (mlp, NULL)))
    1648              {
    1649                arglist_parser_done (argparser, arg);
    1650                return true;
    1651              }
    1652            bracket_nesting_depth--;
    1653            next_context_iter = null_context_list_iterator;
    1654            state = 0;
    1655            continue;
    1656  
    1657          case token_type_rbracket:
    1658            if (delim == token_type_rbracket || delim == token_type_eof)
    1659              {
    1660                arglist_parser_done (argparser, arg);
    1661                return false;
    1662              }
    1663            next_context_iter = null_context_list_iterator;
    1664            state = 0;
    1665            continue;
    1666  
    1667          case token_type_string:
    1668            {
    1669              lex_pos_ty pos;
    1670  
    1671              pos.file_name = logical_file_name;
    1672              pos.line_number = token.line_number;
    1673  
    1674              if (extract_all)
    1675                {
    1676                  char *string = mixed_string_contents (token.mixed_string);
    1677                  mixed_string_free (token.mixed_string);
    1678                  remember_a_message (mlp, NULL, string, true, false,
    1679                                      inner_context, &pos,
    1680                                      NULL, token.comment, true);
    1681                }
    1682              else
    1683                arglist_parser_remember (argparser, arg, token.mixed_string,
    1684                                         inner_context,
    1685                                         pos.file_name, pos.line_number,
    1686                                         token.comment, true);
    1687            }
    1688            drop_reference (token.comment);
    1689            next_context_iter = null_context_list_iterator;
    1690            state = 0;
    1691            continue;
    1692  
    1693          case token_type_eof:
    1694            arglist_parser_done (argparser, arg);
    1695            return true;
    1696  
    1697          case token_type_plus:
    1698          case token_type_other:
    1699            next_context_iter = null_context_list_iterator;
    1700            state = 0;
    1701            continue;
    1702  
    1703          default:
    1704            abort ();
    1705          }
    1706      }
    1707  }
    1708  
    1709  
    1710  void
    1711  extract_python (FILE *f,
    1712                  const char *real_filename, const char *logical_filename,
    1713                  flag_context_list_table_ty *flag_table,
    1714                  msgdomain_list_ty *mdlp)
    1715  {
    1716    message_list_ty *mlp = mdlp->item[0]->messages;
    1717  
    1718    fp = f;
    1719    real_file_name = real_filename;
    1720    logical_file_name = xstrdup (logical_filename);
    1721    line_number = 1;
    1722  
    1723    phase1_pushback_length = 0;
    1724  
    1725    lexical_context = lc_outside;
    1726  
    1727    phase2_pushback_length = 0;
    1728  
    1729    last_comment_line = -1;
    1730    last_non_comment_line = -1;
    1731  
    1732    /* For Python, the default source file encoding is UTF-8.  This is specified
    1733       in PEP 3120.  */
    1734    xgettext_current_file_source_encoding =
    1735     (xgettext_global_source_encoding != NULL ? xgettext_global_source_encoding :
    1736      po_charset_utf8);
    1737  #if HAVE_ICONV
    1738    xgettext_current_file_source_iconv = xgettext_global_source_iconv;
    1739  #endif
    1740  
    1741    xgettext_current_source_encoding = xgettext_current_file_source_encoding;
    1742  #if HAVE_ICONV
    1743    xgettext_current_source_iconv = xgettext_current_file_source_iconv;
    1744  #endif
    1745  
    1746    continuation_or_nonblank_line = false;
    1747  
    1748    open_pbb = 0;
    1749  
    1750    phase5_pushback_length = 0;
    1751  
    1752    flag_context_list_table = flag_table;
    1753    paren_nesting_depth = 0;
    1754    bracket_nesting_depth = 0;
    1755  
    1756    init_keywords ();
    1757  
    1758    /* Eat tokens until eof is seen.  When extract_balanced returns
    1759       due to an unbalanced closing parenthesis, just restart it.  */
    1760    while (!extract_balanced (mlp, token_type_eof,
    1761                              null_context, null_context_list_iterator,
    1762                              arglist_parser_alloc (mlp, NULL)))
    1763      ;
    1764  
    1765    fp = NULL;
    1766    real_file_name = NULL;
    1767    logical_file_name = NULL;
    1768    line_number = 0;
    1769  }