(root)/
gettext-0.22.4/
gettext-tools/
src/
po-lex.c
       1  /* GNU gettext - internationalization aids
       2     Copyright (C) 1995-2009, 2011, 2019 Free Software Foundation, Inc.
       3  
       4     This file was written by Peter Miller <millerp@canb.auug.org.au>.
       5     Multibyte character handling by Bruno Haible <haible@clisp.cons.org>.
       6  
       7     This program is free software: you can redistribute it and/or modify
       8     it under the terms of the GNU General Public License as published by
       9     the Free Software Foundation; either version 3 of the License, or
      10     (at your option) any later version.
      11  
      12     This program is distributed in the hope that it will be useful,
      13     but WITHOUT ANY WARRANTY; without even the implied warranty of
      14     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      15     GNU General Public License for more details.
      16  
      17     You should have received a copy of the GNU General Public License
      18     along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
      19  
      20  
      21  #ifdef HAVE_CONFIG_H
      22  # include "config.h"
      23  #endif
      24  
      25  /* Specification.  */
      26  #include "po-lex.h"
      27  
      28  #include <errno.h>
      29  #include <limits.h>
      30  #include <stdio.h>
      31  #include <stdlib.h>
      32  #include <string.h>
      33  #include <stdarg.h>
      34  
      35  #if HAVE_ICONV
      36  # include <iconv.h>
      37  #endif
      38  
      39  #include "c-ctype.h"
      40  #include "uniwidth.h"
      41  #include "gettext.h"
      42  #include "po-charset.h"
      43  #include "xalloc.h"
      44  #include "error.h"
      45  #include "error-progname.h"
      46  #include "xvasprintf.h"
      47  #include "po-error.h"
      48  #include "po-xerror.h"
      49  #include "pos.h"
      50  #include "message.h"
      51  #include "str-list.h"
      52  #include "po-gram-gen2.h"
      53  
      54  #define _(str) gettext(str)
      55  
      56  #if HAVE_ICONV
      57  # include "unistr.h"
      58  #endif
      59  
      60  #if HAVE_DECL_GETC_UNLOCKED
      61  # undef getc
      62  # define getc getc_unlocked
      63  #endif
      64  
      65  
      66  /* Current position within the PO file.  */
      67  lex_pos_ty gram_pos;
      68  int gram_pos_column;
      69  
      70  
      71  /* Error handling during the parsing of a PO file.
      72     These functions can access gram_pos and gram_pos_column.  */
      73  
      74  /* VARARGS1 */
      75  void
      76  po_gram_error (const char *fmt, ...)
      77  {
      78    va_list ap;
      79    char *buffer;
      80  
      81    va_start (ap, fmt);
      82    if (vasprintf (&buffer, fmt, ap) < 0)
      83      error (EXIT_FAILURE, 0, _("memory exhausted"));
      84    va_end (ap);
      85    po_xerror (PO_SEVERITY_ERROR, NULL, gram_pos.file_name, gram_pos.line_number,
      86               gram_pos_column + 1, false, buffer);
      87    free (buffer);
      88  
      89    if (error_message_count >= gram_max_allowed_errors)
      90      po_error (EXIT_FAILURE, 0, _("too many errors, aborting"));
      91  }
      92  
      93  /* VARARGS2 */
      94  void
      95  po_gram_error_at_line (const lex_pos_ty *pp, const char *fmt, ...)
      96  {
      97    va_list ap;
      98    char *buffer;
      99  
     100    va_start (ap, fmt);
     101    if (vasprintf (&buffer, fmt, ap) < 0)
     102      error (EXIT_FAILURE, 0, _("memory exhausted"));
     103    va_end (ap);
     104    po_xerror (PO_SEVERITY_ERROR, NULL, pp->file_name, pp->line_number,
     105               (size_t)(-1), false, buffer);
     106    free (buffer);
     107  
     108    if (error_message_count >= gram_max_allowed_errors)
     109      po_error (EXIT_FAILURE, 0, _("too many errors, aborting"));
     110  }
     111  
     112  
     113  /* The lowest level of PO file parsing converts bytes to multibyte characters.
     114     This is needed
     115     1. for C compatibility: ISO C 99 section 5.1.1.2 says that the first
     116        translation phase maps bytes to characters.
     117     2. to keep track of the current column, for the sake of precise error
     118        location. Emacs compile.el interprets the column in error messages
     119        by default as a screen column number, not as character number.
     120     3. to avoid skipping backslash-newline in the midst of a multibyte
     121        character. If XY is a multibyte character,  X \ newline Y  is invalid.
     122   */
     123  
     124  /* Multibyte character data type.  */
     125  /* Note this depends on po_lex_charset and po_lex_iconv, which get set
     126     while the file is being parsed.  */
     127  
     128  #define MBCHAR_BUF_SIZE 24
     129  
     130  struct mbchar
     131  {
     132    size_t bytes;         /* number of bytes of current character, > 0 */
     133  #if HAVE_ICONV
     134    bool uc_valid;        /* true if uc is a valid Unicode character */
     135    ucs4_t uc;            /* if uc_valid: the current character */
     136  #endif
     137    char buf[MBCHAR_BUF_SIZE]; /* room for the bytes */
     138  };
     139  
     140  /* We want to pass multibyte characters by reference automatically,
     141     therefore we use an array type.  */
     142  typedef struct mbchar mbchar_t[1];
     143  
     144  /* A version of memcpy optimized for the case n <= 1.  */
     145  static inline void
     146  memcpy_small (void *dst, const void *src, size_t n)
     147  {
     148    if (n > 0)
     149      {
     150        char *q = (char *) dst;
     151        const char *p = (const char *) src;
     152  
     153        *q = *p;
     154        if (--n > 0)
     155          do *++q = *++p; while (--n > 0);
     156      }
     157  }
     158  
     159  /* EOF (not a real character) is represented with bytes = 0 and
     160     uc_valid = false.  */
     161  static inline bool
     162  mb_iseof (const mbchar_t mbc)
     163  {
     164    return (mbc->bytes == 0);
     165  }
     166  
     167  /* Access the current character.  */
     168  static inline const char *
     169  mb_ptr (const mbchar_t mbc)
     170  {
     171    return mbc->buf;
     172  }
     173  static inline size_t
     174  mb_len (const mbchar_t mbc)
     175  {
     176    return mbc->bytes;
     177  }
     178  
     179  /* Comparison of characters.  */
     180  
     181  static inline bool
     182  mb_iseq (const mbchar_t mbc, char sc)
     183  {
     184    /* Note: It is wrong to compare only mbc->uc, because when the encoding is
     185       SHIFT_JIS, mbc->buf[0] == '\\' corresponds to mbc->uc == 0x00A5, but we
     186       want to treat it as an escape character, although it looks like a Yen
     187       sign.  */
     188  #if HAVE_ICONV && 0
     189    if (mbc->uc_valid)
     190      return (mbc->uc == sc); /* wrong! */
     191    else
     192  #endif
     193      return (mbc->bytes == 1 && mbc->buf[0] == sc);
     194  }
     195  
     196  static inline bool
     197  mb_isnul (const mbchar_t mbc)
     198  {
     199  #if HAVE_ICONV
     200    if (mbc->uc_valid)
     201      return (mbc->uc == 0);
     202    else
     203  #endif
     204      return (mbc->bytes == 1 && mbc->buf[0] == 0);
     205  }
     206  
     207  static inline int
     208  mb_cmp (const mbchar_t mbc1, const mbchar_t mbc2)
     209  {
     210  #if HAVE_ICONV
     211    if (mbc1->uc_valid && mbc2->uc_valid)
     212      return (int) mbc1->uc - (int) mbc2->uc;
     213    else
     214  #endif
     215      return (mbc1->bytes == mbc2->bytes
     216              ? memcmp (mbc1->buf, mbc2->buf, mbc1->bytes)
     217              : mbc1->bytes < mbc2->bytes
     218                ? (memcmp (mbc1->buf, mbc2->buf, mbc1->bytes) > 0 ? 1 : -1)
     219                : (memcmp (mbc1->buf, mbc2->buf, mbc2->bytes) >= 0 ? 1 : -1));
     220  }
     221  
     222  static inline bool
     223  mb_equal (const mbchar_t mbc1, const mbchar_t mbc2)
     224  {
     225  #if HAVE_ICONV
     226    if (mbc1->uc_valid && mbc2->uc_valid)
     227      return mbc1->uc == mbc2->uc;
     228    else
     229  #endif
     230      return (mbc1->bytes == mbc2->bytes
     231              && memcmp (mbc1->buf, mbc2->buf, mbc1->bytes) == 0);
     232  }
     233  
     234  /* <ctype.h>, <wctype.h> classification.  */
     235  
     236  static inline bool
     237  mb_isascii (const mbchar_t mbc)
     238  {
     239  #if HAVE_ICONV
     240    if (mbc->uc_valid)
     241      return (mbc->uc >= 0x0000 && mbc->uc <= 0x007F);
     242    else
     243  #endif
     244      return (mbc->bytes == 1
     245  #if CHAR_MIN < 0x00 /* to avoid gcc warning */
     246              && mbc->buf[0] >= 0x00
     247  #endif
     248  #if CHAR_MAX > 0x7F /* to avoid gcc warning */
     249              && mbc->buf[0] <= 0x7F
     250  #endif
     251             );
     252  }
     253  
     254  /* Extra <wchar.h> function.  */
     255  
     256  /* Unprintable characters appear as a small box of width 1.  */
     257  #define MB_UNPRINTABLE_WIDTH 1
     258  
     259  static int
     260  mb_width (const mbchar_t mbc)
     261  {
     262  #if HAVE_ICONV
     263    if (mbc->uc_valid)
     264      {
     265        ucs4_t uc = mbc->uc;
     266        const char *encoding =
     267          (po_lex_iconv != (iconv_t)(-1) ? po_lex_charset : "");
     268        int w = uc_width (uc, encoding);
     269        /* For unprintable characters, arbitrarily return 0 for control
     270           characters (except tab) and MB_UNPRINTABLE_WIDTH otherwise.  */
     271        if (w >= 0)
     272          return w;
     273        if (uc >= 0x0000 && uc <= 0x001F)
     274          {
     275            if (uc == 0x0009)
     276              return 8 - (gram_pos_column & 7);
     277            return 0;
     278          }
     279        if ((uc >= 0x007F && uc <= 0x009F) || (uc >= 0x2028 && uc <= 0x2029))
     280          return 0;
     281        return MB_UNPRINTABLE_WIDTH;
     282      }
     283    else
     284  #endif
     285      {
     286        if (mbc->bytes == 1)
     287          {
     288            if (
     289  #if CHAR_MIN < 0x00 /* to avoid gcc warning */
     290                mbc->buf[0] >= 0x00 &&
     291  #endif
     292                mbc->buf[0] <= 0x1F)
     293              {
     294                if (mbc->buf[0] == 0x09)
     295                  return 8 - (gram_pos_column & 7);
     296                return 0;
     297              }
     298            if (mbc->buf[0] == 0x7F)
     299              return 0;
     300          }
     301        return MB_UNPRINTABLE_WIDTH;
     302      }
     303  }
     304  
     305  /* Output.  */
     306  static inline void
     307  mb_putc (const mbchar_t mbc, FILE *stream)
     308  {
     309    fwrite (mbc->buf, 1, mbc->bytes, stream);
     310  }
     311  
     312  /* Assignment.  */
     313  static inline void
     314  mb_setascii (mbchar_t mbc, char sc)
     315  {
     316    mbc->bytes = 1;
     317  #if HAVE_ICONV
     318    mbc->uc_valid = 1;
     319    mbc->uc = sc;
     320  #endif
     321    mbc->buf[0] = sc;
     322  }
     323  
     324  /* Copying a character.  */
     325  static inline void
     326  mb_copy (mbchar_t new_mbc, const mbchar_t old_mbc)
     327  {
     328    memcpy_small (&new_mbc->buf[0], &old_mbc->buf[0], old_mbc->bytes);
     329    new_mbc->bytes = old_mbc->bytes;
     330  #if HAVE_ICONV
     331    if ((new_mbc->uc_valid = old_mbc->uc_valid))
     332      new_mbc->uc = old_mbc->uc;
     333  #endif
     334  }
     335  
     336  
     337  /* Multibyte character input.  */
     338  
     339  /* Number of characters that can be pushed back.
     340     We need 1 for lex_getc, plus 1 for lex_ungetc.  */
     341  #define NPUSHBACK 2
     342  
     343  /* Data type of a multibyte character input stream.  */
     344  struct mbfile
     345  {
     346    FILE *fp;
     347    bool eof_seen;
     348    int have_pushback;
     349    unsigned int bufcount;
     350    char buf[MBCHAR_BUF_SIZE];
     351    struct mbchar pushback[NPUSHBACK];
     352  };
     353  
     354  /* We want to pass multibyte streams by reference automatically,
     355     therefore we use an array type.  */
     356  typedef struct mbfile mbfile_t[1];
     357  
     358  /* Whether invalid multibyte sequences in the input shall be signalled
     359     or silently tolerated.  */
     360  static bool signal_eilseq;
     361  
     362  static inline void
     363  mbfile_init (mbfile_t mbf, FILE *stream)
     364  {
     365    mbf->fp = stream;
     366    mbf->eof_seen = false;
     367    mbf->have_pushback = 0;
     368    mbf->bufcount = 0;
     369  }
     370  
     371  /* Read the next multibyte character from mbf and put it into mbc.
     372     If a read error occurs, errno is set and ferror (mbf->fp) becomes true.  */
     373  static void
     374  mbfile_getc (mbchar_t mbc, mbfile_t mbf)
     375  {
     376    size_t bytes;
     377  
     378    /* If EOF has already been seen, don't use getc.  This matters if
     379       mbf->fp is connected to an interactive tty.  */
     380    if (mbf->eof_seen)
     381      goto eof;
     382  
     383    /* Return character pushed back, if there is one.  */
     384    if (mbf->have_pushback > 0)
     385      {
     386        mbf->have_pushback--;
     387        mb_copy (mbc, &mbf->pushback[mbf->have_pushback]);
     388        return;
     389      }
     390  
     391    /* Before using iconv, we need at least one byte.  */
     392    if (mbf->bufcount == 0)
     393      {
     394        int c = getc (mbf->fp);
     395        if (c == EOF)
     396          {
     397            mbf->eof_seen = true;
     398            goto eof;
     399          }
     400        mbf->buf[0] = (unsigned char) c;
     401        mbf->bufcount++;
     402      }
     403  
     404  #if HAVE_ICONV
     405    if (po_lex_iconv != (iconv_t)(-1))
     406      {
     407        /* Use iconv on an increasing number of bytes.  Read only as many
     408           bytes from mbf->fp as needed.  This is needed to give reasonable
     409           interactive behaviour when mbf->fp is connected to an interactive
     410           tty.  */
     411        for (;;)
     412          {
     413            unsigned char scratchbuf[64];
     414            const char *inptr = &mbf->buf[0];
     415            size_t insize = mbf->bufcount;
     416            char *outptr = (char *) &scratchbuf[0];
     417            size_t outsize = sizeof (scratchbuf);
     418  
     419            size_t res = iconv (po_lex_iconv,
     420                                (ICONV_CONST char **) &inptr, &insize,
     421                                &outptr, &outsize);
     422            /* We expect that a character has been produced if and only if
     423               some input bytes have been consumed.  */
     424            if ((insize < mbf->bufcount) != (outsize < sizeof (scratchbuf)))
     425              abort ();
     426            if (outsize == sizeof (scratchbuf))
     427              {
     428                /* No character has been produced.  Must be an error.  */
     429                if (res != (size_t)(-1))
     430                  abort ();
     431  
     432                if (errno == EILSEQ)
     433                  {
     434                    /* An invalid multibyte sequence was encountered.  */
     435                    /* Return a single byte.  */
     436                    if (signal_eilseq)
     437                      po_gram_error (_("invalid multibyte sequence"));
     438                    bytes = 1;
     439                    mbc->uc_valid = false;
     440                    break;
     441                  }
     442                else if (errno == EINVAL)
     443                  {
     444                    /* An incomplete multibyte character.  */
     445                    int c;
     446  
     447                    if (mbf->bufcount == MBCHAR_BUF_SIZE)
     448                      {
     449                        /* An overlong incomplete multibyte sequence was
     450                           encountered.  */
     451                        /* Return a single byte.  */
     452                        bytes = 1;
     453                        mbc->uc_valid = false;
     454                        break;
     455                      }
     456  
     457                    /* Read one more byte and retry iconv.  */
     458                    c = getc (mbf->fp);
     459                    if (c == EOF)
     460                      {
     461                        mbf->eof_seen = true;
     462                        if (ferror (mbf->fp))
     463                          goto eof;
     464                        if (signal_eilseq)
     465                          po_gram_error (_("incomplete multibyte sequence at end of file"));
     466                        bytes = mbf->bufcount;
     467                        mbc->uc_valid = false;
     468                        break;
     469                      }
     470                    mbf->buf[mbf->bufcount++] = (unsigned char) c;
     471                    if (c == '\n')
     472                      {
     473                        if (signal_eilseq)
     474                          po_gram_error (_("incomplete multibyte sequence at end of line"));
     475                        bytes = mbf->bufcount - 1;
     476                        mbc->uc_valid = false;
     477                        break;
     478                      }
     479                  }
     480                else
     481                  {
     482                    const char *errno_description = strerror (errno);
     483                    po_xerror (PO_SEVERITY_FATAL_ERROR, NULL, NULL, 0, 0, false,
     484                               xasprintf ("%s: %s",
     485                                          _("iconv failure"),
     486                                          errno_description));
     487                  }
     488              }
     489            else
     490              {
     491                size_t outbytes = sizeof (scratchbuf) - outsize;
     492                bytes = mbf->bufcount - insize;
     493  
     494                /* We expect that one character has been produced.  */
     495                if (bytes == 0)
     496                  abort ();
     497                if (outbytes == 0)
     498                  abort ();
     499                /* Convert it from UTF-8 to UCS-4.  */
     500                if (u8_mbtoucr (&mbc->uc, scratchbuf, outbytes) < (int) outbytes)
     501                  {
     502                    /* scratchbuf contains an out-of-range Unicode character
     503                       (> 0x10ffff).  */
     504                    if (signal_eilseq)
     505                      po_gram_error (_("invalid multibyte sequence"));
     506                    mbc->uc_valid = false;
     507                    break;
     508                  }
     509                mbc->uc_valid = true;
     510                break;
     511              }
     512          }
     513      }
     514    else
     515  #endif
     516      {
     517        if (po_lex_weird_cjk
     518            /* Special handling of encodings with CJK structure.  */
     519            && (unsigned char) mbf->buf[0] >= 0x80)
     520          {
     521            if (mbf->bufcount == 1)
     522              {
     523                /* Read one more byte.  */
     524                int c = getc (mbf->fp);
     525                if (c == EOF)
     526                  {
     527                    if (ferror (mbf->fp))
     528                      {
     529                        mbf->eof_seen = true;
     530                        goto eof;
     531                      }
     532                  }
     533                else
     534                  {
     535                    mbf->buf[1] = (unsigned char) c;
     536                    mbf->bufcount++;
     537                  }
     538              }
     539            if (mbf->bufcount >= 2 && (unsigned char) mbf->buf[1] >= 0x30)
     540              /* Return a double byte.  */
     541              bytes = 2;
     542            else
     543              /* Return a single byte.  */
     544              bytes = 1;
     545          }
     546        else
     547          {
     548            /* Return a single byte.  */
     549            bytes = 1;
     550          }
     551  #if HAVE_ICONV
     552        mbc->uc_valid = false;
     553  #endif
     554      }
     555  
     556    /* Return the multibyte sequence mbf->buf[0..bytes-1].  */
     557    memcpy_small (&mbc->buf[0], &mbf->buf[0], bytes);
     558    mbc->bytes = bytes;
     559  
     560    mbf->bufcount -= bytes;
     561    if (mbf->bufcount > 0)
     562      {
     563        /* It's not worth calling memmove() for so few bytes.  */
     564        unsigned int count = mbf->bufcount;
     565        char *p = &mbf->buf[0];
     566  
     567        do
     568          {
     569            *p = *(p + bytes);
     570            p++;
     571          }
     572        while (--count > 0);
     573      }
     574    return;
     575  
     576  eof:
     577    /* An mbchar_t with bytes == 0 is used to indicate EOF.  */
     578    mbc->bytes = 0;
     579  #if HAVE_ICONV
     580    mbc->uc_valid = false;
     581  #endif
     582    return;
     583  }
     584  
     585  static void
     586  mbfile_ungetc (const mbchar_t mbc, mbfile_t mbf)
     587  {
     588    if (mbf->have_pushback >= NPUSHBACK)
     589      abort ();
     590    mb_copy (&mbf->pushback[mbf->have_pushback], mbc);
     591    mbf->have_pushback++;
     592  }
     593  
     594  
     595  /* Lexer variables.  */
     596  
     597  static mbfile_t mbf;
     598  unsigned int gram_max_allowed_errors = 20;
     599  static bool po_lex_obsolete;
     600  static bool po_lex_previous;
     601  static bool pass_comments = false;
     602  bool pass_obsolete_entries = false;
     603  
     604  
     605  /* Prepare lexical analysis.  */
     606  void
     607  lex_start (FILE *fp, const char *real_filename, const char *logical_filename)
     608  {
     609    /* Ignore the logical_filename, because PO file entries already have
     610       their file names attached.  But use real_filename for error messages.  */
     611    gram_pos.file_name = xstrdup (real_filename);
     612  
     613    mbfile_init (mbf, fp);
     614  
     615    gram_pos.line_number = 1;
     616    gram_pos_column = 0;
     617    signal_eilseq = true;
     618    po_lex_obsolete = false;
     619    po_lex_previous = false;
     620    po_lex_charset_init ();
     621  }
     622  
     623  /* Terminate lexical analysis.  */
     624  void
     625  lex_end ()
     626  {
     627    mbf->fp = NULL;
     628    gram_pos.file_name = NULL;
     629    gram_pos.line_number = 0;
     630    gram_pos_column = 0;
     631    signal_eilseq = false;
     632    po_lex_obsolete = false;
     633    po_lex_previous = false;
     634    po_lex_charset_close ();
     635  }
     636  
     637  
     638  /* Read a single character, dealing with backslash-newline.
     639     Also keep track of the current line number and column number.  */
     640  static void
     641  lex_getc (mbchar_t mbc)
     642  {
     643    for (;;)
     644      {
     645        mbfile_getc (mbc, mbf);
     646  
     647        if (mb_iseof (mbc))
     648          {
     649            if (ferror (mbf->fp))
     650             bomb:
     651              {
     652                const char *errno_description = strerror (errno);
     653                po_xerror (PO_SEVERITY_FATAL_ERROR, NULL, NULL, 0, 0, false,
     654                           xasprintf ("%s: %s",
     655                                      xasprintf (_("error while reading \"%s\""),
     656                                                 gram_pos.file_name),
     657                                      errno_description));
     658              }
     659            break;
     660          }
     661  
     662        if (mb_iseq (mbc, '\n'))
     663          {
     664            gram_pos.line_number++;
     665            gram_pos_column = 0;
     666            break;
     667          }
     668  
     669        gram_pos_column += mb_width (mbc);
     670  
     671        if (mb_iseq (mbc, '\\'))
     672          {
     673            mbchar_t mbc2;
     674  
     675            mbfile_getc (mbc2, mbf);
     676  
     677            if (mb_iseof (mbc2))
     678              {
     679                if (ferror (mbf->fp))
     680                  goto bomb;
     681                break;
     682              }
     683  
     684            if (!mb_iseq (mbc2, '\n'))
     685              {
     686                mbfile_ungetc (mbc2, mbf);
     687                break;
     688              }
     689  
     690            gram_pos.line_number++;
     691            gram_pos_column = 0;
     692          }
     693        else
     694          break;
     695      }
     696  }
     697  
     698  
     699  static void
     700  lex_ungetc (const mbchar_t mbc)
     701  {
     702    if (!mb_iseof (mbc))
     703      {
     704        if (mb_iseq (mbc, '\n'))
     705          /* Decrement the line number, but don't care about the column.  */
     706          gram_pos.line_number--;
     707        else
     708          /* Decrement the column number.  Also works well enough for tabs.  */
     709          gram_pos_column -= mb_width (mbc);
     710  
     711        mbfile_ungetc (mbc, mbf);
     712      }
     713  }
     714  
     715  
     716  static int
     717  keyword_p (const char *s)
     718  {
     719    if (!po_lex_previous)
     720      {
     721        if (!strcmp (s, "domain"))
     722          return DOMAIN;
     723        if (!strcmp (s, "msgid"))
     724          return MSGID;
     725        if (!strcmp (s, "msgid_plural"))
     726          return MSGID_PLURAL;
     727        if (!strcmp (s, "msgstr"))
     728          return MSGSTR;
     729        if (!strcmp (s, "msgctxt"))
     730          return MSGCTXT;
     731      }
     732    else
     733      {
     734        /* Inside a "#|" context, the keywords have a different meaning.  */
     735        if (!strcmp (s, "msgid"))
     736          return PREV_MSGID;
     737        if (!strcmp (s, "msgid_plural"))
     738          return PREV_MSGID_PLURAL;
     739        if (!strcmp (s, "msgctxt"))
     740          return PREV_MSGCTXT;
     741      }
     742    po_gram_error_at_line (&gram_pos, _("keyword \"%s\" unknown"), s);
     743    return NAME;
     744  }
     745  
     746  
     747  static int
     748  control_sequence ()
     749  {
     750    mbchar_t mbc;
     751    int val;
     752    int max;
     753  
     754    lex_getc (mbc);
     755    if (mb_len (mbc) == 1)
     756      switch (mb_ptr (mbc) [0])
     757        {
     758        case 'n':
     759          return '\n';
     760  
     761        case 't':
     762          return '\t';
     763  
     764        case 'b':
     765          return '\b';
     766  
     767        case 'r':
     768          return '\r';
     769  
     770        case 'f':
     771          return '\f';
     772  
     773        case 'v':
     774          return '\v';
     775  
     776        case 'a':
     777          return '\a';
     778  
     779        case '\\':
     780        case '"':
     781          return mb_ptr (mbc) [0];
     782  
     783        case '0': case '1': case '2': case '3':
     784        case '4': case '5': case '6': case '7':
     785          val = 0;
     786          max = 0;
     787          for (;;)
     788            {
     789              char c = mb_ptr (mbc) [0];
     790              /* Warning: not portable, can't depend on '0'..'7' ordering.  */
     791              val = val * 8 + (c - '0');
     792              if (++max == 3)
     793                break;
     794              lex_getc (mbc);
     795              if (mb_len (mbc) == 1)
     796                switch (mb_ptr (mbc) [0])
     797                  {
     798                  case '0': case '1': case '2': case '3':
     799                  case '4': case '5': case '6': case '7':
     800                    continue;
     801  
     802                  default:
     803                    break;
     804                  }
     805              lex_ungetc (mbc);
     806              break;
     807            }
     808          return val;
     809  
     810        case 'x':
     811          lex_getc (mbc);
     812          if (mb_iseof (mbc) || mb_len (mbc) != 1
     813              || !c_isxdigit (mb_ptr (mbc) [0]))
     814            break;
     815  
     816          val = 0;
     817          for (;;)
     818            {
     819              char c = mb_ptr (mbc) [0];
     820              val *= 16;
     821              if (c_isdigit (c))
     822                /* Warning: not portable, can't depend on '0'..'9' ordering */
     823                val += c - '0';
     824              else if (c_isupper (c))
     825                /* Warning: not portable, can't depend on 'A'..'F' ordering */
     826                val += c - 'A' + 10;
     827              else
     828                /* Warning: not portable, can't depend on 'a'..'f' ordering */
     829                val += c - 'a' + 10;
     830  
     831              lex_getc (mbc);
     832              if (mb_len (mbc) == 1)
     833                switch (mb_ptr (mbc) [0])
     834                  {
     835                  case '0': case '1': case '2': case '3': case '4':
     836                  case '5': case '6': case '7': case '8': case '9':
     837                  case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
     838                  case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
     839                    continue;
     840  
     841                  default:
     842                    break;
     843                  }
     844              lex_ungetc (mbc);
     845              break;
     846            }
     847          return val;
     848  
     849        /* FIXME: \u and \U are not handled.  */
     850        }
     851    lex_ungetc (mbc);
     852    po_gram_error (_("invalid control sequence"));
     853    return ' ';
     854  }
     855  
     856  
     857  /* Return the next token in the PO file.  The return codes are defined
     858     in "po-gram-gen2.h".  Associated data is put in 'po_gram_lval'.  */
     859  int
     860  po_gram_lex ()
     861  {
     862    static char *buf;
     863    static size_t bufmax;
     864    mbchar_t mbc;
     865    size_t bufpos;
     866  
     867    for (;;)
     868      {
     869        lex_getc (mbc);
     870  
     871        if (mb_iseof (mbc))
     872          /* Yacc want this for end of file.  */
     873          return 0;
     874  
     875        if (mb_len (mbc) == 1)
     876          switch (mb_ptr (mbc) [0])
     877            {
     878            case '\n':
     879              po_lex_obsolete = false;
     880              po_lex_previous = false;
     881              /* Ignore whitespace, not relevant for the grammar.  */
     882              break;
     883  
     884            case ' ':
     885            case '\t':
     886            case '\r':
     887            case '\f':
     888            case '\v':
     889              /* Ignore whitespace, not relevant for the grammar.  */
     890              break;
     891  
     892            case '#':
     893              lex_getc (mbc);
     894              if (mb_iseq (mbc, '~'))
     895                /* A pseudo-comment beginning with #~ is found.  This is
     896                   not a comment.  It is the format for obsolete entries.
     897                   We simply discard the "#~" prefix.  The following
     898                   characters are expected to be well formed.  */
     899                {
     900                  po_lex_obsolete = true;
     901                  /* A pseudo-comment beginning with #~| denotes a previous
     902                     untranslated string in an obsolete entry.  This does not
     903                     make much sense semantically, and is implemented here
     904                     for completeness only.  */
     905                  lex_getc (mbc);
     906                  if (mb_iseq (mbc, '|'))
     907                    po_lex_previous = true;
     908                  else
     909                    lex_ungetc (mbc);
     910                  break;
     911                }
     912              if (mb_iseq (mbc, '|'))
     913                /* A pseudo-comment beginning with #| is found.  This is
     914                   the previous untranslated string.  We discard the "#|"
     915                   prefix, but change the keywords and string returns
     916                   accordingly.  */
     917                {
     918                  po_lex_previous = true;
     919                  break;
     920                }
     921  
     922              /* Accumulate comments into a buffer.  If we have been asked
     923                 to pass comments, generate a COMMENT token, otherwise
     924                 discard it.  */
     925              signal_eilseq = false;
     926              if (pass_comments)
     927                {
     928                  bufpos = 0;
     929                  for (;;)
     930                    {
     931                      while (bufpos + mb_len (mbc) >= bufmax)
     932                        {
     933                          bufmax += 100;
     934                          buf = xrealloc (buf, bufmax);
     935                        }
     936                      if (mb_iseof (mbc) || mb_iseq (mbc, '\n'))
     937                        break;
     938  
     939                      memcpy_small (&buf[bufpos], mb_ptr (mbc), mb_len (mbc));
     940                      bufpos += mb_len (mbc);
     941  
     942                      lex_getc (mbc);
     943                    }
     944                  buf[bufpos] = '\0';
     945  
     946                  po_gram_lval.string.string = buf;
     947                  po_gram_lval.string.pos = gram_pos;
     948                  po_gram_lval.string.obsolete = po_lex_obsolete;
     949                  po_lex_obsolete = false;
     950                  signal_eilseq = true;
     951                  return COMMENT;
     952                }
     953              else
     954                {
     955                  /* We do this in separate loop because collecting large
     956                     comments while they get not passed to the upper layers
     957                     is not very efficient.  */
     958                  while (!mb_iseof (mbc) && !mb_iseq (mbc, '\n'))
     959                    lex_getc (mbc);
     960                  po_lex_obsolete = false;
     961                  signal_eilseq = true;
     962                }
     963              break;
     964  
     965            case '"':
     966              /* Accumulate a string.  */
     967              bufpos = 0;
     968              for (;;)
     969                {
     970                  lex_getc (mbc);
     971                  while (bufpos + mb_len (mbc) >= bufmax)
     972                    {
     973                      bufmax += 100;
     974                      buf = xrealloc (buf, bufmax);
     975                    }
     976                  if (mb_iseof (mbc))
     977                    {
     978                      po_gram_error_at_line (&gram_pos,
     979                                             _("end-of-file within string"));
     980                      break;
     981                    }
     982                  if (mb_iseq (mbc, '\n'))
     983                    {
     984                      po_gram_error_at_line (&gram_pos,
     985                                             _("end-of-line within string"));
     986                      break;
     987                    }
     988                  if (mb_iseq (mbc, '"'))
     989                    break;
     990                  if (mb_iseq (mbc, '\\'))
     991                    {
     992                      buf[bufpos++] = control_sequence ();
     993                      continue;
     994                    }
     995  
     996                  /* Add mbc to the accumulator.  */
     997                  memcpy_small (&buf[bufpos], mb_ptr (mbc), mb_len (mbc));
     998                  bufpos += mb_len (mbc);
     999                }
    1000              buf[bufpos] = '\0';
    1001  
    1002              /* Strings cannot contain the msgctxt separator, because it cannot
    1003                 be faithfully represented in the msgid of a .mo file.  */
    1004              if (strchr (buf, MSGCTXT_SEPARATOR) != NULL)
    1005                po_gram_error_at_line (&gram_pos,
    1006                                       _("context separator <EOT> within string"));
    1007  
    1008              /* FIXME: Treatment of embedded \000 chars is incorrect.  */
    1009              po_gram_lval.string.string = xstrdup (buf);
    1010              po_gram_lval.string.pos = gram_pos;
    1011              po_gram_lval.string.obsolete = po_lex_obsolete;
    1012              return (po_lex_previous ? PREV_STRING : STRING);
    1013  
    1014            case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
    1015            case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
    1016            case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
    1017            case 's': case 't': case 'u': case 'v': case 'w': case 'x':
    1018            case 'y': case 'z':
    1019            case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
    1020            case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
    1021            case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
    1022            case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
    1023            case 'Y': case 'Z':
    1024            case '_': case '$':
    1025              bufpos = 0;
    1026              for (;;)
    1027                {
    1028                  char c = mb_ptr (mbc) [0];
    1029                  if (bufpos + 1 >= bufmax)
    1030                    {
    1031                      bufmax += 100;
    1032                      buf = xrealloc (buf, bufmax);
    1033                    }
    1034                  buf[bufpos++] = c;
    1035                  lex_getc (mbc);
    1036                  if (mb_len (mbc) == 1)
    1037                    switch (mb_ptr (mbc) [0])
    1038                      {
    1039                      default:
    1040                        break;
    1041                      case 'a': case 'b': case 'c': case 'd': case 'e':
    1042                      case 'f': case 'g': case 'h': case 'i': case 'j':
    1043                      case 'k': case 'l': case 'm': case 'n': case 'o':
    1044                      case 'p': case 'q': case 'r': case 's': case 't':
    1045                      case 'u': case 'v': case 'w': case 'x': case 'y':
    1046                      case 'z':
    1047                      case 'A': case 'B': case 'C': case 'D': case 'E':
    1048                      case 'F': case 'G': case 'H': case 'I': case 'J':
    1049                      case 'K': case 'L': case 'M': case 'N': case 'O':
    1050                      case 'P': case 'Q': case 'R': case 'S': case 'T':
    1051                      case 'U': case 'V': case 'W': case 'X': case 'Y':
    1052                      case 'Z':
    1053                      case '_': case '$':
    1054                      case '0': case '1': case '2': case '3': case '4':
    1055                      case '5': case '6': case '7': case '8': case '9':
    1056                        continue;
    1057                      }
    1058                  break;
    1059                }
    1060              lex_ungetc (mbc);
    1061  
    1062              buf[bufpos] = '\0';
    1063  
    1064              {
    1065                int k = keyword_p (buf);
    1066                if (k == NAME)
    1067                  {
    1068                    po_gram_lval.string.string = xstrdup (buf);
    1069                    po_gram_lval.string.pos = gram_pos;
    1070                    po_gram_lval.string.obsolete = po_lex_obsolete;
    1071                  }
    1072                else
    1073                  {
    1074                    po_gram_lval.pos.pos = gram_pos;
    1075                    po_gram_lval.pos.obsolete = po_lex_obsolete;
    1076                  }
    1077                return k;
    1078              }
    1079  
    1080            case '0': case '1': case '2': case '3': case '4':
    1081            case '5': case '6': case '7': case '8': case '9':
    1082              bufpos = 0;
    1083              for (;;)
    1084                {
    1085                  char c = mb_ptr (mbc) [0];
    1086                  if (bufpos + 1 >= bufmax)
    1087                    {
    1088                      bufmax += 100;
    1089                      buf = xrealloc (buf, bufmax + 1);
    1090                    }
    1091                  buf[bufpos++] = c;
    1092                  lex_getc (mbc);
    1093                  if (mb_len (mbc) == 1)
    1094                    switch (mb_ptr (mbc) [0])
    1095                      {
    1096                      default:
    1097                        break;
    1098  
    1099                      case '0': case '1': case '2': case '3': case '4':
    1100                      case '5': case '6': case '7': case '8': case '9':
    1101                        continue;
    1102                      }
    1103                  break;
    1104                }
    1105              lex_ungetc (mbc);
    1106  
    1107              buf[bufpos] = '\0';
    1108  
    1109              po_gram_lval.number.number = atol (buf);
    1110              po_gram_lval.number.pos = gram_pos;
    1111              po_gram_lval.number.obsolete = po_lex_obsolete;
    1112              return NUMBER;
    1113  
    1114            case '[':
    1115              po_gram_lval.pos.pos = gram_pos;
    1116              po_gram_lval.pos.obsolete = po_lex_obsolete;
    1117              return '[';
    1118  
    1119            case ']':
    1120              po_gram_lval.pos.pos = gram_pos;
    1121              po_gram_lval.pos.obsolete = po_lex_obsolete;
    1122              return ']';
    1123  
    1124            default:
    1125              /* This will cause a syntax error.  */
    1126              return JUNK;
    1127            }
    1128        else
    1129          /* This will cause a syntax error.  */
    1130          return JUNK;
    1131      }
    1132  }
    1133  
    1134  
    1135  /* po_gram_lex() can return comments as COMMENT.  Switch this on or off.  */
    1136  void
    1137  po_lex_pass_comments (bool flag)
    1138  {
    1139    pass_comments = flag;
    1140  }
    1141  
    1142  
    1143  /* po_gram_lex() can return obsolete entries as if they were normal entries.
    1144     Switch this on or off.  */
    1145  void
    1146  po_lex_pass_obsolete_entries (bool flag)
    1147  {
    1148    pass_obsolete_entries = flag;
    1149  }