(root)/
gettext-0.22.4/
gettext-tools/
src/
x-rst.c
       1  /* xgettext RST/RSJ backend.
       2     Copyright (C) 2001-2003, 2005-2009, 2018-2019 Free Software Foundation, Inc.
       3  
       4     This file was written by Bruno Haible <haible@clisp.cons.org>, 2001.
       5  
       6     This program is free software: you can redistribute it and/or modify
       7     it under the terms of the GNU General Public License as published by
       8     the Free Software Foundation; either version 3 of the License, or
       9     (at your option) any later version.
      10  
      11     This program is distributed in the hope that it will be useful,
      12     but WITHOUT ANY WARRANTY; without even the implied warranty of
      13     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      14     GNU General Public License for more details.
      15  
      16     You should have received a copy of the GNU General Public License
      17     along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
      18  
      19  #ifdef HAVE_CONFIG_H
      20  # include "config.h"
      21  #endif
      22  
      23  /* Specification.  */
      24  #include "x-rst.h"
      25  
      26  #include <errno.h>
      27  #include <stdbool.h>
      28  #include <stdio.h>
      29  #include <stddef.h>
      30  #include <stdlib.h>
      31  
      32  #include "c-ctype.h"
      33  #include "po-charset.h"
      34  #include "message.h"
      35  #include "xgettext.h"
      36  #include "xg-pos.h"
      37  #include "xg-encoding.h"
      38  #include "xg-mixed-string.h"
      39  #include "xg-message.h"
      40  #include "error.h"
      41  #include "error-progname.h"
      42  #include "xalloc.h"
      43  #include "gettext.h"
      44  
      45  #define _(s) gettext(s)
      46  
      47  /* RST stands for Resource String Table.
      48  
      49     An RST file consists of several string definitions.  A string definition
      50     starts at the beginning of a line and looks like this:
      51         ModuleName.ConstName=StringExpression
      52     A StringExpression consists of string pieces of the form 'xyz',
      53     single characters of the form #nnn (decimal integer), and +
      54     at the end of the line to designate continuation on the next line.
      55     String definitions can be separated by blank lines or comment lines
      56     beginning with '#'.
      57  
      58     This backend attempts to be functionally equivalent to the 'rstconv'
      59     program, part of the Free Pascal run time library, written by
      60     Sebastian Guenther.  Except that
      61       * the locations are output as "ModuleName.ConstName",
      62         not "ModuleName:ConstName",
      63       * we add the flag '#, object-pascal-format' where appropriate.
      64   */
      65  
      66  void
      67  extract_rst (FILE *f,
      68               const char *real_filename, const char *logical_filename,
      69               flag_context_list_table_ty *flag_table,
      70               msgdomain_list_ty *mdlp)
      71  {
      72    static char *buffer;
      73    static int bufmax;
      74    message_list_ty *mlp = mdlp->item[0]->messages;
      75    int line_number;
      76  
      77    line_number = 1;
      78    for (;;)
      79      {
      80        int c;
      81        int bufpos;
      82        char *location;
      83        char *msgid;
      84        lex_pos_ty pos;
      85  
      86        c = getc (f);
      87        if (c == EOF)
      88          break;
      89  
      90        /* Ignore blank line.  */
      91        if (c == '\n')
      92          {
      93            line_number++;
      94            continue;
      95          }
      96  
      97        /* Ignore comment line.  */
      98        if (c == '#')
      99          {
     100            do
     101              c = getc (f);
     102            while (c != EOF && c != '\n');
     103            if (c == EOF)
     104              break;
     105            line_number++;
     106            continue;
     107          }
     108  
     109        /* Read ModuleName.ConstName.  */
     110        bufpos = 0;
     111        for (;;)
     112          {
     113            if (c == EOF || c == '\n')
     114              {
     115                error_with_progname = false;
     116                error (EXIT_FAILURE, 0, _("%s:%d: invalid string definition"),
     117                       logical_filename, line_number);
     118                error_with_progname = true;
     119              }
     120            if (bufpos >= bufmax)
     121              {
     122                bufmax = 2 * bufmax + 10;
     123                buffer = xrealloc (buffer, bufmax);
     124              }
     125            if (c == '=')
     126              break;
     127            buffer[bufpos++] = c;
     128            c = getc (f);
     129            if (c == EOF && ferror (f))
     130              goto bomb;
     131          }
     132        buffer[bufpos] = '\0';
     133        location = xstrdup (buffer);
     134  
     135        /* Read StringExpression.  */
     136        bufpos = 0;
     137        for (;;)
     138          {
     139            c = getc (f);
     140            if (c == EOF)
     141              break;
     142            else if (c == '\n')
     143              {
     144                line_number++;
     145                break;
     146              }
     147            else if (c == '\'')
     148              {
     149                for (;;)
     150                  {
     151                    c = getc (f);
     152                    /* Embedded single quotes like 'abc''def' don't occur.
     153                       See fpc-1.0.4/compiler/cresstr.pas.  */
     154                    if (c == EOF || c == '\n' || c == '\'')
     155                      break;
     156                    if (bufpos >= bufmax)
     157                      {
     158                        bufmax = 2 * bufmax + 10;
     159                        buffer = xrealloc (buffer, bufmax);
     160                      }
     161                    buffer[bufpos++] = c;
     162                  }
     163                if (c == EOF)
     164                  break;
     165                else if (c == '\n')
     166                  {
     167                    line_number++;
     168                    break;
     169                  }
     170              }
     171            else if (c == '#')
     172              {
     173                int n;
     174                c = getc (f);
     175                if (c == EOF && ferror (f))
     176                  goto bomb;
     177                if (c == EOF || !c_isdigit (c))
     178                  {
     179                    error_with_progname = false;
     180                    error (EXIT_FAILURE, 0, _("%s:%d: missing number after #"),
     181                           logical_filename, line_number);
     182                    error_with_progname = true;
     183                  }
     184                n = (c - '0');
     185                for (;;)
     186                  {
     187                    c = getc (f);
     188                    if (c == EOF || !c_isdigit (c))
     189                      break;
     190                    n = n * 10 + (c - '0');
     191                  }
     192                if (bufpos >= bufmax)
     193                  {
     194                    bufmax = 2 * bufmax + 10;
     195                    buffer = xrealloc (buffer, bufmax);
     196                  }
     197                buffer[bufpos++] = (unsigned char) n;
     198                if (c == EOF)
     199                  break;
     200                ungetc (c, f);
     201              }
     202            else if (c == '+')
     203              {
     204                c = getc (f);
     205                if (c == EOF)
     206                  break;
     207                if (c == '\n')
     208                  line_number++;
     209                else
     210                  ungetc (c, f);
     211              }
     212            else
     213              {
     214                error_with_progname = false;
     215                error (EXIT_FAILURE, 0, _("%s:%d: invalid string expression"),
     216                       logical_filename, line_number);
     217                error_with_progname = true;
     218              }
     219          }
     220        if (bufpos >= bufmax)
     221          {
     222            bufmax = 2 * bufmax + 10;
     223            buffer = xrealloc (buffer, bufmax);
     224          }
     225        buffer[bufpos] = '\0';
     226        msgid = xstrdup (buffer);
     227  
     228        pos.file_name = location;
     229        pos.line_number = (size_t)(-1);
     230  
     231        remember_a_message (mlp, NULL, msgid, false, false, null_context, &pos,
     232                            NULL, NULL, false);
     233  
     234        /* Here c is the last read character: EOF or '\n'.  */
     235        if (c == EOF)
     236          break;
     237      }
     238  
     239    if (ferror (f))
     240      {
     241      bomb:
     242        error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
     243               real_filename);
     244      }
     245  }
     246  
     247  
     248  /* RSJ stands for Resource String Table in JSON.
     249  
     250     An RSJ file is a JSON file that contains several string definitions.
     251     It has the format (modulo whitespace)
     252       {
     253         "version": 1,
     254         "strings":
     255           [
     256             {
     257               "hash": <integer>,
     258               "name": <string>,
     259               "sourcebytes": [ <integer>... ],
     260               "value": <string>
     261             },
     262             ...
     263           ]
     264       }
     265     The sourcebytes array contains the original source bytes, in the
     266     source encoding (not guaranteed to be ISO-8859-1, see
     267     <http://wiki.freepascal.org/FPC_Unicode_support#Source_file_codepage>).
     268  
     269     This backend attempts to be functionally equivalent to the 'rstconv'
     270     program, part of the Free Pascal run time library, written by
     271     Sebastian Guenther.  Except that
     272       * we use the "value" as msgid, not the "sourcebytes",
     273       * the locations are output as "ModuleName.ConstName",
     274         not "ModuleName:ConstName",
     275       * we add the flag '#, object-pascal-format' where appropriate.
     276   */
     277  
     278  /* For the JSON syntax, refer to RFC 8259.  */
     279  
     280  /* ======================== Reading of characters.  ======================== */
     281  
     282  /* The input file stream.  */
     283  static FILE *fp;
     284  
     285  
     286  /* 1. line_number handling.  */
     287  
     288  static int
     289  phase1_getc ()
     290  {
     291    int c = getc (fp);
     292  
     293    if (c == EOF)
     294      {
     295        if (ferror (fp))
     296          error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
     297                 real_file_name);
     298        return EOF;
     299      }
     300  
     301    if (c == '\n')
     302      line_number++;
     303  
     304    return c;
     305  }
     306  
     307  /* Supports only one pushback character.  */
     308  static void
     309  phase1_ungetc (int c)
     310  {
     311    if (c != EOF)
     312      {
     313        if (c == '\n')
     314          --line_number;
     315  
     316        ungetc (c, fp);
     317      }
     318  }
     319  
     320  
     321  /* 2. Skipping whitespace.  */
     322  
     323  /* Tests whether a phase1_getc() result is JSON whitespace.  */
     324  static inline bool
     325  is_whitespace (int c)
     326  {
     327    return (c == ' ' || c == '\t' || c == '\n' || c == '\r');
     328  }
     329  
     330  static int
     331  phase2_getc ()
     332  {
     333    int c;
     334  
     335    do
     336      c = phase1_getc ();
     337    while (is_whitespace (c));
     338  
     339    return c;
     340  }
     341  
     342  static void
     343  phase2_ungetc (int c)
     344  {
     345    phase1_ungetc (c);
     346  }
     347  
     348  
     349  /* ========================== Reading of tokens.  ========================== */
     350  
     351  /* Result of parsing a token.  */
     352  
     353  enum parse_result
     354  {
     355    pr_parsed, /* successfully parsed */
     356    pr_none,   /* the next token is of a different type */
     357    pr_syntax  /* syntax error inside the token */
     358  };
     359  
     360  static char *buffer;
     361  static int bufmax;
     362  
     363  /* Parses an integer.  Returns it in buffer, of length bufmax.
     364     Returns pr_parsed or pr_none.  */
     365  static enum parse_result
     366  parse_integer ()
     367  {
     368    int c;
     369    int bufpos;
     370  
     371    c = phase2_getc ();
     372    bufpos = 0;
     373    for (;;)
     374      {
     375        if (bufpos >= bufmax)
     376          {
     377            bufmax = 2 * bufmax + 10;
     378            buffer = xrealloc (buffer, bufmax);
     379          }
     380        if (!(c >= '0' && c <= '9'))
     381          break;
     382        buffer[bufpos++] = c;
     383        c = phase1_getc ();
     384      }
     385    phase1_ungetc (c);
     386    buffer[bufpos] = '\0';
     387    return (bufpos == 0 ? pr_none : pr_parsed);
     388  }
     389  
     390  static struct mixed_string_buffer stringbuf;
     391  
     392  /* Parses a string.  Returns it in stringbuf, in UTF-8 encoding.
     393     Returns a parse_result.  */
     394  static enum parse_result
     395  parse_string ()
     396  {
     397    int c;
     398  
     399    c = phase2_getc ();
     400    if (c != '"')
     401      {
     402        phase2_ungetc (c);
     403        return pr_none;
     404      }
     405    mixed_string_buffer_init (&stringbuf, lc_string,
     406                              logical_file_name, line_number);
     407    for (;;)
     408      {
     409        c = phase1_getc ();
     410        /* Keep line_number in sync.  */
     411        stringbuf.line_number = line_number;
     412        if (c == EOF || (c >= 0 && c < 0x20))
     413          return pr_syntax;
     414        if (c == '"')
     415          break;
     416        if (c == '\\')
     417          {
     418            c = phase1_getc ();
     419            if (c == 'u')
     420              {
     421                unsigned int n = 0;
     422                int i;
     423  
     424                for (i = 0; i < 4; i++)
     425                  {
     426                    c = phase1_getc ();
     427  
     428                    if (c >= '0' && c <= '9')
     429                      n = (n << 4) + (c - '0');
     430                    else if (c >= 'A' && c <= 'F')
     431                      n = (n << 4) + (c - 'A' + 10);
     432                    else if (c >= 'a' && c <= 'f')
     433                      n = (n << 4) + (c - 'a' + 10);
     434                    else
     435                      return pr_syntax;
     436                  }
     437                mixed_string_buffer_append_unicode (&stringbuf, n);
     438              }
     439            else
     440              {
     441                switch (c)
     442                  {
     443                  case '"':
     444                  case '\\':
     445                  case '/':
     446                    break;
     447                  case 'b':
     448                    c = '\b';
     449                    break;
     450                  case 'f':
     451                    c = '\f';
     452                    break;
     453                  case 'n':
     454                    c = '\n';
     455                    break;
     456                  case 'r':
     457                    c = '\r';
     458                    break;
     459                  case 't':
     460                    c = '\t';
     461                    break;
     462                  default:
     463                    return pr_syntax;
     464                  }
     465                mixed_string_buffer_append_char (&stringbuf, c);
     466              }
     467          }
     468        else
     469          mixed_string_buffer_append_char (&stringbuf, c);
     470      }
     471    return pr_parsed;
     472  }
     473  
     474  void
     475  extract_rsj (FILE *f,
     476               const char *real_filename, const char *logical_filename,
     477               flag_context_list_table_ty *flag_table,
     478               msgdomain_list_ty *mdlp)
     479  {
     480    message_list_ty *mlp = mdlp->item[0]->messages;
     481    int c;
     482  
     483    fp = f;
     484    real_file_name = real_filename;
     485    logical_file_name = xstrdup (logical_filename);
     486    line_number = 1;
     487  
     488    /* JSON is always in UTF-8.  */
     489    xgettext_current_source_encoding = po_charset_utf8;
     490  
     491    /* Parse the initial opening brace.  */
     492    c = phase2_getc ();
     493    if (c != '{')
     494      goto invalid_json;
     495  
     496    c = phase2_getc ();
     497    if (c != '}')
     498      {
     499        phase2_ungetc (c);
     500        for (;;)
     501          {
     502            /* Parse a string.  */
     503            char *s1;
     504            if (parse_string () != pr_parsed)
     505              goto invalid_json;
     506            s1 = mixed_string_contents_free1 (
     507                   mixed_string_buffer_result (&stringbuf));
     508  
     509            /* Parse a colon.  */
     510            c = phase2_getc ();
     511            if (c != ':')
     512              goto invalid_json;
     513  
     514            if (strcmp (s1, "version") == 0)
     515              {
     516                /* Parse an integer.  */
     517                if (parse_integer () != pr_parsed)
     518                  goto invalid_rsj;
     519                if (strcmp (buffer, "1") != 0)
     520                  goto invalid_rsj_version;
     521              }
     522            else if (strcmp (s1, "strings") == 0)
     523              {
     524                /* Parse an array.  */
     525                c = phase2_getc ();
     526                if (c != '[')
     527                  goto invalid_rsj;
     528  
     529                c = phase2_getc ();
     530                if (c != ']')
     531                  {
     532                    phase2_ungetc (c);
     533                    for (;;)
     534                      {
     535                        char *location = NULL;
     536                        char *msgid = NULL;
     537                        lex_pos_ty pos;
     538  
     539                        /* Parse an object.  */
     540                        c = phase2_getc ();
     541                        if (c != '{')
     542                          goto invalid_rsj;
     543  
     544                        c = phase2_getc ();
     545                        if (c != '}')
     546                          {
     547                            phase2_ungetc (c);
     548                            for (;;)
     549                              {
     550                                /* Parse a string.  */
     551                                char *s2;
     552                                if (parse_string () != pr_parsed)
     553                                  goto invalid_json;
     554                                s2 = mixed_string_contents_free1 (
     555                                       mixed_string_buffer_result (&stringbuf));
     556  
     557                                /* Parse a colon.  */
     558                                c = phase2_getc ();
     559                                if (c != ':')
     560                                  goto invalid_json;
     561  
     562                                if (strcmp (s2, "hash") == 0)
     563                                  {
     564                                    /* Parse an integer.  */
     565                                    if (parse_integer () != pr_parsed)
     566                                      goto invalid_rsj;
     567                                  }
     568                                else if (strcmp (s2, "name") == 0)
     569                                  {
     570                                    /* Parse a string.  */
     571                                    enum parse_result r = parse_string ();
     572                                    if (r == pr_none)
     573                                      goto invalid_rsj;
     574                                    if (r == pr_syntax || location != NULL)
     575                                      goto invalid_json;
     576                                    location =
     577                                      mixed_string_contents_free1 (
     578                                        mixed_string_buffer_result (&stringbuf));
     579                                  }
     580                                else if (strcmp (s2, "sourcebytes") == 0)
     581                                  {
     582                                    /* Parse an array.  */
     583                                    c = phase2_getc ();
     584                                    if (c != '[')
     585                                      goto invalid_rsj;
     586  
     587                                    c = phase2_getc ();
     588                                    if (c != ']')
     589                                      {
     590                                        phase2_ungetc (c);
     591                                        for (;;)
     592                                          {
     593                                            /* Parse an integer.  */
     594                                            if (parse_integer () != pr_parsed)
     595                                              goto invalid_rsj;
     596  
     597                                            /* Parse a comma.  */
     598                                            c = phase2_getc ();
     599                                            if (c == ']')
     600                                              break;
     601                                            if (c != ',')
     602                                              goto invalid_json;
     603                                          }
     604                                      }
     605                                  }
     606                                else if (strcmp (s2, "value") == 0)
     607                                  {
     608                                    /* Parse a string.  */
     609                                    enum parse_result r = parse_string ();
     610                                    if (r == pr_none)
     611                                      goto invalid_rsj;
     612                                    if (r == pr_syntax || msgid != NULL)
     613                                      goto invalid_json;
     614                                    msgid =
     615                                      mixed_string_contents_free1 (
     616                                        mixed_string_buffer_result (&stringbuf));
     617                                  }
     618                                else
     619                                  goto invalid_rsj;
     620  
     621                                free (s2);
     622  
     623                                /* Parse a comma.  */
     624                                c = phase2_getc ();
     625                                if (c == '}')
     626                                  break;
     627                                if (c != ',')
     628                                  goto invalid_json;
     629                              }
     630                          }
     631  
     632                        if (location == NULL || msgid == NULL)
     633                          goto invalid_rsj;
     634  
     635                        pos.file_name = location;
     636                        pos.line_number = (size_t)(-1);
     637  
     638                        remember_a_message (mlp, NULL, msgid, true, false,
     639                                            null_context, &pos,
     640                                            NULL, NULL, false);
     641  
     642                        /* Parse a comma.  */
     643                        c = phase2_getc ();
     644                        if (c == ']')
     645                          break;
     646                        if (c != ',')
     647                          goto invalid_json;
     648                      }
     649                  }
     650              }
     651            else
     652              goto invalid_rsj;
     653  
     654            /* Parse a comma.  */
     655            c = phase2_getc ();
     656            if (c == '}')
     657              break;
     658            if (c != ',')
     659              goto invalid_json;
     660          }
     661      }
     662  
     663    /* Seen the closing brace.  */
     664    c = phase2_getc ();
     665    if (c != EOF)
     666      goto invalid_json;
     667  
     668    fp = NULL;
     669    real_file_name = NULL;
     670    logical_file_name = NULL;
     671    line_number = 0;
     672  
     673    return;
     674  
     675   invalid_json:
     676    error_with_progname = false;
     677    error (EXIT_FAILURE, 0, _("%s:%d: invalid JSON syntax"),
     678           logical_filename, line_number);
     679    error_with_progname = true;
     680    return;
     681  
     682   invalid_rsj:
     683    error_with_progname = false;
     684    error (EXIT_FAILURE, 0, _("%s:%d: invalid RSJ syntax"),
     685           logical_filename, line_number);
     686    error_with_progname = true;
     687    return;
     688  
     689   invalid_rsj_version:
     690    error_with_progname = false;
     691    error (EXIT_FAILURE, 0,
     692           _("%s:%d: invalid RSJ version. Only version 1 is supported."),
     693           logical_filename, line_number);
     694    error_with_progname = true;
     695    return;
     696  }