1  /* Reading Java .properties files.
       2     Copyright (C) 2003, 2005-2007, 2009, 2018, 2020, 2023 Free Software Foundation, Inc.
       3     Written by Bruno Haible <bruno@clisp.org>, 2003.
       4  
       5     This program is free software: you can redistribute it and/or modify
       6     it under the terms of the GNU General Public License as published by
       7     the Free Software Foundation; either version 3 of the License, or
       8     (at your option) any later version.
       9  
      10     This program is distributed in the hope that it will be useful,
      11     but WITHOUT ANY WARRANTY; without even the implied warranty of
      12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      13     GNU General Public License for more details.
      14  
      15     You should have received a copy of the GNU General Public License
      16     along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
      17  
      18  #ifdef HAVE_CONFIG_H
      19  # include <config.h>
      20  #endif
      21  
      22  /* Specification.  */
      23  #include "read-properties.h"
      24  
      25  #include <assert.h>
      26  #include <errno.h>
      27  #include <stdbool.h>
      28  #include <stdio.h>
      29  #include <stdlib.h>
      30  #include <string.h>
      31  
      32  #include "error.h"
      33  #include "error-progname.h"
      34  #include "message.h"
      35  #include "read-catalog-abstract.h"
      36  #include "xalloc.h"
      37  #include "xvasprintf.h"
      38  #include "po-xerror.h"
      39  #include "msgl-ascii.h"
      40  #include "read-file.h"
      41  #include "unistr.h"
      42  #include "gettext.h"
      43  
      44  #define _(str) gettext (str)
      45  
      46  /* For compiling this file in C++ mode.  */
      47  #ifdef __cplusplus
      48  # define this thiss
      49  #endif
      50  
      51  
      52  /* The format of the Java .properties files is documented in the JDK
      53     documentation for class java.util.Properties.  In the case of .properties
      54     files for PropertyResourceBundle, each non-comment line contains a
      55     key/value pair in the form "key = value" or "key : value" or "key value",
      56     where the key is the msgid and the value is the msgstr.  Messages with
      57     plurals are not supported in this format.
      58  
      59     The encoding of Java .properties files is:
      60       - ASCII with Java \uxxxx escape sequences,
      61       - ISO-8859-1 if non-ASCII bytes are encounterd,
      62       - UTF-8 if non-ASCII bytes are encountered and the entire file is
      63         valid UTF-8 (in Java 9 or newer), see
      64         https://docs.oracle.com/javase/9/intl/internationalization-enhancements-jdk-9.htm */
      65  
      66  /* Handling of comments: We copy all comments from the .properties file to
      67     the PO file. This is not really needed; it's a service for translators
      68     who don't like PO files and prefer to maintain the .properties file.  */
      69  
      70  /* Real filename, used in error messages about the input file.  */
      71  static const char *real_file_name;
      72  
      73  /* File name and line number.  */
      74  extern lex_pos_ty gram_pos;
      75  
      76  /* The contents of the input file.  */
      77  static char *contents;
      78  static size_t contents_length;
      79  
      80  /* True if the input file is assumed to be in UTF-8 encoding.
      81     False if it is assumed to be in ISO-8859-1 encoding.  */
      82  static bool assume_utf8;
      83  
      84  /* Current position in contents.  */
      85  static size_t position;
      86  
      87  /* Phase 1: Read an input byte.
      88     Max. 1 pushback byte.  */
      89  
      90  static int
      91  phase1_getc ()
      92  {
      93    if (position == contents_length)
      94      return EOF;
      95  
      96    return (unsigned char) contents[position++];
      97  }
      98  
      99  static inline void
     100  phase1_ungetc (int c)
     101  {
     102    if (c != EOF)
     103      position--;
     104  }
     105  
     106  
     107  /* Phase 2: Read an input byte, treating CR/LF like a single LF.
     108     Max. 2 pushback bytes.  */
     109  
     110  static unsigned char phase2_pushback[2];
     111  static int phase2_pushback_length;
     112  
     113  static int
     114  phase2_getc ()
     115  {
     116    int c;
     117  
     118    if (phase2_pushback_length)
     119      c = phase2_pushback[--phase2_pushback_length];
     120    else
     121      {
     122        c = phase1_getc ();
     123  
     124        if (c == '\r')
     125          {
     126            int c2 = phase1_getc ();
     127            if (c2 == '\n')
     128              c = c2;
     129            else
     130              phase1_ungetc (c2);
     131          }
     132      }
     133  
     134    if (c == '\n')
     135      gram_pos.line_number++;
     136  
     137    return c;
     138  }
     139  
     140  static void
     141  phase2_ungetc (int c)
     142  {
     143    if (c == '\n')
     144      --gram_pos.line_number;
     145    if (c != EOF)
     146      phase2_pushback[phase2_pushback_length++] = c;
     147  }
     148  
     149  
     150  /* Phase 3: Read an input byte, treating CR/LF like a single LF,
     151     with handling of continuation lines.
     152     Max. 1 pushback character.  */
     153  
     154  static int
     155  phase3_getc ()
     156  {
     157    int c = phase2_getc ();
     158  
     159    for (;;)
     160      {
     161        if (c != '\\')
     162          return c;
     163  
     164        c = phase2_getc ();
     165        if (c != '\n')
     166          {
     167            phase2_ungetc (c);
     168            return '\\';
     169          }
     170  
     171        /* Skip the backslash-newline and all whitespace that follows it.  */
     172        do
     173          c = phase2_getc ();
     174        while (c == ' ' || c == '\t' || c == '\r' || c == '\f');
     175      }
     176  }
     177  
     178  static inline void
     179  phase3_ungetc (int c)
     180  {
     181    phase2_ungetc (c);
     182  }
     183  
     184  
     185  /* Converts a string from ISO-8859-1 encoding to UTF-8 encoding.  */
     186  static char *
     187  conv_from_iso_8859_1 (char *string)
     188  {
     189    if (is_ascii_string (string))
     190      return string;
     191    else
     192      {
     193        size_t length = strlen (string);
     194        /* Each ISO-8859-1 character needs 2 bytes at worst.  */
     195        unsigned char *utf8_string = XNMALLOC (2 * length + 1, unsigned char);
     196        unsigned char *q = utf8_string;
     197        const char *str = string;
     198        const char *str_limit = str + length;
     199  
     200        while (str < str_limit)
     201          {
     202            unsigned int uc = (unsigned char) *str++;
     203            int n = u8_uctomb (q, uc, 6);
     204            assert (n > 0);
     205            q += n;
     206          }
     207        *q = '\0';
     208        assert (q - utf8_string <= 2 * length);
     209  
     210        return (char *) utf8_string;
     211      }
     212  }
     213  
     214  
     215  /* Converts a string from JAVA encoding (with \uxxxx sequences) to UTF-8
     216     encoding.  May destructively modify the argument string.  */
     217  static char *
     218  conv_from_java (char *string)
     219  {
     220    /* This conversion can only shrink the string, never increase its size.
     221       So there is no need to xmalloc the result freshly.  */
     222    const char *p = string;
     223    unsigned char *q = (unsigned char *) string;
     224  
     225    while (*p != '\0')
     226      {
     227        if (p[0] == '\\' && p[1] == 'u')
     228          {
     229            unsigned int n = 0;
     230            int i;
     231  
     232            for (i = 0; i < 4; i++)
     233              {
     234                int c1 = (unsigned char) p[2 + i];
     235  
     236                if (c1 >= '0' && c1 <= '9')
     237                  n = (n << 4) + (c1 - '0');
     238                else if (c1 >= 'A' && c1 <= 'F')
     239                  n = (n << 4) + (c1 - 'A' + 10);
     240                else if (c1 >= 'a' && c1 <= 'f')
     241                  n = (n << 4) + (c1 - 'a' + 10);
     242                else
     243                  goto just_one_byte;
     244              }
     245  
     246            if (i == 4)
     247              {
     248                unsigned int uc;
     249  
     250                if (n >= 0xd800 && n < 0xdc00)
     251                  {
     252                    if (p[6] == '\\' && p[7] == 'u')
     253                      {
     254                        unsigned int m = 0;
     255  
     256                        for (i = 0; i < 4; i++)
     257                          {
     258                            int c1 = (unsigned char) p[8 + i];
     259  
     260                            if (c1 >= '0' && c1 <= '9')
     261                              m = (m << 4) + (c1 - '0');
     262                            else if (c1 >= 'A' && c1 <= 'F')
     263                              m = (m << 4) + (c1 - 'A' + 10);
     264                            else if (c1 >= 'a' && c1 <= 'f')
     265                              m = (m << 4) + (c1 - 'a' + 10);
     266                            else
     267                              goto just_one_byte;
     268                          }
     269  
     270                        if (i == 4 && (m >= 0xdc00 && m < 0xe000))
     271                          {
     272                            /* Combine two UTF-16 words to a character.  */
     273                            uc = 0x10000 + ((n - 0xd800) << 10) + (m - 0xdc00);
     274                            p += 12;
     275                          }
     276                        else
     277                          goto just_one_byte;
     278                      }
     279                    else
     280                      goto just_one_byte;
     281                  }
     282                else
     283                  {
     284                    uc = n;
     285                    p += 6;
     286                  }
     287  
     288                q += u8_uctomb (q, uc, 6);
     289                continue;
     290              }
     291          }
     292        just_one_byte:
     293          *q++ = (unsigned char) *p++;
     294      }
     295    *q = '\0';
     296    return string;
     297  }
     298  
     299  
     300  /* Phase 4: Read the next single byte or UTF-16 code point,
     301     treating CR/LF like a single LF, with handling of continuation lines
     302     and of \uxxxx sequences.  */
     303  
     304  /* Return value of phase 4 when EOF is reached.  */
     305  #define P4_EOF 0xffff
     306  
     307  /* Convert an UTF-16 code point to a return value that can be distinguished
     308     from a single-byte return value.  */
     309  #define UNICODE(code) (0x10000 + (code))
     310  
     311  /* Test a return value of phase 4 whether it designates an UTF-16 code
     312     point.  */
     313  #define IS_UNICODE(p4_result) ((p4_result) >= 0x10000)
     314  
     315  /* Extract the UTF-16 code of a return value that satisfies IS_UNICODE.  */
     316  #define UTF16_VALUE(p4_result) ((unsigned short) ((p4_result) - 0x10000))
     317  
     318  static int
     319  phase4_getuc ()
     320  {
     321    int c = phase3_getc ();
     322  
     323    if (c == EOF)
     324      return P4_EOF;
     325    if (c == '\\')
     326      {
     327        int c2 = phase3_getc ();
     328  
     329        if (c2 == 't')
     330          return '\t';
     331        if (c2 == 'n')
     332          return '\n';
     333        if (c2 == 'r')
     334          return '\r';
     335        if (c2 == 'f')
     336          return '\f';
     337        if (c2 == 'u')
     338          {
     339            unsigned int n = 0;
     340            int i;
     341  
     342            for (i = 0; i < 4; i++)
     343              {
     344                int c1 = phase3_getc ();
     345  
     346                if (c1 >= '0' && c1 <= '9')
     347                  n = (n << 4) + (c1 - '0');
     348                else if (c1 >= 'A' && c1 <= 'F')
     349                  n = (n << 4) + (c1 - 'A' + 10);
     350                else if (c1 >= 'a' && c1 <= 'f')
     351                  n = (n << 4) + (c1 - 'a' + 10);
     352                else
     353                  {
     354                    phase3_ungetc (c1);
     355                    po_xerror (PO_SEVERITY_ERROR, NULL,
     356                               real_file_name, gram_pos.line_number, (size_t)(-1),
     357                               false, _("warning: invalid \\uxxxx syntax for Unicode character"));
     358                    return 'u';
     359                  }
     360              }
     361            return UNICODE (n);
     362          }
     363  
     364        return c2;
     365      }
     366    else
     367      return c;
     368  }
     369  
     370  
     371  /* Reads a key or value string.
     372     Returns the string in UTF-8 encoding, or NULL if the end of the logical
     373     line is reached.
     374     Parsing ends:
     375       - when returning NULL, after the end of the logical line,
     376       - otherwise, if in_key is true, after the whitespace and possibly the
     377         separator that follows after the string,
     378       - otherwise, if in_key is false, after the end of the logical line. */
     379  
     380  static char *
     381  read_escaped_string (bool in_key)
     382  {
     383    /* The part of the string that has already been converted to UTF-8.  */
     384    static unsigned char *utf8_buffer;
     385    static size_t utf8_buflen;
     386    static size_t utf8_allocated;
     387    /* The first half of an UTF-16 surrogate character.  */
     388    unsigned short utf16_surr;
     389    /* Line in which this surrogate character occurred.  */
     390    size_t utf16_surr_line;
     391  
     392    /* Ensures utf8_buffer has room for N bytes.  N must be <= 10.  */
     393    #define utf8_buffer_ensure_available(n)  \
     394      do                                                                        \
     395        {                                                                       \
     396          if (utf8_buflen + (n) > utf8_allocated)                               \
     397            {                                                                   \
     398              utf8_allocated = 2 * utf8_allocated + 10;                         \
     399              utf8_buffer =                                                     \
     400                (unsigned char *) xrealloc (utf8_buffer, utf8_allocated);       \
     401            }                                                                   \
     402        }                                                                       \
     403      while (0)
     404  
     405    /* Appends a lone surrogate to utf8_buffer.  */
     406    /* Note: A half surrogate is invalid in UTF-8:
     407       - RFC 3629 says
     408           "The definition of UTF-8 prohibits encoding character
     409            numbers between U+D800 and U+DFFF".
     410       - Unicode 4.0 chapter 3
     411         <https://www.unicode.org/versions/Unicode4.0.0/ch03.pdf>
     412         section 3.9, p.77, says
     413           "Because surrogate code points are not Unicode scalar
     414            values, any UTF-8 byte sequence that would otherwise
     415            map to code points D800..DFFF is ill-formed."
     416         and in table 3-6, p. 78, does not mention D800..DFFF.
     417       - The unicode.org FAQ question "How do I convert an unpaired
     418         UTF-16 surrogate to UTF-8?" has the answer
     419           "By representing such an unpaired surrogate on its own
     420            as a 3-byte sequence, the resulting UTF-8 data stream
     421            would become ill-formed."
     422       So use U+FFFD instead.  */
     423    #define utf8_buffer_append_lone_surrogate(uc, line) \
     424      do                                                                        \
     425        {                                                                       \
     426          error_with_progname = false;                                          \
     427          po_xerror (PO_SEVERITY_ERROR, NULL,                                   \
     428                     real_file_name, (line), (size_t)(-1), false,               \
     429                     xasprintf (_("warning: lone surrogate U+%04X"), (uc)));    \
     430          error_with_progname = true;                                           \
     431          utf8_buffer_ensure_available (3);                                     \
     432          utf8_buffer[utf8_buflen++] = 0xef;                                    \
     433          utf8_buffer[utf8_buflen++] = 0xbf;                                    \
     434          utf8_buffer[utf8_buflen++] = 0xbd;                                    \
     435        }                                                                       \
     436      while (0)
     437  
     438    int c;
     439  
     440    /* Skip whitespace before the string.  */
     441    do
     442      c = phase3_getc ();
     443    while (c == ' ' || c == '\t' || c == '\r' || c == '\f');
     444  
     445    if (c == EOF || c == '\n')
     446      /* Empty string.  */
     447      return NULL;
     448  
     449    /* Start accumulating the string.  */
     450    utf8_buflen = 0;
     451    utf16_surr = 0;
     452    utf16_surr_line = 0;
     453    for (;;)
     454      {
     455        if (in_key && (c == '=' || c == ':'
     456                       || c == ' ' || c == '\t' || c == '\r' || c == '\f'))
     457          {
     458            /* Skip whitespace after the string.  */
     459            while (c == ' ' || c == '\t' || c == '\r' || c == '\f')
     460              c = phase3_getc ();
     461            /* Skip '=' or ':' separator.  */
     462            if (!(c == '=' || c == ':'))
     463              phase3_ungetc (c);
     464            break;
     465          }
     466  
     467        phase3_ungetc (c);
     468  
     469        /* Read the next byte or UTF-16 code point.  */
     470        c = phase4_getuc ();
     471        if (c == P4_EOF)
     472          break;
     473  
     474        /* Append it to the buffer.  */
     475        if (IS_UNICODE (c))
     476          {
     477            /* Append an UTF-16 code point.  */
     478            /* Test whether this character and the previous one form a Unicode
     479               surrogate pair.  */
     480            if (utf16_surr != 0
     481                && (c >= UNICODE (0xdc00) && c < UNICODE (0xe000)))
     482              {
     483                unsigned short utf16buf[2];
     484                ucs4_t uc;
     485                int len;
     486  
     487                utf16buf[0] = utf16_surr;
     488                utf16buf[1] = UTF16_VALUE (c);
     489                if (u16_mbtouc (&uc, utf16buf, 2) != 2)
     490                  abort ();
     491  
     492                utf8_buffer_ensure_available (6);
     493                len = u8_uctomb (utf8_buffer + utf8_buflen, uc, 6);
     494                if (len < 0)
     495                  {
     496                    error_with_progname = false;
     497                    po_xerror (PO_SEVERITY_ERROR, NULL,
     498                               real_file_name, gram_pos.line_number, (size_t)(-1),
     499                               false, _("warning: invalid Unicode character"));
     500                    error_with_progname = true;
     501                  }
     502                else
     503                  utf8_buflen += len;
     504  
     505                utf16_surr = 0;
     506              }
     507            else
     508              {
     509                if (utf16_surr != 0)
     510                  {
     511                    utf8_buffer_append_lone_surrogate (utf16_surr, utf16_surr_line);
     512                    utf16_surr = 0;
     513                  }
     514  
     515                if (c >= UNICODE (0xd800) && c < UNICODE (0xdc00))
     516                  {
     517                    utf16_surr = UTF16_VALUE (c);
     518                    utf16_surr_line = gram_pos.line_number;
     519                  }
     520                else if (c >= UNICODE (0xdc00) && c < UNICODE (0xe000))
     521                  utf8_buffer_append_lone_surrogate (UTF16_VALUE (c), gram_pos.line_number);
     522                else
     523                  {
     524                    ucs4_t uc = UTF16_VALUE (c);
     525                    int len;
     526  
     527                    utf8_buffer_ensure_available (3);
     528                    len = u8_uctomb (utf8_buffer + utf8_buflen, uc, 3);
     529                    if (len < 0)
     530                      {
     531                        error_with_progname = false;
     532                        po_xerror (PO_SEVERITY_ERROR, NULL,
     533                                   real_file_name, gram_pos.line_number, (size_t)(-1),
     534                                   false, _("warning: invalid Unicode character"));
     535                        error_with_progname = true;
     536                      }
     537                    else
     538                      utf8_buflen += len;
     539                  }
     540              }
     541          }
     542        else
     543          {
     544            /* Append a single byte.  */
     545            if (utf16_surr != 0)
     546              {
     547                utf8_buffer_append_lone_surrogate (utf16_surr, utf16_surr_line);
     548                utf16_surr = 0;
     549              }
     550  
     551            if (assume_utf8)
     552              {
     553                /* No conversion needed.  */
     554                utf8_buffer_ensure_available (1);
     555                utf8_buffer[utf8_buflen++] = c;
     556              }
     557            else
     558              {
     559                /* Convert the byte from ISO-8859-1 to UTF-8 on the fly.  */
     560                ucs4_t uc = c;
     561                int len;
     562  
     563                utf8_buffer_ensure_available (2);
     564                len = u8_uctomb (utf8_buffer + utf8_buflen, uc, 2);
     565                if (len < 0)
     566                  abort ();
     567                utf8_buflen += len;
     568              }
     569          }
     570  
     571        c = phase3_getc ();
     572        if (c == EOF || c == '\n')
     573          {
     574            if (in_key)
     575              phase3_ungetc (c);
     576            break;
     577          }
     578      }
     579    if (utf16_surr != 0)
     580      utf8_buffer_append_lone_surrogate (utf16_surr, utf16_surr_line);
     581  
     582    /* Return the result.  */
     583    {
     584      unsigned char *utf8_string = XNMALLOC (utf8_buflen + 1, unsigned char);
     585      if (utf8_buflen > 0)
     586        memcpy (utf8_string, utf8_buffer, utf8_buflen);
     587      utf8_string[utf8_buflen] = '\0';
     588  
     589      return (char *) utf8_string;
     590    }
     591    #undef utf8_buffer_append_lone_surrogate
     592    #undef utf8_buffer_ensure_available
     593  }
     594  
     595  
     596  /* Read a .properties file from a stream, and dispatch to the various
     597     abstract_catalog_reader_class_ty methods.  */
     598  static void
     599  properties_parse (abstract_catalog_reader_ty *this, FILE *file,
     600                    const char *real_filename, const char *logical_filename)
     601  {
     602    /* Read the file into memory.  */
     603    contents = fread_file (file, 0, &contents_length);
     604    if (contents == NULL)
     605      {
     606        const char *errno_description = strerror (errno);
     607        po_xerror (PO_SEVERITY_FATAL_ERROR, NULL, NULL, 0, 0, false,
     608                   xasprintf ("%s: %s",
     609                              xasprintf (_("error while reading \"%s\""),
     610                                         real_filename),
     611                              errno_description));
     612        return;
     613      }
     614  
     615    /* Test whether it's valid UTF-8.  */
     616    assume_utf8 = (u8_check ((uint8_t *) contents, contents_length) == NULL);
     617  
     618    position = 0;
     619    real_file_name = real_filename;
     620    gram_pos.file_name = xstrdup (real_file_name);
     621    gram_pos.line_number = 1;
     622  
     623    for (;;)
     624      {
     625        int c;
     626        bool comment;
     627        bool hidden;
     628  
     629        c = phase2_getc ();
     630  
     631        if (c == EOF)
     632          break;
     633  
     634        comment = false;
     635        hidden = false;
     636        if (c == '#')
     637          comment = true;
     638        else if (c == '!')
     639          {
     640            /* For compatibility with write-properties.c, we treat '!' not
     641               followed by space as a fuzzy or untranslated message.  */
     642            int c2 = phase2_getc ();
     643            if (c2 == ' ' || c2 == '\n' || c2 == EOF)
     644              comment = true;
     645            else
     646              hidden = true;
     647            phase2_ungetc (c2);
     648          }
     649        else
     650          phase2_ungetc (c);
     651  
     652        if (comment)
     653          {
     654            /* A comment line.  */
     655            static char *buffer;
     656            static size_t bufmax;
     657            static size_t buflen;
     658  
     659            buflen = 0;
     660            for (;;)
     661              {
     662                c = phase2_getc ();
     663  
     664                if (buflen >= bufmax)
     665                  {
     666                    bufmax += 100;
     667                    buffer = xrealloc (buffer, bufmax);
     668                  }
     669  
     670                if (c == EOF || c == '\n')
     671                  break;
     672  
     673                buffer[buflen++] = c;
     674              }
     675            buffer[buflen] = '\0';
     676  
     677            po_callback_comment_dispatcher (
     678              conv_from_java (
     679                assume_utf8 ? buffer : conv_from_iso_8859_1 (buffer)));
     680          }
     681        else
     682          {
     683            /* A key/value pair.  */
     684            char *msgid;
     685            lex_pos_ty msgid_pos;
     686  
     687            msgid_pos = gram_pos;
     688            msgid = read_escaped_string (true);
     689            if (msgid == NULL)
     690              /* Skip blank line.  */
     691              ;
     692            else
     693              {
     694                char *msgstr;
     695                lex_pos_ty msgstr_pos;
     696                bool force_fuzzy;
     697  
     698                msgstr_pos = gram_pos;
     699                msgstr = read_escaped_string (false);
     700                if (msgstr == NULL)
     701                  msgstr = xstrdup ("");
     702  
     703                /* Be sure to make the message fuzzy if it was commented out
     704                   and if it is not already header/fuzzy/untranslated.  */
     705                force_fuzzy = (hidden && msgid[0] != '\0' && msgstr[0] != '\0');
     706  
     707                po_callback_message (NULL, msgid, &msgid_pos, NULL,
     708                                     msgstr, strlen (msgstr) + 1, &msgstr_pos,
     709                                     NULL, NULL, NULL,
     710                                     force_fuzzy, false);
     711              }
     712          }
     713      }
     714  
     715    free (contents);
     716    contents = NULL;
     717    real_file_name = NULL;
     718    gram_pos.line_number = 0;
     719  }
     720  
     721  const struct catalog_input_format input_format_properties =
     722  {
     723    properties_parse,                     /* parse */
     724    true                                  /* produces_utf8 */
     725  };