(root)/
gettext-0.22.4/
gettext-tools/
src/
read-mo.c
       1  /* Reading binary .mo files.
       2     Copyright (C) 1995-1998, 2000-2007, 2014-2015, 2017, 2020 Free Software Foundation, Inc.
       3     Written by Ulrich Drepper <drepper@gnu.ai.mit.edu>, April 1995.
       4  
       5     This program is free software: you can redistribute it and/or modify
       6     it under the terms of the GNU General Public License as published by
       7     the Free Software Foundation; either version 3 of the License, or
       8     (at your option) any later version.
       9  
      10     This program is distributed in the hope that it will be useful,
      11     but WITHOUT ANY WARRANTY; without even the implied warranty of
      12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      13     GNU General Public License for more details.
      14  
      15     You should have received a copy of the GNU General Public License
      16     along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
      17  
      18  #ifdef HAVE_CONFIG_H
      19  # include <config.h>
      20  #endif
      21  
      22  /* Specification.  */
      23  #include "read-mo.h"
      24  
      25  #include <errno.h>
      26  #include <stdbool.h>
      27  #include <stdio.h>
      28  #include <stddef.h>
      29  #include <stdlib.h>
      30  #include <string.h>
      31  
      32  /* These two include files describe the binary .mo format.  */
      33  #include "gmo.h"
      34  #include "hash-string.h"
      35  
      36  #include "error.h"
      37  #include "xalloc.h"
      38  #include "binary-io.h"
      39  #include "message.h"
      40  #include "format.h"
      41  #include "gettext.h"
      42  #include "xsize.h"
      43  
      44  #define _(str) gettext (str)
      45  
      46  
      47  enum mo_endianness
      48  {
      49    MO_LITTLE_ENDIAN,
      50    MO_BIG_ENDIAN
      51  };
      52  
      53  /* We read the file completely into memory.  This is more efficient than
      54     lots of lseek().  This struct represents the .mo file in memory.  */
      55  struct binary_mo_file
      56  {
      57    const char *filename;
      58    char *data;
      59    size_t size;
      60    enum mo_endianness endian;
      61  };
      62  
      63  
      64  /* Read the contents of the given input stream.  */
      65  static void
      66  read_binary_mo_file (struct binary_mo_file *bfp,
      67                       FILE *fp, const char *filename)
      68  {
      69    char *buf = NULL;
      70    size_t alloc = 0;
      71    size_t size = 0;
      72    size_t count;
      73  
      74    while (!feof (fp))
      75      {
      76        const size_t increment = 4096;
      77        if (size + increment > alloc)
      78          {
      79            alloc = alloc + alloc / 2;
      80            if (alloc < size + increment)
      81              alloc = size + increment;
      82            buf = (char *) xrealloc (buf, alloc);
      83          }
      84        count = fread (buf + size, 1, increment, fp);
      85        if (count == 0)
      86          {
      87            if (ferror (fp))
      88              error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
      89                     filename);
      90          }
      91        else
      92          size += count;
      93      }
      94    buf = (char *) xrealloc (buf, size);
      95    bfp->filename = filename;
      96    bfp->data = buf;
      97    bfp->size = size;
      98  }
      99  
     100  /* Get a 32-bit number from the file, at the given file position.  */
     101  static nls_uint32
     102  get_uint32 (const struct binary_mo_file *bfp, size_t offset)
     103  {
     104    nls_uint32 b0, b1, b2, b3;
     105    size_t end = xsum (offset, 4);
     106  
     107    if (size_overflow_p (end) || end > bfp->size)
     108      error (EXIT_FAILURE, 0, _("file \"%s\" is truncated"), bfp->filename);
     109  
     110    b0 = *(unsigned char *) (bfp->data + offset + 0);
     111    b1 = *(unsigned char *) (bfp->data + offset + 1);
     112    b2 = *(unsigned char *) (bfp->data + offset + 2);
     113    b3 = *(unsigned char *) (bfp->data + offset + 3);
     114    if (bfp->endian == MO_LITTLE_ENDIAN)
     115      return b0 | (b1 << 8) | (b2 << 16) | (b3 << 24);
     116    else
     117      return (b0 << 24) | (b1 << 16) | (b2 << 8) | b3;
     118  }
     119  
     120  /* Get a static string from the file, at the given file position.  */
     121  static char *
     122  get_string (const struct binary_mo_file *bfp, size_t offset, size_t *lengthp)
     123  {
     124    /* See 'struct string_desc'.  */
     125    nls_uint32 s_length = get_uint32 (bfp, offset);
     126    nls_uint32 s_offset = get_uint32 (bfp, offset + 4);
     127    size_t s_end = xsum3 (s_offset, s_length, 1);
     128  
     129    if (size_overflow_p (s_end) || s_end > bfp->size)
     130      error (EXIT_FAILURE, 0, _("file \"%s\" is truncated"), bfp->filename);
     131    if (bfp->data[s_offset + s_length] != '\0')
     132      error (EXIT_FAILURE, 0,
     133             _("file \"%s\" contains a not NUL terminated string"),
     134             bfp->filename);
     135  
     136    *lengthp = s_length + 1;
     137    return bfp->data + s_offset;
     138  }
     139  
     140  /* Get a system dependent string from the file, at the given file position.  */
     141  static char *
     142  get_sysdep_string (const struct binary_mo_file *bfp, size_t offset,
     143                     const struct mo_file_header *header, size_t *lengthp)
     144  {
     145    /* See 'struct sysdep_string'.  */
     146    size_t length;
     147    char *string;
     148    size_t i;
     149    char *p;
     150    nls_uint32 s_offset;
     151  
     152    /* Compute the length.  */
     153    s_offset = get_uint32 (bfp, offset);
     154    length = 0;
     155    for (i = 4; ; i += 8)
     156      {
     157        nls_uint32 segsize = get_uint32 (bfp, offset + i);
     158        nls_uint32 sysdepref = get_uint32 (bfp, offset + i + 4);
     159        nls_uint32 sysdep_segment_offset;
     160        nls_uint32 ss_length;
     161        nls_uint32 ss_offset;
     162        size_t ss_end;
     163        size_t s_end;
     164        size_t n;
     165  
     166        s_end = xsum (s_offset, segsize);
     167        if (size_overflow_p (s_end) || s_end > bfp->size)
     168          error (EXIT_FAILURE, 0, _("file \"%s\" is truncated"), bfp->filename);
     169        length += segsize;
     170        s_offset += segsize;
     171  
     172        if (sysdepref == SEGMENTS_END)
     173          {
     174            /* The last static segment must end in a NUL.  */
     175            if (!(segsize > 0 && bfp->data[s_offset - 1] == '\0'))
     176              /* Invalid.  */
     177              error (EXIT_FAILURE, 0,
     178                     _("file \"%s\" contains a not NUL terminated system dependent string"),
     179                     bfp->filename);
     180            break;
     181          }
     182        if (sysdepref >= header->n_sysdep_segments)
     183          /* Invalid.  */
     184          error (EXIT_FAILURE, 0, _("file \"%s\" is not in GNU .mo format"),
     185                 bfp->filename);
     186        /* See 'struct sysdep_segment'.  */
     187        sysdep_segment_offset = header->sysdep_segments_offset + sysdepref * 8;
     188        ss_length = get_uint32 (bfp, sysdep_segment_offset);
     189        ss_offset = get_uint32 (bfp, sysdep_segment_offset + 4);
     190        ss_end = xsum (ss_offset, ss_length);
     191        if (size_overflow_p (ss_end) || ss_end > bfp->size)
     192          error (EXIT_FAILURE, 0, _("file \"%s\" is truncated"), bfp->filename);
     193        if (!(ss_length > 0 && bfp->data[ss_end - 1] == '\0'))
     194          {
     195            char location[30];
     196            sprintf (location, "sysdep_segment[%u]", (unsigned int) sysdepref);
     197            error (EXIT_FAILURE, 0,
     198                   _("file \"%s\" contains a not NUL terminated string, at %s"),
     199                   bfp->filename, location);
     200          }
     201        n = strlen (bfp->data + ss_offset);
     202        length += (n > 1 ? 1 + n + 1 : n);
     203      }
     204  
     205    /* Allocate and fill the string.  */
     206    string = XNMALLOC (length, char);
     207    p = string;
     208    s_offset = get_uint32 (bfp, offset);
     209    for (i = 4; ; i += 8)
     210      {
     211        nls_uint32 segsize = get_uint32 (bfp, offset + i);
     212        nls_uint32 sysdepref = get_uint32 (bfp, offset + i + 4);
     213        nls_uint32 sysdep_segment_offset;
     214        nls_uint32 ss_length;
     215        nls_uint32 ss_offset;
     216        size_t n;
     217  
     218        memcpy (p, bfp->data + s_offset, segsize);
     219        p += segsize;
     220        s_offset += segsize;
     221  
     222        if (sysdepref == SEGMENTS_END)
     223          break;
     224        if (sysdepref >= header->n_sysdep_segments)
     225          abort ();
     226        /* See 'struct sysdep_segment'.  */
     227        sysdep_segment_offset = header->sysdep_segments_offset + sysdepref * 8;
     228        ss_length = get_uint32 (bfp, sysdep_segment_offset);
     229        ss_offset = get_uint32 (bfp, sysdep_segment_offset + 4);
     230        if (ss_offset + ss_length > bfp->size)
     231          abort ();
     232        if (!(ss_length > 0 && bfp->data[ss_offset + ss_length - 1] == '\0'))
     233          abort ();
     234        n = strlen (bfp->data + ss_offset);
     235        if (n > 1)
     236          *p++ = '<';
     237        memcpy (p, bfp->data + ss_offset, n);
     238        p += n;
     239        if (n > 1)
     240          *p++ = '>';
     241      }
     242  
     243    if (p != string + length)
     244      abort ();
     245  
     246    *lengthp = length;
     247    return string;
     248  }
     249  
     250  /* Reads an existing .mo file and adds the messages to mlp.  */
     251  void
     252  read_mo_file (message_list_ty *mlp, const char *filename)
     253  {
     254    FILE *fp;
     255    struct binary_mo_file bf;
     256    struct mo_file_header header;
     257    unsigned int i;
     258    static lex_pos_ty pos = { __FILE__, __LINE__ };
     259  
     260    if (strcmp (filename, "-") == 0 || strcmp (filename, "/dev/stdin") == 0)
     261      {
     262        fp = stdin;
     263        SET_BINARY (fileno (fp));
     264      }
     265    else
     266      {
     267        fp = fopen (filename, "rb");
     268        if (fp == NULL)
     269          error (EXIT_FAILURE, errno,
     270                 _("error while opening \"%s\" for reading"), filename);
     271      }
     272  
     273    /* Read the file contents into memory.  */
     274    read_binary_mo_file (&bf, fp, filename);
     275  
     276    /* Get a 32-bit number from the file header.  */
     277  # define GET_HEADER_FIELD(field) \
     278      get_uint32 (&bf, offsetof (struct mo_file_header, field))
     279  
     280    /* We must grope the file to determine which endian it is.
     281       Perversity of the universe tends towards maximum, so it will
     282       probably not match the currently executing architecture.  */
     283    bf.endian = MO_BIG_ENDIAN;
     284    header.magic = GET_HEADER_FIELD (magic);
     285    if (header.magic != _MAGIC)
     286      {
     287        bf.endian = MO_LITTLE_ENDIAN;
     288        header.magic = GET_HEADER_FIELD (magic);
     289        if (header.magic != _MAGIC)
     290          {
     291          unrecognised:
     292            error (EXIT_FAILURE, 0, _("file \"%s\" is not in GNU .mo format"),
     293                   filename);
     294          }
     295      }
     296  
     297    header.revision = GET_HEADER_FIELD (revision);
     298  
     299    /* We support only the major revisions 0 and 1.  */
     300    switch (header.revision >> 16)
     301      {
     302      case 0:
     303      case 1:
     304        /* Fill the header parts that apply to major revisions 0 and 1.  */
     305        header.nstrings = GET_HEADER_FIELD (nstrings);
     306        header.orig_tab_offset = GET_HEADER_FIELD (orig_tab_offset);
     307        header.trans_tab_offset = GET_HEADER_FIELD (trans_tab_offset);
     308        header.hash_tab_size = GET_HEADER_FIELD (hash_tab_size);
     309        header.hash_tab_offset = GET_HEADER_FIELD (hash_tab_offset);
     310  
     311        /* The following verifications attempt to ensure that 'msgunfmt' complains
     312           about a .mo file that may make libintl crash at run time.  */
     313  
     314        /* Verify that the array of messages is sorted.  */
     315        {
     316          char *prev_msgid = NULL;
     317  
     318          for (i = 0; i < header.nstrings; i++)
     319            {
     320              char *msgid;
     321              size_t msgid_len;
     322  
     323              msgid = get_string (&bf, header.orig_tab_offset + i * 8,
     324                                  &msgid_len);
     325              if (i == 0)
     326                prev_msgid = msgid;
     327              else
     328                {
     329                  if (!(strcmp (prev_msgid, msgid) < 0))
     330                    error (EXIT_FAILURE, 0,
     331                           _("file \"%s\" is not in GNU .mo format: The array of messages is not sorted."),
     332                           filename);
     333                }
     334            }
     335        }
     336  
     337        /* Verify the hash table.  */
     338        if (header.hash_tab_size > 0)
     339          {
     340            char *seen;
     341            unsigned int j;
     342  
     343            /* Verify the hash table's size.  */
     344            if (!(header.hash_tab_size > 2))
     345              error (EXIT_FAILURE, 0,
     346                     _("file \"%s\" is not in GNU .mo format: The hash table size is invalid."),
     347                     filename);
     348  
     349            /* Verify that the non-empty hash table entries contain the values
     350               1, ..., nstrings, each exactly once.  */
     351            seen = (char *) xcalloc (header.nstrings, 1);
     352            for (j = 0; j < header.hash_tab_size; j++)
     353              {
     354                nls_uint32 entry =
     355                  get_uint32 (&bf, header.hash_tab_offset + j * 4);
     356  
     357                if (entry != 0)
     358                  {
     359                    i = entry - 1;
     360                    if (!(i < header.nstrings && seen[i] == 0))
     361                      error (EXIT_FAILURE, 0,
     362                             _("file \"%s\" is not in GNU .mo format: The hash table contains invalid entries."),
     363                             filename);
     364                    seen[i] = 1;
     365                  }
     366              }
     367            for (i = 0; i < header.nstrings; i++)
     368              if (seen[i] == 0)
     369                error (EXIT_FAILURE, 0, _("file \"%s\" is not in GNU .mo format: Some messages are not present in the hash table."),
     370                       filename);
     371            free (seen);
     372  
     373            /* Verify that the hash table lookup algorithm finds the entry for
     374               each message.  */
     375            for (i = 0; i < header.nstrings; i++)
     376              {
     377                size_t msgid_len;
     378                char *msgid = get_string (&bf, header.orig_tab_offset + i * 8,
     379                                          &msgid_len);
     380                nls_uint32 hash_val = hash_string (msgid);
     381                nls_uint32 idx = hash_val % header.hash_tab_size;
     382                nls_uint32 incr = 1 + (hash_val % (header.hash_tab_size - 2));
     383                for (;;)
     384                  {
     385                    nls_uint32 entry =
     386                      get_uint32 (&bf, header.hash_tab_offset + idx * 4);
     387  
     388                    if (entry == 0)
     389                      error (EXIT_FAILURE, 0,
     390                             _("file \"%s\" is not in GNU .mo format: Some messages are at a wrong index in the hash table."),
     391                             filename);
     392                    if (entry == i + 1)
     393                      break;
     394  
     395                    if (idx >= header.hash_tab_size - incr)
     396                      idx -= header.hash_tab_size - incr;
     397                    else
     398                      idx += incr;
     399                  }
     400              }
     401          }
     402  
     403        for (i = 0; i < header.nstrings; i++)
     404          {
     405            message_ty *mp;
     406            char *msgctxt;
     407            char *msgid;
     408            size_t msgid_len;
     409            char *separator;
     410            char *msgstr;
     411            size_t msgstr_len;
     412  
     413            /* Read the msgctxt and msgid.  */
     414            msgid = get_string (&bf, header.orig_tab_offset + i * 8,
     415                                &msgid_len);
     416            /* Split into msgctxt and msgid.  */
     417            separator = strchr (msgid, MSGCTXT_SEPARATOR);
     418            if (separator != NULL)
     419              {
     420                /* The part before the MSGCTXT_SEPARATOR is the msgctxt.  */
     421                *separator = '\0';
     422                msgctxt = msgid;
     423                msgid = separator + 1;
     424                msgid_len -= msgid - msgctxt;
     425              }
     426            else
     427              msgctxt = NULL;
     428  
     429            /* Read the msgstr.  */
     430            msgstr = get_string (&bf, header.trans_tab_offset + i * 8,
     431                                 &msgstr_len);
     432  
     433            mp = message_alloc (msgctxt,
     434                                msgid,
     435                                (strlen (msgid) + 1 < msgid_len
     436                                 ? msgid + strlen (msgid) + 1
     437                                 : NULL),
     438                                msgstr, msgstr_len,
     439                                &pos);
     440            message_list_append (mlp, mp);
     441          }
     442  
     443        switch (header.revision & 0xffff)
     444          {
     445          case 0:
     446            break;
     447          case 1:
     448          default:
     449            /* Fill the header parts that apply to minor revision >= 1.  */
     450            header.n_sysdep_segments = GET_HEADER_FIELD (n_sysdep_segments);
     451            header.sysdep_segments_offset =
     452              GET_HEADER_FIELD (sysdep_segments_offset);
     453            header.n_sysdep_strings = GET_HEADER_FIELD (n_sysdep_strings);
     454            header.orig_sysdep_tab_offset =
     455              GET_HEADER_FIELD (orig_sysdep_tab_offset);
     456            header.trans_sysdep_tab_offset =
     457              GET_HEADER_FIELD (trans_sysdep_tab_offset);
     458  
     459            for (i = 0; i < header.n_sysdep_strings; i++)
     460              {
     461                message_ty *mp;
     462                char *msgctxt;
     463                char *msgid;
     464                size_t msgid_len;
     465                char *separator;
     466                char *msgstr;
     467                size_t msgstr_len;
     468                nls_uint32 offset;
     469                size_t f;
     470  
     471                /* Read the msgctxt and msgid.  */
     472                offset = get_uint32 (&bf, header.orig_sysdep_tab_offset + i * 4);
     473                msgid = get_sysdep_string (&bf, offset, &header, &msgid_len);
     474                /* Split into msgctxt and msgid.  */
     475                separator = strchr (msgid, MSGCTXT_SEPARATOR);
     476                if (separator != NULL)
     477                  {
     478                    /* The part before the MSGCTXT_SEPARATOR is the msgctxt.  */
     479                    *separator = '\0';
     480                    msgctxt = msgid;
     481                    msgid = separator + 1;
     482                    msgid_len -= msgid - msgctxt;
     483                  }
     484                else
     485                  msgctxt = NULL;
     486  
     487                /* Read the msgstr.  */
     488                offset = get_uint32 (&bf, header.trans_sysdep_tab_offset + i * 4);
     489                msgstr = get_sysdep_string (&bf, offset, &header, &msgstr_len);
     490  
     491                mp = message_alloc (msgctxt,
     492                                    msgid,
     493                                    (strlen (msgid) + 1 < msgid_len
     494                                     ? msgid + strlen (msgid) + 1
     495                                     : NULL),
     496                                    msgstr, msgstr_len,
     497                                    &pos);
     498  
     499                /* Only messages with c-format or objc-format annotation are
     500                   recognized as having system-dependent strings by msgfmt.
     501                   Which one of the two, we don't know.  We have to guess,
     502                   assuming that c-format is more probable than objc-format and
     503                   that the .mo was likely produced by "msgfmt -c".  */
     504                for (f = format_c; ; f = format_objc)
     505                  {
     506                    bool valid = true;
     507                    struct formatstring_parser *parser = formatstring_parsers[f];
     508                    const char *str_end;
     509                    const char *str;
     510  
     511                    str_end = msgid + msgid_len;
     512                    for (str = msgid; str < str_end; str += strlen (str) + 1)
     513                      {
     514                        char *invalid_reason = NULL;
     515                        void *descr =
     516                          parser->parse (str, false, NULL, &invalid_reason);
     517  
     518                        if (descr != NULL)
     519                          parser->free (descr);
     520                        else
     521                          {
     522                            free (invalid_reason);
     523                            valid = false;
     524                            break;
     525                          }
     526                      }
     527                    if (valid)
     528                      {
     529                        str_end = msgstr + msgstr_len;
     530                        for (str = msgstr; str < str_end; str += strlen (str) + 1)
     531                          {
     532                            char *invalid_reason = NULL;
     533                            void *descr =
     534                              parser->parse (str, true, NULL, &invalid_reason);
     535  
     536                            if (descr != NULL)
     537                              parser->free (descr);
     538                            else
     539                              {
     540                                free (invalid_reason);
     541                                valid = false;
     542                                break;
     543                              }
     544                          }
     545                      }
     546  
     547                    if (valid)
     548                      {
     549                        /* Found the most likely among c-format, objc-format.  */
     550                        mp->is_format[f] = yes;
     551                        break;
     552                      }
     553  
     554                    /* Try next f.  */
     555                    if (f == format_objc)
     556                      break;
     557                  }
     558  
     559                message_list_append (mlp, mp);
     560              }
     561            break;
     562          }
     563        break;
     564  
     565      default:
     566        goto unrecognised;
     567      }
     568  
     569    if (fp != stdin)
     570      fclose (fp);
     571  }