(root)/
gettext-0.22.4/
gettext-tools/
src/
msgl-iconv.c
       1  /* Message list charset and locale charset handling.
       2     Copyright (C) 2001-2003, 2005-2009, 2019-2023 Free Software Foundation, Inc.
       3     Written by Bruno Haible <haible@clisp.cons.org>, 2001.
       4  
       5     This program is free software: you can redistribute it and/or modify
       6     it under the terms of the GNU General Public License as published by
       7     the Free Software Foundation; either version 3 of the License, or
       8     (at your option) any later version.
       9  
      10     This program is distributed in the hope that it will be useful,
      11     but WITHOUT ANY WARRANTY; without even the implied warranty of
      12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      13     GNU General Public License for more details.
      14  
      15     You should have received a copy of the GNU General Public License
      16     along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
      17  
      18  
      19  #ifdef HAVE_CONFIG_H
      20  # include "config.h"
      21  #endif
      22  #include <alloca.h>
      23  
      24  /* Specification.  */
      25  #include "msgl-iconv.h"
      26  
      27  #include <stdbool.h>
      28  #include <stdlib.h>
      29  #include <string.h>
      30  
      31  #if HAVE_ICONV
      32  # include <iconv.h>
      33  #endif
      34  
      35  #include "noreturn.h"
      36  #include "progname.h"
      37  #include "basename-lgpl.h"
      38  #include "string-desc.h"
      39  #include "message.h"
      40  #include "po-charset.h"
      41  #include "xstriconv.h"
      42  #include "xstriconveh.h"
      43  #include "msgl-ascii.h"
      44  #include "msgl-ofn.h"
      45  #include "xalloc.h"
      46  #include "xmalloca.h"
      47  #include "c-strstr.h"
      48  #include "xvasprintf.h"
      49  #include "po-xerror.h"
      50  #include "gettext.h"
      51  
      52  #define _(str) gettext (str)
      53  
      54  
      55  #if HAVE_ICONV
      56  
      57  _GL_NORETURN_FUNC static void conversion_error (const struct conversion_context* context);
      58  static void
      59  conversion_error (const struct conversion_context* context)
      60  {
      61    if (context->to_code == po_charset_utf8)
      62      /* If a conversion to UTF-8 fails, the problem lies in the input.  */
      63      po_xerror (PO_SEVERITY_FATAL_ERROR, context->message, NULL, 0, 0, false,
      64                 xasprintf (_("%s: input is not valid in \"%s\" encoding"),
      65                            context->from_filename, context->from_code));
      66    else
      67      po_xerror (PO_SEVERITY_FATAL_ERROR, context->message, NULL, 0, 0, false,
      68                 xasprintf (_("%s: error while converting from \"%s\" encoding to \"%s\" encoding"),
      69                            context->from_filename, context->from_code,
      70                            context->to_code));
      71    /* NOTREACHED */
      72    abort ();
      73  }
      74  
      75  char *
      76  convert_string_directly (iconv_t cd, const char *string,
      77                           const struct conversion_context* context)
      78  {
      79    size_t len = strlen (string) + 1;
      80    char *result = NULL;
      81    size_t resultlen = 0;
      82  
      83    if (xmem_cd_iconv (string, len, cd, &result, &resultlen) == 0)
      84      /* Verify the result has exactly one NUL byte, at the end.  */
      85      if (resultlen > 0 && result[resultlen - 1] == '\0'
      86          && strlen (result) == resultlen - 1)
      87        return result;
      88  
      89    conversion_error (context);
      90    /* NOTREACHED */
      91    return NULL;
      92  }
      93  
      94  string_desc_t
      95  convert_string_desc_directly (iconv_t cd, string_desc_t string,
      96                                const struct conversion_context* context)
      97  {
      98    char *result = NULL;
      99    size_t resultlen = 0;
     100  
     101    if (xmem_cd_iconv (string_desc_data (string), string_desc_length (string),
     102                       cd, &result, &resultlen) == 0)
     103      return string_desc_new_addr (resultlen, result);
     104  
     105    conversion_error (context);
     106    /* NOTREACHED */
     107    return string_desc_new_empty ();
     108  }
     109  
     110  static char *
     111  convert_string (const iconveh_t *cd, const char *string,
     112                  const struct conversion_context* context)
     113  {
     114    size_t len = strlen (string) + 1;
     115    char *result = NULL;
     116    size_t resultlen = 0;
     117  
     118    if (xmem_cd_iconveh (string, len, cd, iconveh_error, NULL,
     119                         &result, &resultlen) == 0)
     120      /* Verify the result has exactly one NUL byte, at the end.  */
     121      if (resultlen > 0 && result[resultlen - 1] == '\0'
     122          && strlen (result) == resultlen - 1)
     123        return result;
     124  
     125    conversion_error (context);
     126    /* NOTREACHED */
     127    return NULL;
     128  }
     129  
     130  static void
     131  convert_string_list (const iconveh_t *cd, string_list_ty *slp,
     132                       const struct conversion_context* context)
     133  {
     134    size_t i;
     135  
     136    if (slp != NULL)
     137      for (i = 0; i < slp->nitems; i++)
     138        slp->item[i] = convert_string (cd, slp->item[i], context);
     139  }
     140  
     141  static void
     142  convert_prev_msgid (const iconveh_t *cd, message_ty *mp,
     143                      const struct conversion_context* context)
     144  {
     145    if (mp->prev_msgctxt != NULL)
     146      mp->prev_msgctxt = convert_string (cd, mp->prev_msgctxt, context);
     147    if (mp->prev_msgid != NULL)
     148      mp->prev_msgid = convert_string (cd, mp->prev_msgid, context);
     149    if (mp->prev_msgid_plural != NULL)
     150      mp->prev_msgid_plural = convert_string (cd, mp->prev_msgid_plural, context);
     151  }
     152  
     153  static void
     154  convert_msgid (const iconveh_t *cd, message_ty *mp,
     155                 const struct conversion_context* context)
     156  {
     157    if (mp->msgctxt != NULL)
     158      mp->msgctxt = convert_string (cd, mp->msgctxt, context);
     159    mp->msgid = convert_string (cd, mp->msgid, context);
     160    if (mp->msgid_plural != NULL)
     161      mp->msgid_plural = convert_string (cd, mp->msgid_plural, context);
     162  }
     163  
     164  static void
     165  convert_msgstr (const iconveh_t *cd, message_ty *mp,
     166                  const struct conversion_context* context)
     167  {
     168    char *result = NULL;
     169    size_t resultlen = 0;
     170  
     171    if (!(mp->msgstr_len > 0 && mp->msgstr[mp->msgstr_len - 1] == '\0'))
     172      abort ();
     173  
     174    if (xmem_cd_iconveh (mp->msgstr, mp->msgstr_len, cd, iconveh_error, NULL,
     175                         &result, &resultlen) == 0)
     176      /* Verify the result has a NUL byte at the end.  */
     177      if (resultlen > 0 && result[resultlen - 1] == '\0')
     178        /* Verify the result has the same number of NUL bytes.  */
     179        {
     180          const char *p;
     181          const char *pend;
     182          int nulcount1;
     183          int nulcount2;
     184  
     185          for (p = mp->msgstr, pend = p + mp->msgstr_len, nulcount1 = 0;
     186               p < pend;
     187               p += strlen (p) + 1, nulcount1++);
     188          for (p = result, pend = p + resultlen, nulcount2 = 0;
     189               p < pend;
     190               p += strlen (p) + 1, nulcount2++);
     191  
     192          if (nulcount1 == nulcount2)
     193            {
     194              mp->msgstr = result;
     195              mp->msgstr_len = resultlen;
     196              return;
     197            }
     198        }
     199  
     200    conversion_error (context);
     201  }
     202  
     203  #endif
     204  
     205  
     206  static bool
     207  iconv_message_list_internal (message_list_ty *mlp,
     208                               const char *canon_from_code,
     209                               const char *canon_to_code,
     210                               bool update_header,
     211                               const char *from_filename)
     212  {
     213    bool canon_from_code_overridden = (canon_from_code != NULL);
     214    bool msgids_changed;
     215    size_t j;
     216  
     217    /* If the list is empty, nothing to do.  */
     218    if (mlp->nitems == 0)
     219      return false;
     220  
     221    /* Search the header entry, and extract and replace the charset name.  */
     222    for (j = 0; j < mlp->nitems; j++)
     223      if (is_header (mlp->item[j]) && !mlp->item[j]->obsolete)
     224        {
     225          const char *header = mlp->item[j]->msgstr;
     226  
     227          if (header != NULL)
     228            {
     229              const char *charsetstr = c_strstr (header, "charset=");
     230  
     231              if (charsetstr != NULL)
     232                {
     233                  size_t len;
     234                  char *charset;
     235                  const char *canon_charset;
     236  
     237                  charsetstr += strlen ("charset=");
     238                  len = strcspn (charsetstr, " \t\n");
     239                  charset = (char *) xmalloca (len + 1);
     240                  memcpy (charset, charsetstr, len);
     241                  charset[len] = '\0';
     242  
     243                  canon_charset = po_charset_canonicalize (charset);
     244                  if (canon_charset == NULL)
     245                    {
     246                      if (!canon_from_code_overridden)
     247                        {
     248                          /* Don't give an error for POT files, because
     249                             POT files usually contain only ASCII msgids.
     250                             Also don't give an error for disguised POT
     251                             files that actually contain only ASCII msgids.  */
     252                          const char *filename = from_filename;
     253                          size_t filenamelen;
     254  
     255                          if (strcmp (charset, "CHARSET") == 0
     256                              && ((filename != NULL
     257                                   && (filenamelen = strlen (filename)) >= 4
     258                                   && memcmp (filename + filenamelen - 4, ".pot", 4)
     259                                      == 0)
     260                                  || is_ascii_message_list (mlp)))
     261                            canon_charset = po_charset_ascii;
     262                          else
     263                            po_xerror (PO_SEVERITY_FATAL_ERROR, NULL, NULL, 0, 0,
     264                                       false,
     265                                       xasprintf (_("present charset \"%s\" is not a portable encoding name"),
     266                                                  charset));
     267                        }
     268                    }
     269                  else
     270                    {
     271                      if (canon_from_code == NULL)
     272                        canon_from_code = canon_charset;
     273                      else if (canon_from_code != canon_charset)
     274                        po_xerror (PO_SEVERITY_FATAL_ERROR, NULL, NULL, 0,  0,
     275                                   false,
     276                                   xasprintf (_("two different charsets \"%s\" and \"%s\" in input file"),
     277                                              canon_from_code, canon_charset));
     278                    }
     279                  freea (charset);
     280  
     281                  if (update_header)
     282                    {
     283                      size_t len1, len2, len3;
     284                      char *new_header;
     285  
     286                      len1 = charsetstr - header;
     287                      len2 = strlen (canon_to_code);
     288                      len3 = (header + strlen (header)) - (charsetstr + len);
     289                      new_header = XNMALLOC (len1 + len2 + len3 + 1, char);
     290                      memcpy (new_header, header, len1);
     291                      memcpy (new_header + len1, canon_to_code, len2);
     292                      memcpy (new_header + len1 + len2, charsetstr + len,
     293                              len3 + 1);
     294                      mlp->item[j]->msgstr = new_header;
     295                      mlp->item[j]->msgstr_len = len1 + len2 + len3 + 1;
     296                    }
     297                }
     298            }
     299        }
     300    if (canon_from_code == NULL)
     301      {
     302        if (is_ascii_message_list (mlp))
     303          canon_from_code = po_charset_ascii;
     304        else
     305          po_xerror (PO_SEVERITY_FATAL_ERROR, NULL, NULL, 0, 0, false,
     306                     _("input file doesn't contain a header entry with a charset specification"));
     307      }
     308  
     309    msgids_changed = false;
     310  
     311    /* If the two encodings are the same, nothing to do.  */
     312    if (canon_from_code != canon_to_code)
     313      {
     314  #if HAVE_ICONV
     315        iconveh_t cd;
     316        struct conversion_context context;
     317  
     318        if (iconveh_open (canon_to_code, canon_from_code, &cd) < 0)
     319          po_xerror (PO_SEVERITY_FATAL_ERROR, NULL, NULL, 0, 0, false,
     320                     xasprintf (_("Cannot convert from \"%s\" to \"%s\". %s relies on iconv(), and iconv() does not support this conversion."),
     321                                canon_from_code, canon_to_code,
     322                                last_component (program_name)));
     323  
     324        context.from_code = canon_from_code;
     325        context.to_code = canon_to_code;
     326        context.from_filename = from_filename;
     327  
     328        for (j = 0; j < mlp->nitems; j++)
     329          {
     330            message_ty *mp = mlp->item[j];
     331  
     332            if ((mp->msgctxt != NULL && !is_ascii_string (mp->msgctxt))
     333                || !is_ascii_string (mp->msgid))
     334              msgids_changed = true;
     335            context.message = mp;
     336            convert_string_list (&cd, mp->comment, &context);
     337            convert_string_list (&cd, mp->comment_dot, &context);
     338            convert_prev_msgid (&cd, mp, &context);
     339            convert_msgid (&cd, mp, &context);
     340            convert_msgstr (&cd, mp, &context);
     341          }
     342  
     343        iconveh_close (&cd);
     344  
     345        if (msgids_changed)
     346          if (message_list_msgids_changed (mlp))
     347            po_xerror (PO_SEVERITY_FATAL_ERROR, NULL, NULL, 0, 0, false,
     348                       xasprintf (_("Conversion from \"%s\" to \"%s\" introduces duplicates: some different msgids become equal."),
     349                                  canon_from_code, canon_to_code));
     350  #else
     351            po_xerror (PO_SEVERITY_FATAL_ERROR, NULL, NULL, 0, 0, false,
     352                       xasprintf (_("Cannot convert from \"%s\" to \"%s\". %s relies on iconv(). This version was built without iconv()."),
     353                                  canon_from_code, canon_to_code,
     354                                  last_component (program_name)));
     355  #endif
     356      }
     357  
     358    return msgids_changed;
     359  }
     360  
     361  bool
     362  iconv_message_list (message_list_ty *mlp,
     363                      const char *canon_from_code, const char *canon_to_code,
     364                      const char *from_filename)
     365  {
     366    return iconv_message_list_internal (mlp,
     367                                        canon_from_code, canon_to_code, true,
     368                                        from_filename);
     369  }
     370  
     371  msgdomain_list_ty *
     372  iconv_msgdomain_list (msgdomain_list_ty *mdlp,
     373                        const char *to_code,
     374                        bool update_header,
     375                        const char *from_filename)
     376  {
     377    const char *canon_to_code;
     378    size_t k;
     379  
     380    /* Canonicalize target encoding.  */
     381    canon_to_code = po_charset_canonicalize (to_code);
     382    if (canon_to_code == NULL)
     383      po_xerror (PO_SEVERITY_FATAL_ERROR, NULL, NULL, 0, 0, false,
     384                 xasprintf (_("target charset \"%s\" is not a portable encoding name."),
     385                            to_code));
     386  
     387    /* Test whether the control characters required for escaping file names with
     388       spaces are present in the target encoding.  */
     389    if (msgdomain_list_has_filenames_with_spaces (mdlp)
     390        && !(canon_to_code == po_charset_utf8
     391             || strcmp (canon_to_code, "GB18030") == 0))
     392      po_xerror (PO_SEVERITY_FATAL_ERROR, NULL, NULL, 0, 0, false,
     393                 xasprintf (_("Cannot write the control characters that protect file names with spaces in the %s encoding"),
     394                            canon_to_code));
     395  
     396    for (k = 0; k < mdlp->nitems; k++)
     397      iconv_message_list_internal (mdlp->item[k]->messages,
     398                                   mdlp->encoding, canon_to_code, update_header,
     399                                   from_filename);
     400  
     401    mdlp->encoding = canon_to_code;
     402    return mdlp;
     403  }
     404  
     405  #if HAVE_ICONV
     406  
     407  static bool
     408  iconvable_string (const iconveh_t *cd, const char *string)
     409  {
     410    size_t len = strlen (string) + 1;
     411    char *result = NULL;
     412    size_t resultlen = 0;
     413  
     414    if (xmem_cd_iconveh (string, len, cd, iconveh_error, NULL,
     415                         &result, &resultlen) == 0)
     416      {
     417        /* Test if the result has exactly one NUL byte, at the end.  */
     418        bool ok = (resultlen > 0 && result[resultlen - 1] == '\0'
     419                   && strlen (result) == resultlen - 1);
     420        free (result);
     421        return ok;
     422      }
     423    return false;
     424  }
     425  
     426  static bool
     427  iconvable_string_list (const iconveh_t *cd, string_list_ty *slp)
     428  {
     429    size_t i;
     430  
     431    if (slp != NULL)
     432      for (i = 0; i < slp->nitems; i++)
     433        if (!iconvable_string (cd, slp->item[i]))
     434          return false;
     435    return true;
     436  }
     437  
     438  static bool
     439  iconvable_prev_msgid (const iconveh_t *cd, message_ty *mp)
     440  {
     441    if (mp->prev_msgctxt != NULL)
     442      if (!iconvable_string (cd, mp->prev_msgctxt))
     443        return false;
     444    if (mp->prev_msgid != NULL)
     445      if (!iconvable_string (cd, mp->prev_msgid))
     446        return false;
     447    if (mp->prev_msgid_plural != NULL)
     448      if (!iconvable_string (cd, mp->prev_msgid_plural))
     449        return false;
     450    return true;
     451  }
     452  
     453  static bool
     454  iconvable_msgid (const iconveh_t *cd, message_ty *mp)
     455  {
     456    if (mp->msgctxt != NULL)
     457      if (!iconvable_string (cd, mp->msgctxt))
     458        return false;
     459    if (!iconvable_string (cd, mp->msgid))
     460      return false;
     461    if (mp->msgid_plural != NULL)
     462      if (!iconvable_string (cd, mp->msgid_plural))
     463        return false;
     464    return true;
     465  }
     466  
     467  static bool
     468  iconvable_msgstr (const iconveh_t *cd, message_ty *mp)
     469  {
     470    char *result = NULL;
     471    size_t resultlen = 0;
     472  
     473    if (!(mp->msgstr_len > 0 && mp->msgstr[mp->msgstr_len - 1] == '\0'))
     474      abort ();
     475  
     476    if (xmem_cd_iconveh (mp->msgstr, mp->msgstr_len, cd, iconveh_error, NULL,
     477                         &result, &resultlen) == 0)
     478      {
     479        bool ok = false;
     480  
     481        /* Test if the result has a NUL byte at the end.  */
     482        if (resultlen > 0 && result[resultlen - 1] == '\0')
     483          /* Test if the result has the same number of NUL bytes.  */
     484          {
     485            const char *p;
     486            const char *pend;
     487            int nulcount1;
     488            int nulcount2;
     489  
     490            for (p = mp->msgstr, pend = p + mp->msgstr_len, nulcount1 = 0;
     491                 p < pend;
     492                 p += strlen (p) + 1, nulcount1++);
     493            for (p = result, pend = p + resultlen, nulcount2 = 0;
     494                 p < pend;
     495                 p += strlen (p) + 1, nulcount2++);
     496  
     497            if (nulcount1 == nulcount2)
     498              ok = true;
     499          }
     500  
     501        free (result);
     502        return ok;
     503      }
     504    return false;
     505  }
     506  
     507  #endif
     508  
     509  bool
     510  is_message_list_iconvable (message_list_ty *mlp,
     511                             const char *canon_from_code,
     512                             const char *canon_to_code)
     513  {
     514    bool canon_from_code_overridden = (canon_from_code != NULL);
     515    size_t j;
     516  
     517    /* If the list is empty, nothing to check.  */
     518    if (mlp->nitems == 0)
     519      return true;
     520  
     521    /* Search the header entry, and extract the charset name.  */
     522    for (j = 0; j < mlp->nitems; j++)
     523      if (is_header (mlp->item[j]) && !mlp->item[j]->obsolete)
     524        {
     525          const char *header = mlp->item[j]->msgstr;
     526  
     527          if (header != NULL)
     528            {
     529              const char *charsetstr = c_strstr (header, "charset=");
     530  
     531              if (charsetstr != NULL)
     532                {
     533                  size_t len;
     534                  char *charset;
     535                  const char *canon_charset;
     536  
     537                  charsetstr += strlen ("charset=");
     538                  len = strcspn (charsetstr, " \t\n");
     539                  charset = (char *) xmalloca (len + 1);
     540                  memcpy (charset, charsetstr, len);
     541                  charset[len] = '\0';
     542  
     543                  canon_charset = po_charset_canonicalize (charset);
     544                  if (canon_charset == NULL)
     545                    {
     546                      if (!canon_from_code_overridden)
     547                        {
     548                          /* Don't give an error for POT files, because POT
     549                             files usually contain only ASCII msgids.  */
     550                          if (strcmp (charset, "CHARSET") == 0)
     551                            canon_charset = po_charset_ascii;
     552                          else
     553                            {
     554                              /* charset is not a portable encoding name.  */
     555                              freea (charset);
     556                              return false;
     557                            }
     558                        }
     559                    }
     560                  else
     561                    {
     562                      if (canon_from_code == NULL)
     563                        canon_from_code = canon_charset;
     564                      else if (canon_from_code != canon_charset)
     565                        {
     566                          /* Two different charsets in input file.  */
     567                          freea (charset);
     568                          return false;
     569                        }
     570                    }
     571                  freea (charset);
     572                }
     573            }
     574        }
     575    if (canon_from_code == NULL)
     576      {
     577        if (is_ascii_message_list (mlp))
     578          canon_from_code = po_charset_ascii;
     579        else
     580          /* Input file lacks a header entry with a charset specification.  */
     581          return false;
     582      }
     583  
     584    /* If the two encodings are the same, nothing to check.  */
     585    if (canon_from_code != canon_to_code)
     586      {
     587  #if HAVE_ICONV
     588        iconveh_t cd;
     589  
     590        if (iconveh_open (canon_to_code, canon_from_code, &cd) < 0)
     591          /* iconv() doesn't support this conversion.  */
     592          return false;
     593  
     594        for (j = 0; j < mlp->nitems; j++)
     595          {
     596            message_ty *mp = mlp->item[j];
     597  
     598            if (!(iconvable_string_list (&cd, mp->comment)
     599                  && iconvable_string_list (&cd, mp->comment_dot)
     600                  && iconvable_prev_msgid (&cd, mp)
     601                  && iconvable_msgid (&cd, mp)
     602                  && iconvable_msgstr (&cd, mp)))
     603              return false;
     604          }
     605  
     606        iconveh_close (&cd);
     607  #else
     608        /* This version was built without iconv().  */
     609        return false;
     610  #endif
     611      }
     612  
     613    return true;
     614  }