(root)/
gettext-0.22.4/
gettext-tools/
gnulib-lib/
striconveha.c
       1  /* Character set conversion with error handling and autodetection.
       2     Copyright (C) 2002, 2005, 2007, 2009-2023 Free Software Foundation, Inc.
       3     Written by Bruno Haible.
       4  
       5     This file is free software: you can redistribute it and/or modify
       6     it under the terms of the GNU Lesser General Public License as
       7     published by the Free Software Foundation; either version 2.1 of the
       8     License, or (at your option) any later version.
       9  
      10     This file is distributed in the hope that it will be useful,
      11     but WITHOUT ANY WARRANTY; without even the implied warranty of
      12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      13     GNU Lesser General Public License for more details.
      14  
      15     You should have received a copy of the GNU Lesser General Public License
      16     along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
      17  
      18  #include <config.h>
      19  
      20  /* Specification.  */
      21  #include "striconveha.h"
      22  
      23  #include <errno.h>
      24  #include <stdlib.h>
      25  #include <string.h>
      26  
      27  #include "malloca.h"
      28  #include "c-strcase.h"
      29  #include "striconveh.h"
      30  
      31  #define SIZEOF(a) (sizeof(a)/sizeof(a[0]))
      32  
      33  
      34  /* Autodetection list.  */
      35  
      36  struct autodetect_alias
      37  {
      38    struct autodetect_alias *next;
      39    const char *name;
      40    const char * const *encodings_to_try;
      41  };
      42  
      43  static const char * const autodetect_utf8_try[] =
      44  {
      45    /* Try UTF-8 first. There are very few ISO-8859-1 inputs that would
      46       be valid UTF-8, but many UTF-8 inputs are valid ISO-8859-1.  */
      47    "UTF-8", "ISO-8859-1",
      48    NULL
      49  };
      50  static const char * const autodetect_jp_try[] =
      51  {
      52    /* Try 7-bit encoding first. If the input contains bytes >= 0x80,
      53       it will fail.
      54       Try EUC-JP next. Short SHIFT_JIS inputs may come out wrong. This
      55       is unavoidable. People will condemn SHIFT_JIS.
      56       If we tried SHIFT_JIS first, then some short EUC-JP inputs would
      57       come out wrong, and people would condemn EUC-JP and Unix, which
      58       would not be good.
      59       Finally try SHIFT_JIS.  */
      60    "ISO-2022-JP-2", "EUC-JP", "SHIFT_JIS",
      61    NULL
      62  };
      63  static const char * const autodetect_kr_try[] =
      64  {
      65    /* Try 7-bit encoding first. If the input contains bytes >= 0x80,
      66       it will fail.
      67       Finally try EUC-KR.  */
      68    "ISO-2022-KR", "EUC-KR",
      69    NULL
      70  };
      71  
      72  static struct autodetect_alias autodetect_predefined[] =
      73  {
      74    { &autodetect_predefined[1], "autodetect_utf8", autodetect_utf8_try },
      75    { &autodetect_predefined[2], "autodetect_jp",   autodetect_jp_try },
      76    { NULL,                      "autodetect_kr",   autodetect_kr_try }
      77  };
      78  
      79  static struct autodetect_alias *autodetect_list = &autodetect_predefined[0];
      80  static struct autodetect_alias **autodetect_list_end =
      81    &autodetect_predefined[SIZEOF(autodetect_predefined)-1].next;
      82  
      83  int
      84  uniconv_register_autodetect (const char *name,
      85                               const char * const *try_in_order)
      86  {
      87    size_t namelen;
      88    size_t listlen;
      89    size_t memneed;
      90    size_t i;
      91  
      92    /* The TRY_IN_ORDER list must not be empty.  */
      93    if (try_in_order[0] == NULL)
      94      {
      95        errno = EINVAL;
      96        return -1;
      97      }
      98  
      99    /* We must deep-copy NAME and TRY_IN_ORDER, because they may be allocated
     100       with dynamic extent.  */
     101    namelen = strlen (name) + 1;
     102    memneed = sizeof (struct autodetect_alias) + namelen + sizeof (char *);
     103    for (i = 0; try_in_order[i] != NULL; i++)
     104      memneed += sizeof (char *) + strlen (try_in_order[i]) + 1;
     105    listlen = i;
     106  
     107    void *memory = malloc (memneed);
     108    if (memory != NULL)
     109      {
     110        struct autodetect_alias *new_alias = memory;
     111        memory = new_alias + 1;
     112  
     113        char const **new_try_in_order = memory;
     114        memory = new_try_in_order + listlen + 1;
     115  
     116        char *new_name = memcpy (memory, name, namelen);
     117        memory = new_name + namelen;
     118  
     119        for (i = 0; i < listlen; i++)
     120          {
     121            size_t len = strlen (try_in_order[i]) + 1;
     122            char *copy = memcpy (memory, try_in_order[i], len);
     123            new_try_in_order[i] = copy;
     124            memory = copy + len;
     125          }
     126        new_try_in_order[i] = NULL;
     127  
     128        /* Now insert the new alias.  */
     129        new_alias->name = new_name;
     130        new_alias->encodings_to_try = new_try_in_order;
     131        new_alias->next = NULL;
     132        /* FIXME: Not multithread-safe.  */
     133        *autodetect_list_end = new_alias;
     134        autodetect_list_end = &new_alias->next;
     135        return 0;
     136      }
     137    else
     138      {
     139        errno = ENOMEM;
     140        return -1;
     141      }
     142  }
     143  
     144  /* Like mem_iconveha, except no handling of transliteration.  */
     145  static int
     146  mem_iconveha_notranslit (const char *src, size_t srclen,
     147                           const char *from_codeset, const char *to_codeset,
     148                           enum iconv_ilseq_handler handler,
     149                           size_t *offsets,
     150                           char **resultp, size_t *lengthp)
     151  {
     152    int retval = mem_iconveh (src, srclen, from_codeset, to_codeset, handler,
     153                              offsets, resultp, lengthp);
     154    if (retval >= 0 || errno != EINVAL)
     155      return retval;
     156    else
     157      {
     158        struct autodetect_alias *alias;
     159  
     160        /* Unsupported from_codeset or to_codeset. Check whether the caller
     161           requested autodetection.  */
     162        for (alias = autodetect_list; alias != NULL; alias = alias->next)
     163          if (strcmp (from_codeset, alias->name) == 0)
     164            {
     165              const char * const *encodings;
     166  
     167              if (handler != iconveh_error)
     168                {
     169                  /* First try all encodings without any forgiving.  */
     170                  encodings = alias->encodings_to_try;
     171                  do
     172                    {
     173                      retval = mem_iconveha_notranslit (src, srclen,
     174                                                        *encodings, to_codeset,
     175                                                        iconveh_error, offsets,
     176                                                        resultp, lengthp);
     177                      if (!(retval < 0 && errno == EILSEQ))
     178                        return retval;
     179                      encodings++;
     180                    }
     181                  while (*encodings != NULL);
     182                }
     183  
     184              encodings = alias->encodings_to_try;
     185              do
     186                {
     187                  retval = mem_iconveha_notranslit (src, srclen,
     188                                                    *encodings, to_codeset,
     189                                                    handler, offsets,
     190                                                    resultp, lengthp);
     191                  if (!(retval < 0 && errno == EILSEQ))
     192                    return retval;
     193                  encodings++;
     194                }
     195              while (*encodings != NULL);
     196  
     197              /* Return the last call's result.  */
     198              return -1;
     199            }
     200  
     201        /* It wasn't an autodetection name.  */
     202        errno = EINVAL;
     203        return -1;
     204      }
     205  }
     206  
     207  int
     208  mem_iconveha (const char *src, size_t srclen,
     209                const char *from_codeset, const char *to_codeset,
     210                bool transliterate,
     211                enum iconv_ilseq_handler handler,
     212                size_t *offsets,
     213                char **resultp, size_t *lengthp)
     214  {
     215    if (srclen == 0)
     216      {
     217        /* Nothing to convert.  */
     218        *lengthp = 0;
     219        return 0;
     220      }
     221  
     222    /* When using GNU libc >= 2.2 or GNU libiconv >= 1.5,
     223       we want to use transliteration.  */
     224  #if (((__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2) || __GLIBC__ > 2) \
     225       && !defined __UCLIBC__) \
     226      || _LIBICONV_VERSION >= 0x0105
     227    if (transliterate)
     228      {
     229        int retval;
     230        size_t len = strlen (to_codeset);
     231        char *to_codeset_suffixed = (char *) malloca (len + 10 + 1);
     232        if (to_codeset_suffixed == NULL)
     233          {
     234            errno = ENOMEM;
     235            return -1;
     236          }
     237        memcpy (to_codeset_suffixed, to_codeset, len);
     238        memcpy (to_codeset_suffixed + len, "//TRANSLIT", 10 + 1);
     239  
     240        retval = mem_iconveha_notranslit (src, srclen,
     241                                          from_codeset, to_codeset_suffixed,
     242                                          handler, offsets, resultp, lengthp);
     243  
     244        freea (to_codeset_suffixed);
     245  
     246        return retval;
     247      }
     248    else
     249  #endif
     250      return mem_iconveha_notranslit (src, srclen,
     251                                      from_codeset, to_codeset,
     252                                      handler, offsets, resultp, lengthp);
     253  }
     254  
     255  /* Like str_iconveha, except no handling of transliteration.  */
     256  static char *
     257  str_iconveha_notranslit (const char *src,
     258                           const char *from_codeset, const char *to_codeset,
     259                           enum iconv_ilseq_handler handler)
     260  {
     261    char *result = str_iconveh (src, from_codeset, to_codeset, handler);
     262  
     263    if (result != NULL || errno != EINVAL)
     264      return result;
     265    else
     266      {
     267        struct autodetect_alias *alias;
     268  
     269        /* Unsupported from_codeset or to_codeset. Check whether the caller
     270           requested autodetection.  */
     271        for (alias = autodetect_list; alias != NULL; alias = alias->next)
     272          if (strcmp (from_codeset, alias->name) == 0)
     273            {
     274              const char * const *encodings;
     275  
     276              if (handler != iconveh_error)
     277                {
     278                  /* First try all encodings without any forgiving.  */
     279                  encodings = alias->encodings_to_try;
     280                  do
     281                    {
     282                      result = str_iconveha_notranslit (src,
     283                                                        *encodings, to_codeset,
     284                                                        iconveh_error);
     285                      if (!(result == NULL && errno == EILSEQ))
     286                        return result;
     287                      encodings++;
     288                    }
     289                  while (*encodings != NULL);
     290                }
     291  
     292              encodings = alias->encodings_to_try;
     293              do
     294                {
     295                  result = str_iconveha_notranslit (src,
     296                                                    *encodings, to_codeset,
     297                                                    handler);
     298                  if (!(result == NULL && errno == EILSEQ))
     299                    return result;
     300                  encodings++;
     301                }
     302              while (*encodings != NULL);
     303  
     304              /* Return the last call's result.  */
     305              return NULL;
     306            }
     307  
     308        /* It wasn't an autodetection name.  */
     309        errno = EINVAL;
     310        return NULL;
     311      }
     312  }
     313  
     314  char *
     315  str_iconveha (const char *src,
     316                const char *from_codeset, const char *to_codeset,
     317                bool transliterate,
     318                enum iconv_ilseq_handler handler)
     319  {
     320    if (*src == '\0' || c_strcasecmp (from_codeset, to_codeset) == 0)
     321      {
     322        char *result = strdup (src);
     323  
     324        if (result == NULL)
     325          errno = ENOMEM;
     326        return result;
     327      }
     328  
     329    /* When using GNU libc >= 2.2 or GNU libiconv >= 1.5,
     330       we want to use transliteration.  */
     331  #if (((__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2) || __GLIBC__ > 2) \
     332       && !defined __UCLIBC__) \
     333      || _LIBICONV_VERSION >= 0x0105
     334    if (transliterate)
     335      {
     336        char *result;
     337        size_t len = strlen (to_codeset);
     338        char *to_codeset_suffixed = (char *) malloca (len + 10 + 1);
     339        if (to_codeset_suffixed == NULL)
     340          {
     341            errno = ENOMEM;
     342            return NULL;
     343          }
     344        memcpy (to_codeset_suffixed, to_codeset, len);
     345        memcpy (to_codeset_suffixed + len, "//TRANSLIT", 10 + 1);
     346  
     347        result = str_iconveha_notranslit (src, from_codeset, to_codeset_suffixed,
     348                                          handler);
     349  
     350        freea (to_codeset_suffixed);
     351  
     352        return result;
     353      }
     354    else
     355  #endif
     356      return str_iconveha_notranslit (src, from_codeset, to_codeset, handler);
     357  }