1  /* Character set conversion with error handling and autodetection.
       2     Copyright (C) 2002, 2005, 2007, 2009-2023 Free Software Foundation, Inc.
       3     Written by Bruno Haible.
       4  
       5     This file is free software: you can redistribute it and/or modify
       6     it under the terms of the GNU Lesser General Public License as
       7     published by the Free Software Foundation; either version 2.1 of the
       8     License, or (at your option) any later version.
       9  
      10     This file is distributed in the hope that it will be useful,
      11     but WITHOUT ANY WARRANTY; without even the implied warranty of
      12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      13     GNU Lesser General Public License for more details.
      14  
      15     You should have received a copy of the GNU Lesser General Public License
      16     along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
      17  
      18  #include <config.h>
      19  
      20  /* Specification.  */
      21  #include "striconveha.h"
      22  
      23  #include <errno.h>
      24  #include <stdlib.h>
      25  #include <string.h>
      26  
      27  #include "malloca.h"
      28  #include "c-strcase.h"
      29  #include "striconveh.h"
      30  
      31  #define SIZEOF(a) (sizeof(a)/sizeof(a[0]))
      32  
      33  
      34  /* Autodetection list.  */
      35  
      36  struct autodetect_alias
      37  {
      38    struct autodetect_alias *next;
      39    const char *name;
      40    const char * const *encodings_to_try;
      41  };
      42  
      43  static const char * const autodetect_utf8_try[] =
      44  {
      45    /* Try UTF-8 first. There are very few ISO-8859-1 inputs that would
      46       be valid UTF-8, but many UTF-8 inputs are valid ISO-8859-1.  */
      47    "UTF-8", "ISO-8859-1",
      48    NULL
      49  };
      50  static const char * const autodetect_jp_try[] =
      51  {
      52    /* Try 7-bit encoding first. If the input contains bytes >= 0x80,
      53       it will fail.
      54       Try EUC-JP next. Short SHIFT_JIS inputs may come out wrong. This
      55       is unavoidable. People will condemn SHIFT_JIS.
      56       If we tried SHIFT_JIS first, then some short EUC-JP inputs would
      57       come out wrong, and people would condemn EUC-JP and Unix, which
      58       would not be good.
      59       Finally try SHIFT_JIS.  */
      60    "ISO-2022-JP-2", "EUC-JP", "SHIFT_JIS",
      61    NULL
      62  };
      63  static const char * const autodetect_kr_try[] =
      64  {
      65    /* Try 7-bit encoding first. If the input contains bytes >= 0x80,
      66       it will fail.
      67       Finally try EUC-KR.  */
      68    "ISO-2022-KR", "EUC-KR",
      69    NULL
      70  };
      71  
      72  static struct autodetect_alias autodetect_predefined[] =
      73  {
      74    { &autodetect_predefined[1], "autodetect_utf8", autodetect_utf8_try },
      75    { &autodetect_predefined[2], "autodetect_jp",   autodetect_jp_try },
      76    { NULL,                      "autodetect_kr",   autodetect_kr_try }
      77  };
      78  
      79  static struct autodetect_alias *autodetect_list = &autodetect_predefined[0];
      80  static struct autodetect_alias **autodetect_list_end =
      81    &autodetect_predefined[SIZEOF(autodetect_predefined)-1].next;
      82  
      83  int
      84  uniconv_register_autodetect (const char *name,
      85                               const char * const *try_in_order)
      86  {
      87    size_t namelen;
      88    size_t listlen;
      89    size_t memneed;
      90    size_t i;
      91    char *memory;
      92    struct autodetect_alias *new_alias;
      93    char *new_name;
      94    const char **new_try_in_order;
      95  
      96    /* The TRY_IN_ORDER list must not be empty.  */
      97    if (try_in_order[0] == NULL)
      98      {
      99        errno = EINVAL;
     100        return -1;
     101      }
     102  
     103    /* We must deep-copy NAME and TRY_IN_ORDER, because they may be allocated
     104       with dynamic extent.  */
     105    namelen = strlen (name) + 1;
     106    memneed = sizeof (struct autodetect_alias) + namelen + sizeof (char *);
     107    for (i = 0; try_in_order[i] != NULL; i++)
     108      memneed += sizeof (char *) + strlen (try_in_order[i]) + 1;
     109    listlen = i;
     110  
     111    memory = (char *) malloc (memneed);
     112    if (memory != NULL)
     113      {
     114        new_alias = (struct autodetect_alias *) memory;
     115        memory += sizeof (struct autodetect_alias);
     116  
     117        new_try_in_order = (const char **) memory;
     118        memory += (listlen + 1) * sizeof (char *);
     119  
     120        new_name = (char *) memory;
     121        memcpy (new_name, name, namelen);
     122        memory += namelen;
     123  
     124        for (i = 0; i < listlen; i++)
     125          {
     126            size_t len = strlen (try_in_order[i]) + 1;
     127            memcpy (memory, try_in_order[i], len);
     128            new_try_in_order[i] = (const char *) memory;
     129            memory += len;
     130          }
     131        new_try_in_order[i] = NULL;
     132  
     133        /* Now insert the new alias.  */
     134        new_alias->name = new_name;
     135        new_alias->encodings_to_try = new_try_in_order;
     136        new_alias->next = NULL;
     137        /* FIXME: Not multithread-safe.  */
     138        *autodetect_list_end = new_alias;
     139        autodetect_list_end = &new_alias->next;
     140        return 0;
     141      }
     142    else
     143      {
     144        errno = ENOMEM;
     145        return -1;
     146      }
     147  }
     148  
     149  /* Like mem_iconveha, except no handling of transliteration.  */
     150  static int
     151  mem_iconveha_notranslit (const char *src, size_t srclen,
     152                           const char *from_codeset, const char *to_codeset,
     153                           enum iconv_ilseq_handler handler,
     154                           size_t *offsets,
     155                           char **resultp, size_t *lengthp)
     156  {
     157    int retval = mem_iconveh (src, srclen, from_codeset, to_codeset, handler,
     158                              offsets, resultp, lengthp);
     159    if (retval >= 0 || errno != EINVAL)
     160      return retval;
     161    else
     162      {
     163        struct autodetect_alias *alias;
     164  
     165        /* Unsupported from_codeset or to_codeset. Check whether the caller
     166           requested autodetection.  */
     167        for (alias = autodetect_list; alias != NULL; alias = alias->next)
     168          if (strcmp (from_codeset, alias->name) == 0)
     169            {
     170              const char * const *encodings;
     171  
     172              if (handler != iconveh_error)
     173                {
     174                  /* First try all encodings without any forgiving.  */
     175                  encodings = alias->encodings_to_try;
     176                  do
     177                    {
     178                      retval = mem_iconveha_notranslit (src, srclen,
     179                                                        *encodings, to_codeset,
     180                                                        iconveh_error, offsets,
     181                                                        resultp, lengthp);
     182                      if (!(retval < 0 && errno == EILSEQ))
     183                        return retval;
     184                      encodings++;
     185                    }
     186                  while (*encodings != NULL);
     187                }
     188  
     189              encodings = alias->encodings_to_try;
     190              do
     191                {
     192                  retval = mem_iconveha_notranslit (src, srclen,
     193                                                    *encodings, to_codeset,
     194                                                    handler, offsets,
     195                                                    resultp, lengthp);
     196                  if (!(retval < 0 && errno == EILSEQ))
     197                    return retval;
     198                  encodings++;
     199                }
     200              while (*encodings != NULL);
     201  
     202              /* Return the last call's result.  */
     203              return -1;
     204            }
     205  
     206        /* It wasn't an autodetection name.  */
     207        errno = EINVAL;
     208        return -1;
     209      }
     210  }
     211  
     212  int
     213  mem_iconveha (const char *src, size_t srclen,
     214                const char *from_codeset, const char *to_codeset,
     215                bool transliterate,
     216                enum iconv_ilseq_handler handler,
     217                size_t *offsets,
     218                char **resultp, size_t *lengthp)
     219  {
     220    if (srclen == 0)
     221      {
     222        /* Nothing to convert.  */
     223        *lengthp = 0;
     224        return 0;
     225      }
     226  
     227    /* When using GNU libc >= 2.2 or GNU libiconv >= 1.5,
     228       we want to use transliteration.  */
     229  #if (((__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2) || __GLIBC__ > 2) \
     230       && !defined __UCLIBC__) \
     231      || _LIBICONV_VERSION >= 0x0105
     232    if (transliterate)
     233      {
     234        int retval;
     235        size_t len = strlen (to_codeset);
     236        char *to_codeset_suffixed = (char *) malloca (len + 10 + 1);
     237        if (to_codeset_suffixed == NULL)
     238          {
     239            errno = ENOMEM;
     240            return -1;
     241          }
     242        memcpy (to_codeset_suffixed, to_codeset, len);
     243        memcpy (to_codeset_suffixed + len, "//TRANSLIT", 10 + 1);
     244  
     245        retval = mem_iconveha_notranslit (src, srclen,
     246                                          from_codeset, to_codeset_suffixed,
     247                                          handler, offsets, resultp, lengthp);
     248  
     249        freea (to_codeset_suffixed);
     250  
     251        return retval;
     252      }
     253    else
     254  #endif
     255      return mem_iconveha_notranslit (src, srclen,
     256                                      from_codeset, to_codeset,
     257                                      handler, offsets, resultp, lengthp);
     258  }
     259  
     260  /* Like str_iconveha, except no handling of transliteration.  */
     261  static char *
     262  str_iconveha_notranslit (const char *src,
     263                           const char *from_codeset, const char *to_codeset,
     264                           enum iconv_ilseq_handler handler)
     265  {
     266    char *result = str_iconveh (src, from_codeset, to_codeset, handler);
     267  
     268    if (result != NULL || errno != EINVAL)
     269      return result;
     270    else
     271      {
     272        struct autodetect_alias *alias;
     273  
     274        /* Unsupported from_codeset or to_codeset. Check whether the caller
     275           requested autodetection.  */
     276        for (alias = autodetect_list; alias != NULL; alias = alias->next)
     277          if (strcmp (from_codeset, alias->name) == 0)
     278            {
     279              const char * const *encodings;
     280  
     281              if (handler != iconveh_error)
     282                {
     283                  /* First try all encodings without any forgiving.  */
     284                  encodings = alias->encodings_to_try;
     285                  do
     286                    {
     287                      result = str_iconveha_notranslit (src,
     288                                                        *encodings, to_codeset,
     289                                                        iconveh_error);
     290                      if (!(result == NULL && errno == EILSEQ))
     291                        return result;
     292                      encodings++;
     293                    }
     294                  while (*encodings != NULL);
     295                }
     296  
     297              encodings = alias->encodings_to_try;
     298              do
     299                {
     300                  result = str_iconveha_notranslit (src,
     301                                                    *encodings, to_codeset,
     302                                                    handler);
     303                  if (!(result == NULL && errno == EILSEQ))
     304                    return result;
     305                  encodings++;
     306                }
     307              while (*encodings != NULL);
     308  
     309              /* Return the last call's result.  */
     310              return NULL;
     311            }
     312  
     313        /* It wasn't an autodetection name.  */
     314        errno = EINVAL;
     315        return NULL;
     316      }
     317  }
     318  
     319  char *
     320  str_iconveha (const char *src,
     321                const char *from_codeset, const char *to_codeset,
     322                bool transliterate,
     323                enum iconv_ilseq_handler handler)
     324  {
     325    if (*src == '\0' || c_strcasecmp (from_codeset, to_codeset) == 0)
     326      {
     327        char *result = strdup (src);
     328  
     329        if (result == NULL)
     330          errno = ENOMEM;
     331        return result;
     332      }
     333  
     334    /* When using GNU libc >= 2.2 or GNU libiconv >= 1.5,
     335       we want to use transliteration.  */
     336  #if (((__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2) || __GLIBC__ > 2) \
     337       && !defined __UCLIBC__) \
     338      || _LIBICONV_VERSION >= 0x0105
     339    if (transliterate)
     340      {
     341        char *result;
     342        size_t len = strlen (to_codeset);
     343        char *to_codeset_suffixed = (char *) malloca (len + 10 + 1);
     344        if (to_codeset_suffixed == NULL)
     345          {
     346            errno = ENOMEM;
     347            return NULL;
     348          }
     349        memcpy (to_codeset_suffixed, to_codeset, len);
     350        memcpy (to_codeset_suffixed + len, "//TRANSLIT", 10 + 1);
     351  
     352        result = str_iconveha_notranslit (src, from_codeset, to_codeset_suffixed,
     353                                          handler);
     354  
     355        freea (to_codeset_suffixed);
     356  
     357        return result;
     358      }
     359    else
     360  #endif
     361      return str_iconveha_notranslit (src, from_codeset, to_codeset, handler);
     362  }