1  /* Character set conversion with error handling.
       2     Copyright (C) 2001-2023 Free Software Foundation, Inc.
       3     Written by Bruno Haible and Simon Josefsson.
       4  
       5     This file is free software: you can redistribute it and/or modify
       6     it under the terms of the GNU Lesser General Public License as
       7     published by the Free Software Foundation; either version 2.1 of the
       8     License, or (at your option) any later version.
       9  
      10     This file is distributed in the hope that it will be useful,
      11     but WITHOUT ANY WARRANTY; without even the implied warranty of
      12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      13     GNU Lesser General Public License for more details.
      14  
      15     You should have received a copy of the GNU Lesser General Public License
      16     along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
      17  
      18  #include <config.h>
      19  
      20  /* Specification.  */
      21  #include "striconveh.h"
      22  
      23  #include <errno.h>
      24  #include <stdlib.h>
      25  #include <string.h>
      26  
      27  #if HAVE_ICONV
      28  # include <iconv.h>
      29  # include "unistr.h"
      30  #endif
      31  
      32  #include "c-strcase.h"
      33  #include "c-strcaseeq.h"
      34  
      35  #ifndef SIZE_MAX
      36  # define SIZE_MAX ((size_t) -1)
      37  #endif
      38  
      39  
      40  #if HAVE_ICONV
      41  
      42  /* The caller must provide an iconveh_t, not just an iconv_t, because when a
      43     conversion error occurs, we may have to determine the Unicode representation
      44     of the inconvertible character.  */
      45  
      46  int
      47  iconveh_open (const char *to_codeset, const char *from_codeset, iconveh_t *cdp)
      48  {
      49    iconv_t cd;
      50    iconv_t cd1;
      51    iconv_t cd2;
      52  
      53    /* Avoid glibc-2.1 bug with EUC-KR.  */
      54  # if ((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
      55       && !defined _LIBICONV_VERSION
      56    if (c_strcasecmp (from_codeset, "EUC-KR") == 0
      57        || c_strcasecmp (to_codeset, "EUC-KR") == 0)
      58      {
      59        errno = EINVAL;
      60        return -1;
      61      }
      62  # endif
      63  
      64    cd = iconv_open (to_codeset, from_codeset);
      65  
      66    if (STRCASEEQ (from_codeset, "UTF-8", 'U','T','F','-','8',0,0,0,0))
      67      cd1 = (iconv_t)(-1);
      68    else
      69      {
      70        cd1 = iconv_open ("UTF-8", from_codeset);
      71        if (cd1 == (iconv_t)(-1))
      72          {
      73            int saved_errno = errno;
      74            if (cd != (iconv_t)(-1))
      75              iconv_close (cd);
      76            errno = saved_errno;
      77            return -1;
      78          }
      79      }
      80  
      81    if (STRCASEEQ (to_codeset, "UTF-8", 'U','T','F','-','8',0,0,0,0)
      82  # if (((__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2) || __GLIBC__ > 2) \
      83        && !defined __UCLIBC__) \
      84       || _LIBICONV_VERSION >= 0x0105
      85        || c_strcasecmp (to_codeset, "UTF-8//TRANSLIT") == 0
      86  # endif
      87       )
      88      cd2 = (iconv_t)(-1);
      89    else
      90      {
      91        cd2 = iconv_open (to_codeset, "UTF-8");
      92        if (cd2 == (iconv_t)(-1))
      93          {
      94            int saved_errno = errno;
      95            if (cd1 != (iconv_t)(-1))
      96              iconv_close (cd1);
      97            if (cd != (iconv_t)(-1))
      98              iconv_close (cd);
      99            errno = saved_errno;
     100            return -1;
     101          }
     102      }
     103  
     104    cdp->cd = cd;
     105    cdp->cd1 = cd1;
     106    cdp->cd2 = cd2;
     107    return 0;
     108  }
     109  
     110  int
     111  iconveh_close (const iconveh_t *cd)
     112  {
     113    if (cd->cd2 != (iconv_t)(-1) && iconv_close (cd->cd2) < 0)
     114      {
     115        /* Return -1, but preserve the errno from iconv_close.  */
     116        int saved_errno = errno;
     117        if (cd->cd1 != (iconv_t)(-1))
     118          iconv_close (cd->cd1);
     119        if (cd->cd != (iconv_t)(-1))
     120          iconv_close (cd->cd);
     121        errno = saved_errno;
     122        return -1;
     123      }
     124    if (cd->cd1 != (iconv_t)(-1) && iconv_close (cd->cd1) < 0)
     125      {
     126        /* Return -1, but preserve the errno from iconv_close.  */
     127        int saved_errno = errno;
     128        if (cd->cd != (iconv_t)(-1))
     129          iconv_close (cd->cd);
     130        errno = saved_errno;
     131        return -1;
     132      }
     133    if (cd->cd != (iconv_t)(-1) && iconv_close (cd->cd) < 0)
     134      return -1;
     135    return 0;
     136  }
     137  
     138  /* iconv_carefully is like iconv, except that it stops as soon as it encounters
     139     a conversion error, and it returns in *INCREMENTED a boolean telling whether
     140     it has incremented the input pointers past the error location.  */
     141  # if !defined _LIBICONV_VERSION && !(defined __GLIBC__ && !defined __UCLIBC__)
     142  /* Irix iconv() inserts a NUL byte if it cannot convert.
     143     NetBSD iconv() inserts a question mark if it cannot convert.
     144     Only GNU libiconv and GNU libc are known to prefer to fail rather
     145     than doing a lossy conversion.  */
     146  static size_t
     147  iconv_carefully (iconv_t cd,
     148                   const char **inbuf, size_t *inbytesleft,
     149                   char **outbuf, size_t *outbytesleft,
     150                   bool *incremented)
     151  {
     152    const char *inptr = *inbuf;
     153    const char *inptr_end = inptr + *inbytesleft;
     154    char *outptr = *outbuf;
     155    size_t outsize = *outbytesleft;
     156    const char *inptr_before;
     157    size_t res;
     158  
     159    do
     160      {
     161        size_t insize;
     162  
     163        inptr_before = inptr;
     164        res = (size_t)(-1);
     165  
     166        for (insize = 1; inptr + insize <= inptr_end; insize++)
     167          {
     168            res = iconv (cd,
     169                         (ICONV_CONST char **) &inptr, &insize,
     170                         &outptr, &outsize);
     171            if (!(res == (size_t)(-1) && errno == EINVAL))
     172              break;
     173            /* iconv can eat up a shift sequence but give EINVAL while attempting
     174               to convert the first character.  E.g. libiconv does this.  */
     175            if (inptr > inptr_before)
     176              {
     177                res = 0;
     178                break;
     179              }
     180          }
     181  
     182        if (res == 0)
     183          {
     184            *outbuf = outptr;
     185            *outbytesleft = outsize;
     186          }
     187      }
     188    while (res == 0 && inptr < inptr_end);
     189  
     190    *inbuf = inptr;
     191    *inbytesleft = inptr_end - inptr;
     192    if (res != (size_t)(-1) && res > 0)
     193      {
     194        /* iconv() has already incremented INPTR.  We cannot go back to a
     195           previous INPTR, otherwise the state inside CD would become invalid,
     196           if FROM_CODESET is a stateful encoding.  So, tell the caller that
     197           *INBUF has already been incremented.  */
     198        *incremented = (inptr > inptr_before);
     199        errno = EILSEQ;
     200        return (size_t)(-1);
     201      }
     202    else
     203      {
     204        *incremented = false;
     205        return res;
     206      }
     207  }
     208  # else
     209  #  define iconv_carefully(cd, inbuf, inbytesleft, outbuf, outbytesleft, incremented) \
     210       (*(incremented) = false, \
     211        iconv (cd, (ICONV_CONST char **) (inbuf), inbytesleft, outbuf, outbytesleft))
     212  # endif
     213  
     214  /* iconv_carefully_1 is like iconv_carefully, except that it stops after
     215     converting one character or one shift sequence.  */
     216  static size_t
     217  iconv_carefully_1 (iconv_t cd,
     218                     const char **inbuf, size_t *inbytesleft,
     219                     char **outbuf, size_t *outbytesleft,
     220                     bool *incremented)
     221  {
     222    const char *inptr_before = *inbuf;
     223    const char *inptr = inptr_before;
     224    const char *inptr_end = inptr_before + *inbytesleft;
     225    char *outptr = *outbuf;
     226    size_t outsize = *outbytesleft;
     227    size_t res = (size_t)(-1);
     228    size_t insize;
     229  
     230    for (insize = 1; inptr_before + insize <= inptr_end; insize++)
     231      {
     232        inptr = inptr_before;
     233        res = iconv (cd,
     234                     (ICONV_CONST char **) &inptr, &insize,
     235                     &outptr, &outsize);
     236        if (!(res == (size_t)(-1) && errno == EINVAL))
     237          break;
     238        /* iconv can eat up a shift sequence but give EINVAL while attempting
     239           to convert the first character.  E.g. libiconv does this.  */
     240        if (inptr > inptr_before)
     241          {
     242            res = 0;
     243            break;
     244          }
     245      }
     246  
     247    *inbuf = inptr;
     248    *inbytesleft = inptr_end - inptr;
     249  # if !defined _LIBICONV_VERSION && !(defined __GLIBC__ && !defined __UCLIBC__)
     250    /* Irix iconv() inserts a NUL byte if it cannot convert.
     251       NetBSD iconv() inserts a question mark if it cannot convert.
     252       Only GNU libiconv and GNU libc are known to prefer to fail rather
     253       than doing a lossy conversion.  */
     254    if (res != (size_t)(-1) && res > 0)
     255      {
     256        /* iconv() has already incremented INPTR.  We cannot go back to a
     257           previous INPTR, otherwise the state inside CD would become invalid,
     258           if FROM_CODESET is a stateful encoding.  So, tell the caller that
     259           *INBUF has already been incremented.  */
     260        *incremented = (inptr > inptr_before);
     261        errno = EILSEQ;
     262        return (size_t)(-1);
     263      }
     264  # endif
     265  
     266    if (res != (size_t)(-1))
     267      {
     268        *outbuf = outptr;
     269        *outbytesleft = outsize;
     270      }
     271    *incremented = false;
     272    return res;
     273  }
     274  
     275  /* utf8conv_carefully is like iconv, except that
     276       - it converts from UTF-8 to UTF-8,
     277       - it stops as soon as it encounters a conversion error, and it returns
     278         in *INCREMENTED a boolean telling whether it has incremented the input
     279         pointers past the error location,
     280       - if one_character_only is true, it stops after converting one
     281         character.  */
     282  static size_t
     283  utf8conv_carefully (bool one_character_only,
     284                      const char **inbuf, size_t *inbytesleft,
     285                      char **outbuf, size_t *outbytesleft,
     286                      bool *incremented)
     287  {
     288    const char *inptr = *inbuf;
     289    size_t insize = *inbytesleft;
     290    char *outptr = *outbuf;
     291    size_t outsize = *outbytesleft;
     292    size_t res;
     293  
     294    res = 0;
     295    do
     296      {
     297        ucs4_t uc;
     298        int n;
     299        int m;
     300  
     301        n = u8_mbtoucr (&uc, (const uint8_t *) inptr, insize);
     302        if (n < 0)
     303          {
     304            errno = (n == -2 ? EINVAL : EILSEQ);
     305            n = u8_mbtouc (&uc, (const uint8_t *) inptr, insize);
     306            inptr += n;
     307            insize -= n;
     308            res = (size_t)(-1);
     309            *incremented = true;
     310            break;
     311          }
     312        if (outsize == 0)
     313          {
     314            errno = E2BIG;
     315            res = (size_t)(-1);
     316            *incremented = false;
     317            break;
     318          }
     319        m = u8_uctomb ((uint8_t *) outptr, uc, outsize);
     320        if (m == -2)
     321          {
     322            errno = E2BIG;
     323            res = (size_t)(-1);
     324            *incremented = false;
     325            break;
     326          }
     327        inptr += n;
     328        insize -= n;
     329        if (m == -1)
     330          {
     331            errno = EILSEQ;
     332            res = (size_t)(-1);
     333            *incremented = true;
     334            break;
     335          }
     336        outptr += m;
     337        outsize -= m;
     338      }
     339    while (!one_character_only && insize > 0);
     340  
     341    *inbuf = inptr;
     342    *inbytesleft = insize;
     343    *outbuf = outptr;
     344    *outbytesleft = outsize;
     345    return res;
     346  }
     347  
     348  static int
     349  mem_cd_iconveh_internal (const char *src, size_t srclen,
     350                           iconv_t cd, iconv_t cd1, iconv_t cd2,
     351                           enum iconv_ilseq_handler handler,
     352                           size_t extra_alloc,
     353                           size_t *offsets,
     354                           char **resultp, size_t *lengthp)
     355  {
     356    /* When a conversion error occurs, we cannot start using CD1 and CD2 at
     357       this point: FROM_CODESET may be a stateful encoding like ISO-2022-KR.
     358       Instead, we have to start afresh from the beginning of SRC.  */
     359    /* Use a temporary buffer, so that for small strings, a single malloc()
     360       call will be sufficient.  */
     361  # define tmpbufsize 4096
     362    /* The alignment is needed when converting e.g. to glibc's WCHAR_T or
     363       libiconv's UCS-4-INTERNAL encoding.  */
     364    union { unsigned int align; char buf[tmpbufsize]; } tmp;
     365  # define tmpbuf tmp.buf
     366  
     367    char *initial_result;
     368    char *result;
     369    size_t allocated;
     370    size_t length;
     371    size_t last_length = (size_t)(-1); /* only needed if offsets != NULL */
     372  
     373    if (*resultp != NULL && *lengthp >= sizeof (tmpbuf))
     374      {
     375        initial_result = *resultp;
     376        allocated = *lengthp;
     377      }
     378    else
     379      {
     380        initial_result = tmpbuf;
     381        allocated = sizeof (tmpbuf);
     382      }
     383    result = initial_result;
     384  
     385    /* Test whether a direct conversion is possible at all.  */
     386    if (cd == (iconv_t)(-1))
     387      goto indirectly;
     388  
     389    if (offsets != NULL)
     390      {
     391        size_t i;
     392  
     393        for (i = 0; i < srclen; i++)
     394          offsets[i] = (size_t)(-1);
     395  
     396        last_length = (size_t)(-1);
     397      }
     398    length = 0;
     399  
     400    /* First, try a direct conversion, and see whether a conversion error
     401       occurs at all.  */
     402    {
     403      const char *inptr = src;
     404      size_t insize = srclen;
     405  
     406      /* Avoid glibc-2.1 bug and Solaris 2.7-2.9 bug.  */
     407  # if defined _LIBICONV_VERSION \
     408       || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
     409            || defined __sun)
     410      /* Set to the initial state.  */
     411      iconv (cd, NULL, NULL, NULL, NULL);
     412  # endif
     413  
     414      while (insize > 0)
     415        {
     416          char *outptr = result + length;
     417          size_t outsize = allocated - extra_alloc - length;
     418          bool incremented;
     419          size_t res;
     420          bool grow;
     421  
     422          if (offsets != NULL)
     423            {
     424              if (length != last_length) /* ensure that offset[] be increasing */
     425                {
     426                  offsets[inptr - src] = length;
     427                  last_length = length;
     428                }
     429              res = iconv_carefully_1 (cd,
     430                                       &inptr, &insize,
     431                                       &outptr, &outsize,
     432                                       &incremented);
     433            }
     434          else
     435            /* Use iconv_carefully instead of iconv here, because:
     436               - If TO_CODESET is UTF-8, we can do the error handling in this
     437                 loop, no need for a second loop,
     438               - With iconv() implementations other than GNU libiconv and GNU
     439                 libc, if we use iconv() in a big swoop, checking for an E2BIG
     440                 return, we lose the number of irreversible conversions.  */
     441            res = iconv_carefully (cd,
     442                                   &inptr, &insize,
     443                                   &outptr, &outsize,
     444                                   &incremented);
     445  
     446          length = outptr - result;
     447          grow = (length + extra_alloc > allocated / 2);
     448          if (res == (size_t)(-1))
     449            {
     450              if (errno == E2BIG)
     451                grow = true;
     452              else if (errno == EINVAL)
     453                break;
     454              else if (errno == EILSEQ && handler != iconveh_error)
     455                {
     456                  if (cd2 == (iconv_t)(-1))
     457                    {
     458                      /* TO_CODESET is UTF-8.  */
     459                      /* Error handling can produce up to 1 or 3 bytes of
     460                         output.  */
     461                      size_t extra_need =
     462                        (handler == iconveh_replacement_character ? 3 : 1);
     463                      if (length + extra_need + extra_alloc > allocated)
     464                        {
     465                          char *memory;
     466  
     467                          allocated = 2 * allocated;
     468                          if (length + extra_need + extra_alloc > allocated)
     469                            allocated = 2 * allocated;
     470                          if (length + extra_need + extra_alloc > allocated)
     471                            abort ();
     472                          if (result == initial_result)
     473                            memory = (char *) malloc (allocated);
     474                          else
     475                            memory = (char *) realloc (result, allocated);
     476                          if (memory == NULL)
     477                            {
     478                              if (result != initial_result)
     479                                free (result);
     480                              errno = ENOMEM;
     481                              return -1;
     482                            }
     483                          if (result == initial_result)
     484                            memcpy (memory, initial_result, length);
     485                          result = memory;
     486                          grow = false;
     487                        }
     488                      /* The input is invalid in FROM_CODESET.  Eat up one byte
     489                         and emit a replacement character or a question mark.  */
     490                      if (!incremented)
     491                        {
     492                          if (insize == 0)
     493                            abort ();
     494                          inptr++;
     495                          insize--;
     496                        }
     497                      if (handler == iconveh_replacement_character)
     498                        {
     499                          /* U+FFFD in UTF-8 encoding.  */
     500                          result[length+0] = '\357';
     501                          result[length+1] = '\277';
     502                          result[length+2] = '\275';
     503                          length += 3;
     504                        }
     505                      else
     506                        {
     507                          result[length] = '?';
     508                          length++;
     509                        }
     510                    }
     511                  else
     512                    goto indirectly;
     513                }
     514              else
     515                {
     516                  if (result != initial_result)
     517                    free (result);
     518                  return -1;
     519                }
     520            }
     521          if (insize == 0)
     522            break;
     523          if (grow)
     524            {
     525              char *memory;
     526  
     527              allocated = 2 * allocated;
     528              if (result == initial_result)
     529                memory = (char *) malloc (allocated);
     530              else
     531                memory = (char *) realloc (result, allocated);
     532              if (memory == NULL)
     533                {
     534                  if (result != initial_result)
     535                    free (result);
     536                  errno = ENOMEM;
     537                  return -1;
     538                }
     539              if (result == initial_result)
     540                memcpy (memory, initial_result, length);
     541              result = memory;
     542            }
     543        }
     544    }
     545  
     546    /* Now get the conversion state back to the initial state.
     547       But avoid glibc-2.1 bug and Solaris 2.7 bug.  */
     548  #if defined _LIBICONV_VERSION \
     549      || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
     550           || defined __sun)
     551    for (;;)
     552      {
     553        char *outptr = result + length;
     554        size_t outsize = allocated - extra_alloc - length;
     555        size_t res;
     556  
     557        res = iconv (cd, NULL, NULL, &outptr, &outsize);
     558        length = outptr - result;
     559        if (res == (size_t)(-1))
     560          {
     561            if (errno == E2BIG)
     562              {
     563                char *memory;
     564  
     565                allocated = 2 * allocated;
     566                if (result == initial_result)
     567                  memory = (char *) malloc (allocated);
     568                else
     569                  memory = (char *) realloc (result, allocated);
     570                if (memory == NULL)
     571                  {
     572                    if (result != initial_result)
     573                      free (result);
     574                    errno = ENOMEM;
     575                    return -1;
     576                  }
     577                if (result == initial_result)
     578                  memcpy (memory, initial_result, length);
     579                result = memory;
     580              }
     581            else
     582              {
     583                if (result != initial_result)
     584                  free (result);
     585                return -1;
     586              }
     587          }
     588        else
     589          break;
     590      }
     591  #endif
     592  
     593    /* The direct conversion succeeded.  */
     594    goto done;
     595  
     596   indirectly:
     597    /* The direct conversion failed.
     598       Use a conversion through UTF-8.  */
     599    if (offsets != NULL)
     600      {
     601        size_t i;
     602  
     603        for (i = 0; i < srclen; i++)
     604          offsets[i] = (size_t)(-1);
     605  
     606        last_length = (size_t)(-1);
     607      }
     608    length = 0;
     609    {
     610      const bool slowly = (offsets != NULL || handler == iconveh_error);
     611  # define utf8bufsize 4096 /* may also be smaller or larger than tmpbufsize */
     612      char utf8buf[utf8bufsize + 3];
     613      size_t utf8len = 0;
     614      const char *in1ptr = src;
     615      size_t in1size = srclen;
     616      bool do_final_flush1 = true;
     617      bool do_final_flush2 = true;
     618  
     619      /* Avoid glibc-2.1 bug and Solaris 2.7-2.9 bug.  */
     620  # if defined _LIBICONV_VERSION \
     621       || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
     622            || defined __sun)
     623      /* Set to the initial state.  */
     624      if (cd1 != (iconv_t)(-1))
     625        iconv (cd1, NULL, NULL, NULL, NULL);
     626      if (cd2 != (iconv_t)(-1))
     627        iconv (cd2, NULL, NULL, NULL, NULL);
     628  # endif
     629  
     630      while (in1size > 0 || do_final_flush1 || utf8len > 0 || do_final_flush2)
     631        {
     632          char *out1ptr = utf8buf + utf8len;
     633          size_t out1size = utf8bufsize - utf8len;
     634          bool incremented1;
     635          size_t res1;
     636          int errno1;
     637  
     638          /* Conversion step 1: from FROM_CODESET to UTF-8.  */
     639          if (in1size > 0)
     640            {
     641              if (offsets != NULL
     642                  && length != last_length) /* ensure that offset[] be increasing */
     643                {
     644                  offsets[in1ptr - src] = length;
     645                  last_length = length;
     646                }
     647              if (cd1 != (iconv_t)(-1))
     648                {
     649                  if (slowly)
     650                    res1 = iconv_carefully_1 (cd1,
     651                                              &in1ptr, &in1size,
     652                                              &out1ptr, &out1size,
     653                                              &incremented1);
     654                  else
     655                    res1 = iconv_carefully (cd1,
     656                                            &in1ptr, &in1size,
     657                                            &out1ptr, &out1size,
     658                                            &incremented1);
     659                }
     660              else
     661                {
     662                  /* FROM_CODESET is UTF-8.  */
     663                  res1 = utf8conv_carefully (slowly,
     664                                             &in1ptr, &in1size,
     665                                             &out1ptr, &out1size,
     666                                             &incremented1);
     667                }
     668            }
     669          else if (do_final_flush1)
     670            {
     671              /* Now get the conversion state of CD1 back to the initial state.
     672                 But avoid glibc-2.1 bug and Solaris 2.7 bug.  */
     673  # if defined _LIBICONV_VERSION \
     674       || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
     675            || defined __sun)
     676              if (cd1 != (iconv_t)(-1))
     677                res1 = iconv (cd1, NULL, NULL, &out1ptr, &out1size);
     678              else
     679  # endif
     680                res1 = 0;
     681              do_final_flush1 = false;
     682              incremented1 = true;
     683            }
     684          else
     685            {
     686              res1 = 0;
     687              incremented1 = true;
     688            }
     689          if (res1 == (size_t)(-1)
     690              && !(errno == E2BIG || errno == EINVAL || errno == EILSEQ))
     691            {
     692              if (result != initial_result)
     693                free (result);
     694              return -1;
     695            }
     696          if (res1 == (size_t)(-1)
     697              && errno == EILSEQ && handler != iconveh_error)
     698            {
     699              /* The input is invalid in FROM_CODESET.  Eat up one byte and
     700                 emit a U+FFFD character or a question mark.  Room for this
     701                 character was allocated at the end of utf8buf.  */
     702              if (!incremented1)
     703                {
     704                  if (in1size == 0)
     705                    abort ();
     706                  in1ptr++;
     707                  in1size--;
     708                }
     709              if (handler == iconveh_replacement_character)
     710                {
     711                  /* U+FFFD in UTF-8 encoding.  */
     712                  out1ptr[0] = '\357';
     713                  out1ptr[1] = '\277';
     714                  out1ptr[2] = '\275';
     715                  out1ptr += 3;
     716                }
     717              else
     718                *out1ptr++ = '?';
     719              res1 = 0;
     720            }
     721          errno1 = errno;
     722          utf8len = out1ptr - utf8buf;
     723  
     724          if (offsets != NULL
     725              || in1size == 0
     726              || utf8len > utf8bufsize / 2
     727              || (res1 == (size_t)(-1) && errno1 == E2BIG))
     728            {
     729              /* Conversion step 2: from UTF-8 to TO_CODESET.  */
     730              const char *in2ptr = utf8buf;
     731              size_t in2size = utf8len;
     732  
     733              while (in2size > 0
     734                     || (in1size == 0 && !do_final_flush1 && do_final_flush2))
     735                {
     736                  char *out2ptr = result + length;
     737                  size_t out2size = allocated - extra_alloc - length;
     738                  bool incremented2;
     739                  size_t res2;
     740                  bool grow;
     741  
     742                  if (in2size > 0)
     743                    {
     744                      if (cd2 != (iconv_t)(-1))
     745                        res2 = iconv_carefully (cd2,
     746                                                &in2ptr, &in2size,
     747                                                &out2ptr, &out2size,
     748                                                &incremented2);
     749                      else
     750                        /* TO_CODESET is UTF-8.  */
     751                        res2 = utf8conv_carefully (false,
     752                                                   &in2ptr, &in2size,
     753                                                   &out2ptr, &out2size,
     754                                                   &incremented2);
     755                    }
     756                  else /* in1size == 0 && !do_final_flush1
     757                          && in2size == 0 && do_final_flush2 */
     758                    {
     759                      /* Now get the conversion state of CD1 back to the initial
     760                         state.  But avoid glibc-2.1 bug and Solaris 2.7 bug.  */
     761  # if defined _LIBICONV_VERSION \
     762       || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
     763            || defined __sun)
     764                      if (cd2 != (iconv_t)(-1))
     765                        res2 = iconv (cd2, NULL, NULL, &out2ptr, &out2size);
     766                      else
     767  # endif
     768                        res2 = 0;
     769                      do_final_flush2 = false;
     770                      incremented2 = true;
     771                    }
     772  
     773                  length = out2ptr - result;
     774                  grow = (length + extra_alloc > allocated / 2);
     775                  if (res2 == (size_t)(-1))
     776                    {
     777                      if (errno == E2BIG)
     778                        grow = true;
     779                      else if (errno == EINVAL)
     780                        break;
     781                      else if (errno == EILSEQ && handler != iconveh_error)
     782                        {
     783                          /* Error handling can produce up to 10 bytes of UTF-8
     784                             output.  But TO_CODESET may be UCS-2, UTF-16 or
     785                             UCS-4, so use CD2 here as well.  */
     786                          char scratchbuf[10];
     787                          size_t scratchlen;
     788                          ucs4_t uc;
     789                          const char *inptr;
     790                          size_t insize;
     791                          size_t res;
     792  
     793                          if (incremented2)
     794                            {
     795                              if (u8_prev (&uc, (const uint8_t *) in2ptr,
     796                                           (const uint8_t *) utf8buf)
     797                                  == NULL)
     798                                abort ();
     799                            }
     800                          else
     801                            {
     802                              int n;
     803                              if (in2size == 0)
     804                                abort ();
     805                              n = u8_mbtouc_unsafe (&uc, (const uint8_t *) in2ptr,
     806                                                    in2size);
     807                              in2ptr += n;
     808                              in2size -= n;
     809                            }
     810  
     811                          if (handler == iconveh_escape_sequence)
     812                            {
     813                              static char const hex[16] = "0123456789ABCDEF";
     814                              scratchlen = 0;
     815                              scratchbuf[scratchlen++] = '\\';
     816                              if (uc < 0x10000)
     817                                scratchbuf[scratchlen++] = 'u';
     818                              else
     819                                {
     820                                  scratchbuf[scratchlen++] = 'U';
     821                                  scratchbuf[scratchlen++] = hex[(uc>>28) & 15];
     822                                  scratchbuf[scratchlen++] = hex[(uc>>24) & 15];
     823                                  scratchbuf[scratchlen++] = hex[(uc>>20) & 15];
     824                                  scratchbuf[scratchlen++] = hex[(uc>>16) & 15];
     825                                }
     826                              scratchbuf[scratchlen++] = hex[(uc>>12) & 15];
     827                              scratchbuf[scratchlen++] = hex[(uc>>8) & 15];
     828                              scratchbuf[scratchlen++] = hex[(uc>>4) & 15];
     829                              scratchbuf[scratchlen++] = hex[uc & 15];
     830                            }
     831                          else if (handler == iconveh_replacement_character)
     832                            {
     833                              /* U+FFFD in UTF-8 encoding.  */
     834                              scratchbuf[0] = '\357';
     835                              scratchbuf[1] = '\277';
     836                              scratchbuf[2] = '\275';
     837                              scratchlen = 3;
     838                            }
     839                          else
     840                            {
     841                              scratchbuf[0] = '?';
     842                              scratchlen = 1;
     843                            }
     844  
     845                          inptr = scratchbuf;
     846                          insize = scratchlen;
     847                          if (cd2 != (iconv_t)(-1))
     848                            {
     849                              char *out2ptr_try = out2ptr;
     850                              size_t out2size_try = out2size;
     851                              res = iconv (cd2,
     852                                           (ICONV_CONST char **) &inptr, &insize,
     853                                           &out2ptr_try, &out2size_try);
     854                              if (handler == iconveh_replacement_character
     855                                  && (res == (size_t)(-1)
     856                                      ? errno == EILSEQ
     857                                      /* FreeBSD iconv(), NetBSD iconv(), and
     858                                         Solaris 11 iconv() insert a '?' if they
     859                                         cannot convert.  This is what we want.
     860                                         But IRIX iconv() inserts a NUL byte if it
     861                                         cannot convert.
     862                                         And musl libc iconv() inserts a '*' if it
     863                                         cannot convert.  */
     864                                      : (res > 0
     865                                         && !(out2ptr_try - out2ptr == 1
     866                                              && *out2ptr == '?'))))
     867                                {
     868                                  /* The iconv() call failed.
     869                                     U+FFFD can't be converted to TO_CODESET.
     870                                     Use '?' instead.  */
     871                                  scratchbuf[0] = '?';
     872                                  scratchlen = 1;
     873                                  inptr = scratchbuf;
     874                                  insize = scratchlen;
     875                                  res = iconv (cd2,
     876                                               (ICONV_CONST char **) &inptr, &insize,
     877                                               &out2ptr, &out2size);
     878                                }
     879                              else
     880                                {
     881                                  /* Accept the results of the iconv() call.  */
     882                                  out2ptr = out2ptr_try;
     883                                  out2size = out2size_try;
     884                                  res = 0;
     885                                }
     886                            }
     887                          else
     888                            {
     889                              /* TO_CODESET is UTF-8.  */
     890                              if (out2size >= insize)
     891                                {
     892                                  memcpy (out2ptr, inptr, insize);
     893                                  out2ptr += insize;
     894                                  out2size -= insize;
     895                                  inptr += insize;
     896                                  insize = 0;
     897                                  res = 0;
     898                                }
     899                              else
     900                                {
     901                                  errno = E2BIG;
     902                                  res = (size_t)(-1);
     903                                }
     904                            }
     905                          length = out2ptr - result;
     906                          if (res == (size_t)(-1) && errno == E2BIG)
     907                            {
     908                              char *memory;
     909  
     910                              allocated = 2 * allocated;
     911                              if (length + 1 + extra_alloc > allocated)
     912                                abort ();
     913                              if (result == initial_result)
     914                                memory = (char *) malloc (allocated);
     915                              else
     916                                memory = (char *) realloc (result, allocated);
     917                              if (memory == NULL)
     918                                {
     919                                  if (result != initial_result)
     920                                    free (result);
     921                                  errno = ENOMEM;
     922                                  return -1;
     923                                }
     924                              if (result == initial_result)
     925                                memcpy (memory, initial_result, length);
     926                              result = memory;
     927                              grow = false;
     928  
     929                              out2ptr = result + length;
     930                              out2size = allocated - extra_alloc - length;
     931                              if (cd2 != (iconv_t)(-1))
     932                                res = iconv (cd2,
     933                                             (ICONV_CONST char **) &inptr,
     934                                             &insize,
     935                                             &out2ptr, &out2size);
     936                              else
     937                                {
     938                                  /* TO_CODESET is UTF-8.  */
     939                                  if (!(out2size >= insize))
     940                                    abort ();
     941                                  memcpy (out2ptr, inptr, insize);
     942                                  out2ptr += insize;
     943                                  out2size -= insize;
     944                                  inptr += insize;
     945                                  insize = 0;
     946                                  res = 0;
     947                                }
     948                              length = out2ptr - result;
     949                            }
     950  # if !defined _LIBICONV_VERSION && !(defined __GLIBC__ && !defined __UCLIBC__)
     951                          /* IRIX iconv() inserts a NUL byte if it cannot convert.
     952                             FreeBSD iconv(), NetBSD iconv(), and Solaris 11
     953                             iconv() insert a '?' if they cannot convert.
     954                             musl libc iconv() inserts a '*' if it cannot convert.
     955                             Only GNU libiconv and GNU libc are known to prefer
     956                             to fail rather than doing a lossy conversion.  */
     957                          if (res != (size_t)(-1) && res > 0)
     958                            {
     959                              errno = EILSEQ;
     960                              res = (size_t)(-1);
     961                            }
     962  # endif
     963                          if (res == (size_t)(-1))
     964                            {
     965                              /* Failure converting the ASCII replacement.  */
     966                              if (result != initial_result)
     967                                free (result);
     968                              return -1;
     969                            }
     970                        }
     971                      else
     972                        {
     973                          if (result != initial_result)
     974                            free (result);
     975                          return -1;
     976                        }
     977                    }
     978                  if (!(in2size > 0
     979                        || (in1size == 0 && !do_final_flush1 && do_final_flush2)))
     980                    break;
     981                  if (grow)
     982                    {
     983                      char *memory;
     984  
     985                      allocated = 2 * allocated;
     986                      if (result == initial_result)
     987                        memory = (char *) malloc (allocated);
     988                      else
     989                        memory = (char *) realloc (result, allocated);
     990                      if (memory == NULL)
     991                        {
     992                          if (result != initial_result)
     993                            free (result);
     994                          errno = ENOMEM;
     995                          return -1;
     996                        }
     997                      if (result == initial_result)
     998                        memcpy (memory, initial_result, length);
     999                      result = memory;
    1000                    }
    1001                }
    1002  
    1003              /* Move the remaining bytes to the beginning of utf8buf.  */
    1004              if (in2size > 0)
    1005                memmove (utf8buf, in2ptr, in2size);
    1006              utf8len = in2size;
    1007            }
    1008  
    1009          if (res1 == (size_t)(-1))
    1010            {
    1011              if (errno1 == EINVAL)
    1012                in1size = 0;
    1013              else if (errno1 == EILSEQ)
    1014                {
    1015                  if (result != initial_result)
    1016                    free (result);
    1017                  errno = errno1;
    1018                  return -1;
    1019                }
    1020            }
    1021        }
    1022  # undef utf8bufsize
    1023    }
    1024  
    1025   done:
    1026    /* Now the final memory allocation.  */
    1027    if (result == tmpbuf)
    1028      {
    1029        size_t memsize = length + extra_alloc;
    1030  
    1031        if (*resultp != NULL && *lengthp >= memsize)
    1032          result = *resultp;
    1033        else
    1034          {
    1035            char *memory;
    1036  
    1037            memory = (char *) malloc (memsize > 0 ? memsize : 1);
    1038            if (memory != NULL)
    1039              result = memory;
    1040            else
    1041              {
    1042                errno = ENOMEM;
    1043                return -1;
    1044              }
    1045          }
    1046        memcpy (result, tmpbuf, length);
    1047      }
    1048    else if (result != *resultp && length + extra_alloc < allocated)
    1049      {
    1050        /* Shrink the allocated memory if possible.  */
    1051        size_t memsize = length + extra_alloc;
    1052        char *memory;
    1053  
    1054        memory = (char *) realloc (result, memsize > 0 ? memsize : 1);
    1055        if (memory != NULL)
    1056          result = memory;
    1057      }
    1058    *resultp = result;
    1059    *lengthp = length;
    1060    return 0;
    1061  # undef tmpbuf
    1062  # undef tmpbufsize
    1063  }
    1064  
    1065  int
    1066  mem_cd_iconveh (const char *src, size_t srclen,
    1067                  const iconveh_t *cd,
    1068                  enum iconv_ilseq_handler handler,
    1069                  size_t *offsets,
    1070                  char **resultp, size_t *lengthp)
    1071  {
    1072    return mem_cd_iconveh_internal (src, srclen, cd->cd, cd->cd1, cd->cd2,
    1073                                    handler, 0, offsets, resultp, lengthp);
    1074  }
    1075  
    1076  char *
    1077  str_cd_iconveh (const char *src,
    1078                  const iconveh_t *cd,
    1079                  enum iconv_ilseq_handler handler)
    1080  {
    1081    /* For most encodings, a trailing NUL byte in the input will be converted
    1082       to a trailing NUL byte in the output.  But not for UTF-7.  So that this
    1083       function is usable for UTF-7, we have to exclude the NUL byte from the
    1084       conversion and add it by hand afterwards.  */
    1085    char *result = NULL;
    1086    size_t length = 0;
    1087    int retval = mem_cd_iconveh_internal (src, strlen (src),
    1088                                          cd->cd, cd->cd1, cd->cd2, handler, 1,
    1089                                          NULL, &result, &length);
    1090  
    1091    if (retval < 0)
    1092      {
    1093        free (result);
    1094        return NULL;
    1095      }
    1096  
    1097    /* Add the terminating NUL byte.  */
    1098    result[length] = '\0';
    1099  
    1100    return result;
    1101  }
    1102  
    1103  #endif
    1104  
    1105  int
    1106  mem_iconveh (const char *src, size_t srclen,
    1107               const char *from_codeset, const char *to_codeset,
    1108               enum iconv_ilseq_handler handler,
    1109               size_t *offsets,
    1110               char **resultp, size_t *lengthp)
    1111  {
    1112    if (srclen == 0)
    1113      {
    1114        /* Nothing to convert.  */
    1115        *lengthp = 0;
    1116        return 0;
    1117      }
    1118    else if (offsets == NULL && c_strcasecmp (from_codeset, to_codeset) == 0)
    1119      {
    1120        char *result;
    1121  
    1122        if (*resultp != NULL && *lengthp >= srclen)
    1123          result = *resultp;
    1124        else
    1125          {
    1126            result = (char *) malloc (srclen);
    1127            if (result == NULL)
    1128              {
    1129                errno = ENOMEM;
    1130                return -1;
    1131              }
    1132          }
    1133        memcpy (result, src, srclen);
    1134        *resultp = result;
    1135        *lengthp = srclen;
    1136        return 0;
    1137      }
    1138    else
    1139      {
    1140  #if HAVE_ICONV
    1141        iconveh_t cd;
    1142        char *result;
    1143        size_t length;
    1144        int retval;
    1145  
    1146        if (iconveh_open (to_codeset, from_codeset, &cd) < 0)
    1147          return -1;
    1148  
    1149        result = *resultp;
    1150        length = *lengthp;
    1151        retval = mem_cd_iconveh (src, srclen, &cd, handler, offsets,
    1152                                 &result, &length);
    1153  
    1154        if (retval < 0)
    1155          {
    1156            /* Close cd, but preserve the errno from str_cd_iconv.  */
    1157            int saved_errno = errno;
    1158            iconveh_close (&cd);
    1159            errno = saved_errno;
    1160          }
    1161        else
    1162          {
    1163            if (iconveh_close (&cd) < 0)
    1164              {
    1165                if (result != *resultp)
    1166                  free (result);
    1167                return -1;
    1168              }
    1169            *resultp = result;
    1170            *lengthp = length;
    1171          }
    1172        return retval;
    1173  #else
    1174        /* This is a different error code than if iconv_open existed but didn't
    1175           support from_codeset and to_codeset, so that the caller can emit
    1176           an error message such as
    1177             "iconv() is not supported. Installing GNU libiconv and
    1178              then reinstalling this package would fix this."  */
    1179        errno = ENOSYS;
    1180        return -1;
    1181  #endif
    1182      }
    1183  }
    1184  
    1185  char *
    1186  str_iconveh (const char *src,
    1187               const char *from_codeset, const char *to_codeset,
    1188               enum iconv_ilseq_handler handler)
    1189  {
    1190    if (*src == '\0' || c_strcasecmp (from_codeset, to_codeset) == 0)
    1191      {
    1192        char *result = strdup (src);
    1193  
    1194        if (result == NULL)
    1195          errno = ENOMEM;
    1196        return result;
    1197      }
    1198    else
    1199      {
    1200  #if HAVE_ICONV
    1201        iconveh_t cd;
    1202        char *result;
    1203  
    1204        if (iconveh_open (to_codeset, from_codeset, &cd) < 0)
    1205          return NULL;
    1206  
    1207        result = str_cd_iconveh (src, &cd, handler);
    1208  
    1209        if (result == NULL)
    1210          {
    1211            /* Close cd, but preserve the errno from str_cd_iconv.  */
    1212            int saved_errno = errno;
    1213            iconveh_close (&cd);
    1214            errno = saved_errno;
    1215          }
    1216        else
    1217          {
    1218            if (iconveh_close (&cd) < 0)
    1219              {
    1220                free (result);
    1221                return NULL;
    1222              }
    1223          }
    1224        return result;
    1225  #else
    1226        /* This is a different error code than if iconv_open existed but didn't
    1227           support from_codeset and to_codeset, so that the caller can emit
    1228           an error message such as
    1229             "iconv() is not supported. Installing GNU libiconv and
    1230              then reinstalling this package would fix this."  */
    1231        errno = ENOSYS;
    1232        return NULL;
    1233  #endif
    1234      }
    1235  }