(root)/
grep-3.11/
lib/
iconv.c
       1  /* Character set conversion.
       2     Copyright (C) 1999-2001, 2007, 2009-2023 Free Software Foundation, Inc.
       3  
       4     This file is free software: you can redistribute it and/or modify
       5     it under the terms of the GNU Lesser General Public License as
       6     published by the Free Software Foundation; either version 2.1 of the
       7     License, or (at your option) any later version.
       8  
       9     This file is distributed in the hope that it will be useful,
      10     but WITHOUT ANY WARRANTY; without even the implied warranty of
      11     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      12     GNU Lesser General Public License for more details.
      13  
      14     You should have received a copy of the GNU Lesser General Public License
      15     along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
      16  
      17  #include <config.h>
      18  
      19  /* Specification.  */
      20  #include <iconv.h>
      21  
      22  #include <stddef.h>
      23  
      24  #if REPLACE_ICONV_UTF
      25  # include <errno.h>
      26  # include <stdint.h>
      27  # include <stdlib.h>
      28  # include "unistr.h"
      29  #endif
      30  
      31  #if REPLACE_ICONV_UTF
      32  
      33  /* UTF-{16,32}{BE,LE} converters taken from GNU libiconv 1.11.  */
      34  
      35  /* Return code if invalid. (xxx_mbtowc) */
      36  # define RET_ILSEQ      -1
      37  /* Return code if no bytes were read. (xxx_mbtowc) */
      38  # define RET_TOOFEW     -2
      39  
      40  /* Return code if invalid. (xxx_wctomb) */
      41  # define RET_ILUNI      -1
      42  /* Return code if output buffer is too small. (xxx_wctomb, xxx_reset) */
      43  # define RET_TOOSMALL   -2
      44  
      45  /*
      46   * UTF-16BE
      47   */
      48  
      49  /* Specification: RFC 2781 */
      50  
      51  static int
      52  utf16be_mbtowc (ucs4_t *pwc, const unsigned char *s, size_t n)
      53  {
      54    if (n >= 2)
      55      {
      56        ucs4_t wc = (s[0] << 8) + s[1];
      57        if (wc >= 0xd800 && wc < 0xdc00)
      58          {
      59            if (n >= 4)
      60              {
      61                ucs4_t wc2 = (s[2] << 8) + s[3];
      62                if (!(wc2 >= 0xdc00 && wc2 < 0xe000))
      63                  return RET_ILSEQ;
      64                *pwc = 0x10000 + ((wc - 0xd800) << 10) + (wc2 - 0xdc00);
      65                return 4;
      66              }
      67          }
      68        else if (wc >= 0xdc00 && wc < 0xe000)
      69          {
      70            return RET_ILSEQ;
      71          }
      72        else
      73          {
      74            *pwc = wc;
      75            return 2;
      76          }
      77      }
      78    return RET_TOOFEW;
      79  }
      80  
      81  static int
      82  utf16be_wctomb (unsigned char *r, ucs4_t wc, size_t n)
      83  {
      84    if (!(wc >= 0xd800 && wc < 0xe000))
      85      {
      86        if (wc < 0x10000)
      87          {
      88            if (n >= 2)
      89              {
      90                r[0] = (unsigned char) (wc >> 8);
      91                r[1] = (unsigned char) wc;
      92                return 2;
      93              }
      94            else
      95              return RET_TOOSMALL;
      96          }
      97        else if (wc < 0x110000)
      98          {
      99            if (n >= 4)
     100              {
     101                ucs4_t wc1 = 0xd800 + ((wc - 0x10000) >> 10);
     102                ucs4_t wc2 = 0xdc00 + ((wc - 0x10000) & 0x3ff);
     103                r[0] = (unsigned char) (wc1 >> 8);
     104                r[1] = (unsigned char) wc1;
     105                r[2] = (unsigned char) (wc2 >> 8);
     106                r[3] = (unsigned char) wc2;
     107                return 4;
     108              }
     109            else
     110              return RET_TOOSMALL;
     111          }
     112      }
     113    return RET_ILUNI;
     114  }
     115  
     116  /*
     117   * UTF-16LE
     118   */
     119  
     120  /* Specification: RFC 2781 */
     121  
     122  static int
     123  utf16le_mbtowc (ucs4_t *pwc, const unsigned char *s, size_t n)
     124  {
     125    if (n >= 2)
     126      {
     127        ucs4_t wc = s[0] + (s[1] << 8);
     128        if (wc >= 0xd800 && wc < 0xdc00)
     129          {
     130            if (n >= 4)
     131              {
     132                ucs4_t wc2 = s[2] + (s[3] << 8);
     133                if (!(wc2 >= 0xdc00 && wc2 < 0xe000))
     134                  return RET_ILSEQ;
     135                *pwc = 0x10000 + ((wc - 0xd800) << 10) + (wc2 - 0xdc00);
     136                return 4;
     137              }
     138          }
     139        else if (wc >= 0xdc00 && wc < 0xe000)
     140          {
     141            return RET_ILSEQ;
     142          }
     143        else
     144          {
     145            *pwc = wc;
     146            return 2;
     147          }
     148      }
     149    return RET_TOOFEW;
     150  }
     151  
     152  static int
     153  utf16le_wctomb (unsigned char *r, ucs4_t wc, size_t n)
     154  {
     155    if (!(wc >= 0xd800 && wc < 0xe000))
     156      {
     157        if (wc < 0x10000)
     158          {
     159            if (n >= 2)
     160              {
     161                r[0] = (unsigned char) wc;
     162                r[1] = (unsigned char) (wc >> 8);
     163                return 2;
     164              }
     165            else
     166              return RET_TOOSMALL;
     167          }
     168        else if (wc < 0x110000)
     169          {
     170            if (n >= 4)
     171              {
     172                ucs4_t wc1 = 0xd800 + ((wc - 0x10000) >> 10);
     173                ucs4_t wc2 = 0xdc00 + ((wc - 0x10000) & 0x3ff);
     174                r[0] = (unsigned char) wc1;
     175                r[1] = (unsigned char) (wc1 >> 8);
     176                r[2] = (unsigned char) wc2;
     177                r[3] = (unsigned char) (wc2 >> 8);
     178                return 4;
     179              }
     180            else
     181              return RET_TOOSMALL;
     182          }
     183      }
     184    return RET_ILUNI;
     185  }
     186  
     187  /*
     188   * UTF-32BE
     189   */
     190  
     191  /* Specification: Unicode 3.1 Standard Annex #19 */
     192  
     193  static int
     194  utf32be_mbtowc (ucs4_t *pwc, const unsigned char *s, size_t n)
     195  {
     196    if (n >= 4)
     197      {
     198        ucs4_t wc = (s[0] << 24) + (s[1] << 16) + (s[2] << 8) + s[3];
     199        if (wc < 0x110000 && !(wc >= 0xd800 && wc < 0xe000))
     200          {
     201            *pwc = wc;
     202            return 4;
     203          }
     204        else
     205          return RET_ILSEQ;
     206      }
     207    return RET_TOOFEW;
     208  }
     209  
     210  static int
     211  utf32be_wctomb (unsigned char *r, ucs4_t wc, size_t n)
     212  {
     213    if (wc < 0x110000 && !(wc >= 0xd800 && wc < 0xe000))
     214      {
     215        if (n >= 4)
     216          {
     217            r[0] = 0;
     218            r[1] = (unsigned char) (wc >> 16);
     219            r[2] = (unsigned char) (wc >> 8);
     220            r[3] = (unsigned char) wc;
     221            return 4;
     222          }
     223        else
     224          return RET_TOOSMALL;
     225      }
     226    return RET_ILUNI;
     227  }
     228  
     229  /*
     230   * UTF-32LE
     231   */
     232  
     233  /* Specification: Unicode 3.1 Standard Annex #19 */
     234  
     235  static int
     236  utf32le_mbtowc (ucs4_t *pwc, const unsigned char *s, size_t n)
     237  {
     238    if (n >= 4)
     239      {
     240        ucs4_t wc = s[0] + (s[1] << 8) + (s[2] << 16) + (s[3] << 24);
     241        if (wc < 0x110000 && !(wc >= 0xd800 && wc < 0xe000))
     242          {
     243            *pwc = wc;
     244            return 4;
     245          }
     246        else
     247          return RET_ILSEQ;
     248      }
     249    return RET_TOOFEW;
     250  }
     251  
     252  static int
     253  utf32le_wctomb (unsigned char *r, ucs4_t wc, size_t n)
     254  {
     255    if (wc < 0x110000 && !(wc >= 0xd800 && wc < 0xe000))
     256      {
     257        if (n >= 4)
     258          {
     259            r[0] = (unsigned char) wc;
     260            r[1] = (unsigned char) (wc >> 8);
     261            r[2] = (unsigned char) (wc >> 16);
     262            r[3] = 0;
     263            return 4;
     264          }
     265        else
     266          return RET_TOOSMALL;
     267      }
     268    return RET_ILUNI;
     269  }
     270  
     271  #endif
     272  
     273  size_t
     274  rpl_iconv (iconv_t cd,
     275             ICONV_CONST char **inbuf, size_t *inbytesleft,
     276             char **outbuf, size_t *outbytesleft)
     277  #undef iconv
     278  {
     279  #if REPLACE_ICONV_UTF
     280    switch ((uintptr_t) cd)
     281      {
     282        {
     283          int (*xxx_wctomb) (unsigned char *, ucs4_t, size_t);
     284  
     285          case (uintptr_t) _ICONV_UTF8_UTF16BE:
     286            xxx_wctomb = utf16be_wctomb;
     287            goto loop_from_utf8;
     288          case (uintptr_t) _ICONV_UTF8_UTF16LE:
     289            xxx_wctomb = utf16le_wctomb;
     290            goto loop_from_utf8;
     291          case (uintptr_t) _ICONV_UTF8_UTF32BE:
     292            xxx_wctomb = utf32be_wctomb;
     293            goto loop_from_utf8;
     294          case (uintptr_t) _ICONV_UTF8_UTF32LE:
     295            xxx_wctomb = utf32le_wctomb;
     296            goto loop_from_utf8;
     297  
     298         loop_from_utf8:
     299          if (inbuf == NULL || *inbuf == NULL)
     300            return 0;
     301          {
     302            ICONV_CONST char *inptr = *inbuf;
     303            size_t inleft = *inbytesleft;
     304            char *outptr = *outbuf;
     305            size_t outleft = *outbytesleft;
     306            size_t res = 0;
     307            while (inleft > 0)
     308              {
     309                ucs4_t uc;
     310                int m = u8_mbtoucr (&uc, (const uint8_t *) inptr, inleft);
     311                if (m <= 0)
     312                  {
     313                    if (m == -1)
     314                      {
     315                        errno = EILSEQ;
     316                        res = (size_t)(-1);
     317                        break;
     318                      }
     319                    if (m == -2)
     320                      {
     321                        errno = EINVAL;
     322                        res = (size_t)(-1);
     323                        break;
     324                      }
     325                    abort ();
     326                  }
     327                else
     328                  {
     329                    int n = xxx_wctomb ((uint8_t *) outptr, uc, outleft);
     330                    if (n < 0)
     331                      {
     332                        if (n == RET_ILUNI)
     333                          {
     334                            errno = EILSEQ;
     335                            res = (size_t)(-1);
     336                            break;
     337                          }
     338                        if (n == RET_TOOSMALL)
     339                          {
     340                            errno = E2BIG;
     341                            res = (size_t)(-1);
     342                            break;
     343                          }
     344                        abort ();
     345                      }
     346                    else
     347                      {
     348                        inptr += m;
     349                        inleft -= m;
     350                        outptr += n;
     351                        outleft -= n;
     352                      }
     353                  }
     354              }
     355            *inbuf = inptr;
     356            *inbytesleft = inleft;
     357            *outbuf = outptr;
     358            *outbytesleft = outleft;
     359            return res;
     360          }
     361        }
     362  
     363        {
     364          int (*xxx_mbtowc) (ucs4_t *, const unsigned char *, size_t);
     365  
     366          case (uintptr_t) _ICONV_UTF16BE_UTF8:
     367            xxx_mbtowc = utf16be_mbtowc;
     368            goto loop_to_utf8;
     369          case (uintptr_t) _ICONV_UTF16LE_UTF8:
     370            xxx_mbtowc = utf16le_mbtowc;
     371            goto loop_to_utf8;
     372          case (uintptr_t) _ICONV_UTF32BE_UTF8:
     373            xxx_mbtowc = utf32be_mbtowc;
     374            goto loop_to_utf8;
     375          case (uintptr_t) _ICONV_UTF32LE_UTF8:
     376            xxx_mbtowc = utf32le_mbtowc;
     377            goto loop_to_utf8;
     378  
     379         loop_to_utf8:
     380          if (inbuf == NULL || *inbuf == NULL)
     381            return 0;
     382          {
     383            ICONV_CONST char *inptr = *inbuf;
     384            size_t inleft = *inbytesleft;
     385            char *outptr = *outbuf;
     386            size_t outleft = *outbytesleft;
     387            size_t res = 0;
     388            while (inleft > 0)
     389              {
     390                ucs4_t uc;
     391                int m = xxx_mbtowc (&uc, (const uint8_t *) inptr, inleft);
     392                if (m <= 0)
     393                  {
     394                    if (m == RET_ILSEQ)
     395                      {
     396                        errno = EILSEQ;
     397                        res = (size_t)(-1);
     398                        break;
     399                      }
     400                    if (m == RET_TOOFEW)
     401                      {
     402                        errno = EINVAL;
     403                        res = (size_t)(-1);
     404                        break;
     405                      }
     406                    abort ();
     407                  }
     408                else
     409                  {
     410                    int n = u8_uctomb ((uint8_t *) outptr, uc, outleft);
     411                    if (n < 0)
     412                      {
     413                        if (n == -1)
     414                          {
     415                            errno = EILSEQ;
     416                            res = (size_t)(-1);
     417                            break;
     418                          }
     419                        if (n == -2)
     420                          {
     421                            errno = E2BIG;
     422                            res = (size_t)(-1);
     423                            break;
     424                          }
     425                        abort ();
     426                      }
     427                    else
     428                      {
     429                        inptr += m;
     430                        inleft -= m;
     431                        outptr += n;
     432                        outleft -= n;
     433                      }
     434                  }
     435              }
     436            *inbuf = inptr;
     437            *inbytesleft = inleft;
     438            *outbuf = outptr;
     439            *outbytesleft = outleft;
     440            return res;
     441          }
     442        }
     443      }
     444  #endif
     445    return iconv (cd, inbuf, inbytesleft, outbuf, outbytesleft);
     446  }