(root)/
tar-1.35/
gnu/
mbrtowc-impl.h
       1  /* Convert multibyte character to wide character.
       2     Copyright (C) 1999-2002, 2005-2023 Free Software Foundation, Inc.
       3  
       4     This file is free software: you can redistribute it and/or modify
       5     it under the terms of the GNU Lesser General Public License as
       6     published by the Free Software Foundation; either version 2.1 of the
       7     License, or (at your option) any later version.
       8  
       9     This file is distributed in the hope that it will be useful,
      10     but WITHOUT ANY WARRANTY; without even the implied warranty of
      11     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      12     GNU Lesser General Public License for more details.
      13  
      14     You should have received a copy of the GNU Lesser General Public License
      15     along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
      16  
      17  /* Written by Bruno Haible <bruno@clisp.org>, 2008.  */
      18  
      19  /* This file contains the body of the mbrtowc and mbrtoc32 functions,
      20     when GNULIB_defined_mbstate_t is defined.  */
      21  
      22    char *pstate = (char *)ps;
      23  
      24    if (s == NULL)
      25      {
      26        pwc = NULL;
      27        s = "";
      28        n = 1;
      29      }
      30  
      31    if (n == 0)
      32      return (size_t)(-2);
      33  
      34    /* Here n > 0.  */
      35  
      36    if (pstate == NULL)
      37      pstate = internal_state;
      38  
      39    {
      40      size_t nstate = pstate[0];
      41      char buf[4];
      42      const char *p;
      43      size_t m;
      44      enc_t enc;
      45      int res;
      46  
      47      switch (nstate)
      48        {
      49        case 0:
      50          p = s;
      51          m = n;
      52          break;
      53        case 3:
      54          buf[2] = pstate[3];
      55          FALLTHROUGH;
      56        case 2:
      57          buf[1] = pstate[2];
      58          FALLTHROUGH;
      59        case 1:
      60          buf[0] = pstate[1];
      61          p = buf;
      62          m = nstate;
      63          buf[m++] = s[0];
      64          if (n >= 2 && m < 4)
      65            {
      66              buf[m++] = s[1];
      67              if (n >= 3 && m < 4)
      68                buf[m++] = s[2];
      69            }
      70          break;
      71        default:
      72          errno = EINVAL;
      73          return (size_t)(-1);
      74        }
      75  
      76      /* Here m > 0.  */
      77  
      78      enc = locale_encoding_classification ();
      79  
      80      if (enc == enc_utf8) /* UTF-8 */
      81        {
      82          /* Achieve
      83               - multi-thread safety and
      84               - the ability to produce wide character values > WCHAR_MAX
      85             by not calling mbtowc() at all.  */
      86  #include "mbrtowc-impl-utf8.h"
      87        }
      88      else
      89        {
      90          /* The hidden internal state of mbtowc would make this function not
      91             multi-thread safe.  Achieve multi-thread safety through a lock.  */
      92          wchar_t wc;
      93          res = mbtowc_with_lock (&wc, p, m);
      94  
      95          if (res >= 0)
      96            {
      97              if ((wc == 0) != (res == 0))
      98                abort ();
      99              if (pwc != NULL)
     100                *pwc = wc;
     101              goto success;
     102            }
     103  
     104          /* mbtowc does not distinguish between invalid and incomplete multibyte
     105             sequences.  But mbrtowc needs to make this distinction.
     106             There are two possible approaches:
     107               - Use iconv() and its return value.
     108               - Use built-in knowledge about the possible encodings.
     109             Given the low quality of implementation of iconv() on the systems
     110             that lack mbrtowc(), we use the second approach.
     111             The possible encodings are:
     112               - 8-bit encodings,
     113               - EUC-JP, EUC-KR, GB2312, EUC-TW, BIG5, GB18030, SJIS,
     114               - UTF-8 (already handled above).
     115             Use specialized code for each.  */
     116          if (m >= 4 || m >= MB_CUR_MAX)
     117            goto invalid;
     118          /* Here MB_CUR_MAX > 1 and 0 < m < 4.  */
     119          switch (enc)
     120            {
     121            /* As a reference for this code, you can use the GNU libiconv
     122               implementation.  Look for uses of the RET_TOOFEW macro.  */
     123  
     124            case enc_eucjp: /* EUC-JP */
     125              {
     126                if (m == 1)
     127                  {
     128                    unsigned char c = (unsigned char) p[0];
     129  
     130                    if ((c >= 0xa1 && c < 0xff) || c == 0x8e || c == 0x8f)
     131                      goto incomplete;
     132                  }
     133                if (m == 2)
     134                  {
     135                    unsigned char c = (unsigned char) p[0];
     136  
     137                    if (c == 0x8f)
     138                      {
     139                        unsigned char c2 = (unsigned char) p[1];
     140  
     141                        if (c2 >= 0xa1 && c2 < 0xff)
     142                          goto incomplete;
     143                      }
     144                  }
     145                goto invalid;
     146              }
     147  
     148            case enc_94: /* EUC-KR, GB2312, BIG5 */
     149              {
     150                if (m == 1)
     151                  {
     152                    unsigned char c = (unsigned char) p[0];
     153  
     154                    if (c >= 0xa1 && c < 0xff)
     155                      goto incomplete;
     156                  }
     157                goto invalid;
     158              }
     159  
     160            case enc_euctw: /* EUC-TW */
     161              {
     162                if (m == 1)
     163                  {
     164                    unsigned char c = (unsigned char) p[0];
     165  
     166                    if ((c >= 0xa1 && c < 0xff) || c == 0x8e)
     167                      goto incomplete;
     168                  }
     169                else /* m == 2 || m == 3 */
     170                  {
     171                    unsigned char c = (unsigned char) p[0];
     172  
     173                    if (c == 0x8e)
     174                      goto incomplete;
     175                  }
     176                goto invalid;
     177              }
     178  
     179            case enc_gb18030: /* GB18030 */
     180              {
     181                if (m == 1)
     182                  {
     183                    unsigned char c = (unsigned char) p[0];
     184  
     185                    if ((c >= 0x90 && c <= 0xe3) || (c >= 0xf8 && c <= 0xfe))
     186                      goto incomplete;
     187                  }
     188                else /* m == 2 || m == 3 */
     189                  {
     190                    unsigned char c = (unsigned char) p[0];
     191  
     192                    if (c >= 0x90 && c <= 0xe3)
     193                      {
     194                        unsigned char c2 = (unsigned char) p[1];
     195  
     196                        if (c2 >= 0x30 && c2 <= 0x39)
     197                          {
     198                            if (m == 2)
     199                              goto incomplete;
     200                            else /* m == 3 */
     201                              {
     202                                unsigned char c3 = (unsigned char) p[2];
     203  
     204                                if (c3 >= 0x81 && c3 <= 0xfe)
     205                                  goto incomplete;
     206                              }
     207                          }
     208                      }
     209                  }
     210                goto invalid;
     211              }
     212  
     213            case enc_sjis: /* SJIS */
     214              {
     215                if (m == 1)
     216                  {
     217                    unsigned char c = (unsigned char) p[0];
     218  
     219                    if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea)
     220                        || (c >= 0xf0 && c <= 0xf9))
     221                      goto incomplete;
     222                  }
     223                goto invalid;
     224              }
     225  
     226            default:
     227              /* An unknown multibyte encoding.  */
     228              goto incomplete;
     229            }
     230        }
     231  
     232     success:
     233      /* res >= 0 is the corrected return value of
     234         mbtowc_with_lock (&wc, p, m).  */
     235      if (nstate >= (res > 0 ? res : 1))
     236        abort ();
     237      res -= nstate;
     238      pstate[0] = 0;
     239      return res;
     240  
     241     incomplete:
     242      {
     243        size_t k = nstate;
     244        /* Here 0 <= k < m < 4.  */
     245        pstate[++k] = s[0];
     246        if (k < m)
     247          {
     248            pstate[++k] = s[1];
     249            if (k < m)
     250              pstate[++k] = s[2];
     251          }
     252        if (k != m)
     253          abort ();
     254      }
     255      pstate[0] = m;
     256      return (size_t)(-2);
     257  
     258     invalid:
     259      errno = EILSEQ;
     260      /* The conversion state is undefined, says POSIX.  */
     261      return (size_t)(-1);
     262    }