(root)/
gettext-0.22.4/
gettext-runtime/
gnulib-lib/
mbrtoc32.c
       1  /* Convert multibyte character to 32-bit wide character.
       2     Copyright (C) 2020-2023 Free Software Foundation, Inc.
       3  
       4     This file is free software: you can redistribute it and/or modify
       5     it under the terms of the GNU Lesser General Public License as
       6     published by the Free Software Foundation; either version 2.1 of the
       7     License, or (at your option) any later version.
       8  
       9     This file is distributed in the hope that it will be useful,
      10     but WITHOUT ANY WARRANTY; without even the implied warranty of
      11     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      12     GNU Lesser General Public License for more details.
      13  
      14     You should have received a copy of the GNU Lesser General Public License
      15     along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
      16  
      17  /* Written by Bruno Haible <bruno@clisp.org>, 2020.  */
      18  
      19  #include <config.h>
      20  
      21  /* Specification.  */
      22  #include <uchar.h>
      23  
      24  #include "attribute.h"
      25  
      26  #include <errno.h>
      27  #include <stdlib.h>
      28  
      29  #if GL_CHAR32_T_IS_UNICODE
      30  # include "lc-charset-unicode.h"
      31  #endif
      32  
      33  #if GNULIB_defined_mbstate_t /* AIX, IRIX */
      34  /* Implement mbrtoc32() on top of mbtowc() for the non-UTF-8 locales
      35     and directly for the UTF-8 locales.  */
      36  
      37  /* Note: On AIX (64-bit) we can implement mbrtoc32 in two equivalent ways:
      38     - in a way that parallels the override of mbrtowc; this is the code branch
      39       here;
      40     - in a way that invokes the overridden mbrtowc; this would be the #else
      41       branch below.
      42     They are equivalent.  */
      43  
      44  # if AVOID_ANY_THREADS
      45  
      46  /* The option '--disable-threads' explicitly requests no locking.  */
      47  
      48  # elif defined _WIN32 && !defined __CYGWIN__
      49  
      50  #  define WIN32_LEAN_AND_MEAN  /* avoid including junk */
      51  #  include <windows.h>
      52  
      53  # elif HAVE_PTHREAD_API
      54  
      55  #  include <pthread.h>
      56  #  if HAVE_THREADS_H && HAVE_WEAK_SYMBOLS
      57  #   include <threads.h>
      58  #   pragma weak thrd_exit
      59  #   define c11_threads_in_use() (thrd_exit != NULL)
      60  #  else
      61  #   define c11_threads_in_use() 0
      62  #  endif
      63  
      64  # elif HAVE_THREADS_H
      65  
      66  #  include <threads.h>
      67  
      68  # endif
      69  
      70  # include "lc-charset-dispatch.h"
      71  # include "mbtowc-lock.h"
      72  
      73  static_assert (sizeof (mbstate_t) >= 4);
      74  static char internal_state[4];
      75  
      76  size_t
      77  mbrtoc32 (char32_t *pwc, const char *s, size_t n, mbstate_t *ps)
      78  {
      79  # define FITS_IN_CHAR_TYPE(wc)  1
      80  # include "mbrtowc-impl.h"
      81  }
      82  
      83  #else /* glibc, macOS, FreeBSD, NetBSD, OpenBSD, HP-UX, Solaris, Cygwin, mingw, MSVC, Minix, Android */
      84  
      85  /* Implement mbrtoc32() based on the original mbrtoc32() or on mbrtowc().  */
      86  
      87  # include <wchar.h>
      88  
      89  # include "localcharset.h"
      90  # include "streq.h"
      91  
      92  # if MBRTOC32_IN_C_LOCALE_MAYBE_EILSEQ
      93  #  include "hard-locale.h"
      94  #  include <locale.h>
      95  # endif
      96  
      97  static mbstate_t internal_state;
      98  
      99  size_t
     100  mbrtoc32 (char32_t *pwc, const char *s, size_t n, mbstate_t *ps)
     101  # undef mbrtoc32
     102  {
     103    /* It's simpler to handle the case s == NULL upfront, than to worry about
     104       this case later, before every test of pwc and n.  */
     105    if (s == NULL)
     106      {
     107        pwc = NULL;
     108        s = "";
     109        n = 1;
     110      }
     111  
     112  # if MBRTOC32_EMPTY_INPUT_BUG || _GL_SMALL_WCHAR_T
     113    if (n == 0)
     114      return (size_t) -2;
     115  # endif
     116  
     117    if (ps == NULL)
     118      ps = &internal_state;
     119  
     120  # if HAVE_WORKING_MBRTOC32
     121    /* mbrtoc32() may produce different values for wc than mbrtowc().  Therefore
     122       use mbrtoc32().  */
     123  
     124  #  if defined _WIN32 && !defined __CYGWIN__
     125    char32_t wc;
     126    size_t ret = mbrtoc32 (&wc, s, n, ps);
     127    if (ret < (size_t) -2 && pwc != NULL)
     128      *pwc = wc;
     129  #  else
     130    size_t ret = mbrtoc32 (pwc, s, n, ps);
     131  #  endif
     132  
     133  #  if GNULIB_MBRTOC32_REGULAR
     134    /* Verify that mbrtoc32 is regular.  */
     135    if (ret < (size_t) -3 && ! mbsinit (ps))
     136      /* This occurs on glibc 2.36.  */
     137      mbszero (ps);
     138    if (ret == (size_t) -3)
     139      abort ();
     140  #  endif
     141  
     142  #  if MBRTOC32_IN_C_LOCALE_MAYBE_EILSEQ
     143    if ((size_t) -2 <= ret && n != 0 && ! hard_locale (LC_CTYPE))
     144      {
     145        if (pwc != NULL)
     146          *pwc = (unsigned char) *s;
     147        return 1;
     148      }
     149  #  endif
     150  
     151    return ret;
     152  
     153  # elif _GL_SMALL_WCHAR_T
     154  
     155    /* Special-case all encodings that may produce wide character values
     156       > WCHAR_MAX.  */
     157    const char *encoding = locale_charset ();
     158    if (STREQ_OPT (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0, 0))
     159      {
     160        /* Special-case the UTF-8 encoding.  Assume that the wide-character
     161           encoding in a UTF-8 locale is UCS-2 or, equivalently, UTF-16.  */
     162        /* Here n > 0.  */
     163        char *pstate = (char *)ps;
     164        size_t nstate = pstate[0];
     165        char buf[4];
     166        const char *p;
     167        size_t m;
     168        int res;
     169  
     170        switch (nstate)
     171          {
     172          case 0:
     173            p = s;
     174            m = n;
     175            break;
     176          case 3:
     177            buf[2] = pstate[3];
     178            FALLTHROUGH;
     179          case 2:
     180            buf[1] = pstate[2];
     181            FALLTHROUGH;
     182          case 1:
     183            buf[0] = pstate[1];
     184            p = buf;
     185            m = nstate;
     186            buf[m++] = s[0];
     187            if (n >= 2 && m < 4)
     188              {
     189                buf[m++] = s[1];
     190                if (n >= 3 && m < 4)
     191                  buf[m++] = s[2];
     192              }
     193            break;
     194          default:
     195            errno = EINVAL;
     196            return (size_t)(-1);
     197          }
     198  
     199        /* Here m > 0.  */
     200  
     201        {
     202  #  define FITS_IN_CHAR_TYPE(wc)  1
     203  #  include "mbrtowc-impl-utf8.h"
     204        }
     205  
     206       success:
     207        if (nstate >= (res > 0 ? res : 1))
     208          abort ();
     209        res -= nstate;
     210        /* Set *ps to an initial state.  */
     211  #  if defined _WIN32 && !defined __CYGWIN__
     212        /* Native Windows.  */
     213        /* MSVC defines 'mbstate_t' as an 8-byte struct; the first 4 bytes matter.
     214           On mingw, 'mbstate_t' is sometimes defined as 'int', sometimes defined
     215           as an 8-byte struct, of which the first 4 bytes matter.  */
     216        *(unsigned int *)pstate = 0;
     217  #  elif defined __CYGWIN__
     218        /* Cygwin defines 'mbstate_t' as an 8-byte struct; the first 4 bytes
     219           matter.  */
     220        ps->__count = 0;
     221  #  else
     222        pstate[0] = 0;
     223  #  endif
     224        return res;
     225  
     226       incomplete:
     227        {
     228          size_t k = nstate;
     229          /* Here 0 <= k < m < 4.  */
     230          pstate[++k] = s[0];
     231          if (k < m)
     232            {
     233              pstate[++k] = s[1];
     234              if (k < m)
     235                pstate[++k] = s[2];
     236            }
     237          if (k != m)
     238            abort ();
     239        }
     240        pstate[0] = m;
     241        return (size_t)(-2);
     242  
     243       invalid:
     244        errno = EILSEQ;
     245        /* The conversion state is undefined, says POSIX.  */
     246        return (size_t)(-1);
     247      }
     248    else
     249      {
     250        wchar_t wc;
     251        size_t ret = mbrtowc (&wc, s, n, ps);
     252        if (ret < (size_t) -2 && pwc != NULL)
     253          *pwc = wc;
     254        return ret;
     255      }
     256  
     257  # else
     258  
     259    /* char32_t and wchar_t are equivalent.  Use mbrtowc().  */
     260    wchar_t wc;
     261    size_t ret = mbrtowc (&wc, s, n, ps);
     262  
     263  #  if GNULIB_MBRTOC32_REGULAR
     264    /* Ensure that mbrtoc32 is regular.  */
     265    if (ret < (size_t) -2 && ! mbsinit (ps))
     266      /* This occurs on glibc 2.12.  */
     267      mbszero (ps);
     268  #  endif
     269  
     270  #  if GL_CHAR32_T_IS_UNICODE && GL_CHAR32_T_VS_WCHAR_T_NEEDS_CONVERSION
     271    if (ret < (size_t) -2 && wc != 0)
     272      {
     273        wc = locale_encoding_to_unicode (wc);
     274        if (wc == 0)
     275          {
     276            ret = (size_t) -1;
     277            errno = EILSEQ;
     278          }
     279      }
     280  #  endif
     281    if (ret < (size_t) -2 && pwc != NULL)
     282      *pwc = wc;
     283    return ret;
     284  
     285  # endif
     286  }
     287  
     288  #endif