1 /* Convert multibyte character to 32-bit wide character.
2 Copyright (C) 2020-2023 Free Software Foundation, Inc.
3
4 This file is free software: you can redistribute it and/or modify
5 it under the terms of the GNU Lesser General Public License as
6 published by the Free Software Foundation; either version 2.1 of the
7 License, or (at your option) any later version.
8
9 This file is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU Lesser General Public License for more details.
13
14 You should have received a copy of the GNU Lesser General Public License
15 along with this program. If not, see <https://www.gnu.org/licenses/>. */
16
17 /* Written by Bruno Haible <bruno@clisp.org>, 2020. */
18
19 #include <config.h>
20
21 /* Specification. */
22 #include <uchar.h>
23
24 #include "attribute.h"
25
26 #include <errno.h>
27 #include <stdlib.h>
28
29 #if GL_CHAR32_T_IS_UNICODE
30 # include "lc-charset-unicode.h"
31 #endif
32
33 #if GNULIB_defined_mbstate_t /* AIX, IRIX */
34 /* Implement mbrtoc32() on top of mbtowc() for the non-UTF-8 locales
35 and directly for the UTF-8 locales. */
36
37 /* Note: On AIX (64-bit) we can implement mbrtoc32 in two equivalent ways:
38 - in a way that parallels the override of mbrtowc; this is the code branch
39 here;
40 - in a way that invokes the overridden mbrtowc; this would be the #else
41 branch below.
42 They are equivalent. */
43
44 # if defined _WIN32 && !defined __CYGWIN__
45
46 # define WIN32_LEAN_AND_MEAN /* avoid including junk */
47 # include <windows.h>
48
49 # elif HAVE_PTHREAD_API
50
51 # include <pthread.h>
52 # if HAVE_THREADS_H && HAVE_WEAK_SYMBOLS
53 # include <threads.h>
54 # pragma weak thrd_exit
55 # define c11_threads_in_use() (thrd_exit != NULL)
56 # else
57 # define c11_threads_in_use() 0
58 # endif
59
60 # elif HAVE_THREADS_H
61
62 # include <threads.h>
63
64 # endif
65
66 # include "lc-charset-dispatch.h"
67 # include "mbtowc-lock.h"
68
69 static_assert (sizeof (mbstate_t) >= 4);
70 static char internal_state[4];
71
72 size_t
73 mbrtoc32 (char32_t *pwc, const char *s, size_t n, mbstate_t *ps)
74 {
75 # define FITS_IN_CHAR_TYPE(wc) 1
76 # include "mbrtowc-impl.h"
77 }
78
79 #else /* glibc, macOS, FreeBSD, NetBSD, OpenBSD, HP-UX, Solaris, Cygwin, mingw, MSVC, Minix, Android */
80
81 /* Implement mbrtoc32() based on the original mbrtoc32() or on mbrtowc(). */
82
83 # include <wchar.h>
84
85 # include "localcharset.h"
86 # include "streq.h"
87
88 # if MBRTOC32_IN_C_LOCALE_MAYBE_EILSEQ
89 # include "hard-locale.h"
90 # include <locale.h>
91 # endif
92
93 static mbstate_t internal_state;
94
95 size_t
96 mbrtoc32 (char32_t *pwc, const char *s, size_t n, mbstate_t *ps)
97 # undef mbrtoc32
98 {
99 /* It's simpler to handle the case s == NULL upfront, than to worry about
100 this case later, before every test of pwc and n. */
101 if (s == NULL)
102 {
103 pwc = NULL;
104 s = "";
105 n = 1;
106 }
107
108 # if MBRTOC32_EMPTY_INPUT_BUG || _GL_SMALL_WCHAR_T
109 if (n == 0)
110 return (size_t) -2;
111 # endif
112
113 if (ps == NULL)
114 ps = &internal_state;
115
116 # if HAVE_WORKING_MBRTOC32
117 /* mbrtoc32() may produce different values for wc than mbrtowc(). Therefore
118 use mbrtoc32(). */
119
120 # if defined _WIN32 && !defined __CYGWIN__
121 char32_t wc;
122 size_t ret = mbrtoc32 (&wc, s, n, ps);
123 if (ret < (size_t) -2 && pwc != NULL)
124 *pwc = wc;
125 # else
126 size_t ret = mbrtoc32 (pwc, s, n, ps);
127 # endif
128
129 # if GNULIB_MBRTOC32_REGULAR
130 /* Verify that mbrtoc32 is regular. */
131 if (ret < (size_t) -3 && ! mbsinit (ps))
132 /* This occurs on glibc 2.36. */
133 mbszero (ps);
134 if (ret == (size_t) -3)
135 abort ();
136 # endif
137
138 # if MBRTOC32_IN_C_LOCALE_MAYBE_EILSEQ
139 if ((size_t) -2 <= ret && n != 0 && ! hard_locale (LC_CTYPE))
140 {
141 if (pwc != NULL)
142 *pwc = (unsigned char) *s;
143 return 1;
144 }
145 # endif
146
147 return ret;
148
149 # elif _GL_SMALL_WCHAR_T
150
151 /* Special-case all encodings that may produce wide character values
152 > WCHAR_MAX. */
153 const char *encoding = locale_charset ();
154 if (STREQ_OPT (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0, 0))
155 {
156 /* Special-case the UTF-8 encoding. Assume that the wide-character
157 encoding in a UTF-8 locale is UCS-2 or, equivalently, UTF-16. */
158 /* Here n > 0. */
159 char *pstate = (char *)ps;
160 size_t nstate = pstate[0];
161 char buf[4];
162 const char *p;
163 size_t m;
164 int res;
165
166 switch (nstate)
167 {
168 case 0:
169 p = s;
170 m = n;
171 break;
172 case 3:
173 buf[2] = pstate[3];
174 FALLTHROUGH;
175 case 2:
176 buf[1] = pstate[2];
177 FALLTHROUGH;
178 case 1:
179 buf[0] = pstate[1];
180 p = buf;
181 m = nstate;
182 buf[m++] = s[0];
183 if (n >= 2 && m < 4)
184 {
185 buf[m++] = s[1];
186 if (n >= 3 && m < 4)
187 buf[m++] = s[2];
188 }
189 break;
190 default:
191 errno = EINVAL;
192 return (size_t)(-1);
193 }
194
195 /* Here m > 0. */
196
197 {
198 # define FITS_IN_CHAR_TYPE(wc) 1
199 # include "mbrtowc-impl-utf8.h"
200 }
201
202 success:
203 if (nstate >= (res > 0 ? res : 1))
204 abort ();
205 res -= nstate;
206 /* Set *ps to an initial state. */
207 # if defined _WIN32 && !defined __CYGWIN__
208 /* Native Windows. */
209 /* MSVC defines 'mbstate_t' as an 8-byte struct; the first 4 bytes matter.
210 On mingw, 'mbstate_t' is sometimes defined as 'int', sometimes defined
211 as an 8-byte struct, of which the first 4 bytes matter. */
212 *(unsigned int *)pstate = 0;
213 # elif defined __CYGWIN__
214 /* Cygwin defines 'mbstate_t' as an 8-byte struct; the first 4 bytes
215 matter. */
216 ps->__count = 0;
217 # else
218 pstate[0] = 0;
219 # endif
220 return res;
221
222 incomplete:
223 {
224 size_t k = nstate;
225 /* Here 0 <= k < m < 4. */
226 pstate[++k] = s[0];
227 if (k < m)
228 {
229 pstate[++k] = s[1];
230 if (k < m)
231 pstate[++k] = s[2];
232 }
233 if (k != m)
234 abort ();
235 }
236 pstate[0] = m;
237 return (size_t)(-2);
238
239 invalid:
240 errno = EILSEQ;
241 /* The conversion state is undefined, says POSIX. */
242 return (size_t)(-1);
243 }
244 else
245 {
246 wchar_t wc;
247 size_t ret = mbrtowc (&wc, s, n, ps);
248 if (ret < (size_t) -2 && pwc != NULL)
249 *pwc = wc;
250 return ret;
251 }
252
253 # else
254
255 /* char32_t and wchar_t are equivalent. Use mbrtowc(). */
256 wchar_t wc;
257 size_t ret = mbrtowc (&wc, s, n, ps);
258
259 # if GNULIB_MBRTOC32_REGULAR
260 /* Ensure that mbrtoc32 is regular. */
261 if (ret < (size_t) -2 && ! mbsinit (ps))
262 /* This occurs on glibc 2.12. */
263 mbszero (ps);
264 # endif
265
266 # if GL_CHAR32_T_IS_UNICODE && GL_CHAR32_T_VS_WCHAR_T_NEEDS_CONVERSION
267 if (ret < (size_t) -2 && wc != 0)
268 {
269 wc = locale_encoding_to_unicode (wc);
270 if (wc == 0)
271 {
272 ret = (size_t) -1;
273 errno = EILSEQ;
274 }
275 }
276 # endif
277 if (ret < (size_t) -2 && pwc != NULL)
278 *pwc = wc;
279 return ret;
280
281 # endif
282 }
283
284 #endif