1 /* Convert multibyte character to 32-bit wide character.
2 Copyright (C) 2020-2023 Free Software Foundation, Inc.
3
4 This file is free software: you can redistribute it and/or modify
5 it under the terms of the GNU Lesser General Public License as
6 published by the Free Software Foundation; either version 2.1 of the
7 License, or (at your option) any later version.
8
9 This file is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU Lesser General Public License for more details.
13
14 You should have received a copy of the GNU Lesser General Public License
15 along with this program. If not, see <https://www.gnu.org/licenses/>. */
16
17 /* Written by Bruno Haible <bruno@clisp.org>, 2020. */
18
19 #include <config.h>
20
21 /* Specification. */
22 #include <uchar.h>
23
24 #include "attribute.h"
25
26 #include <errno.h>
27 #include <stdlib.h>
28
29 #if GL_CHAR32_T_IS_UNICODE
30 # include "lc-charset-unicode.h"
31 #endif
32
33 #if GNULIB_defined_mbstate_t /* AIX, IRIX */
34 /* Implement mbrtoc32() on top of mbtowc() for the non-UTF-8 locales
35 and directly for the UTF-8 locales. */
36
37 /* Note: On AIX (64-bit) we can implement mbrtoc32 in two equivalent ways:
38 - in a way that parallels the override of mbrtowc; this is the code branch
39 here;
40 - in a way that invokes the overridden mbrtowc; this would be the #else
41 branch below.
42 They are equivalent. */
43
44 # if defined _WIN32 && !defined __CYGWIN__
45
46 # define WIN32_LEAN_AND_MEAN /* avoid including junk */
47 # include <windows.h>
48
49 # elif HAVE_PTHREAD_API
50
51 # include <pthread.h>
52 # if HAVE_THREADS_H && HAVE_WEAK_SYMBOLS
53 # include <threads.h>
54 # pragma weak thrd_exit
55 # define c11_threads_in_use() (thrd_exit != NULL)
56 # else
57 # define c11_threads_in_use() 0
58 # endif
59
60 # elif HAVE_THREADS_H
61
62 # include <threads.h>
63
64 # endif
65
66 # include "lc-charset-dispatch.h"
67 # include "mbtowc-lock.h"
68
69 static_assert (sizeof (mbstate_t) >= 4);
70 static char internal_state[4];
71
72 size_t
73 mbrtoc32 (char32_t *pwc, const char *s, size_t n, mbstate_t *ps)
74 {
75 # define FITS_IN_CHAR_TYPE(wc) 1
76 # include "mbrtowc-impl.h"
77 }
78
79 #else /* glibc, macOS, FreeBSD, NetBSD, OpenBSD, HP-UX, Solaris, Cygwin, mingw, MSVC, Minix, Android */
80
81 /* Implement mbrtoc32() based on the original mbrtoc32() or on mbrtowc(). */
82
83 # include <wchar.h>
84
85 # include "localcharset.h"
86 # include "streq.h"
87
88 # if MBRTOC32_IN_C_LOCALE_MAYBE_EILSEQ
89 # include "hard-locale.h"
90 # include <locale.h>
91 # endif
92
93 static mbstate_t internal_state;
94
95 size_t
96 mbrtoc32 (char32_t *pwc, const char *s, size_t n, mbstate_t *ps)
97 # undef mbrtoc32
98 {
99 /* It's simpler to handle the case s == NULL upfront, than to worry about
100 this case later, before every test of pwc and n. */
101 if (s == NULL)
102 {
103 pwc = NULL;
104 s = "";
105 n = 1;
106 }
107
108 # if MBRTOC32_EMPTY_INPUT_BUG || _GL_SMALL_WCHAR_T
109 if (n == 0)
110 return (size_t) -2;
111 # endif
112
113 if (ps == NULL)
114 ps = &internal_state;
115
116 # if HAVE_WORKING_MBRTOC32
117 /* mbrtoc32() may produce different values for wc than mbrtowc(). Therefore
118 use mbrtoc32(). */
119
120 # if defined _WIN32 && !defined __CYGWIN__
121 char32_t wc;
122 size_t ret = mbrtoc32 (&wc, s, n, ps);
123 if (ret < (size_t) -2 && pwc != NULL)
124 *pwc = wc;
125 # else
126 size_t ret = mbrtoc32 (pwc, s, n, ps);
127 # endif
128
129 # if MBRTOC32_IN_C_LOCALE_MAYBE_EILSEQ
130 if ((size_t) -2 <= ret && n != 0 && ! hard_locale (LC_CTYPE))
131 {
132 if (pwc != NULL)
133 *pwc = (unsigned char) *s;
134 return 1;
135 }
136 # endif
137
138 return ret;
139
140 # elif _GL_SMALL_WCHAR_T
141
142 /* Special-case all encodings that may produce wide character values
143 > WCHAR_MAX. */
144 const char *encoding = locale_charset ();
145 if (STREQ_OPT (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0, 0))
146 {
147 /* Special-case the UTF-8 encoding. Assume that the wide-character
148 encoding in a UTF-8 locale is UCS-2 or, equivalently, UTF-16. */
149 /* Here n > 0. */
150 char *pstate = (char *)ps;
151 size_t nstate = pstate[0];
152 char buf[4];
153 const char *p;
154 size_t m;
155 int res;
156
157 switch (nstate)
158 {
159 case 0:
160 p = s;
161 m = n;
162 break;
163 case 3:
164 buf[2] = pstate[3];
165 FALLTHROUGH;
166 case 2:
167 buf[1] = pstate[2];
168 FALLTHROUGH;
169 case 1:
170 buf[0] = pstate[1];
171 p = buf;
172 m = nstate;
173 buf[m++] = s[0];
174 if (n >= 2 && m < 4)
175 {
176 buf[m++] = s[1];
177 if (n >= 3 && m < 4)
178 buf[m++] = s[2];
179 }
180 break;
181 default:
182 errno = EINVAL;
183 return (size_t)(-1);
184 }
185
186 /* Here m > 0. */
187
188 {
189 # define FITS_IN_CHAR_TYPE(wc) 1
190 # include "mbrtowc-impl-utf8.h"
191 }
192
193 success:
194 if (nstate >= (res > 0 ? res : 1))
195 abort ();
196 res -= nstate;
197 /* Set *ps to the initial state. */
198 # if defined _WIN32 && !defined __CYGWIN__
199 /* Native Windows. */
200 /* MSVC defines 'mbstate_t' as an 8-byte struct; the first 4 bytes matter.
201 On mingw, 'mbstate_t' is sometimes defined as 'int', sometimes defined
202 as an 8-byte struct, of which the first 4 bytes matter. */
203 *(unsigned int *)pstate = 0;
204 # elif defined __CYGWIN__
205 /* Cygwin defines 'mbstate_t' as an 8-byte struct; the first 4 bytes
206 matter. */
207 ps->__count = 0;
208 # else
209 pstate[0] = 0;
210 # endif
211 return res;
212
213 incomplete:
214 {
215 size_t k = nstate;
216 /* Here 0 <= k < m < 4. */
217 pstate[++k] = s[0];
218 if (k < m)
219 {
220 pstate[++k] = s[1];
221 if (k < m)
222 pstate[++k] = s[2];
223 }
224 if (k != m)
225 abort ();
226 }
227 pstate[0] = m;
228 return (size_t)(-2);
229
230 invalid:
231 errno = EILSEQ;
232 /* The conversion state is undefined, says POSIX. */
233 return (size_t)(-1);
234 }
235 else
236 {
237 wchar_t wc;
238 size_t ret = mbrtowc (&wc, s, n, ps);
239 if (ret < (size_t) -2 && pwc != NULL)
240 *pwc = wc;
241 return ret;
242 }
243
244 # else
245
246 /* char32_t and wchar_t are equivalent. Use mbrtowc(). */
247 wchar_t wc;
248 size_t ret = mbrtowc (&wc, s, n, ps);
249 # if GL_CHAR32_T_IS_UNICODE && GL_CHAR32_T_VS_WCHAR_T_NEEDS_CONVERSION
250 if (ret < (size_t) -2 && wc != 0)
251 {
252 wc = locale_encoding_to_unicode (wc);
253 if (wc == 0)
254 {
255 ret = (size_t) -1;
256 errno = EILSEQ;
257 }
258 }
259 # endif
260 if (ret < (size_t) -2 && pwc != NULL)
261 *pwc = wc;
262 return ret;
263
264 # endif
265 }
266
267 #endif