1 /* Test of conversion of multibyte character to wide character.
2 Copyright (C) 2008-2023 Free Software Foundation, Inc.
3
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <https://www.gnu.org/licenses/>. */
16
17 #include <config.h>
18
19 #include <wchar.h>
20
21 #include <errno.h>
22 #include <locale.h>
23 #include <stdio.h>
24 #include <stdlib.h>
25 #include <string.h>
26
27 #include "localcharset.h"
28 #include "macros.h"
29
30 #if defined _WIN32 && !defined __CYGWIN__
31
32 static int
33 test_one_locale (const char *name, int codepage)
34 {
35 mbstate_t state;
36 size_t ret;
37
38 # if 1
39 /* Portable code to set the locale. */
40 {
41 char name_with_codepage[1024];
42
43 sprintf (name_with_codepage, "%s.%d", name, codepage);
44
45 /* Set the locale. */
46 if (setlocale (LC_ALL, name_with_codepage) == NULL)
47 return 77;
48 }
49 # else
50 /* Hacky way to set a locale.codepage combination that setlocale() refuses
51 to set. */
52 {
53 /* Codepage of the current locale, set with setlocale().
54 Not necessarily the same as GetACP(). */
55 extern __declspec(dllimport) unsigned int __lc_codepage;
56
57 /* Set the locale. */
58 if (setlocale (LC_ALL, name) == NULL)
59 return 77;
60
61 /* Clobber the codepage and MB_CUR_MAX, both set by setlocale(). */
62 __lc_codepage = codepage;
63 switch (codepage)
64 {
65 case 1252:
66 case 1256:
67 MB_CUR_MAX = 1;
68 break;
69 case 932:
70 case 950:
71 case 936:
72 MB_CUR_MAX = 2;
73 break;
74 case 54936:
75 case 65001:
76 MB_CUR_MAX = 4;
77 break;
78 }
79
80 /* Test whether the codepage is really available. */
81 memset (&state, '\0', sizeof (mbstate_t));
82 if (mbrlen (" ", 1, &state) == (size_t)(-1))
83 return 77;
84 }
85 # endif
86
87 /* Test zero-length input. */
88 {
89 memset (&state, '\0', sizeof (mbstate_t));
90 ret = mbrlen ("x", 0, &state);
91 /* gnulib's implementation returns (size_t)(-2).
92 The AIX 5.1 implementation returns (size_t)(-1).
93 glibc's implementation returns 0. */
94 ASSERT (ret == (size_t)(-2) || ret == (size_t)(-1) || ret == 0);
95 ASSERT (mbsinit (&state));
96 }
97
98 /* Test NUL byte input. */
99 {
100 memset (&state, '\0', sizeof (mbstate_t));
101 ret = mbrlen ("", 1, &state);
102 ASSERT (ret == 0);
103 ASSERT (mbsinit (&state));
104 }
105
106 /* Test single-byte input. */
107 {
108 int c;
109 char buf[1];
110
111 memset (&state, '\0', sizeof (mbstate_t));
112 for (c = 0; c < 0x100; c++)
113 switch (c)
114 {
115 case '\t': case '\v': case '\f':
116 case ' ': case '!': case '"': case '#': case '%':
117 case '&': case '\'': case '(': case ')': case '*':
118 case '+': case ',': case '-': case '.': case '/':
119 case '0': case '1': case '2': case '3': case '4':
120 case '5': case '6': case '7': case '8': case '9':
121 case ':': case ';': case '<': case '=': case '>':
122 case '?':
123 case 'A': case 'B': case 'C': case 'D': case 'E':
124 case 'F': case 'G': case 'H': case 'I': case 'J':
125 case 'K': case 'L': case 'M': case 'N': case 'O':
126 case 'P': case 'Q': case 'R': case 'S': case 'T':
127 case 'U': case 'V': case 'W': case 'X': case 'Y':
128 case 'Z':
129 case '[': case '\\': case ']': case '^': case '_':
130 case 'a': case 'b': case 'c': case 'd': case 'e':
131 case 'f': case 'g': case 'h': case 'i': case 'j':
132 case 'k': case 'l': case 'm': case 'n': case 'o':
133 case 'p': case 'q': case 'r': case 's': case 't':
134 case 'u': case 'v': case 'w': case 'x': case 'y':
135 case 'z': case '{': case '|': case '}': case '~':
136 /* c is in the ISO C "basic character set". */
137 buf[0] = c;
138 ret = mbrlen (buf, 1, &state);
139 ASSERT (ret == 1);
140 ASSERT (mbsinit (&state));
141 break;
142 }
143 }
144
145 /* Test special calling convention, passing a NULL pointer. */
146 {
147 memset (&state, '\0', sizeof (mbstate_t));
148 ret = mbrlen (NULL, 5, &state);
149 ASSERT (ret == 0);
150 ASSERT (mbsinit (&state));
151 }
152
153 switch (codepage)
154 {
155 case 1252:
156 /* Locale encoding is CP1252, an extension of ISO-8859-1. */
157 {
158 char input[] = "B\374\337er"; /* "Büßer" */
159 memset (&state, '\0', sizeof (mbstate_t));
160
161 ret = mbrlen (input, 1, &state);
162 ASSERT (ret == 1);
163 ASSERT (mbsinit (&state));
164 input[0] = '\0';
165
166 ret = mbrlen (input + 1, 1, &state);
167 ASSERT (ret == 1);
168 ASSERT (mbsinit (&state));
169 input[1] = '\0';
170
171 ret = mbrlen (input + 2, 3, &state);
172 ASSERT (ret == 1);
173 ASSERT (mbsinit (&state));
174 input[2] = '\0';
175
176 ret = mbrlen (input + 3, 2, &state);
177 ASSERT (ret == 1);
178 ASSERT (mbsinit (&state));
179 input[3] = '\0';
180
181 ret = mbrlen (input + 4, 1, &state);
182 ASSERT (ret == 1);
183 ASSERT (mbsinit (&state));
184 }
185 return 0;
186
187 case 1256:
188 /* Locale encoding is CP1256, not the same as ISO-8859-6. */
189 {
190 char input[] = "x\302\341\346y"; /* "xآلوy" */
191 memset (&state, '\0', sizeof (mbstate_t));
192
193 ret = mbrlen (input, 1, &state);
194 ASSERT (ret == 1);
195 ASSERT (mbsinit (&state));
196 input[0] = '\0';
197
198 ret = mbrlen (input + 1, 1, &state);
199 ASSERT (ret == 1);
200 ASSERT (mbsinit (&state));
201 input[1] = '\0';
202
203 ret = mbrlen (input + 2, 3, &state);
204 ASSERT (ret == 1);
205 ASSERT (mbsinit (&state));
206 input[2] = '\0';
207
208 ret = mbrlen (input + 3, 2, &state);
209 ASSERT (ret == 1);
210 ASSERT (mbsinit (&state));
211 input[3] = '\0';
212
213 ret = mbrlen (input + 4, 1, &state);
214 ASSERT (ret == 1);
215 ASSERT (mbsinit (&state));
216 }
217 return 0;
218
219 case 932:
220 /* Locale encoding is CP932, similar to Shift_JIS. */
221 {
222 char input[] = "<\223\372\226\173\214\352>"; /* "<日本語>" */
223 memset (&state, '\0', sizeof (mbstate_t));
224
225 ret = mbrlen (input, 1, &state);
226 ASSERT (ret == 1);
227 ASSERT (mbsinit (&state));
228 input[0] = '\0';
229
230 ret = mbrlen (input + 1, 2, &state);
231 ASSERT (ret == 2);
232 ASSERT (mbsinit (&state));
233 input[1] = '\0';
234 input[2] = '\0';
235
236 ret = mbrlen (input + 3, 1, &state);
237 ASSERT (ret == (size_t)(-2));
238 ASSERT (!mbsinit (&state));
239 input[3] = '\0';
240
241 ret = mbrlen (input + 4, 4, &state);
242 ASSERT (ret == 1);
243 ASSERT (mbsinit (&state));
244 input[4] = '\0';
245
246 ret = mbrlen (input + 5, 3, &state);
247 ASSERT (ret == 2);
248 ASSERT (mbsinit (&state));
249 input[5] = '\0';
250 input[6] = '\0';
251
252 ret = mbrlen (input + 7, 1, &state);
253 ASSERT (ret == 1);
254 ASSERT (mbsinit (&state));
255
256 /* Test some invalid input. */
257 memset (&state, '\0', sizeof (mbstate_t));
258 ret = mbrlen ("\377", 1, &state); /* 0xFF */
259 ASSERT ((ret == (size_t)-1 && errno == EILSEQ) || ret == (size_t)-2);
260
261 memset (&state, '\0', sizeof (mbstate_t));
262 ret = mbrlen ("\225\377", 2, &state); /* 0x95 0xFF */
263 ASSERT ((ret == (size_t)-1 && errno == EILSEQ) || ret == 2);
264 }
265 return 0;
266
267 case 950:
268 /* Locale encoding is CP950, similar to Big5. */
269 {
270 char input[] = "<\244\351\245\273\273\171>"; /* "<日本語>" */
271 memset (&state, '\0', sizeof (mbstate_t));
272
273 ret = mbrlen (input, 1, &state);
274 ASSERT (ret == 1);
275 ASSERT (mbsinit (&state));
276 input[0] = '\0';
277
278 ret = mbrlen (input + 1, 2, &state);
279 ASSERT (ret == 2);
280 ASSERT (mbsinit (&state));
281 input[1] = '\0';
282 input[2] = '\0';
283
284 ret = mbrlen (input + 3, 1, &state);
285 ASSERT (ret == (size_t)(-2));
286 ASSERT (!mbsinit (&state));
287 input[3] = '\0';
288
289 ret = mbrlen (input + 4, 4, &state);
290 ASSERT (ret == 1);
291 ASSERT (mbsinit (&state));
292 input[4] = '\0';
293
294 ret = mbrlen (input + 5, 3, &state);
295 ASSERT (ret == 2);
296 ASSERT (mbsinit (&state));
297 input[5] = '\0';
298 input[6] = '\0';
299
300 ret = mbrlen (input + 7, 1, &state);
301 ASSERT (ret == 1);
302 ASSERT (mbsinit (&state));
303
304 /* Test some invalid input. */
305 memset (&state, '\0', sizeof (mbstate_t));
306 ret = mbrlen ("\377", 1, &state); /* 0xFF */
307 ASSERT ((ret == (size_t)-1 && errno == EILSEQ) || ret == (size_t)-2);
308
309 memset (&state, '\0', sizeof (mbstate_t));
310 ret = mbrlen ("\225\377", 2, &state); /* 0x95 0xFF */
311 ASSERT ((ret == (size_t)-1 && errno == EILSEQ) || ret == 2);
312 }
313 return 0;
314
315 case 936:
316 /* Locale encoding is CP936 = GBK, an extension of GB2312. */
317 {
318 char input[] = "<\310\325\261\276\325\132>"; /* "<日本語>" */
319 memset (&state, '\0', sizeof (mbstate_t));
320
321 ret = mbrlen (input, 1, &state);
322 ASSERT (ret == 1);
323 ASSERT (mbsinit (&state));
324 input[0] = '\0';
325
326 ret = mbrlen (input + 1, 2, &state);
327 ASSERT (ret == 2);
328 ASSERT (mbsinit (&state));
329 input[1] = '\0';
330 input[2] = '\0';
331
332 ret = mbrlen (input + 3, 1, &state);
333 ASSERT (ret == (size_t)(-2));
334 ASSERT (!mbsinit (&state));
335 input[3] = '\0';
336
337 ret = mbrlen (input + 4, 4, &state);
338 ASSERT (ret == 1);
339 ASSERT (mbsinit (&state));
340 input[4] = '\0';
341
342 ret = mbrlen (input + 5, 3, &state);
343 ASSERT (ret == 2);
344 ASSERT (mbsinit (&state));
345 input[5] = '\0';
346 input[6] = '\0';
347
348 ret = mbrlen (input + 7, 1, &state);
349 ASSERT (ret == 1);
350 ASSERT (mbsinit (&state));
351
352 /* Test some invalid input. */
353 memset (&state, '\0', sizeof (mbstate_t));
354 ret = mbrlen ("\377", 1, &state); /* 0xFF */
355 ASSERT ((ret == (size_t)-1 && errno == EILSEQ) || ret == (size_t)-2);
356
357 memset (&state, '\0', sizeof (mbstate_t));
358 ret = mbrlen ("\225\377", 2, &state); /* 0x95 0xFF */
359 ASSERT ((ret == (size_t)-1 && errno == EILSEQ) || ret == 2);
360 }
361 return 0;
362
363 case 54936:
364 /* Locale encoding is CP54936 = GB18030. */
365 if (strcmp (locale_charset (), "GB18030") != 0)
366 return 77;
367 {
368 char input[] = "B\250\271\201\060\211\070er"; /* "Büßer" */
369 memset (&state, '\0', sizeof (mbstate_t));
370
371 ret = mbrlen (input, 1, &state);
372 ASSERT (ret == 1);
373 ASSERT (mbsinit (&state));
374 input[0] = '\0';
375
376 ret = mbrlen (input + 1, 1, &state);
377 ASSERT (ret == (size_t)(-2));
378 ASSERT (!mbsinit (&state));
379 input[1] = '\0';
380
381 ret = mbrlen (input + 2, 7, &state);
382 ASSERT (ret == 1);
383 ASSERT (mbsinit (&state));
384 input[2] = '\0';
385
386 ret = mbrlen (input + 3, 6, &state);
387 ASSERT (ret == 4);
388 ASSERT (mbsinit (&state));
389 input[3] = '\0';
390 input[4] = '\0';
391 input[5] = '\0';
392 input[6] = '\0';
393
394 ret = mbrlen (input + 7, 2, &state);
395 ASSERT (ret == 1);
396 ASSERT (mbsinit (&state));
397 input[7] = '\0';
398
399 ret = mbrlen (input + 8, 1, &state);
400 ASSERT (ret == 1);
401 ASSERT (mbsinit (&state));
402
403 /* Test some invalid input. */
404 memset (&state, '\0', sizeof (mbstate_t));
405 ret = mbrlen ("\377", 1, &state); /* 0xFF */
406 ASSERT (ret == (size_t)-1);
407 ASSERT (errno == EILSEQ);
408
409 memset (&state, '\0', sizeof (mbstate_t));
410 ret = mbrlen ("\225\377", 2, &state); /* 0x95 0xFF */
411 ASSERT (ret == (size_t)-1);
412 ASSERT (errno == EILSEQ);
413
414 memset (&state, '\0', sizeof (mbstate_t));
415 ret = mbrlen ("\201\045", 2, &state); /* 0x81 0x25 */
416 ASSERT (ret == (size_t)-1);
417 ASSERT (errno == EILSEQ);
418
419 memset (&state, '\0', sizeof (mbstate_t));
420 ret = mbrlen ("\201\060\377", 3, &state); /* 0x81 0x30 0xFF */
421 ASSERT (ret == (size_t)-1);
422 ASSERT (errno == EILSEQ);
423
424 memset (&state, '\0', sizeof (mbstate_t));
425 ret = mbrlen ("\201\060\377\064", 4, &state); /* 0x81 0x30 0xFF 0x34 */
426 ASSERT (ret == (size_t)-1);
427 ASSERT (errno == EILSEQ);
428
429 memset (&state, '\0', sizeof (mbstate_t));
430 ret = mbrlen ("\201\060\211\072", 4, &state); /* 0x81 0x30 0x89 0x3A */
431 ASSERT (ret == (size_t)-1);
432 ASSERT (errno == EILSEQ);
433 }
434 return 0;
435
436 case 65001:
437 /* Locale encoding is CP65001 = UTF-8. */
438 if (strcmp (locale_charset (), "UTF-8") != 0)
439 return 77;
440 {
441 char input[] = "B\303\274\303\237er"; /* "Büßer" */
442 memset (&state, '\0', sizeof (mbstate_t));
443
444 ret = mbrlen (input, 1, &state);
445 ASSERT (ret == 1);
446 ASSERT (mbsinit (&state));
447 input[0] = '\0';
448
449 ret = mbrlen (input + 1, 1, &state);
450 ASSERT (ret == (size_t)(-2));
451 ASSERT (!mbsinit (&state));
452 input[1] = '\0';
453
454 ret = mbrlen (input + 2, 5, &state);
455 ASSERT (ret == 1);
456 ASSERT (mbsinit (&state));
457 input[2] = '\0';
458
459 ret = mbrlen (input + 3, 4, &state);
460 ASSERT (ret == 2);
461 ASSERT (mbsinit (&state));
462 input[3] = '\0';
463 input[4] = '\0';
464
465 ret = mbrlen (input + 5, 2, &state);
466 ASSERT (ret == 1);
467 ASSERT (mbsinit (&state));
468 input[5] = '\0';
469
470 ret = mbrlen (input + 6, 1, &state);
471 ASSERT (ret == 1);
472 ASSERT (mbsinit (&state));
473
474 /* Test some invalid input. */
475 memset (&state, '\0', sizeof (mbstate_t));
476 ret = mbrlen ("\377", 1, &state); /* 0xFF */
477 ASSERT (ret == (size_t)-1);
478 ASSERT (errno == EILSEQ);
479
480 memset (&state, '\0', sizeof (mbstate_t));
481 ret = mbrlen ("\303\300", 2, &state); /* 0xC3 0xC0 */
482 ASSERT (ret == (size_t)-1);
483 ASSERT (errno == EILSEQ);
484
485 memset (&state, '\0', sizeof (mbstate_t));
486 ret = mbrlen ("\343\300", 2, &state); /* 0xE3 0xC0 */
487 ASSERT (ret == (size_t)-1);
488 ASSERT (errno == EILSEQ);
489
490 memset (&state, '\0', sizeof (mbstate_t));
491 ret = mbrlen ("\343\300\200", 3, &state); /* 0xE3 0xC0 0x80 */
492 ASSERT (ret == (size_t)-1);
493 ASSERT (errno == EILSEQ);
494
495 memset (&state, '\0', sizeof (mbstate_t));
496 ret = mbrlen ("\343\200\300", 3, &state); /* 0xE3 0x80 0xC0 */
497 ASSERT (ret == (size_t)-1);
498 ASSERT (errno == EILSEQ);
499
500 memset (&state, '\0', sizeof (mbstate_t));
501 ret = mbrlen ("\363\300", 2, &state); /* 0xF3 0xC0 */
502 ASSERT (ret == (size_t)-1);
503 ASSERT (errno == EILSEQ);
504
505 memset (&state, '\0', sizeof (mbstate_t));
506 ret = mbrlen ("\363\300\200\200", 4, &state); /* 0xF3 0xC0 0x80 0x80 */
507 ASSERT (ret == (size_t)-1);
508 ASSERT (errno == EILSEQ);
509
510 memset (&state, '\0', sizeof (mbstate_t));
511 ret = mbrlen ("\363\200\300", 3, &state); /* 0xF3 0x80 0xC0 */
512 ASSERT (ret == (size_t)-1);
513 ASSERT (errno == EILSEQ);
514
515 memset (&state, '\0', sizeof (mbstate_t));
516 ret = mbrlen ("\363\200\300\200", 4, &state); /* 0xF3 0x80 0xC0 0x80 */
517 ASSERT (ret == (size_t)-1);
518 ASSERT (errno == EILSEQ);
519
520 memset (&state, '\0', sizeof (mbstate_t));
521 ret = mbrlen ("\363\200\200\300", 4, &state); /* 0xF3 0x80 0x80 0xC0 */
522 ASSERT (ret == (size_t)-1);
523 ASSERT (errno == EILSEQ);
524 }
525 return 0;
526
527 default:
528 return 1;
529 }
530 }
531
532 int
533 main (int argc, char *argv[])
534 {
535 int codepage = atoi (argv[argc - 1]);
536 int result;
537 int i;
538
539 result = 77;
540 for (i = 1; i < argc - 1; i++)
541 {
542 int ret = test_one_locale (argv[i], codepage);
543
544 if (ret != 77)
545 result = ret;
546 }
547
548 if (result == 77)
549 {
550 fprintf (stderr, "Skipping test: found no locale with codepage %d\n",
551 codepage);
552 }
553 return result;
554 }
555
556 #else
557
558 int
559 main (int argc, char *argv[])
560 {
561 fputs ("Skipping test: not a native Windows system\n", stderr);
562 return 77;
563 }
564
565 #endif