1 /* Charset handling while reading PO files.
2 Copyright (C) 2001-2007, 2010, 2019-2021 Free Software Foundation, Inc.
3 Written by Bruno Haible <haible@clisp.cons.org>, 2001.
4
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3 of the License, or
8 (at your option) any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <https://www.gnu.org/licenses/>. */
17
18
19 #ifdef HAVE_CONFIG_H
20 # include "config.h"
21 #endif
22 #include <alloca.h>
23
24 /* Specification. */
25 #include "po-charset.h"
26
27 #include <stdlib.h>
28 #include <string.h>
29
30 #include "xmalloca.h"
31 #include "xvasprintf.h"
32 #include "po-xerror.h"
33 #if !IN_LIBGETTEXTPO
34 # include "basename-lgpl.h"
35 # include "progname.h"
36 #endif
37 #include "c-strstr.h"
38 #include "c-strcase.h"
39 #include "gettext.h"
40
41 #define _(str) gettext (str)
42
43 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
44
45 static const char ascii[] = "ASCII";
46
47 /* The canonicalized encoding name for ASCII. */
48 const char *po_charset_ascii = ascii;
49
50 static const char utf8[] = "UTF-8";
51
52 /* The canonicalized encoding name for UTF-8. */
53 const char *po_charset_utf8 = utf8;
54
55 /* Canonicalize an encoding name. */
56 const char *
57 po_charset_canonicalize (const char *charset)
58 {
59 /* The list of charsets supported by glibc's iconv() and by the portable
60 iconv() across platforms. Taken from intl/localcharset.h. */
61 static const char *standard_charsets[] =
62 {
63 ascii, "ANSI_X3.4-1968", "US-ASCII", /* i = 0..2 */
64 "ISO-8859-1", "ISO_8859-1", /* i = 3, 4 */
65 "ISO-8859-2", "ISO_8859-2",
66 "ISO-8859-3", "ISO_8859-3",
67 "ISO-8859-4", "ISO_8859-4",
68 "ISO-8859-5", "ISO_8859-5",
69 "ISO-8859-6", "ISO_8859-6",
70 "ISO-8859-7", "ISO_8859-7",
71 "ISO-8859-8", "ISO_8859-8",
72 "ISO-8859-9", "ISO_8859-9",
73 "ISO-8859-13", "ISO_8859-13",
74 "ISO-8859-14", "ISO_8859-14",
75 "ISO-8859-15", "ISO_8859-15", /* i = 25, 26 */
76 "KOI8-R",
77 "KOI8-U",
78 "KOI8-T",
79 "CP850",
80 "CP866",
81 "CP874",
82 "CP932",
83 "CP949",
84 "CP950",
85 "CP1250",
86 "CP1251",
87 "CP1252",
88 "CP1253",
89 "CP1254",
90 "CP1255",
91 "CP1256",
92 "CP1257",
93 "GB2312",
94 "EUC-JP",
95 "EUC-KR",
96 "EUC-TW",
97 "BIG5",
98 "BIG5-HKSCS",
99 "GBK",
100 "GB18030",
101 "SHIFT_JIS",
102 "JOHAB",
103 "TIS-620",
104 "VISCII",
105 "GEORGIAN-PS",
106 utf8
107 };
108 size_t i;
109
110 for (i = 0; i < SIZEOF (standard_charsets); i++)
111 if (c_strcasecmp (charset, standard_charsets[i]) == 0)
112 return standard_charsets[i < 3 ? 0 : i < 27 ? ((i - 3) & ~1) + 3 : i];
113 return NULL;
114 }
115
116 /* Test for ASCII compatibility. */
117 bool
118 po_charset_ascii_compatible (const char *canon_charset)
119 {
120 /* There are only a few exceptions to ASCII compatibility. */
121 if (strcmp (canon_charset, "SHIFT_JIS") == 0
122 || strcmp (canon_charset, "JOHAB") == 0
123 || strcmp (canon_charset, "VISCII") == 0)
124 return false;
125 else
126 return true;
127 }
128
129 /* Test for a weird encoding, i.e. an encoding which has double-byte
130 characters ending in 0x5C. */
131 bool po_is_charset_weird (const char *canon_charset)
132 {
133 static const char *weird_charsets[] =
134 {
135 "BIG5",
136 "BIG5-HKSCS",
137 "GBK",
138 "GB18030",
139 "SHIFT_JIS",
140 "JOHAB"
141 };
142 size_t i;
143
144 for (i = 0; i < SIZEOF (weird_charsets); i++)
145 if (strcmp (canon_charset, weird_charsets[i]) == 0)
146 return true;
147 return false;
148 }
149
150 /* Test for a weird CJK encoding, i.e. a weird encoding with CJK structure.
151 An encoding has CJK structure if every valid character stream is composed
152 of single bytes in the range 0x{00..7F} and of byte pairs in the range
153 0x{80..FF}{30..FF}. */
154 bool po_is_charset_weird_cjk (const char *canon_charset)
155 {
156 static const char *weird_cjk_charsets[] =
157 { /* single bytes double bytes */
158 "BIG5", /* 0x{00..7F}, 0x{A1..F9}{40..FE} */
159 "BIG5-HKSCS", /* 0x{00..7F}, 0x{88..FE}{40..FE} */
160 "GBK", /* 0x{00..7F}, 0x{81..FE}{40..FE} */
161 "GB18030", /* 0x{00..7F}, 0x{81..FE}{30..FE} */
162 "SHIFT_JIS", /* 0x{00..7F}, 0x{81..F9}{40..FC} */
163 "JOHAB" /* 0x{00..7F}, 0x{84..F9}{31..FE} */
164 };
165 size_t i;
166
167 for (i = 0; i < SIZEOF (weird_cjk_charsets); i++)
168 if (strcmp (canon_charset, weird_cjk_charsets[i]) == 0)
169 return true;
170 return false;
171 }
172
173 /* Hardcoded iterator functions for all kinds of encodings.
174 We could also implement a general iterator function with iconv(),
175 but we need a fast one. */
176
177 /* Character iterator for 8-bit encodings. */
178 static size_t
179 char_iterator (const char *s)
180 {
181 return 1;
182 }
183
184 /* Character iterator for GB2312. See libiconv/lib/euc_cn.h. */
185 /* Character iterator for EUC-KR. See libiconv/lib/euc_kr.h. */
186 static size_t
187 euc_character_iterator (const char *s)
188 {
189 unsigned char c = *s;
190 if (c >= 0xa1 && c < 0xff)
191 {
192 unsigned char c2 = s[1];
193 if (c2 >= 0xa1 && c2 < 0xff)
194 return 2;
195 }
196 return 1;
197 }
198
199 /* Character iterator for EUC-JP. See libiconv/lib/euc_jp.h. */
200 static size_t
201 euc_jp_character_iterator (const char *s)
202 {
203 unsigned char c = *s;
204 if (c >= 0xa1 && c < 0xff)
205 {
206 unsigned char c2 = s[1];
207 if (c2 >= 0xa1 && c2 < 0xff)
208 return 2;
209 }
210 else if (c == 0x8e)
211 {
212 unsigned char c2 = s[1];
213 if (c2 >= 0xa1 && c2 < 0xe0)
214 return 2;
215 }
216 else if (c == 0x8f)
217 {
218 unsigned char c2 = s[1];
219 if (c2 >= 0xa1 && c2 < 0xff)
220 {
221 unsigned char c3 = s[2];
222 if (c3 >= 0xa1 && c3 < 0xff)
223 return 3;
224 }
225 }
226 return 1;
227 }
228
229 /* Character iterator for EUC-TW. See libiconv/lib/euc_tw.h. */
230 static size_t
231 euc_tw_character_iterator (const char *s)
232 {
233 unsigned char c = *s;
234 if (c >= 0xa1 && c < 0xff)
235 {
236 unsigned char c2 = s[1];
237 if (c2 >= 0xa1 && c2 < 0xff)
238 return 2;
239 }
240 else if (c == 0x8e)
241 {
242 unsigned char c2 = s[1];
243 if (c2 >= 0xa1 && c2 <= 0xb0)
244 {
245 unsigned char c3 = s[2];
246 if (c3 >= 0xa1 && c3 < 0xff)
247 {
248 unsigned char c4 = s[3];
249 if (c4 >= 0xa1 && c4 < 0xff)
250 return 4;
251 }
252 }
253 }
254 return 1;
255 }
256
257 /* Character iterator for BIG5. See libiconv/lib/ces_big5.h. */
258 static size_t
259 big5_character_iterator (const char *s)
260 {
261 unsigned char c = *s;
262 if (c >= 0xa1 && c < 0xff)
263 {
264 unsigned char c2 = s[1];
265 if ((c2 >= 0x40 && c2 < 0x7f) || (c2 >= 0xa1 && c2 < 0xff))
266 return 2;
267 }
268 return 1;
269 }
270
271 /* Character iterator for BIG5-HKSCS. See libiconv/lib/big5hkscs.h. */
272 static size_t
273 big5hkscs_character_iterator (const char *s)
274 {
275 unsigned char c = *s;
276 if (c >= 0x88 && c < 0xff)
277 {
278 unsigned char c2 = s[1];
279 if ((c2 >= 0x40 && c2 < 0x7f) || (c2 >= 0xa1 && c2 < 0xff))
280 return 2;
281 }
282 return 1;
283 }
284
285 /* Character iterator for GBK. See libiconv/lib/ces_gbk.h and
286 libiconv/lib/gbk.h. */
287 static size_t
288 gbk_character_iterator (const char *s)
289 {
290 unsigned char c = *s;
291 if (c >= 0x81 && c < 0xff)
292 {
293 unsigned char c2 = s[1];
294 if ((c2 >= 0x40 && c2 < 0x7f) || (c2 >= 0x80 && c2 < 0xff))
295 return 2;
296 }
297 return 1;
298 }
299
300 /* Character iterator for GB18030. See libiconv/lib/gb18030.h. */
301 static size_t
302 gb18030_character_iterator (const char *s)
303 {
304 unsigned char c = *s;
305 if (c >= 0x81 && c < 0xff)
306 {
307 unsigned char c2 = s[1];
308 if ((c2 >= 0x40 && c2 < 0x7f) || (c2 >= 0x80 && c2 < 0xff))
309 return 2;
310 }
311 if (c >= 0x81 && c <= 0x84)
312 {
313 unsigned char c2 = s[1];
314 if (c2 >= 0x30 && c2 <= 0x39)
315 {
316 unsigned char c3 = s[2];
317 if (c3 >= 0x81 && c3 < 0xff)
318 {
319 unsigned char c4 = s[3];
320 if (c4 >= 0x30 && c4 <= 0x39)
321 return 4;
322 }
323 }
324 }
325 return 1;
326 }
327
328 /* Character iterator for SHIFT_JIS. See libiconv/lib/sjis.h. */
329 static size_t
330 shift_jis_character_iterator (const char *s)
331 {
332 unsigned char c = *s;
333 if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xf9))
334 {
335 unsigned char c2 = s[1];
336 if ((c2 >= 0x40 && c2 <= 0x7e) || (c2 >= 0x80 && c2 <= 0xfc))
337 return 2;
338 }
339 return 1;
340 }
341
342 /* Character iterator for JOHAB. See libiconv/lib/johab.h and
343 libiconv/lib/johab_hangul.h. */
344 static size_t
345 johab_character_iterator (const char *s)
346 {
347 unsigned char c = *s;
348 if (c >= 0x84 && c <= 0xd3)
349 {
350 unsigned char c2 = s[1];
351 if ((c2 >= 0x41 && c2 < 0x7f) || (c2 >= 0x81 && c2 < 0xff))
352 return 2;
353 }
354 else if (c >= 0xd9 && c <= 0xf9)
355 {
356 unsigned char c2 = s[1];
357 if ((c2 >= 0x31 && c2 <= 0x7e) || (c2 >= 0x91 && c2 <= 0xfe))
358 return 2;
359 }
360 return 1;
361 }
362
363 /* Character iterator for UTF-8. See libiconv/lib/utf8.h. */
364 static size_t
365 utf8_character_iterator (const char *s)
366 {
367 unsigned char c = *s;
368 if (c >= 0xc2)
369 {
370 if (c < 0xe0)
371 {
372 unsigned char c2 = s[1];
373 if (c2 >= 0x80 && c2 < 0xc0)
374 return 2;
375 }
376 else if (c < 0xf0)
377 {
378 unsigned char c2 = s[1];
379 if (c2 >= 0x80 && c2 < 0xc0)
380 {
381 unsigned char c3 = s[2];
382 if (c3 >= 0x80 && c3 < 0xc0)
383 return 3;
384 }
385 }
386 else if (c < 0xf8)
387 {
388 unsigned char c2 = s[1];
389 if (c2 >= 0x80 && c2 < 0xc0)
390 {
391 unsigned char c3 = s[2];
392 if (c3 >= 0x80 && c3 < 0xc0)
393 {
394 unsigned char c4 = s[3];
395 if (c4 >= 0x80 && c4 < 0xc0)
396 return 4;
397 }
398 }
399 }
400 }
401 return 1;
402 }
403
404 /* Returns a character iterator for a given encoding.
405 Given a pointer into a string, it returns the number occupied by the next
406 single character. If the piece of string is not valid or if the *s == '\0',
407 it returns 1. */
408 character_iterator_t
409 po_charset_character_iterator (const char *canon_charset)
410 {
411 if (canon_charset == utf8)
412 return utf8_character_iterator;
413 if (strcmp (canon_charset, "GB2312") == 0
414 || strcmp (canon_charset, "EUC-KR") == 0)
415 return euc_character_iterator;
416 if (strcmp (canon_charset, "EUC-JP") == 0)
417 return euc_jp_character_iterator;
418 if (strcmp (canon_charset, "EUC-TW") == 0)
419 return euc_tw_character_iterator;
420 if (strcmp (canon_charset, "BIG5") == 0)
421 return big5_character_iterator;
422 if (strcmp (canon_charset, "BIG5-HKSCS") == 0)
423 return big5hkscs_character_iterator;
424 if (strcmp (canon_charset, "GBK") == 0)
425 return gbk_character_iterator;
426 if (strcmp (canon_charset, "GB18030") == 0)
427 return gb18030_character_iterator;
428 if (strcmp (canon_charset, "SHIFT_JIS") == 0)
429 return shift_jis_character_iterator;
430 if (strcmp (canon_charset, "JOHAB") == 0)
431 return johab_character_iterator;
432 return char_iterator;
433 }
434
435
436 /* The PO file's encoding, as specified in the header entry. */
437 const char *po_lex_charset;
438
439 /* Representation of U+2068 FIRST STRONG ISOLATE (FSI) in the PO file's
440 encoding, or NULL if not available. */
441 const char *po_lex_isolate_start;
442 /* Representation of U+2069 POP DIRECTIONAL ISOLATE (PDI) in the PO file's
443 encoding, or NULL if not available. */
444 const char *po_lex_isolate_end;
445
446 #if HAVE_ICONV
447 /* Converter from the PO file's encoding to UTF-8. */
448 iconv_t po_lex_iconv;
449 #endif
450 /* If no converter is available, some information about the structure of the
451 PO file's encoding. */
452 bool po_lex_weird_cjk;
453
454 void
455 po_lex_charset_init ()
456 {
457 po_lex_charset = NULL;
458 po_lex_isolate_start = NULL;
459 po_lex_isolate_end = NULL;
460 #if HAVE_ICONV
461 po_lex_iconv = (iconv_t)(-1);
462 #endif
463 po_lex_weird_cjk = false;
464 }
465
466 void
467 po_lex_charset_set (const char *header_entry, const char *filename)
468 {
469 /* Verify the validity of CHARSET. It is necessary
470 1. for the correct treatment of multibyte characters containing
471 0x5C bytes in the PO lexer,
472 2. so that at run time, gettext() can call iconv() to convert
473 msgstr. */
474 const char *charsetstr = c_strstr (header_entry, "charset=");
475
476 if (charsetstr != NULL)
477 {
478 size_t len;
479 char *charset;
480 const char *canon_charset;
481
482 charsetstr += strlen ("charset=");
483 len = strcspn (charsetstr, " \t\n");
484 charset = (char *) xmalloca (len + 1);
485 memcpy (charset, charsetstr, len);
486 charset[len] = '\0';
487
488 canon_charset = po_charset_canonicalize (charset);
489 if (canon_charset == NULL)
490 {
491 /* Don't warn for POT files, because POT files usually contain
492 only ASCII msgids. */
493 size_t filenamelen = strlen (filename);
494
495 if (!(filenamelen >= 4
496 && memcmp (filename + filenamelen - 4, ".pot", 4) == 0
497 && strcmp (charset, "CHARSET") == 0))
498 {
499 char *warning_message =
500 xasprintf (_("\
501 Charset \"%s\" is not a portable encoding name.\n\
502 Message conversion to user's charset might not work.\n"),
503 charset);
504 po_xerror (PO_SEVERITY_WARNING, NULL,
505 filename, (size_t)(-1), (size_t)(-1), true,
506 warning_message);
507 free (warning_message);
508 }
509 }
510 else
511 {
512 const char *envval;
513
514 po_lex_charset = canon_charset;
515
516 if (strcmp (canon_charset, "UTF-8") == 0)
517 {
518 po_lex_isolate_start = "\xE2\x81\xA8";
519 po_lex_isolate_end = "\xE2\x81\xA9";
520 }
521 else if (strcmp (canon_charset, "GB18030") == 0)
522 {
523 po_lex_isolate_start = "\x81\x36\xAC\x34";
524 po_lex_isolate_end = "\x81\x36\xAC\x35";
525 }
526 else
527 {
528 /* The other encodings don't contain U+2068, U+2069. */
529 po_lex_isolate_start = NULL;
530 po_lex_isolate_end = NULL;
531 }
532
533 #if HAVE_ICONV
534 if (po_lex_iconv != (iconv_t)(-1))
535 iconv_close (po_lex_iconv);
536 #endif
537
538 /* The old Solaris/openwin msgfmt and GNU msgfmt <= 0.10.35
539 don't know about multibyte encodings, and require a spurious
540 backslash after every multibyte character whose last byte is
541 0x5C. Some programs, like vim, distribute PO files in this
542 broken format. GNU msgfmt must continue to support this old
543 PO file format when the Makefile requests it. */
544 envval = getenv ("OLD_PO_FILE_INPUT");
545 if (envval != NULL && *envval != '\0')
546 {
547 /* Assume the PO file is in old format, with extraneous
548 backslashes. */
549 #if HAVE_ICONV
550 po_lex_iconv = (iconv_t)(-1);
551 #endif
552 po_lex_weird_cjk = false;
553 }
554 else
555 {
556 /* Use iconv() to parse multibyte characters. */
557 #if HAVE_ICONV
558 /* Avoid glibc-2.1 bug with EUC-KR. */
559 # if ((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
560 && !defined _LIBICONV_VERSION
561 if (strcmp (po_lex_charset, "EUC-KR") == 0)
562 po_lex_iconv = (iconv_t)(-1);
563 else
564 # endif
565 /* Avoid Solaris 2.9 bug with GB2312, EUC-TW, BIG5, BIG5-HKSCS,
566 GBK, GB18030. */
567 # if defined __sun && !defined _LIBICONV_VERSION
568 if ( strcmp (po_lex_charset, "GB2312") == 0
569 || strcmp (po_lex_charset, "EUC-TW") == 0
570 || strcmp (po_lex_charset, "BIG5") == 0
571 || strcmp (po_lex_charset, "BIG5-HKSCS") == 0
572 || strcmp (po_lex_charset, "GBK") == 0
573 || strcmp (po_lex_charset, "GB18030") == 0)
574 po_lex_iconv = (iconv_t)(-1);
575 else
576 # endif
577 po_lex_iconv = iconv_open ("UTF-8", po_lex_charset);
578 if (po_lex_iconv == (iconv_t)(-1))
579 {
580 const char *progname;
581 char *warning_message;
582 const char *recommendation;
583 const char *note;
584 char *whole_message;
585
586 # if IN_LIBGETTEXTPO
587 progname = "libgettextpo";
588 # else
589 progname = last_component (program_name);
590 # endif
591
592 warning_message =
593 xasprintf (_("\
594 Charset \"%s\" is not supported. %s relies on iconv(),\n\
595 and iconv() does not support \"%s\".\n"),
596 po_lex_charset, progname, po_lex_charset);
597
598 # if !defined _LIBICONV_VERSION
599 recommendation = _("\
600 Installing GNU libiconv and then reinstalling GNU gettext\n\
601 would fix this problem.\n");
602 # else
603 recommendation = "";
604 # endif
605
606 /* Test for a charset which has double-byte characters
607 ending in 0x5C. For these encodings, the string parser
608 is likely to be confused if it can't see the character
609 boundaries. */
610 po_lex_weird_cjk = po_is_charset_weird_cjk (po_lex_charset);
611 if (po_is_charset_weird (po_lex_charset)
612 && !po_lex_weird_cjk)
613 note = _("Continuing anyway, expect parse errors.");
614 else
615 note = _("Continuing anyway.");
616
617 whole_message =
618 xasprintf ("%s%s%s\n",
619 warning_message, recommendation, note);
620
621 po_xerror (PO_SEVERITY_WARNING, NULL,
622 filename, (size_t)(-1), (size_t)(-1), true,
623 whole_message);
624
625 free (whole_message);
626 free (warning_message);
627 }
628 #else
629 /* Test for a charset which has double-byte characters
630 ending in 0x5C. For these encodings, the string parser
631 is likely to be confused if it can't see the character
632 boundaries. */
633 po_lex_weird_cjk = po_is_charset_weird_cjk (po_lex_charset);
634 if (po_is_charset_weird (po_lex_charset) && !po_lex_weird_cjk)
635 {
636 const char *progname;
637 char *warning_message;
638 const char *recommendation;
639 const char *note;
640 char *whole_message;
641
642 # if IN_LIBGETTEXTPO
643 progname = "libgettextpo";
644 # else
645 progname = last_component (program_name);
646 # endif
647
648 warning_message =
649 xasprintf (_("\
650 Charset \"%s\" is not supported. %s relies on iconv().\n\
651 This version was built without iconv().\n"),
652 po_lex_charset, progname);
653
654 recommendation = _("\
655 Installing GNU libiconv and then reinstalling GNU gettext\n\
656 would fix this problem.\n");
657
658 note = _("Continuing anyway, expect parse errors.");
659
660 whole_message =
661 xasprintf ("%s%s%s\n",
662 warning_message, recommendation, note);
663
664 po_xerror (PO_SEVERITY_WARNING, NULL,
665 filename, (size_t)(-1), (size_t)(-1), true,
666 whole_message);
667
668 free (whole_message);
669 free (warning_message);
670 }
671 #endif
672 }
673 }
674 freea (charset);
675 }
676 else
677 {
678 /* Don't warn for POT files, because POT files usually contain
679 only ASCII msgids. */
680 size_t filenamelen = strlen (filename);
681
682 if (!(filenamelen >= 4
683 && memcmp (filename + filenamelen - 4, ".pot", 4) == 0))
684 po_xerror (PO_SEVERITY_WARNING,
685 NULL, filename, (size_t)(-1), (size_t)(-1), true,
686 _("\
687 Charset missing in header.\n\
688 Message conversion to user's charset will not work.\n"));
689 }
690 }
691
692 void
693 po_lex_charset_close ()
694 {
695 po_lex_charset = NULL;
696 po_lex_isolate_start = NULL;
697 po_lex_isolate_end = NULL;
698 #if HAVE_ICONV
699 if (po_lex_iconv != (iconv_t)(-1))
700 {
701 iconv_close (po_lex_iconv);
702 po_lex_iconv = (iconv_t)(-1);
703 }
704 #endif
705 po_lex_weird_cjk = false;
706 }