1 /* Message list charset and locale charset handling.
2 Copyright (C) 2001-2003, 2005-2009, 2019-2023 Free Software Foundation, Inc.
3 Written by Bruno Haible <haible@clisp.cons.org>, 2001.
4
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3 of the License, or
8 (at your option) any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <https://www.gnu.org/licenses/>. */
17
18
19 #ifdef HAVE_CONFIG_H
20 # include "config.h"
21 #endif
22 #include <alloca.h>
23
24 /* Specification. */
25 #include "msgl-iconv.h"
26
27 #include <stdbool.h>
28 #include <stdlib.h>
29 #include <string.h>
30
31 #if HAVE_ICONV
32 # include <iconv.h>
33 #endif
34
35 #include "noreturn.h"
36 #include "progname.h"
37 #include "basename-lgpl.h"
38 #include "string-desc.h"
39 #include "message.h"
40 #include "po-charset.h"
41 #include "xstriconv.h"
42 #include "xstriconveh.h"
43 #include "msgl-ascii.h"
44 #include "msgl-ofn.h"
45 #include "xalloc.h"
46 #include "xmalloca.h"
47 #include "c-strstr.h"
48 #include "xvasprintf.h"
49 #include "po-xerror.h"
50 #include "gettext.h"
51
52 #define _(str) gettext (str)
53
54
55 #if HAVE_ICONV
56
57 _GL_NORETURN_FUNC static void conversion_error (const struct conversion_context* context);
58 static void
59 conversion_error (const struct conversion_context* context)
60 {
61 if (context->to_code == po_charset_utf8)
62 /* If a conversion to UTF-8 fails, the problem lies in the input. */
63 po_xerror (PO_SEVERITY_FATAL_ERROR, context->message, NULL, 0, 0, false,
64 xasprintf (_("%s: input is not valid in \"%s\" encoding"),
65 context->from_filename, context->from_code));
66 else
67 po_xerror (PO_SEVERITY_FATAL_ERROR, context->message, NULL, 0, 0, false,
68 xasprintf (_("%s: error while converting from \"%s\" encoding to \"%s\" encoding"),
69 context->from_filename, context->from_code,
70 context->to_code));
71 /* NOTREACHED */
72 abort ();
73 }
74
75 char *
76 convert_string_directly (iconv_t cd, const char *string,
77 const struct conversion_context* context)
78 {
79 size_t len = strlen (string) + 1;
80 char *result = NULL;
81 size_t resultlen = 0;
82
83 if (xmem_cd_iconv (string, len, cd, &result, &resultlen) == 0)
84 /* Verify the result has exactly one NUL byte, at the end. */
85 if (resultlen > 0 && result[resultlen - 1] == '\0'
86 && strlen (result) == resultlen - 1)
87 return result;
88
89 conversion_error (context);
90 /* NOTREACHED */
91 return NULL;
92 }
93
94 string_desc_t
95 convert_string_desc_directly (iconv_t cd, string_desc_t string,
96 const struct conversion_context* context)
97 {
98 char *result = NULL;
99 size_t resultlen = 0;
100
101 if (xmem_cd_iconv (string_desc_data (string), string_desc_length (string),
102 cd, &result, &resultlen) == 0)
103 return string_desc_new_addr (resultlen, result);
104
105 conversion_error (context);
106 /* NOTREACHED */
107 return string_desc_new_empty ();
108 }
109
110 static char *
111 convert_string (const iconveh_t *cd, const char *string,
112 const struct conversion_context* context)
113 {
114 size_t len = strlen (string) + 1;
115 char *result = NULL;
116 size_t resultlen = 0;
117
118 if (xmem_cd_iconveh (string, len, cd, iconveh_error, NULL,
119 &result, &resultlen) == 0)
120 /* Verify the result has exactly one NUL byte, at the end. */
121 if (resultlen > 0 && result[resultlen - 1] == '\0'
122 && strlen (result) == resultlen - 1)
123 return result;
124
125 conversion_error (context);
126 /* NOTREACHED */
127 return NULL;
128 }
129
130 static void
131 convert_string_list (const iconveh_t *cd, string_list_ty *slp,
132 const struct conversion_context* context)
133 {
134 size_t i;
135
136 if (slp != NULL)
137 for (i = 0; i < slp->nitems; i++)
138 slp->item[i] = convert_string (cd, slp->item[i], context);
139 }
140
141 static void
142 convert_prev_msgid (const iconveh_t *cd, message_ty *mp,
143 const struct conversion_context* context)
144 {
145 if (mp->prev_msgctxt != NULL)
146 mp->prev_msgctxt = convert_string (cd, mp->prev_msgctxt, context);
147 if (mp->prev_msgid != NULL)
148 mp->prev_msgid = convert_string (cd, mp->prev_msgid, context);
149 if (mp->prev_msgid_plural != NULL)
150 mp->prev_msgid_plural = convert_string (cd, mp->prev_msgid_plural, context);
151 }
152
153 static void
154 convert_msgid (const iconveh_t *cd, message_ty *mp,
155 const struct conversion_context* context)
156 {
157 if (mp->msgctxt != NULL)
158 mp->msgctxt = convert_string (cd, mp->msgctxt, context);
159 mp->msgid = convert_string (cd, mp->msgid, context);
160 if (mp->msgid_plural != NULL)
161 mp->msgid_plural = convert_string (cd, mp->msgid_plural, context);
162 }
163
164 static void
165 convert_msgstr (const iconveh_t *cd, message_ty *mp,
166 const struct conversion_context* context)
167 {
168 char *result = NULL;
169 size_t resultlen = 0;
170
171 if (!(mp->msgstr_len > 0 && mp->msgstr[mp->msgstr_len - 1] == '\0'))
172 abort ();
173
174 if (xmem_cd_iconveh (mp->msgstr, mp->msgstr_len, cd, iconveh_error, NULL,
175 &result, &resultlen) == 0)
176 /* Verify the result has a NUL byte at the end. */
177 if (resultlen > 0 && result[resultlen - 1] == '\0')
178 /* Verify the result has the same number of NUL bytes. */
179 {
180 const char *p;
181 const char *pend;
182 int nulcount1;
183 int nulcount2;
184
185 for (p = mp->msgstr, pend = p + mp->msgstr_len, nulcount1 = 0;
186 p < pend;
187 p += strlen (p) + 1, nulcount1++);
188 for (p = result, pend = p + resultlen, nulcount2 = 0;
189 p < pend;
190 p += strlen (p) + 1, nulcount2++);
191
192 if (nulcount1 == nulcount2)
193 {
194 mp->msgstr = result;
195 mp->msgstr_len = resultlen;
196 return;
197 }
198 }
199
200 conversion_error (context);
201 }
202
203 #endif
204
205
206 static bool
207 iconv_message_list_internal (message_list_ty *mlp,
208 const char *canon_from_code,
209 const char *canon_to_code,
210 bool update_header,
211 const char *from_filename)
212 {
213 bool canon_from_code_overridden = (canon_from_code != NULL);
214 bool msgids_changed;
215 size_t j;
216
217 /* If the list is empty, nothing to do. */
218 if (mlp->nitems == 0)
219 return false;
220
221 /* Search the header entry, and extract and replace the charset name. */
222 for (j = 0; j < mlp->nitems; j++)
223 if (is_header (mlp->item[j]) && !mlp->item[j]->obsolete)
224 {
225 const char *header = mlp->item[j]->msgstr;
226
227 if (header != NULL)
228 {
229 const char *charsetstr = c_strstr (header, "charset=");
230
231 if (charsetstr != NULL)
232 {
233 size_t len;
234 char *charset;
235 const char *canon_charset;
236
237 charsetstr += strlen ("charset=");
238 len = strcspn (charsetstr, " \t\n");
239 charset = (char *) xmalloca (len + 1);
240 memcpy (charset, charsetstr, len);
241 charset[len] = '\0';
242
243 canon_charset = po_charset_canonicalize (charset);
244 if (canon_charset == NULL)
245 {
246 if (!canon_from_code_overridden)
247 {
248 /* Don't give an error for POT files, because
249 POT files usually contain only ASCII msgids.
250 Also don't give an error for disguised POT
251 files that actually contain only ASCII msgids. */
252 const char *filename = from_filename;
253 size_t filenamelen;
254
255 if (strcmp (charset, "CHARSET") == 0
256 && ((filename != NULL
257 && (filenamelen = strlen (filename)) >= 4
258 && memcmp (filename + filenamelen - 4, ".pot", 4)
259 == 0)
260 || is_ascii_message_list (mlp)))
261 canon_charset = po_charset_ascii;
262 else
263 po_xerror (PO_SEVERITY_FATAL_ERROR, NULL, NULL, 0, 0,
264 false,
265 xasprintf (_("present charset \"%s\" is not a portable encoding name"),
266 charset));
267 }
268 }
269 else
270 {
271 if (canon_from_code == NULL)
272 canon_from_code = canon_charset;
273 else if (canon_from_code != canon_charset)
274 po_xerror (PO_SEVERITY_FATAL_ERROR, NULL, NULL, 0, 0,
275 false,
276 xasprintf (_("two different charsets \"%s\" and \"%s\" in input file"),
277 canon_from_code, canon_charset));
278 }
279 freea (charset);
280
281 if (update_header)
282 {
283 size_t len1, len2, len3;
284 char *new_header;
285
286 len1 = charsetstr - header;
287 len2 = strlen (canon_to_code);
288 len3 = (header + strlen (header)) - (charsetstr + len);
289 new_header = XNMALLOC (len1 + len2 + len3 + 1, char);
290 memcpy (new_header, header, len1);
291 memcpy (new_header + len1, canon_to_code, len2);
292 memcpy (new_header + len1 + len2, charsetstr + len,
293 len3 + 1);
294 mlp->item[j]->msgstr = new_header;
295 mlp->item[j]->msgstr_len = len1 + len2 + len3 + 1;
296 }
297 }
298 }
299 }
300 if (canon_from_code == NULL)
301 {
302 if (is_ascii_message_list (mlp))
303 canon_from_code = po_charset_ascii;
304 else
305 po_xerror (PO_SEVERITY_FATAL_ERROR, NULL, NULL, 0, 0, false,
306 _("input file doesn't contain a header entry with a charset specification"));
307 }
308
309 msgids_changed = false;
310
311 /* If the two encodings are the same, nothing to do. */
312 if (canon_from_code != canon_to_code)
313 {
314 #if HAVE_ICONV
315 iconveh_t cd;
316 struct conversion_context context;
317
318 if (iconveh_open (canon_to_code, canon_from_code, &cd) < 0)
319 po_xerror (PO_SEVERITY_FATAL_ERROR, NULL, NULL, 0, 0, false,
320 xasprintf (_("Cannot convert from \"%s\" to \"%s\". %s relies on iconv(), and iconv() does not support this conversion."),
321 canon_from_code, canon_to_code,
322 last_component (program_name)));
323
324 context.from_code = canon_from_code;
325 context.to_code = canon_to_code;
326 context.from_filename = from_filename;
327
328 for (j = 0; j < mlp->nitems; j++)
329 {
330 message_ty *mp = mlp->item[j];
331
332 if ((mp->msgctxt != NULL && !is_ascii_string (mp->msgctxt))
333 || !is_ascii_string (mp->msgid))
334 msgids_changed = true;
335 context.message = mp;
336 convert_string_list (&cd, mp->comment, &context);
337 convert_string_list (&cd, mp->comment_dot, &context);
338 convert_prev_msgid (&cd, mp, &context);
339 convert_msgid (&cd, mp, &context);
340 convert_msgstr (&cd, mp, &context);
341 }
342
343 iconveh_close (&cd);
344
345 if (msgids_changed)
346 if (message_list_msgids_changed (mlp))
347 po_xerror (PO_SEVERITY_FATAL_ERROR, NULL, NULL, 0, 0, false,
348 xasprintf (_("Conversion from \"%s\" to \"%s\" introduces duplicates: some different msgids become equal."),
349 canon_from_code, canon_to_code));
350 #else
351 po_xerror (PO_SEVERITY_FATAL_ERROR, NULL, NULL, 0, 0, false,
352 xasprintf (_("Cannot convert from \"%s\" to \"%s\". %s relies on iconv(). This version was built without iconv()."),
353 canon_from_code, canon_to_code,
354 last_component (program_name)));
355 #endif
356 }
357
358 return msgids_changed;
359 }
360
361 bool
362 iconv_message_list (message_list_ty *mlp,
363 const char *canon_from_code, const char *canon_to_code,
364 const char *from_filename)
365 {
366 return iconv_message_list_internal (mlp,
367 canon_from_code, canon_to_code, true,
368 from_filename);
369 }
370
371 msgdomain_list_ty *
372 iconv_msgdomain_list (msgdomain_list_ty *mdlp,
373 const char *to_code,
374 bool update_header,
375 const char *from_filename)
376 {
377 const char *canon_to_code;
378 size_t k;
379
380 /* Canonicalize target encoding. */
381 canon_to_code = po_charset_canonicalize (to_code);
382 if (canon_to_code == NULL)
383 po_xerror (PO_SEVERITY_FATAL_ERROR, NULL, NULL, 0, 0, false,
384 xasprintf (_("target charset \"%s\" is not a portable encoding name."),
385 to_code));
386
387 /* Test whether the control characters required for escaping file names with
388 spaces are present in the target encoding. */
389 if (msgdomain_list_has_filenames_with_spaces (mdlp)
390 && !(canon_to_code == po_charset_utf8
391 || strcmp (canon_to_code, "GB18030") == 0))
392 po_xerror (PO_SEVERITY_FATAL_ERROR, NULL, NULL, 0, 0, false,
393 xasprintf (_("Cannot write the control characters that protect file names with spaces in the %s encoding"),
394 canon_to_code));
395
396 for (k = 0; k < mdlp->nitems; k++)
397 iconv_message_list_internal (mdlp->item[k]->messages,
398 mdlp->encoding, canon_to_code, update_header,
399 from_filename);
400
401 mdlp->encoding = canon_to_code;
402 return mdlp;
403 }
404
405 #if HAVE_ICONV
406
407 static bool
408 iconvable_string (const iconveh_t *cd, const char *string)
409 {
410 size_t len = strlen (string) + 1;
411 char *result = NULL;
412 size_t resultlen = 0;
413
414 if (xmem_cd_iconveh (string, len, cd, iconveh_error, NULL,
415 &result, &resultlen) == 0)
416 {
417 /* Test if the result has exactly one NUL byte, at the end. */
418 bool ok = (resultlen > 0 && result[resultlen - 1] == '\0'
419 && strlen (result) == resultlen - 1);
420 free (result);
421 return ok;
422 }
423 return false;
424 }
425
426 static bool
427 iconvable_string_list (const iconveh_t *cd, string_list_ty *slp)
428 {
429 size_t i;
430
431 if (slp != NULL)
432 for (i = 0; i < slp->nitems; i++)
433 if (!iconvable_string (cd, slp->item[i]))
434 return false;
435 return true;
436 }
437
438 static bool
439 iconvable_prev_msgid (const iconveh_t *cd, message_ty *mp)
440 {
441 if (mp->prev_msgctxt != NULL)
442 if (!iconvable_string (cd, mp->prev_msgctxt))
443 return false;
444 if (mp->prev_msgid != NULL)
445 if (!iconvable_string (cd, mp->prev_msgid))
446 return false;
447 if (mp->prev_msgid_plural != NULL)
448 if (!iconvable_string (cd, mp->prev_msgid_plural))
449 return false;
450 return true;
451 }
452
453 static bool
454 iconvable_msgid (const iconveh_t *cd, message_ty *mp)
455 {
456 if (mp->msgctxt != NULL)
457 if (!iconvable_string (cd, mp->msgctxt))
458 return false;
459 if (!iconvable_string (cd, mp->msgid))
460 return false;
461 if (mp->msgid_plural != NULL)
462 if (!iconvable_string (cd, mp->msgid_plural))
463 return false;
464 return true;
465 }
466
467 static bool
468 iconvable_msgstr (const iconveh_t *cd, message_ty *mp)
469 {
470 char *result = NULL;
471 size_t resultlen = 0;
472
473 if (!(mp->msgstr_len > 0 && mp->msgstr[mp->msgstr_len - 1] == '\0'))
474 abort ();
475
476 if (xmem_cd_iconveh (mp->msgstr, mp->msgstr_len, cd, iconveh_error, NULL,
477 &result, &resultlen) == 0)
478 {
479 bool ok = false;
480
481 /* Test if the result has a NUL byte at the end. */
482 if (resultlen > 0 && result[resultlen - 1] == '\0')
483 /* Test if the result has the same number of NUL bytes. */
484 {
485 const char *p;
486 const char *pend;
487 int nulcount1;
488 int nulcount2;
489
490 for (p = mp->msgstr, pend = p + mp->msgstr_len, nulcount1 = 0;
491 p < pend;
492 p += strlen (p) + 1, nulcount1++);
493 for (p = result, pend = p + resultlen, nulcount2 = 0;
494 p < pend;
495 p += strlen (p) + 1, nulcount2++);
496
497 if (nulcount1 == nulcount2)
498 ok = true;
499 }
500
501 free (result);
502 return ok;
503 }
504 return false;
505 }
506
507 #endif
508
509 bool
510 is_message_list_iconvable (message_list_ty *mlp,
511 const char *canon_from_code,
512 const char *canon_to_code)
513 {
514 bool canon_from_code_overridden = (canon_from_code != NULL);
515 size_t j;
516
517 /* If the list is empty, nothing to check. */
518 if (mlp->nitems == 0)
519 return true;
520
521 /* Search the header entry, and extract the charset name. */
522 for (j = 0; j < mlp->nitems; j++)
523 if (is_header (mlp->item[j]) && !mlp->item[j]->obsolete)
524 {
525 const char *header = mlp->item[j]->msgstr;
526
527 if (header != NULL)
528 {
529 const char *charsetstr = c_strstr (header, "charset=");
530
531 if (charsetstr != NULL)
532 {
533 size_t len;
534 char *charset;
535 const char *canon_charset;
536
537 charsetstr += strlen ("charset=");
538 len = strcspn (charsetstr, " \t\n");
539 charset = (char *) xmalloca (len + 1);
540 memcpy (charset, charsetstr, len);
541 charset[len] = '\0';
542
543 canon_charset = po_charset_canonicalize (charset);
544 if (canon_charset == NULL)
545 {
546 if (!canon_from_code_overridden)
547 {
548 /* Don't give an error for POT files, because POT
549 files usually contain only ASCII msgids. */
550 if (strcmp (charset, "CHARSET") == 0)
551 canon_charset = po_charset_ascii;
552 else
553 {
554 /* charset is not a portable encoding name. */
555 freea (charset);
556 return false;
557 }
558 }
559 }
560 else
561 {
562 if (canon_from_code == NULL)
563 canon_from_code = canon_charset;
564 else if (canon_from_code != canon_charset)
565 {
566 /* Two different charsets in input file. */
567 freea (charset);
568 return false;
569 }
570 }
571 freea (charset);
572 }
573 }
574 }
575 if (canon_from_code == NULL)
576 {
577 if (is_ascii_message_list (mlp))
578 canon_from_code = po_charset_ascii;
579 else
580 /* Input file lacks a header entry with a charset specification. */
581 return false;
582 }
583
584 /* If the two encodings are the same, nothing to check. */
585 if (canon_from_code != canon_to_code)
586 {
587 #if HAVE_ICONV
588 iconveh_t cd;
589
590 if (iconveh_open (canon_to_code, canon_from_code, &cd) < 0)
591 /* iconv() doesn't support this conversion. */
592 return false;
593
594 for (j = 0; j < mlp->nitems; j++)
595 {
596 message_ty *mp = mlp->item[j];
597
598 if (!(iconvable_string_list (&cd, mp->comment)
599 && iconvable_string_list (&cd, mp->comment_dot)
600 && iconvable_prev_msgid (&cd, mp)
601 && iconvable_msgid (&cd, mp)
602 && iconvable_msgstr (&cd, mp)))
603 return false;
604 }
605
606 iconveh_close (&cd);
607 #else
608 /* This version was built without iconv(). */
609 return false;
610 #endif
611 }
612
613 return true;
614 }