1 /* Extracting a message. Accumulating the message list.
2 Copyright (C) 2001-2020, 2023 Free Software Foundation, Inc.
3
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 3 of the License, or
7 (at your option) any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <https://www.gnu.org/licenses/>. */
16
17 #ifdef HAVE_CONFIG_H
18 # include <config.h>
19 #endif
20
21 /* Specification. */
22 #include "xg-message.h"
23
24 #include <stdio.h>
25
26 #include "c-strstr.h"
27 #include "error-progname.h"
28 #include "format.h"
29 #include "read-catalog-abstract.h"
30 #include "xalloc.h"
31 #include "xerror.h"
32 #include "xvasprintf.h"
33 #include "verify.h"
34
35 #include "xgettext.h"
36
37 #include "gettext.h"
38 #define _(str) gettext (str)
39
40
41 #define CONVERT_STRING(string, lcontext) \
42 string = from_current_source_encoding (string, lcontext, pos->file_name, \
43 pos->line_number);
44
45
46 /* Update the is_format[] flags depending on the information given in the
47 context. */
48 static void
49 set_format_flags_from_context (enum is_format is_format[NFORMATS],
50 flag_context_ty context, const char *string,
51 lex_pos_ty *pos, const char *pretty_msgstr)
52 {
53 size_t i;
54
55 if (context.is_format1 != undecided
56 || context.is_format2 != undecided
57 || context.is_format3 != undecided
58 || context.is_format4 != undecided)
59 for (i = 0; i < NFORMATS; i++)
60 {
61 if (is_format[i] == undecided)
62 {
63 if (formatstring_parsers[i] == current_formatstring_parser1
64 && context.is_format1 != undecided)
65 is_format[i] = (enum is_format) context.is_format1;
66 if (formatstring_parsers[i] == current_formatstring_parser2
67 && context.is_format2 != undecided)
68 is_format[i] = (enum is_format) context.is_format2;
69 if (formatstring_parsers[i] == current_formatstring_parser3
70 && context.is_format3 != undecided)
71 is_format[i] = (enum is_format) context.is_format3;
72 if (formatstring_parsers[i] == current_formatstring_parser4
73 && context.is_format4 != undecided)
74 is_format[i] = (enum is_format) context.is_format4;
75 }
76 if (possible_format_p (is_format[i]))
77 {
78 struct formatstring_parser *parser = formatstring_parsers[i];
79 char *invalid_reason = NULL;
80 void *descr = parser->parse (string, false, NULL, &invalid_reason);
81
82 if (descr != NULL)
83 parser->free (descr);
84 else
85 {
86 /* The string is not a valid format string. */
87 if (is_format[i] != possible)
88 {
89 char buffer[22];
90
91 error_with_progname = false;
92 if (pos->line_number == (size_t)(-1))
93 buffer[0] = '\0';
94 else
95 sprintf (buffer, ":%ld", (long) pos->line_number);
96 multiline_warning (xasprintf (_("%s%s: warning: "),
97 pos->file_name, buffer),
98 xasprintf (is_format[i] == yes_according_to_context
99 ? _("Although being used in a format string position, the %s is not a valid %s format string. Reason: %s\n")
100 : _("Although declared as such, the %s is not a valid %s format string. Reason: %s\n"),
101 pretty_msgstr,
102 format_language_pretty[i],
103 invalid_reason));
104 error_with_progname = true;
105 }
106
107 is_format[i] = impossible;
108 free (invalid_reason);
109 }
110 }
111 }
112 }
113
114
115 void
116 decide_is_format (message_ty *mp)
117 {
118 size_t i;
119
120 /* If it is not already decided, through programmer comments, whether the
121 msgid is a format string, examine the msgid. This is a heuristic. */
122 for (i = 0; i < NFORMATS; i++)
123 {
124 if (mp->is_format[i] == undecided
125 && (formatstring_parsers[i] == current_formatstring_parser1
126 || formatstring_parsers[i] == current_formatstring_parser2
127 || formatstring_parsers[i] == current_formatstring_parser3
128 || formatstring_parsers[i] == current_formatstring_parser4)
129 /* But avoid redundancy: objc-format is stronger than c-format. */
130 && !(i == format_c && possible_format_p (mp->is_format[format_objc]))
131 && !(i == format_objc && possible_format_p (mp->is_format[format_c]))
132 /* Avoid flagging a string as c-format when it's known to be a
133 qt-format or qt-plural-format or kde-format or boost-format
134 string. */
135 && !(i == format_c
136 && (possible_format_p (mp->is_format[format_qt])
137 || possible_format_p (mp->is_format[format_qt_plural])
138 || possible_format_p (mp->is_format[format_kde])
139 || possible_format_p (mp->is_format[format_kde_kuit])
140 || possible_format_p (mp->is_format[format_boost])))
141 /* Avoid flagging a string as kde-format when it's known to
142 be a kde-kuit-format string. */
143 && !(i == format_kde
144 && possible_format_p (mp->is_format[format_kde_kuit]))
145 /* Avoid flagging a string as kde-kuit-format when it's
146 known to be a kde-format string. Note that this relies
147 on the fact that format_kde < format_kde_kuit, so a
148 string will be marked as kde-format if both are
149 undecided. */
150 && !(i == format_kde_kuit
151 && possible_format_p (mp->is_format[format_kde])))
152 {
153 struct formatstring_parser *parser = formatstring_parsers[i];
154 char *invalid_reason = NULL;
155 void *descr = parser->parse (mp->msgid, false, NULL, &invalid_reason);
156
157 if (descr != NULL)
158 {
159 /* msgid is a valid format string. We mark only those msgids
160 as format strings which contain at least one format directive
161 and thus are format strings with a high probability. We
162 don't mark strings without directives as format strings,
163 because that would force the programmer to add
164 "xgettext: no-c-format" anywhere where a translator wishes
165 to use a percent sign. So, the msgfmt checking will not be
166 perfect. Oh well. */
167 if (parser->get_number_of_directives (descr) > 0
168 && !(parser->is_unlikely_intentional != NULL
169 && parser->is_unlikely_intentional (descr)))
170 mp->is_format[i] = possible;
171
172 parser->free (descr);
173 }
174 else
175 {
176 /* msgid is not a valid format string. */
177 mp->is_format[i] = impossible;
178 free (invalid_reason);
179 }
180 }
181 }
182 }
183
184 void
185 intersect_range (message_ty *mp, const struct argument_range *range)
186 {
187 if (has_range_p (*range))
188 {
189 if (has_range_p (mp->range))
190 {
191 if (range->min < mp->range.min)
192 mp->range.min = range->min;
193 if (range->max > mp->range.max)
194 mp->range.max = range->max;
195 }
196 else
197 mp->range = *range;
198 }
199 }
200
201 void
202 decide_do_wrap (message_ty *mp)
203 {
204 /* By default we wrap. */
205 mp->do_wrap = (mp->do_wrap == no ? no : yes);
206 }
207
208 void
209 decide_syntax_check (message_ty *mp)
210 {
211 size_t i;
212
213 for (i = 0; i < NSYNTAXCHECKS; i++)
214 if (mp->do_syntax_check[i] == undecided)
215 mp->do_syntax_check[i] = default_syntax_check[i] == yes ? yes : no;
216 }
217
218
219 static void
220 warn_format_string (enum is_format is_format[NFORMATS], const char *string,
221 lex_pos_ty *pos, const char *pretty_msgstr)
222 {
223 if (possible_format_p (is_format[format_python])
224 && get_python_format_unnamed_arg_count (string) > 1)
225 {
226 char buffer[22];
227
228 error_with_progname = false;
229 if (pos->line_number == (size_t)(-1))
230 buffer[0] = '\0';
231 else
232 sprintf (buffer, ":%ld", (long) pos->line_number);
233 multiline_warning (xasprintf (_("%s%s: warning: "),
234 pos->file_name, buffer),
235 xasprintf (_("\
236 '%s' format string with unnamed arguments cannot be properly localized:\n\
237 The translator cannot reorder the arguments.\n\
238 Please consider using a format string with named arguments,\n\
239 and a mapping instead of a tuple for the arguments.\n"),
240 pretty_msgstr));
241 error_with_progname = true;
242 }
243 }
244
245
246 message_ty *
247 remember_a_message (message_list_ty *mlp, char *msgctxt, char *msgid,
248 bool is_utf8, bool pluralp, flag_context_ty context,
249 lex_pos_ty *pos,
250 const char *extracted_comment,
251 refcounted_string_list_ty *comment, bool comment_is_utf8)
252 {
253 enum is_format is_format[NFORMATS];
254 struct argument_range range;
255 enum is_wrap do_wrap;
256 enum is_syntax_check do_syntax_check[NSYNTAXCHECKS];
257 message_ty *mp;
258 size_t i;
259
260 /* See whether we shall exclude this message. */
261 if (exclude != NULL && message_list_search (exclude, msgctxt, msgid) != NULL)
262 {
263 /* Tell the lexer to reset its comment buffer, so that the next
264 message gets the correct comments. */
265 xgettext_comment_reset ();
266 savable_comment_reset ();
267
268 if (msgctxt != NULL)
269 free (msgctxt);
270 free (msgid);
271
272 return NULL;
273 }
274
275 savable_comment_to_xgettext_comment (comment);
276
277 for (i = 0; i < NFORMATS; i++)
278 is_format[i] = undecided;
279 range.min = -1;
280 range.max = -1;
281 do_wrap = undecided;
282 for (i = 0; i < NSYNTAXCHECKS; i++)
283 do_syntax_check[i] = undecided;
284
285 if (!is_utf8)
286 {
287 if (msgctxt != NULL)
288 CONVERT_STRING (msgctxt, lc_string);
289 CONVERT_STRING (msgid, lc_string);
290 }
291
292 if (msgctxt == NULL && msgid[0] == '\0' && !xgettext_omit_header)
293 {
294 char buffer[22];
295
296 error_with_progname = false;
297 if (pos->line_number == (size_t)(-1))
298 buffer[0] = '\0';
299 else
300 sprintf (buffer, ":%ld", (long) pos->line_number);
301 multiline_warning (xasprintf (_("%s%s: warning: "), pos->file_name,
302 buffer),
303 xstrdup (_("\
304 Empty msgid. It is reserved by GNU gettext:\n\
305 gettext(\"\") returns the header entry with\n\
306 meta information, not the empty string.\n")));
307 error_with_progname = true;
308 }
309
310 /* See if we have seen this message before. */
311 mp = message_list_search (mlp, msgctxt, msgid);
312 if (mp != NULL)
313 {
314 if (pluralp != (mp->msgid_plural != NULL))
315 {
316 lex_pos_ty pos1;
317 lex_pos_ty pos2;
318 char buffer1[22];
319 char buffer2[22];
320
321 if (pluralp)
322 {
323 pos1 = mp->pos;
324 pos2 = *pos;
325 }
326 else
327 {
328 pos1 = *pos;
329 pos2 = mp->pos;
330 }
331
332 if (pos1.line_number == (size_t)(-1))
333 buffer1[0] = '\0';
334 else
335 sprintf (buffer1, ":%ld", (long) pos1.line_number);
336 if (pos2.line_number == (size_t)(-1))
337 buffer2[0] = '\0';
338 else
339 sprintf (buffer2, ":%ld", (long) pos2.line_number);
340 multiline_warning (xstrdup (_("warning: ")),
341 xasprintf ("%s\n%s\n%s\n%s\n",
342 xasprintf (_("msgid '%s' is used without plural and with plural."),
343 msgid),
344 xasprintf (_("%s%s: Here is the occurrence without plural."),
345 pos1.file_name, buffer1),
346 xasprintf (_("%s%s: Here is the occurrence with plural."),
347 pos2.file_name, buffer2),
348 xstrdup (_("Workaround: If the msgid is a sentence, change the wording of the sentence; otherwise, use contexts for disambiguation."))));
349 }
350
351 if (msgctxt != NULL)
352 free (msgctxt);
353 free (msgid);
354 for (i = 0; i < NFORMATS; i++)
355 is_format[i] = mp->is_format[i];
356 do_wrap = mp->do_wrap;
357 for (i = 0; i < NSYNTAXCHECKS; i++)
358 do_syntax_check[i] = mp->do_syntax_check[i];
359 }
360 else
361 {
362 const char *msgstr;
363
364 /* Construct the msgstr from the prefix and suffix, otherwise use the
365 empty string. */
366 if (msgstr_prefix)
367 {
368 msgstr = xasprintf ("%s%s%s", msgstr_prefix, msgid, msgstr_suffix);
369 assume (msgstr != NULL);
370 }
371 else
372 msgstr = "";
373
374 /* Allocate a new message and append the message to the list. */
375 mp = message_alloc (msgctxt, msgid, NULL, msgstr, strlen (msgstr) + 1,
376 pos);
377 /* Do not free msgctxt and msgid. */
378 message_list_append (mlp, mp);
379 }
380
381 /* Determine whether the context specifies that the msgid is a format
382 string. */
383 set_format_flags_from_context (is_format, context, mp->msgid, pos, "msgid");
384
385 /* Ask the lexer for the comments it has seen. */
386 {
387 size_t nitems_before;
388 size_t nitems_after;
389 int j;
390 bool add_all_remaining_comments;
391 /* The string before the comment tag. For example, If "** TRANSLATORS:"
392 is seen and the comment tag is "TRANSLATORS:",
393 then comment_tag_prefix is set to "** ". */
394 const char *comment_tag_prefix = "";
395 size_t comment_tag_prefix_length = 0;
396
397 nitems_before = (mp->comment_dot != NULL ? mp->comment_dot->nitems : 0);
398
399 if (extracted_comment != NULL)
400 {
401 char *copy = xstrdup (extracted_comment);
402 char *rest;
403
404 rest = copy;
405 while (*rest != '\0')
406 {
407 char *newline = strchr (rest, '\n');
408
409 if (newline != NULL)
410 {
411 *newline = '\0';
412 message_comment_dot_append (mp, rest);
413 rest = newline + 1;
414 }
415 else
416 {
417 message_comment_dot_append (mp, rest);
418 break;
419 }
420 }
421 free (copy);
422 }
423
424 add_all_remaining_comments = add_all_comments;
425 for (j = 0; ; ++j)
426 {
427 const char *s = xgettext_comment (j);
428 const char *t;
429 if (s == NULL)
430 break;
431
432 if (!comment_is_utf8)
433 CONVERT_STRING (s, lc_comment);
434
435 /* To reduce the possibility of unwanted matches we do a two
436 step match: the line must contain 'xgettext:' and one of
437 the possible format description strings. */
438 if ((t = c_strstr (s, "xgettext:")) != NULL)
439 {
440 bool tmp_fuzzy;
441 enum is_format tmp_format[NFORMATS];
442 struct argument_range tmp_range;
443 enum is_wrap tmp_wrap;
444 enum is_syntax_check tmp_syntax_check[NSYNTAXCHECKS];
445 bool interesting;
446
447 t += strlen ("xgettext:");
448
449 po_parse_comment_special (t, &tmp_fuzzy, tmp_format, &tmp_range,
450 &tmp_wrap, tmp_syntax_check);
451
452 interesting = false;
453 for (i = 0; i < NFORMATS; i++)
454 if (tmp_format[i] != undecided)
455 {
456 is_format[i] = tmp_format[i];
457 interesting = true;
458 }
459 if (has_range_p (tmp_range))
460 {
461 range = tmp_range;
462 interesting = true;
463 }
464 if (tmp_wrap != undecided)
465 {
466 do_wrap = tmp_wrap;
467 interesting = true;
468 }
469 for (i = 0; i < NSYNTAXCHECKS; i++)
470 if (tmp_syntax_check[i] != undecided)
471 {
472 do_syntax_check[i] = tmp_syntax_check[i];
473 interesting = true;
474 }
475
476 /* If the "xgettext:" marker was followed by an interesting
477 keyword, and we updated our is_format/do_wrap variables,
478 we don't print the comment as a #. comment. */
479 if (interesting)
480 continue;
481 }
482
483 if (!add_all_remaining_comments && comment_tag != NULL)
484 {
485 /* When the comment tag is seen, it drags in not only the line
486 which it starts, but all remaining comment lines. */
487 if ((t = c_strstr (s, comment_tag)) != NULL)
488 {
489 add_all_remaining_comments = true;
490 comment_tag_prefix = s;
491 comment_tag_prefix_length = t - s;
492 }
493 }
494
495 if (add_all_remaining_comments)
496 {
497 if (strncmp (s, comment_tag_prefix, comment_tag_prefix_length) == 0)
498 s += comment_tag_prefix_length;
499 message_comment_dot_append (mp, s);
500 }
501 }
502
503 nitems_after = (mp->comment_dot != NULL ? mp->comment_dot->nitems : 0);
504
505 /* Don't add the comments if they are a repetition of the tail of the
506 already present comments. This avoids unneeded duplication if the
507 same message appears several times, each time with the same comment. */
508 if (nitems_before < nitems_after)
509 {
510 size_t added = nitems_after - nitems_before;
511
512 if (added <= nitems_before)
513 {
514 bool repeated = true;
515
516 for (i = 0; i < added; i++)
517 if (strcmp (mp->comment_dot->item[nitems_before - added + i],
518 mp->comment_dot->item[nitems_before + i]) != 0)
519 {
520 repeated = false;
521 break;
522 }
523
524 if (repeated)
525 {
526 for (i = 0; i < added; i++)
527 free ((char *) mp->comment_dot->item[nitems_before + i]);
528 mp->comment_dot->nitems = nitems_before;
529 }
530 }
531 }
532 }
533
534 for (i = 0; i < NFORMATS; i++)
535 mp->is_format[i] = is_format[i];
536 decide_is_format (mp);
537
538 intersect_range (mp, &range);
539
540 mp->do_wrap = do_wrap;
541 decide_do_wrap (mp);
542
543 for (i = 0; i < NSYNTAXCHECKS; i++)
544 mp->do_syntax_check[i] = do_syntax_check[i];
545 decide_syntax_check (mp);
546
547 /* Warn about the use of non-reorderable format strings when the programming
548 language also provides reorderable format strings. */
549 warn_format_string (is_format, mp->msgid, pos, "msgid");
550
551 /* Remember where we saw this msgid. */
552 message_comment_filepos (mp, pos->file_name, pos->line_number);
553
554 /* Tell the lexer to reset its comment buffer, so that the next
555 message gets the correct comments. */
556 xgettext_comment_reset ();
557 savable_comment_reset ();
558
559 return mp;
560 }
561
562
563 void
564 remember_a_message_plural (message_ty *mp, char *string, bool is_utf8,
565 flag_context_ty context, lex_pos_ty *pos,
566 refcounted_string_list_ty *comment,
567 bool comment_is_utf8)
568 {
569 char *msgid_plural;
570
571 msgid_plural = string;
572
573 savable_comment_to_xgettext_comment (comment);
574
575 if (!is_utf8)
576 CONVERT_STRING (msgid_plural, lc_string);
577
578 /* See if the message is already a plural message. */
579 if (mp->msgid_plural == NULL)
580 {
581 char *msgstr1_malloc = NULL;
582 const char *msgstr1;
583 size_t msgstr1_len;
584 char *msgstr;
585 size_t i;
586
587 mp->msgid_plural = msgid_plural;
588
589 /* Construct the first plural form from the prefix and suffix,
590 otherwise use the empty string. The translator will have to
591 provide additional plural forms. */
592 if (msgstr_prefix)
593 {
594 msgstr1_malloc =
595 xasprintf ("%s%s%s", msgstr_prefix, msgid_plural, msgstr_suffix);
596 msgstr1 = msgstr1_malloc;
597 assume (msgstr1 != NULL);
598 }
599 else
600 msgstr1 = "";
601 msgstr1_len = strlen (msgstr1) + 1;
602 msgstr = XNMALLOC (mp->msgstr_len + msgstr1_len, char);
603 memcpy (msgstr, mp->msgstr, mp->msgstr_len);
604 memcpy (msgstr + mp->msgstr_len, msgstr1, msgstr1_len);
605 mp->msgstr = msgstr;
606 mp->msgstr_len = mp->msgstr_len + msgstr1_len;
607 free (msgstr1_malloc);
608
609 /* Determine whether the context specifies that the msgid_plural is a
610 format string. */
611 set_format_flags_from_context (mp->is_format, context, mp->msgid_plural,
612 pos, "msgid_plural");
613
614 /* If it is not already decided, through programmer comments or
615 the msgid, whether the msgid is a format string, examine the
616 msgid_plural. This is a heuristic. */
617 for (i = 0; i < NFORMATS; i++)
618 if ((formatstring_parsers[i] == current_formatstring_parser1
619 || formatstring_parsers[i] == current_formatstring_parser2
620 || formatstring_parsers[i] == current_formatstring_parser3
621 || formatstring_parsers[i] == current_formatstring_parser4)
622 && (mp->is_format[i] == undecided || mp->is_format[i] == possible)
623 /* But avoid redundancy: objc-format is stronger than c-format. */
624 && !(i == format_c
625 && possible_format_p (mp->is_format[format_objc]))
626 && !(i == format_objc
627 && possible_format_p (mp->is_format[format_c]))
628 /* Avoid flagging a string as c-format when it's known to be a
629 qt-format or qt-plural-format or boost-format string. */
630 && !(i == format_c
631 && (possible_format_p (mp->is_format[format_qt])
632 || possible_format_p (mp->is_format[format_qt_plural])
633 || possible_format_p (mp->is_format[format_kde])
634 || possible_format_p (mp->is_format[format_kde_kuit])
635 || possible_format_p (mp->is_format[format_boost])))
636 /* Avoid flagging a string as kde-format when it's known
637 to be a kde-kuit-format string. */
638 && !(i == format_kde
639 && possible_format_p (mp->is_format[format_kde_kuit]))
640 /* Avoid flagging a string as kde-kuit-format when it's
641 known to be a kde-format string. Note that this relies
642 on the fact that format_kde < format_kde_kuit, so a
643 string will be marked as kde-format if both are
644 undecided. */
645 && !(i == format_kde_kuit
646 && possible_format_p (mp->is_format[format_kde])))
647 {
648 struct formatstring_parser *parser = formatstring_parsers[i];
649 char *invalid_reason = NULL;
650 void *descr =
651 parser->parse (mp->msgid_plural, false, NULL, &invalid_reason);
652
653 if (descr != NULL)
654 {
655 /* Same heuristic as in remember_a_message. */
656 if (parser->get_number_of_directives (descr) > 0
657 && !(parser->is_unlikely_intentional != NULL
658 && parser->is_unlikely_intentional (descr)))
659 mp->is_format[i] = possible;
660
661 parser->free (descr);
662 }
663 else
664 {
665 /* msgid_plural is not a valid format string. */
666 mp->is_format[i] = impossible;
667 free (invalid_reason);
668 }
669 }
670
671 /* Warn about the use of non-reorderable format strings when the programming
672 language also provides reorderable format strings. */
673 warn_format_string (mp->is_format, mp->msgid_plural, pos, "msgid_plural");
674 }
675 else
676 free (msgid_plural);
677
678 /* Tell the lexer to reset its comment buffer, so that the next
679 message gets the correct comments. */
680 xgettext_comment_reset ();
681 savable_comment_reset ();
682 }