1 /* xgettext Python backend.
2 Copyright (C) 2002-2003, 2005-2011, 2013-2014, 2018-2023 Free Software Foundation, Inc.
3
4 This file was written by Bruno Haible <haible@clisp.cons.org>, 2002.
5
6 This program is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
10
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with this program. If not, see <https://www.gnu.org/licenses/>. */
18
19 #ifdef HAVE_CONFIG_H
20 # include "config.h"
21 #endif
22
23 /* Specification. */
24 #include "x-python.h"
25
26 #include <assert.h>
27 #include <errno.h>
28 #include <stdbool.h>
29 #include <stdio.h>
30 #include <stdlib.h>
31 #include <string.h>
32
33 #include "attribute.h"
34 #include "message.h"
35 #include "rc-str-list.h"
36 #include "xgettext.h"
37 #include "xg-pos.h"
38 #include "xg-encoding.h"
39 #include "xg-mixed-string.h"
40 #include "xg-arglist-context.h"
41 #include "xg-arglist-callshape.h"
42 #include "xg-arglist-parser.h"
43 #include "xg-message.h"
44 #include "error.h"
45 #include "error-progname.h"
46 #include "progname.h"
47 #include "basename-lgpl.h"
48 #include "xerror.h"
49 #include "xvasprintf.h"
50 #include "xalloc.h"
51 #include "c-strstr.h"
52 #include "c-ctype.h"
53 #include "po-charset.h"
54 #include "uniname.h"
55 #include "unistr.h"
56 #include "gettext.h"
57
58 #define _(s) gettext(s)
59
60 #undef max /* clean up after MSVC's <stdlib.h> */
61 #define max(a,b) ((a) > (b) ? (a) : (b))
62
63 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
64
65
66 /* The Python syntax is defined in the Python Reference Manual
67 /usr/share/doc/packages/python/html/ref/index.html.
68 See also Python-2.0/Parser/tokenizer.c, Python-2.0/Python/compile.c,
69 Python-2.0/Objects/unicodeobject.c. */
70
71
72 /* ====================== Keyword set customization. ====================== */
73
74 /* If true extract all strings. */
75 static bool extract_all = false;
76
77 static hash_table keywords;
78 static bool default_keywords = true;
79
80
81 void
82 x_python_extract_all ()
83 {
84 extract_all = true;
85 }
86
87
88 void
89 x_python_keyword (const char *name)
90 {
91 if (name == NULL)
92 default_keywords = false;
93 else
94 {
95 const char *end;
96 struct callshape shape;
97 const char *colon;
98
99 if (keywords.table == NULL)
100 hash_init (&keywords, 100);
101
102 split_keywordspec (name, &end, &shape);
103
104 /* The characters between name and end should form a valid C identifier.
105 A colon means an invalid parse in split_keywordspec(). */
106 colon = strchr (name, ':');
107 if (colon == NULL || colon >= end)
108 insert_keyword_callshape (&keywords, name, end - name, &shape);
109 }
110 }
111
112 /* Finish initializing the keywords hash table.
113 Called after argument processing, before each file is processed. */
114 static void
115 init_keywords ()
116 {
117 if (default_keywords)
118 {
119 /* When adding new keywords here, also update the documentation in
120 xgettext.texi! */
121 x_python_keyword ("gettext");
122 x_python_keyword ("ugettext");
123 x_python_keyword ("dgettext:2");
124 x_python_keyword ("ngettext:1,2");
125 x_python_keyword ("ungettext:1,2");
126 x_python_keyword ("dngettext:2,3");
127 x_python_keyword ("_");
128 default_keywords = false;
129 }
130 }
131
132 void
133 init_flag_table_python ()
134 {
135 xgettext_record_flag ("gettext:1:pass-python-format");
136 xgettext_record_flag ("ugettext:1:pass-python-format");
137 xgettext_record_flag ("dgettext:2:pass-python-format");
138 xgettext_record_flag ("ngettext:1:pass-python-format");
139 xgettext_record_flag ("ngettext:2:pass-python-format");
140 xgettext_record_flag ("ungettext:1:pass-python-format");
141 xgettext_record_flag ("ungettext:2:pass-python-format");
142 xgettext_record_flag ("dngettext:2:pass-python-format");
143 xgettext_record_flag ("dngettext:3:pass-python-format");
144 xgettext_record_flag ("_:1:pass-python-format");
145 /* xgettext_record_flag ("%:1:python-format"); // % is an infix operator! */
146
147 xgettext_record_flag ("gettext:1:pass-python-brace-format");
148 xgettext_record_flag ("ugettext:1:pass-python-brace-format");
149 xgettext_record_flag ("dgettext:2:pass-python-brace-format");
150 xgettext_record_flag ("ngettext:1:pass-python-brace-format");
151 xgettext_record_flag ("ngettext:2:pass-python-brace-format");
152 xgettext_record_flag ("ungettext:1:pass-python-brace-format");
153 xgettext_record_flag ("ungettext:2:pass-python-brace-format");
154 xgettext_record_flag ("dngettext:2:pass-python-brace-format");
155 xgettext_record_flag ("dngettext:3:pass-python-brace-format");
156 xgettext_record_flag ("_:1:pass-python-brace-format");
157 /* xgettext_record_flag ("format:1:python-brace-format"); */
158 }
159
160
161 /* ======================== Reading of characters. ======================== */
162
163 /* The input file stream. */
164 static FILE *fp;
165
166
167 /* 0. Terminate line by \n, regardless whether the external
168 representation of a line terminator is CR (Mac), and CR/LF
169 (DOS/Windows), as Python treats them equally. */
170 static int
171 phase0_getc ()
172 {
173 int c;
174
175 c = getc (fp);
176 if (c == EOF)
177 {
178 if (ferror (fp))
179 error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
180 real_file_name);
181 return EOF;
182 }
183
184 if (c == '\r')
185 {
186 int c1 = getc (fp);
187
188 if (c1 != EOF && c1 != '\n')
189 ungetc (c1, fp);
190
191 /* Seen line terminator CR or CR/LF. */
192 return '\n';
193 }
194
195 return c;
196 }
197
198 /* Supports only one pushback character, and not '\n'. */
199 static inline void
200 phase0_ungetc (int c)
201 {
202 if (c != EOF)
203 ungetc (c, fp);
204 }
205
206
207 /* 1. line_number handling. */
208
209 /* Maximum used, roughly a safer MB_LEN_MAX. */
210 #define MAX_PHASE1_PUSHBACK 16
211 static unsigned char phase1_pushback[MAX_PHASE1_PUSHBACK];
212 static int phase1_pushback_length;
213
214 /* Read the next single byte from the input file. */
215 static int
216 phase1_getc ()
217 {
218 int c;
219
220 if (phase1_pushback_length)
221 c = phase1_pushback[--phase1_pushback_length];
222 else
223 c = phase0_getc ();
224
225 if (c == '\n')
226 ++line_number;
227
228 return c;
229 }
230
231 /* Supports MAX_PHASE1_PUSHBACK characters of pushback. */
232 static void
233 phase1_ungetc (int c)
234 {
235 if (c != EOF)
236 {
237 if (c == '\n')
238 --line_number;
239
240 if (phase1_pushback_length == SIZEOF (phase1_pushback))
241 abort ();
242 phase1_pushback[phase1_pushback_length++] = c;
243 }
244 }
245
246
247 /* Phase 2: Conversion to Unicode.
248 This is done early because PEP 0263 specifies that conversion to Unicode
249 conceptually occurs before tokenization. A test case where it matters
250 is with encodings like BIG5: when a double-byte character ending in 0x5C
251 is followed by '\' or 'u0021', the tokenizer must not treat the second
252 half of the double-byte character as a backslash. */
253
254 /* End-of-file indicator for functions returning an UCS-4 character. */
255 #define UEOF -1
256
257 static lexical_context_ty lexical_context;
258
259 static int phase2_pushback[max (9, UNINAME_MAX + 3)];
260 static int phase2_pushback_length;
261
262 /* Read the next Unicode UCS-4 character from the input file. */
263 static int
264 phase2_getc ()
265 {
266 if (phase2_pushback_length)
267 return phase2_pushback[--phase2_pushback_length];
268
269 if (xgettext_current_source_encoding == po_charset_ascii)
270 {
271 int c = phase1_getc ();
272 if (c == EOF)
273 return UEOF;
274 if (!c_isascii (c))
275 {
276 multiline_error (xstrdup (""),
277 xasprintf ("%s\n%s\n",
278 non_ascii_error_message (lexical_context,
279 real_file_name,
280 line_number),
281 _("\
282 Please specify the source encoding through --from-code or through a comment\n\
283 as specified in https://www.python.org/peps/pep-0263.html.\n")));
284 exit (EXIT_FAILURE);
285 }
286 return c;
287 }
288 else if (xgettext_current_source_encoding != po_charset_utf8)
289 {
290 #if HAVE_ICONV
291 /* Use iconv on an increasing number of bytes. Read only as many bytes
292 through phase1_getc as needed. This is needed to give reasonable
293 interactive behaviour when fp is connected to an interactive tty. */
294 unsigned char buf[MAX_PHASE1_PUSHBACK];
295 size_t bufcount;
296
297 {
298 int c = phase1_getc ();
299 if (c == EOF)
300 return UEOF;
301 buf[0] = (unsigned char) c;
302 bufcount = 1;
303 }
304
305 for (;;)
306 {
307 unsigned char scratchbuf[6];
308 const char *inptr = (const char *) &buf[0];
309 size_t insize = bufcount;
310 char *outptr = (char *) &scratchbuf[0];
311 size_t outsize = sizeof (scratchbuf);
312
313 size_t res = iconv (xgettext_current_source_iconv,
314 (ICONV_CONST char **) &inptr, &insize,
315 &outptr, &outsize);
316 /* We expect that a character has been produced if and only if
317 some input bytes have been consumed. */
318 if ((insize < bufcount) != (outsize < sizeof (scratchbuf)))
319 abort ();
320 if (outsize == sizeof (scratchbuf))
321 {
322 /* No character has been produced. Must be an error. */
323 if (res != (size_t)(-1))
324 abort ();
325
326 if (errno == EILSEQ)
327 {
328 /* An invalid multibyte sequence was encountered. */
329 goto invalid;
330 }
331 else if (errno == EINVAL)
332 {
333 /* An incomplete multibyte character. */
334 int c;
335
336 if (bufcount == MAX_PHASE1_PUSHBACK)
337 {
338 /* An overlong incomplete multibyte sequence was
339 encountered. */
340 multiline_error (xstrdup (""),
341 xasprintf (_("\
342 %s:%d: Long incomplete multibyte sequence.\n\
343 Please specify the correct source encoding through --from-code or through a\n\
344 comment as specified in https://www.python.org/peps/pep-0263.html.\n"),
345 real_file_name, line_number));
346 exit (EXIT_FAILURE);
347 }
348
349 /* Read one more byte and retry iconv. */
350 c = phase1_getc ();
351 if (c == EOF)
352 goto incomplete_at_eof;
353 if (c == '\n')
354 goto incomplete_at_eol;
355 buf[bufcount++] = (unsigned char) c;
356 }
357 else
358 error (EXIT_FAILURE, errno, _("%s:%d: iconv failure"),
359 real_file_name, line_number);
360 }
361 else
362 {
363 size_t outbytes = sizeof (scratchbuf) - outsize;
364 size_t bytes = bufcount - insize;
365 ucs4_t uc;
366
367 /* We expect that one character has been produced. */
368 if (bytes == 0)
369 abort ();
370 if (outbytes == 0)
371 abort ();
372 /* Push back the unused bytes. */
373 while (insize > 0)
374 phase1_ungetc (buf[--insize]);
375 /* Convert the character from UTF-8 to UCS-4. */
376 if (u8_mbtoucr (&uc, scratchbuf, outbytes) < (int) outbytes)
377 {
378 /* scratchbuf contains an out-of-range Unicode character
379 (> 0x10ffff). */
380 goto invalid;
381 }
382 return uc;
383 }
384 }
385 #else
386 /* If we don't have iconv(), the only supported values for
387 xgettext_global_source_encoding and thus also for
388 xgettext_current_source_encoding are ASCII and UTF-8. */
389 abort ();
390 #endif
391 }
392 else
393 {
394 /* Read an UTF-8 encoded character.
395 Reject invalid input, like u8_mbtouc does. */
396 int c;
397 ucs4_t uc;
398
399 c = phase1_getc ();
400 if (c == EOF)
401 return UEOF;
402 if (c < 0x80)
403 {
404 uc = c;
405 }
406 else if (c < 0xc2)
407 goto invalid;
408 else if (c < 0xe0)
409 {
410 int c1 = phase1_getc ();
411 if (c1 == EOF)
412 goto incomplete_at_eof;
413 if (c1 == '\n')
414 goto incomplete_at_eol;
415 if ((c1 ^ 0x80) < 0x40)
416 uc = ((unsigned int) (c & 0x1f) << 6)
417 | (unsigned int) (c1 ^ 0x80);
418 else
419 goto invalid;
420 }
421 else if (c < 0xf0)
422 {
423 int c1 = phase1_getc ();
424 if (c1 == EOF)
425 goto incomplete_at_eof;
426 if (c1 == '\n')
427 goto incomplete_at_eol;
428 if ((c1 ^ 0x80) < 0x40
429 && (c >= 0xe1 || c1 >= 0xa0)
430 && (c != 0xed || c1 < 0xa0))
431 {
432 int c2 = phase1_getc ();
433 if (c2 == EOF)
434 goto incomplete_at_eof;
435 if (c2 == '\n')
436 goto incomplete_at_eol;
437 if ((c2 ^ 0x80) < 0x40)
438 uc = ((unsigned int) (c & 0x0f) << 12)
439 | ((unsigned int) (c1 ^ 0x80) << 6)
440 | (unsigned int) (c2 ^ 0x80);
441 else
442 goto invalid;
443 }
444 else
445 goto invalid;
446 }
447 else if (c < 0xf8)
448 {
449 int c1 = phase1_getc ();
450 if (c1 == EOF)
451 goto incomplete_at_eof;
452 if (c1 == '\n')
453 goto incomplete_at_eol;
454 if ((c1 ^ 0x80) < 0x40
455 && (c >= 0xf1 || c1 >= 0x90)
456 && (c < 0xf4 || (c == 0xf4 && c1 < 0x90)))
457 {
458 int c2 = phase1_getc ();
459 if (c2 == EOF)
460 goto incomplete_at_eof;
461 if (c2 == '\n')
462 goto incomplete_at_eol;
463 if ((c2 ^ 0x80) < 0x40)
464 {
465 int c3 = phase1_getc ();
466 if (c3 == EOF)
467 goto incomplete_at_eof;
468 if (c3 == '\n')
469 goto incomplete_at_eol;
470 if ((c3 ^ 0x80) < 0x40)
471 uc = ((unsigned int) (c & 0x07) << 18)
472 | ((unsigned int) (c1 ^ 0x80) << 12)
473 | ((unsigned int) (c2 ^ 0x80) << 6)
474 | (unsigned int) (c3 ^ 0x80);
475 else
476 goto invalid;
477 }
478 else
479 goto invalid;
480 }
481 else
482 goto invalid;
483 }
484 else
485 goto invalid;
486
487 return uc;
488 }
489
490 invalid:
491 /* An invalid multibyte sequence was encountered. */
492 multiline_error (xstrdup (""),
493 xasprintf (_("\
494 %s:%d: Invalid multibyte sequence.\n\
495 Please specify the correct source encoding through --from-code or through a\n\
496 comment as specified in https://www.python.org/peps/pep-0263.html.\n"),
497 real_file_name, line_number));
498 exit (EXIT_FAILURE);
499
500 incomplete_at_eof:
501 multiline_error (xstrdup (""),
502 xasprintf (_("\
503 %s:%d: Incomplete multibyte sequence at end of file.\n\
504 Please specify the correct source encoding through --from-code or through a\n\
505 comment as specified in https://www.python.org/peps/pep-0263.html.\n"),
506 real_file_name, line_number));
507 exit (EXIT_FAILURE);
508
509 incomplete_at_eol:
510 multiline_error (xstrdup (""),
511 xasprintf (_("\
512 %s:%d: Incomplete multibyte sequence at end of line.\n\
513 Please specify the correct source encoding through --from-code or through a\n\
514 comment as specified in https://www.python.org/peps/pep-0263.html.\n"),
515 real_file_name, line_number - 1));
516 exit (EXIT_FAILURE);
517 }
518
519 /* Supports max (9, UNINAME_MAX + 3) pushback characters. */
520 static void
521 phase2_ungetc (int c)
522 {
523 if (c != UEOF)
524 {
525 if (phase2_pushback_length == SIZEOF (phase2_pushback))
526 abort ();
527 phase2_pushback[phase2_pushback_length++] = c;
528 }
529 }
530
531
532 /* ========================= Accumulating strings. ======================== */
533
534 /* See xg-mixed-string.h for the API. */
535
536
537 /* ======================== Accumulating comments. ======================== */
538
539
540 /* Accumulating a single comment line. */
541
542 static struct mixed_string_buffer comment_buffer;
543
544 static inline void
545 comment_start ()
546 {
547 mixed_string_buffer_init (&comment_buffer, lc_comment,
548 logical_file_name, line_number);
549 }
550
551 static inline bool
552 comment_at_start ()
553 {
554 return mixed_string_buffer_is_empty (&comment_buffer);
555 }
556
557 static inline void
558 comment_add (int c)
559 {
560 mixed_string_buffer_append_unicode (&comment_buffer, c);
561 }
562
563 static inline const char *
564 comment_line_end ()
565 {
566 char *buffer =
567 mixed_string_contents_free1 (mixed_string_buffer_result (&comment_buffer));
568 size_t buflen = strlen (buffer);
569
570 while (buflen >= 1
571 && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
572 --buflen;
573 buffer[buflen] = '\0';
574 savable_comment_add (buffer);
575 lexical_context = lc_outside;
576 return buffer;
577 }
578
579
580 /* These are for tracking whether comments count as immediately before
581 keyword. */
582 static int last_comment_line;
583 static int last_non_comment_line;
584
585
586 /* ======================== Recognizing comments. ======================== */
587
588
589 /* Recognizing the "coding" comment.
590 As specified in PEP 0263, it takes the form
591 "coding" [":"|"="] {alphanumeric or "-" or "_" or "*"}*
592 or
593 "set" "fileencoding" "=" {alphanumeric or "-" or "_" or "*"}*
594 and is located in a comment in a line that
595 - is either the first or second line,
596 - is not a continuation line,
597 - in the first form, contains no other tokens except this comment. */
598
599 /* Canonicalized encoding name for the current input file. */
600 static const char *xgettext_current_file_source_encoding;
601
602 #if HAVE_ICONV
603 /* Converter from xgettext_current_file_source_encoding to UTF-8 (except from
604 ASCII or UTF-8, when this conversion is a no-op). */
605 static iconv_t xgettext_current_file_source_iconv;
606 #endif
607
608 static inline void
609 set_current_file_source_encoding (const char *canon_encoding)
610 {
611 xgettext_current_file_source_encoding = canon_encoding;
612
613 if (xgettext_current_file_source_encoding != po_charset_ascii
614 && xgettext_current_file_source_encoding != po_charset_utf8)
615 {
616 #if HAVE_ICONV
617 iconv_t cd;
618
619 /* Avoid glibc-2.1 bug with EUC-KR. */
620 # if ((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
621 && !defined _LIBICONV_VERSION
622 if (strcmp (xgettext_current_file_source_encoding, "EUC-KR") == 0)
623 cd = (iconv_t)(-1);
624 else
625 # endif
626 cd = iconv_open (po_charset_utf8, xgettext_current_file_source_encoding);
627 if (cd == (iconv_t)(-1))
628 error_at_line (EXIT_FAILURE, 0, logical_file_name, line_number - 1,
629 _("Cannot convert from \"%s\" to \"%s\". %s relies on iconv(), and iconv() does not support this conversion."),
630 xgettext_current_file_source_encoding, po_charset_utf8,
631 last_component (program_name));
632 xgettext_current_file_source_iconv = cd;
633 #else
634 error_at_line (EXIT_FAILURE, 0, logical_file_name, line_number - 1,
635 _("Cannot convert from \"%s\" to \"%s\". %s relies on iconv(). This version was built without iconv()."),
636 xgettext_current_file_source_encoding, po_charset_utf8,
637 last_component (program_name));
638 #endif
639 }
640
641 xgettext_current_source_encoding = xgettext_current_file_source_encoding;
642 #if HAVE_ICONV
643 xgettext_current_source_iconv = xgettext_current_file_source_iconv;
644 #endif
645 }
646
647 static inline void
648 try_to_extract_coding (const char *comment)
649 {
650 const char *p = c_strstr (comment, "coding");
651
652 if (p != NULL)
653 {
654 p += 6;
655 if (*p == ':' || *p == '=')
656 {
657 p++;
658 while (*p == ' ' || *p == '\t')
659 p++;
660 {
661 const char *encoding_start = p;
662
663 while (c_isalnum (*p) || *p == '-' || *p == '_' || *p == '.')
664 p++;
665 {
666 const char *encoding_end = p;
667
668 if (encoding_end > encoding_start)
669 {
670 /* Extract the encoding string. */
671 size_t encoding_len = encoding_end - encoding_start;
672 char *encoding = XNMALLOC (encoding_len + 1, char);
673
674 memcpy (encoding, encoding_start, encoding_len);
675 encoding[encoding_len] = '\0';
676
677 {
678 /* Canonicalize it. */
679 const char *canon_encoding = po_charset_canonicalize (encoding);
680 if (canon_encoding == NULL)
681 {
682 error_at_line (0, 0,
683 logical_file_name, line_number - 1,
684 _("Unknown encoding \"%s\". Proceeding with ASCII instead."),
685 encoding);
686 canon_encoding = po_charset_ascii;
687 }
688
689 /* Activate it. */
690 set_current_file_source_encoding (canon_encoding);
691 }
692
693 free (encoding);
694 }
695 }
696 }
697 }
698 }
699 }
700
701 /* Tracking whether the current line is a continuation line or contains a
702 non-blank character. */
703 static bool continuation_or_nonblank_line;
704
705
706 /* Phase 3: Outside strings, replace backslash-newline with nothing and a
707 comment with nothing. */
708
709 static int
710 phase3_getc ()
711 {
712 int c;
713
714 for (;;)
715 {
716 c = phase2_getc ();
717 if (c == '\\')
718 {
719 c = phase2_getc ();
720 if (c != '\n')
721 {
722 phase2_ungetc (c);
723 /* This shouldn't happen usually, because "A backslash is
724 illegal elsewhere on a line outside a string literal." */
725 return '\\';
726 }
727 /* Eat backslash-newline. */
728 continuation_or_nonblank_line = true;
729 }
730 else if (c == '#')
731 {
732 /* Eat a comment. */
733 const char *comment;
734
735 last_comment_line = line_number;
736 comment_start ();
737 for (;;)
738 {
739 c = phase2_getc ();
740 if (c == UEOF || c == '\n')
741 break;
742 /* We skip all leading white space, but not EOLs. */
743 if (!(comment_at_start () && (c == ' ' || c == '\t')))
744 comment_add (c);
745 }
746 comment = comment_line_end ();
747 if (line_number - 1 <= 2 && !continuation_or_nonblank_line)
748 try_to_extract_coding (comment);
749 continuation_or_nonblank_line = false;
750 return c;
751 }
752 else
753 {
754 if (c == '\n')
755 continuation_or_nonblank_line = false;
756 else if (!(c == ' ' || c == '\t' || c == '\f'))
757 continuation_or_nonblank_line = true;
758 return c;
759 }
760 }
761 }
762
763 /* Supports only one pushback character. */
764 static void
765 phase3_ungetc (int c)
766 {
767 phase2_ungetc (c);
768 }
769
770
771 /* ========================= Accumulating strings. ======================== */
772
773 /* Return value of phase7_getuc when EOF is reached. */
774 #define P7_EOF (-1)
775 #define P7_STRING_END (-2)
776
777 /* Convert an UTF-16 or UTF-32 code point to a return value that can be
778 distinguished from a single-byte return value. */
779 #define UNICODE(code) (0x100 + (code))
780
781 /* Test a return value of phase7_getuc whether it designates an UTF-16 or
782 UTF-32 code point. */
783 #define IS_UNICODE(p7_result) ((p7_result) >= 0x100)
784
785 /* Extract the UTF-16 or UTF-32 code of a return value that satisfies
786 IS_UNICODE. */
787 #define UNICODE_VALUE(p7_result) ((p7_result) - 0x100)
788
789
790 /* ========================== Reading of tokens. ========================== */
791
792
793 enum token_type_ty
794 {
795 token_type_eof,
796 token_type_lparen, /* ( */
797 token_type_rparen, /* ) */
798 token_type_comma, /* , */
799 token_type_lbracket, /* [ */
800 token_type_rbracket, /* ] */
801 token_type_string, /* "abc", 'abc', """abc""", '''abc''' */
802 token_type_symbol, /* symbol, number */
803 token_type_plus, /* + */
804 token_type_other /* misc. operator */
805 };
806 typedef enum token_type_ty token_type_ty;
807
808 typedef struct token_ty token_ty;
809 struct token_ty
810 {
811 token_type_ty type;
812 char *string; /* for token_type_symbol */
813 mixed_string_ty *mixed_string; /* for token_type_string */
814 refcounted_string_list_ty *comment; /* for token_type_string */
815 int line_number;
816 };
817
818 /* Free the memory pointed to by a 'struct token_ty'. */
819 static inline void
820 free_token (token_ty *tp)
821 {
822 if (tp->type == token_type_symbol)
823 free (tp->string);
824 if (tp->type == token_type_string)
825 {
826 mixed_string_free (tp->mixed_string);
827 drop_reference (tp->comment);
828 }
829 }
830
831
832 /* There are two different input syntaxes for strings, "abc" and r"abc",
833 and two different input syntaxes for Unicode strings, u"abc" and ur"abc".
834 Which escape sequences are understood, i.e. what is interpreted specially
835 after backslash?
836 "abc" \<nl> \\ \' \" \a\b\f\n\r\t\v \ooo \xnn
837 r"abc"
838 u"abc" \<nl> \\ \' \" \a\b\f\n\r\t\v \ooo \xnn \unnnn \Unnnnnnnn \N{...}
839 ur"abc" \unnnn
840 The \unnnn values are UTF-16 values; a single \Unnnnnnnn can expand to two
841 \unnnn items. The \ooo and \xnn values are in the current source encoding
842 for byte strings, and Unicode code points for Unicode strings.
843 */
844
845 static int
846 phase7_getuc (int quote_char,
847 bool triple, bool interpret_ansic, bool interpret_unicode,
848 unsigned int *backslash_counter)
849 {
850 int c;
851
852 for (;;)
853 {
854 /* Use phase 2, because phase 3 elides comments. */
855 c = phase2_getc ();
856
857 if (c == UEOF)
858 return P7_EOF;
859
860 if (c == quote_char && (interpret_ansic || (*backslash_counter & 1) == 0))
861 {
862 if (triple)
863 {
864 int c1 = phase2_getc ();
865 if (c1 == quote_char)
866 {
867 int c2 = phase2_getc ();
868 if (c2 == quote_char)
869 return P7_STRING_END;
870 phase2_ungetc (c2);
871 }
872 phase2_ungetc (c1);
873 return UNICODE (c);
874 }
875 else
876 return P7_STRING_END;
877 }
878
879 if (c == '\n')
880 {
881 if (triple)
882 {
883 *backslash_counter = 0;
884 return UNICODE ('\n');
885 }
886 /* In r"..." and ur"..." strings, newline is only allowed
887 immediately after an odd number of backslashes (although the
888 backslashes are not interpreted!). */
889 if (!(interpret_ansic || (*backslash_counter & 1) == 0))
890 {
891 *backslash_counter = 0;
892 return UNICODE ('\n');
893 }
894 phase2_ungetc (c);
895 error_with_progname = false;
896 error (0, 0, _("%s:%d: warning: unterminated string"),
897 logical_file_name, line_number);
898 error_with_progname = true;
899 return P7_STRING_END;
900 }
901
902 if (c != '\\')
903 {
904 *backslash_counter = 0;
905 return UNICODE (c);
906 }
907
908 /* Backslash handling. */
909
910 if (!interpret_ansic && !interpret_unicode)
911 {
912 ++*backslash_counter;
913 return UNICODE ('\\');
914 }
915
916 /* Dispatch according to the character following the backslash. */
917 c = phase2_getc ();
918 if (c == UEOF)
919 {
920 ++*backslash_counter;
921 return UNICODE ('\\');
922 }
923
924 if (interpret_ansic)
925 switch (c)
926 {
927 case '\n':
928 continue;
929 case '\\':
930 ++*backslash_counter;
931 return UNICODE (c);
932 case '\'': case '"':
933 *backslash_counter = 0;
934 return UNICODE (c);
935 case 'a':
936 *backslash_counter = 0;
937 return UNICODE ('\a');
938 case 'b':
939 *backslash_counter = 0;
940 return UNICODE ('\b');
941 case 'f':
942 *backslash_counter = 0;
943 return UNICODE ('\f');
944 case 'n':
945 *backslash_counter = 0;
946 return UNICODE ('\n');
947 case 'r':
948 *backslash_counter = 0;
949 return UNICODE ('\r');
950 case 't':
951 *backslash_counter = 0;
952 return UNICODE ('\t');
953 case 'v':
954 *backslash_counter = 0;
955 return UNICODE ('\v');
956 case '0': case '1': case '2': case '3': case '4':
957 case '5': case '6': case '7':
958 {
959 int n = c - '0';
960
961 c = phase2_getc ();
962 if (c != UEOF)
963 {
964 if (c >= '0' && c <= '7')
965 {
966 n = (n << 3) + (c - '0');
967 c = phase2_getc ();
968 if (c != UEOF)
969 {
970 if (c >= '0' && c <= '7')
971 n = (n << 3) + (c - '0');
972 else
973 phase2_ungetc (c);
974 }
975 }
976 else
977 phase2_ungetc (c);
978 }
979 *backslash_counter = 0;
980 if (interpret_unicode)
981 return UNICODE (n);
982 else
983 return (unsigned char) n;
984 }
985 case 'x':
986 {
987 int c1 = phase2_getc ();
988 int n1;
989
990 if (c1 >= '0' && c1 <= '9')
991 n1 = c1 - '0';
992 else if (c1 >= 'A' && c1 <= 'F')
993 n1 = c1 - 'A' + 10;
994 else if (c1 >= 'a' && c1 <= 'f')
995 n1 = c1 - 'a' + 10;
996 else
997 n1 = -1;
998
999 if (n1 >= 0)
1000 {
1001 int c2 = phase2_getc ();
1002 int n2;
1003
1004 if (c2 >= '0' && c2 <= '9')
1005 n2 = c2 - '0';
1006 else if (c2 >= 'A' && c2 <= 'F')
1007 n2 = c2 - 'A' + 10;
1008 else if (c2 >= 'a' && c2 <= 'f')
1009 n2 = c2 - 'a' + 10;
1010 else
1011 n2 = -1;
1012
1013 if (n2 >= 0)
1014 {
1015 int n = (n1 << 4) + n2;
1016 *backslash_counter = 0;
1017 if (interpret_unicode)
1018 return UNICODE (n);
1019 else
1020 return (unsigned char) n;
1021 }
1022
1023 phase2_ungetc (c2);
1024 }
1025 phase2_ungetc (c1);
1026 phase2_ungetc (c);
1027 ++*backslash_counter;
1028 return UNICODE ('\\');
1029 }
1030 }
1031
1032 if (interpret_unicode)
1033 {
1034 if (c == 'u')
1035 {
1036 unsigned char buf[4];
1037 unsigned int n = 0;
1038 int i;
1039
1040 for (i = 0; i < 4; i++)
1041 {
1042 int c1 = phase2_getc ();
1043
1044 if (c1 >= '0' && c1 <= '9')
1045 n = (n << 4) + (c1 - '0');
1046 else if (c1 >= 'A' && c1 <= 'F')
1047 n = (n << 4) + (c1 - 'A' + 10);
1048 else if (c1 >= 'a' && c1 <= 'f')
1049 n = (n << 4) + (c1 - 'a' + 10);
1050 else
1051 {
1052 phase2_ungetc (c1);
1053 while (--i >= 0)
1054 phase2_ungetc (buf[i]);
1055 phase2_ungetc (c);
1056 ++*backslash_counter;
1057 return UNICODE ('\\');
1058 }
1059
1060 buf[i] = c1;
1061 }
1062 *backslash_counter = 0;
1063 return UNICODE (n);
1064 }
1065
1066 if (interpret_ansic)
1067 {
1068 if (c == 'U')
1069 {
1070 unsigned char buf[8];
1071 unsigned int n = 0;
1072 int i;
1073
1074 for (i = 0; i < 8; i++)
1075 {
1076 int c1 = phase2_getc ();
1077
1078 if (c1 >= '0' && c1 <= '9')
1079 n = (n << 4) + (c1 - '0');
1080 else if (c1 >= 'A' && c1 <= 'F')
1081 n = (n << 4) + (c1 - 'A' + 10);
1082 else if (c1 >= 'a' && c1 <= 'f')
1083 n = (n << 4) + (c1 - 'a' + 10);
1084 else
1085 {
1086 phase2_ungetc (c1);
1087 while (--i >= 0)
1088 phase2_ungetc (buf[i]);
1089 phase2_ungetc (c);
1090 ++*backslash_counter;
1091 return UNICODE ('\\');
1092 }
1093
1094 buf[i] = c1;
1095 }
1096 if (n < 0x110000)
1097 {
1098 *backslash_counter = 0;
1099 return UNICODE (n);
1100 }
1101
1102 error_with_progname = false;
1103 error (0, 0, _("%s:%d: warning: invalid Unicode character"),
1104 logical_file_name, line_number);
1105 error_with_progname = true;
1106
1107 while (--i >= 0)
1108 phase2_ungetc (buf[i]);
1109 phase2_ungetc (c);
1110 ++*backslash_counter;
1111 return UNICODE ('\\');
1112 }
1113
1114 if (c == 'N')
1115 {
1116 int c1 = phase2_getc ();
1117 if (c1 == '{')
1118 {
1119 unsigned char buf[UNINAME_MAX + 1];
1120 int i;
1121 unsigned int n;
1122
1123 for (i = 0; i < UNINAME_MAX; i++)
1124 {
1125 int c2 = phase2_getc ();
1126 if (!(c2 >= ' ' && c2 <= '~'))
1127 {
1128 phase2_ungetc (c2);
1129 while (--i >= 0)
1130 phase2_ungetc (buf[i]);
1131 phase2_ungetc (c1);
1132 phase2_ungetc (c);
1133 ++*backslash_counter;
1134 return UNICODE ('\\');
1135 }
1136 if (c2 == '}')
1137 break;
1138 buf[i] = c2;
1139 }
1140 buf[i] = '\0';
1141
1142 n = unicode_name_character ((char *) buf);
1143 if (n != UNINAME_INVALID)
1144 {
1145 *backslash_counter = 0;
1146 return UNICODE (n);
1147 }
1148
1149 phase2_ungetc ('}');
1150 while (--i >= 0)
1151 phase2_ungetc (buf[i]);
1152 }
1153 phase2_ungetc (c1);
1154 phase2_ungetc (c);
1155 ++*backslash_counter;
1156 return UNICODE ('\\');
1157 }
1158 }
1159 }
1160
1161 phase2_ungetc (c);
1162 ++*backslash_counter;
1163 return UNICODE ('\\');
1164 }
1165 }
1166
1167
1168 /* Combine characters into tokens. Discard whitespace except newlines at
1169 the end of logical lines. */
1170
1171 /* Number of pending open parentheses/braces/brackets. */
1172 static int open_pbb;
1173
1174 static token_ty phase5_pushback[2];
1175 static int phase5_pushback_length;
1176
1177 static void
1178 phase5_get (token_ty *tp)
1179 {
1180 int c;
1181
1182 if (phase5_pushback_length)
1183 {
1184 *tp = phase5_pushback[--phase5_pushback_length];
1185 return;
1186 }
1187
1188 for (;;)
1189 {
1190 tp->line_number = line_number;
1191 c = phase3_getc ();
1192
1193 switch (c)
1194 {
1195 case UEOF:
1196 tp->type = token_type_eof;
1197 return;
1198
1199 case ' ':
1200 case '\t':
1201 case '\f':
1202 /* Ignore whitespace and comments. */
1203 continue;
1204
1205 case '\n':
1206 if (last_non_comment_line > last_comment_line)
1207 savable_comment_reset ();
1208 /* Ignore newline if and only if it is used for implicit line
1209 joining. */
1210 if (open_pbb > 0)
1211 continue;
1212 tp->type = token_type_other;
1213 return;
1214 }
1215
1216 last_non_comment_line = tp->line_number;
1217
1218 switch (c)
1219 {
1220 case '.':
1221 {
1222 int c1 = phase3_getc ();
1223 phase3_ungetc (c1);
1224 if (!(c1 >= '0' && c1 <= '9'))
1225 {
1226
1227 tp->type = token_type_other;
1228 return;
1229 }
1230 }
1231 FALLTHROUGH;
1232 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1233 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1234 case 'M': case 'N': case 'O': case 'P': case 'Q':
1235 case 'S': case 'T': case 'V': case 'W': case 'X':
1236 case 'Y': case 'Z':
1237 case '_':
1238 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1239 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1240 case 'm': case 'n': case 'o': case 'p': case 'q':
1241 case 's': case 't': case 'v': case 'w': case 'x':
1242 case 'y': case 'z':
1243 case '0': case '1': case '2': case '3': case '4':
1244 case '5': case '6': case '7': case '8': case '9':
1245 symbol:
1246 /* Symbol, or part of a number. */
1247 {
1248 static char *buffer;
1249 static int bufmax;
1250 int bufpos;
1251
1252 bufpos = 0;
1253 for (;;)
1254 {
1255 if (bufpos >= bufmax)
1256 {
1257 bufmax = 2 * bufmax + 10;
1258 buffer = xrealloc (buffer, bufmax);
1259 }
1260 buffer[bufpos++] = c;
1261 c = phase3_getc ();
1262 switch (c)
1263 {
1264 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1265 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1266 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1267 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1268 case 'Y': case 'Z':
1269 case '_':
1270 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1271 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1272 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1273 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1274 case 'y': case 'z':
1275 case '0': case '1': case '2': case '3': case '4':
1276 case '5': case '6': case '7': case '8': case '9':
1277 continue;
1278 default:
1279 phase3_ungetc (c);
1280 break;
1281 }
1282 break;
1283 }
1284 if (bufpos >= bufmax)
1285 {
1286 bufmax = 2 * bufmax + 10;
1287 buffer = xrealloc (buffer, bufmax);
1288 }
1289 buffer[bufpos] = '\0';
1290 tp->string = xstrdup (buffer);
1291 tp->type = token_type_symbol;
1292 return;
1293 }
1294
1295 /* Strings. */
1296 {
1297 int quote_char;
1298 bool interpret_ansic;
1299 bool interpret_unicode;
1300 bool triple;
1301 unsigned int backslash_counter;
1302
1303 case 'R': case 'r':
1304 {
1305 int c1 = phase2_getc ();
1306 if (c1 == '"' || c1 == '\'')
1307 {
1308 quote_char = c1;
1309 interpret_ansic = false;
1310 interpret_unicode = false;
1311 goto string;
1312 }
1313 phase2_ungetc (c1);
1314 goto symbol;
1315 }
1316
1317 case 'U': case 'u':
1318 {
1319 int c1 = phase2_getc ();
1320 if (c1 == '"' || c1 == '\'')
1321 {
1322 quote_char = c1;
1323 interpret_ansic = true;
1324 interpret_unicode = true;
1325 goto string;
1326 }
1327 if (c1 == 'R' || c1 == 'r')
1328 {
1329 int c2 = phase2_getc ();
1330 if (c2 == '"' || c2 == '\'')
1331 {
1332 quote_char = c2;
1333 interpret_ansic = false;
1334 interpret_unicode = true;
1335 goto string;
1336 }
1337 phase2_ungetc (c2);
1338 }
1339 phase2_ungetc (c1);
1340 goto symbol;
1341 }
1342
1343 case '"': case '\'':
1344 quote_char = c;
1345 interpret_ansic = true;
1346 interpret_unicode = false;
1347 string:
1348 triple = false;
1349 lexical_context = lc_string;
1350 {
1351 int c1 = phase2_getc ();
1352 if (c1 == quote_char)
1353 {
1354 int c2 = phase2_getc ();
1355 if (c2 == quote_char)
1356 triple = true;
1357 else
1358 {
1359 phase2_ungetc (c2);
1360 phase2_ungetc (c1);
1361 }
1362 }
1363 else
1364 phase2_ungetc (c1);
1365 }
1366 backslash_counter = 0;
1367 {
1368 struct mixed_string_buffer msb;
1369
1370 /* Start accumulating the string. */
1371 mixed_string_buffer_init (&msb, lexical_context,
1372 logical_file_name, line_number);
1373 for (;;)
1374 {
1375 int uc = phase7_getuc (quote_char, triple, interpret_ansic,
1376 interpret_unicode, &backslash_counter);
1377
1378 /* Keep line_number in sync. */
1379 msb.line_number = line_number;
1380
1381 if (uc == P7_EOF || uc == P7_STRING_END)
1382 break;
1383
1384 if (IS_UNICODE (uc))
1385 {
1386 assert (UNICODE_VALUE (uc) >= 0
1387 && UNICODE_VALUE (uc) < 0x110000);
1388 mixed_string_buffer_append_unicode (&msb,
1389 UNICODE_VALUE (uc));
1390 }
1391 else
1392 mixed_string_buffer_append_char (&msb, uc);
1393 }
1394 tp->mixed_string = mixed_string_buffer_result (&msb);
1395 tp->comment = add_reference (savable_comment);
1396 lexical_context = lc_outside;
1397 tp->type = token_type_string;
1398 }
1399 return;
1400 }
1401
1402 case '(':
1403 open_pbb++;
1404 tp->type = token_type_lparen;
1405 return;
1406
1407 case ')':
1408 if (open_pbb > 0)
1409 open_pbb--;
1410 tp->type = token_type_rparen;
1411 return;
1412
1413 case ',':
1414 tp->type = token_type_comma;
1415 return;
1416
1417 case '[': case '{':
1418 open_pbb++;
1419 tp->type = (c == '[' ? token_type_lbracket : token_type_other);
1420 return;
1421
1422 case ']': case '}':
1423 if (open_pbb > 0)
1424 open_pbb--;
1425 tp->type = (c == ']' ? token_type_rbracket : token_type_other);
1426 return;
1427
1428 case '+':
1429 tp->type = token_type_plus;
1430 return;
1431
1432 default:
1433 /* We could carefully recognize each of the 2 and 3 character
1434 operators, but it is not necessary, as we only need to recognize
1435 gettext invocations. Don't bother. */
1436 tp->type = token_type_other;
1437 return;
1438 }
1439 }
1440 }
1441
1442 /* Supports only one pushback token. */
1443 static void
1444 phase5_unget (token_ty *tp)
1445 {
1446 if (tp->type != token_type_eof)
1447 {
1448 if (phase5_pushback_length == SIZEOF (phase5_pushback))
1449 abort ();
1450 phase5_pushback[phase5_pushback_length++] = *tp;
1451 }
1452 }
1453
1454
1455 /* Combine adjacent strings to form a single string. Note that the end
1456 of a logical line appears as a token of its own, therefore strings that
1457 belong to different logical lines will not be concatenated. */
1458
1459 static void
1460 x_python_lex (token_ty *tp)
1461 {
1462 phase5_get (tp);
1463 if (tp->type == token_type_string)
1464 {
1465 mixed_string_ty *sum = tp->mixed_string;
1466
1467 for (;;)
1468 {
1469 token_ty token2;
1470 token_ty token3;
1471 token_ty *tp2 = NULL;
1472
1473 phase5_get (&token2);
1474 switch (token2.type)
1475 {
1476 case token_type_plus:
1477 {
1478 phase5_get (&token3);
1479 if (token3.type == token_type_string)
1480 {
1481 free_token (&token2);
1482 tp2 = &token3;
1483 }
1484 else
1485 phase5_unget (&token3);
1486 }
1487 break;
1488 case token_type_string:
1489 tp2 = &token2;
1490 break;
1491 default:
1492 break;
1493 }
1494
1495 if (tp2)
1496 {
1497 sum = mixed_string_concat_free1 (sum, tp2->mixed_string);
1498
1499 free_token (tp2);
1500 continue;
1501 }
1502 phase5_unget (&token2);
1503 break;
1504 }
1505 tp->mixed_string = sum;
1506 }
1507 }
1508
1509
1510 /* ========================= Extracting strings. ========================== */
1511
1512
1513 /* Context lookup table. */
1514 static flag_context_list_table_ty *flag_context_list_table;
1515
1516
1517 /* Maximum supported nesting depth. */
1518 #define MAX_NESTING_DEPTH 1000
1519
1520 /* Current nesting depths. */
1521 static int paren_nesting_depth;
1522 static int bracket_nesting_depth;
1523
1524
1525 /* The file is broken into tokens. Scan the token stream, looking for
1526 a keyword, followed by a left paren, followed by a string. When we
1527 see this sequence, we have something to remember. We assume we are
1528 looking at a valid C or C++ program, and leave the complaints about
1529 the grammar to the compiler.
1530
1531 Normal handling: Look for
1532 keyword ( ... msgid ... )
1533 Plural handling: Look for
1534 keyword ( ... msgid ... msgid_plural ... )
1535
1536 We use recursion because the arguments before msgid or between msgid
1537 and msgid_plural can contain subexpressions of the same form. */
1538
1539
1540 /* Extract messages until the next balanced closing parenthesis or bracket.
1541 Extracted messages are added to MLP.
1542 DELIM can be either token_type_rparen or token_type_rbracket, or
1543 token_type_eof to accept both.
1544 Return true upon eof, false upon closing parenthesis or bracket. */
1545 static bool
1546 extract_balanced (message_list_ty *mlp,
1547 token_type_ty delim,
1548 flag_context_ty outer_context,
1549 flag_context_list_iterator_ty context_iter,
1550 struct arglist_parser *argparser)
1551 {
1552 /* Current argument number. */
1553 int arg = 1;
1554 /* 0 when no keyword has been seen. 1 right after a keyword is seen. */
1555 int state;
1556 /* Parameters of the keyword just seen. Defined only in state 1. */
1557 const struct callshapes *next_shapes = NULL;
1558 /* Context iterator that will be used if the next token is a '('. */
1559 flag_context_list_iterator_ty next_context_iter =
1560 passthrough_context_list_iterator;
1561 /* Current context. */
1562 flag_context_ty inner_context =
1563 inherited_context (outer_context,
1564 flag_context_list_iterator_advance (&context_iter));
1565
1566 /* Start state is 0. */
1567 state = 0;
1568
1569 for (;;)
1570 {
1571 token_ty token;
1572
1573 x_python_lex (&token);
1574 switch (token.type)
1575 {
1576 case token_type_symbol:
1577 {
1578 void *keyword_value;
1579
1580 if (hash_find_entry (&keywords, token.string, strlen (token.string),
1581 &keyword_value)
1582 == 0)
1583 {
1584 next_shapes = (const struct callshapes *) keyword_value;
1585 state = 1;
1586 }
1587 else
1588 state = 0;
1589 }
1590 next_context_iter =
1591 flag_context_list_iterator (
1592 flag_context_list_table_lookup (
1593 flag_context_list_table,
1594 token.string, strlen (token.string)));
1595 free (token.string);
1596 continue;
1597
1598 case token_type_lparen:
1599 if (++paren_nesting_depth > MAX_NESTING_DEPTH)
1600 {
1601 error_with_progname = false;
1602 error (EXIT_FAILURE, 0, _("%s:%d: error: too many open parentheses"),
1603 logical_file_name, line_number);
1604 }
1605 if (extract_balanced (mlp, token_type_rparen,
1606 inner_context, next_context_iter,
1607 arglist_parser_alloc (mlp,
1608 state ? next_shapes : NULL)))
1609 {
1610 arglist_parser_done (argparser, arg);
1611 return true;
1612 }
1613 paren_nesting_depth--;
1614 next_context_iter = null_context_list_iterator;
1615 state = 0;
1616 continue;
1617
1618 case token_type_rparen:
1619 if (delim == token_type_rparen || delim == token_type_eof)
1620 {
1621 arglist_parser_done (argparser, arg);
1622 return false;
1623 }
1624 next_context_iter = null_context_list_iterator;
1625 state = 0;
1626 continue;
1627
1628 case token_type_comma:
1629 arg++;
1630 inner_context =
1631 inherited_context (outer_context,
1632 flag_context_list_iterator_advance (
1633 &context_iter));
1634 next_context_iter = passthrough_context_list_iterator;
1635 state = 0;
1636 continue;
1637
1638 case token_type_lbracket:
1639 if (++bracket_nesting_depth > MAX_NESTING_DEPTH)
1640 {
1641 error_with_progname = false;
1642 error (EXIT_FAILURE, 0, _("%s:%d: error: too many open brackets"),
1643 logical_file_name, line_number);
1644 }
1645 if (extract_balanced (mlp, token_type_rbracket,
1646 null_context, null_context_list_iterator,
1647 arglist_parser_alloc (mlp, NULL)))
1648 {
1649 arglist_parser_done (argparser, arg);
1650 return true;
1651 }
1652 bracket_nesting_depth--;
1653 next_context_iter = null_context_list_iterator;
1654 state = 0;
1655 continue;
1656
1657 case token_type_rbracket:
1658 if (delim == token_type_rbracket || delim == token_type_eof)
1659 {
1660 arglist_parser_done (argparser, arg);
1661 return false;
1662 }
1663 next_context_iter = null_context_list_iterator;
1664 state = 0;
1665 continue;
1666
1667 case token_type_string:
1668 {
1669 lex_pos_ty pos;
1670
1671 pos.file_name = logical_file_name;
1672 pos.line_number = token.line_number;
1673
1674 if (extract_all)
1675 {
1676 char *string = mixed_string_contents (token.mixed_string);
1677 mixed_string_free (token.mixed_string);
1678 remember_a_message (mlp, NULL, string, true, false,
1679 inner_context, &pos,
1680 NULL, token.comment, true);
1681 }
1682 else
1683 arglist_parser_remember (argparser, arg, token.mixed_string,
1684 inner_context,
1685 pos.file_name, pos.line_number,
1686 token.comment, true);
1687 }
1688 drop_reference (token.comment);
1689 next_context_iter = null_context_list_iterator;
1690 state = 0;
1691 continue;
1692
1693 case token_type_eof:
1694 arglist_parser_done (argparser, arg);
1695 return true;
1696
1697 case token_type_plus:
1698 case token_type_other:
1699 next_context_iter = null_context_list_iterator;
1700 state = 0;
1701 continue;
1702
1703 default:
1704 abort ();
1705 }
1706 }
1707 }
1708
1709
1710 void
1711 extract_python (FILE *f,
1712 const char *real_filename, const char *logical_filename,
1713 flag_context_list_table_ty *flag_table,
1714 msgdomain_list_ty *mdlp)
1715 {
1716 message_list_ty *mlp = mdlp->item[0]->messages;
1717
1718 fp = f;
1719 real_file_name = real_filename;
1720 logical_file_name = xstrdup (logical_filename);
1721 line_number = 1;
1722
1723 phase1_pushback_length = 0;
1724
1725 lexical_context = lc_outside;
1726
1727 phase2_pushback_length = 0;
1728
1729 last_comment_line = -1;
1730 last_non_comment_line = -1;
1731
1732 /* For Python, the default source file encoding is UTF-8. This is specified
1733 in PEP 3120. */
1734 xgettext_current_file_source_encoding =
1735 (xgettext_global_source_encoding != NULL ? xgettext_global_source_encoding :
1736 po_charset_utf8);
1737 #if HAVE_ICONV
1738 xgettext_current_file_source_iconv = xgettext_global_source_iconv;
1739 #endif
1740
1741 xgettext_current_source_encoding = xgettext_current_file_source_encoding;
1742 #if HAVE_ICONV
1743 xgettext_current_source_iconv = xgettext_current_file_source_iconv;
1744 #endif
1745
1746 continuation_or_nonblank_line = false;
1747
1748 open_pbb = 0;
1749
1750 phase5_pushback_length = 0;
1751
1752 flag_context_list_table = flag_table;
1753 paren_nesting_depth = 0;
1754 bracket_nesting_depth = 0;
1755
1756 init_keywords ();
1757
1758 /* Eat tokens until eof is seen. When extract_balanced returns
1759 due to an unbalanced closing parenthesis, just restart it. */
1760 while (!extract_balanced (mlp, token_type_eof,
1761 null_context, null_context_list_iterator,
1762 arglist_parser_alloc (mlp, NULL)))
1763 ;
1764
1765 fp = NULL;
1766 real_file_name = NULL;
1767 logical_file_name = NULL;
1768 line_number = 0;
1769 }