1 /* xgettext Lisp backend.
2 Copyright (C) 2001-2003, 2005-2009, 2018-2023 Free Software Foundation, Inc.
3
4 This file was written by Bruno Haible <haible@clisp.cons.org>, 2001.
5
6 This program is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
10
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with this program. If not, see <https://www.gnu.org/licenses/>. */
18
19 #ifdef HAVE_CONFIG_H
20 # include "config.h"
21 #endif
22
23 /* Specification. */
24 #include "x-lisp.h"
25
26 #include <errno.h>
27 #include <stdbool.h>
28 #include <stdio.h>
29 #include <stdlib.h>
30 #include <string.h>
31
32 #include "attribute.h"
33 #include "message.h"
34 #include "xgettext.h"
35 #include "xg-pos.h"
36 #include "xg-mixed-string.h"
37 #include "xg-arglist-context.h"
38 #include "xg-arglist-callshape.h"
39 #include "xg-arglist-parser.h"
40 #include "xg-message.h"
41 #include "error.h"
42 #include "error-progname.h"
43 #include "xalloc.h"
44 #include "mem-hash-map.h"
45 #include "gettext.h"
46
47 #define _(s) gettext(s)
48
49
50 /* The Common Lisp syntax is described in the Common Lisp HyperSpec, chapter 2.
51 Since we are interested only in strings and in forms similar to
52 (gettext msgid ...)
53 or (ngettext msgid msgid_plural ...)
54 we make the following simplifications:
55
56 - Assume the keywords and strings are in an ASCII compatible encoding.
57 This means we can read the input file one byte at a time, instead of
58 one character at a time. No need to worry about multibyte characters:
59 If they occur as part of identifiers, they most probably act as
60 constituent characters, and the byte based approach will do the same.
61
62 - Assume the read table is the standard Common Lisp read table.
63 Non-standard read tables are mostly used to read data, not programs.
64
65 - Assume the read table case is :UPCASE, and *READ-BASE* is 10.
66
67 - Don't interpret #n= and #n#, they usually don't appear in programs.
68
69 - Don't interpret #+, #-, they are unlikely to appear in a gettext form.
70
71 The remaining syntax rules are:
72
73 - The syntax code assigned to each character, and how tokens are built
74 up from characters (single escape, multiple escape etc.).
75
76 - Comment syntax: ';' and '#| ... |#'.
77
78 - String syntax: "..." with single escapes.
79
80 - Read macros and dispatch macro character '#'. Needed to be able to
81 tell which is the n-th argument of a function call.
82
83 */
84
85
86 /* ========================= Lexer customization. ========================= */
87
88 /* 'readtable_case' is the case conversion that is applied to non-escaped
89 parts of symbol tokens. In Common Lisp: (readtable-case *readtable*). */
90
91 enum rtcase
92 {
93 case_upcase,
94 case_downcase,
95 case_preserve,
96 case_invert
97 };
98
99 static enum rtcase readtable_case = case_upcase;
100
101 /* 'read_base' is the assumed radix of integers and rational numbers.
102 In Common Lisp: *read-base*. */
103 static int read_base = 10;
104
105 /* 'read_preserve_whitespace' specifies whether a whitespace character
106 that terminates a token must be pushed back on the input stream.
107 We set it to true, because the special newline side effect in read_object()
108 requires that read_object() sees every newline not inside a token. */
109 static bool read_preserve_whitespace = true;
110
111
112 /* ====================== Keyword set customization. ====================== */
113
114 /* If true extract all strings. */
115 static bool extract_all = false;
116
117 static hash_table keywords;
118 static bool default_keywords = true;
119
120
121 void
122 x_lisp_extract_all ()
123 {
124 extract_all = true;
125 }
126
127
128 void
129 x_lisp_keyword (const char *name)
130 {
131 if (name == NULL)
132 default_keywords = false;
133 else
134 {
135 const char *end;
136 struct callshape shape;
137 const char *colon;
138 size_t len;
139 char *symname;
140 size_t i;
141
142 if (keywords.table == NULL)
143 hash_init (&keywords, 100);
144
145 split_keywordspec (name, &end, &shape);
146
147 /* The characters between name and end should form a valid Lisp symbol.
148 Extract the symbol name part. */
149 colon = strchr (name, ':');
150 if (colon != NULL && colon < end)
151 {
152 name = colon + 1;
153 if (name < end && *name == ':')
154 name++;
155 colon = strchr (name, ':');
156 if (colon != NULL && colon < end)
157 return;
158 }
159
160 /* Uppercase it. */
161 len = end - name;
162 symname = XNMALLOC (len, char);
163 for (i = 0; i < len; i++)
164 symname[i] =
165 (name[i] >= 'a' && name[i] <= 'z' ? name[i] - 'a' + 'A' : name[i]);
166
167 insert_keyword_callshape (&keywords, symname, len, &shape);
168 }
169 }
170
171 /* Finish initializing the keywords hash table.
172 Called after argument processing, before each file is processed. */
173 static void
174 init_keywords ()
175 {
176 if (default_keywords)
177 {
178 /* When adding new keywords here, also update the documentation in
179 xgettext.texi! */
180 x_lisp_keyword ("gettext"); /* I18N:GETTEXT */
181 x_lisp_keyword ("ngettext:1,2"); /* I18N:NGETTEXT */
182 x_lisp_keyword ("gettext-noop");
183 default_keywords = false;
184 }
185 }
186
187 void
188 init_flag_table_lisp ()
189 {
190 xgettext_record_flag ("gettext:1:pass-lisp-format");
191 xgettext_record_flag ("ngettext:1:pass-lisp-format");
192 xgettext_record_flag ("ngettext:2:pass-lisp-format");
193 xgettext_record_flag ("gettext-noop:1:pass-lisp-format");
194 xgettext_record_flag ("format:2:lisp-format");
195 }
196
197
198 /* ======================== Reading of characters. ======================== */
199
200 /* The input file stream. */
201 static FILE *fp;
202
203
204 /* Fetch the next character from the input file. */
205 static int
206 do_getc ()
207 {
208 int c = getc (fp);
209
210 if (c == EOF)
211 {
212 if (ferror (fp))
213 error (EXIT_FAILURE, errno,
214 _("error while reading \"%s\""), real_file_name);
215 }
216 else if (c == '\n')
217 line_number++;
218
219 return c;
220 }
221
222 /* Put back the last fetched character, not EOF. */
223 static void
224 do_ungetc (int c)
225 {
226 if (c == '\n')
227 line_number--;
228 ungetc (c, fp);
229 }
230
231
232 /* ========= Reading of tokens. See CLHS 2.2 "Reader Algorithm". ========= */
233
234
235 /* Syntax code. See CLHS 2.1.4 "Character Syntax Types". */
236
237 enum syntax_code
238 {
239 syntax_illegal, /* non-printable, except whitespace */
240 syntax_single_esc, /* '\' (single escape) */
241 syntax_multi_esc, /* '|' (multiple escape) */
242 syntax_constituent, /* everything else (constituent) */
243 syntax_whitespace, /* TAB,LF,FF,CR,' ' (whitespace) */
244 syntax_eof, /* EOF */
245 syntax_t_macro, /* '()'"' (terminating macro) */
246 syntax_nt_macro /* '#' (non-terminating macro) */
247 };
248
249 /* Returns the syntax code of a character. */
250 static enum syntax_code
251 syntax_code_of (unsigned char c)
252 {
253 switch (c)
254 {
255 case '\\':
256 return syntax_single_esc;
257 case '|':
258 return syntax_multi_esc;
259 case '\t': case '\n': case '\f': case '\r': case ' ':
260 return syntax_whitespace;
261 case '(': case ')': case '\'': case '"': case ',': case ';': case '`':
262 return syntax_t_macro;
263 case '#':
264 return syntax_nt_macro;
265 default:
266 if (c < ' ' && c != '\b')
267 return syntax_illegal;
268 else
269 return syntax_constituent;
270 }
271 }
272
273 struct char_syntax
274 {
275 int ch; /* character */
276 enum syntax_code scode; /* syntax code */
277 };
278
279 /* Returns the next character and its syntax code. */
280 static void
281 read_char_syntax (struct char_syntax *p)
282 {
283 int c = do_getc ();
284
285 p->ch = c;
286 p->scode = (c == EOF ? syntax_eof : syntax_code_of (c));
287 }
288
289 /* Every character in a token has an attribute assigned. The attributes
290 help during interpretation of the token. See
291 CLHS 2.3 "Interpretation of Tokens" for the possible interpretations,
292 and CLHS 2.1.4.2 "Constituent Traits". */
293
294 enum attribute
295 {
296 a_illg, /* invalid constituent */
297 a_pack_m, /* ':' package marker */
298 a_alpha, /* normal alphabetic */
299 a_escaped, /* alphabetic but not subject to case conversion */
300 a_ratio, /* '/' */
301 a_dot, /* '.' */
302 a_sign, /* '+-' */
303 a_extens, /* '_^' extension characters */
304 a_digit, /* '0123456789' */
305 a_letterdigit,/* 'A'-'Z','a'-'z' below base, except 'esfdlESFDL' */
306 a_expodigit, /* 'esfdlESFDL' below base */
307 a_letter, /* 'A'-'Z','a'-'z', except 'esfdlESFDL' */
308 a_expo /* 'esfdlESFDL' */
309 };
310
311 #define is_letter_attribute(a) ((a) >= a_letter)
312 #define is_number_attribute(a) ((a) >= a_ratio)
313
314 /* Returns the attribute of a character, assuming base 10. */
315 static enum attribute
316 attribute_of (unsigned char c)
317 {
318 switch (c)
319 {
320 case ':':
321 return a_pack_m;
322 case '/':
323 return a_ratio;
324 case '.':
325 return a_dot;
326 case '+': case '-':
327 return a_sign;
328 case '_': case '^':
329 return a_extens;
330 case '0': case '1': case '2': case '3': case '4':
331 case '5': case '6': case '7': case '8': case '9':
332 return a_digit;
333 case 'a': case 'b': case 'c': case 'g': case 'h': case 'i': case 'j':
334 case 'k': case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
335 case 't': case 'u': case 'v': case 'w': case 'x': case 'y': case 'z':
336 case 'A': case 'B': case 'C': case 'G': case 'H': case 'I': case 'J':
337 case 'K': case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
338 case 'T': case 'U': case 'V': case 'W': case 'X': case 'Y': case 'Z':
339 return a_letter;
340 case 'e': case 's': case 'd': case 'f': case 'l':
341 case 'E': case 'S': case 'D': case 'F': case 'L':
342 return a_expo;
343 default:
344 /* Treat everything as valid. Never return a_illg. */
345 return a_alpha;
346 }
347 }
348
349 struct token_char
350 {
351 unsigned char ch; /* character */
352 unsigned char attribute; /* attribute */
353 };
354
355 /* A token consists of a sequence of characters with associated attribute. */
356 struct token
357 {
358 int allocated; /* number of allocated 'token_char's */
359 int charcount; /* number of used 'token_char's */
360 struct token_char *chars; /* the token's constituents */
361 bool with_escape; /* whether single-escape or multiple escape occurs */
362 };
363
364 /* Initialize a 'struct token'. */
365 static inline void
366 init_token (struct token *tp)
367 {
368 tp->allocated = 10;
369 tp->chars = XNMALLOC (tp->allocated, struct token_char);
370 tp->charcount = 0;
371 }
372
373 /* Free the memory pointed to by a 'struct token'. */
374 static inline void
375 free_token (struct token *tp)
376 {
377 free (tp->chars);
378 }
379
380 /* Ensure there is enough room in the token for one more character. */
381 static inline void
382 grow_token (struct token *tp)
383 {
384 if (tp->charcount == tp->allocated)
385 {
386 tp->allocated *= 2;
387 tp->chars = (struct token_char *) xrealloc (tp->chars, tp->allocated * sizeof (struct token_char));
388 }
389 }
390
391 /* Read the next token. If 'first' is given, it points to the first
392 character, which has already been read.
393 The algorithm follows CLHS 2.2 "Reader Algorithm". */
394 static void
395 read_token (struct token *tp, const struct char_syntax *first)
396 {
397 bool multiple_escape_flag;
398 struct char_syntax curr;
399
400 init_token (tp);
401 tp->with_escape = false;
402
403 multiple_escape_flag = false;
404 if (first)
405 curr = *first;
406 else
407 read_char_syntax (&curr);
408
409 for (;; read_char_syntax (&curr))
410 {
411 switch (curr.scode)
412 {
413 case syntax_illegal:
414 /* Invalid input. Be tolerant, no error message. */
415 do_ungetc (curr.ch);
416 return;
417
418 case syntax_single_esc:
419 tp->with_escape = true;
420 read_char_syntax (&curr);
421 if (curr.scode == syntax_eof)
422 /* Invalid input. Be tolerant, no error message. */
423 return;
424 grow_token (tp);
425 tp->chars[tp->charcount].ch = curr.ch;
426 tp->chars[tp->charcount].attribute = a_escaped;
427 tp->charcount++;
428 break;
429
430 case syntax_multi_esc:
431 multiple_escape_flag = !multiple_escape_flag;
432 tp->with_escape = true;
433 break;
434
435 case syntax_constituent:
436 case syntax_nt_macro:
437 grow_token (tp);
438 if (multiple_escape_flag)
439 {
440 tp->chars[tp->charcount].ch = curr.ch;
441 tp->chars[tp->charcount].attribute = a_escaped;
442 tp->charcount++;
443 }
444 else
445 {
446 tp->chars[tp->charcount].ch = curr.ch;
447 tp->chars[tp->charcount].attribute = attribute_of (curr.ch);
448 tp->charcount++;
449 }
450 break;
451
452 case syntax_whitespace:
453 case syntax_t_macro:
454 if (multiple_escape_flag)
455 {
456 grow_token (tp);
457 tp->chars[tp->charcount].ch = curr.ch;
458 tp->chars[tp->charcount].attribute = a_escaped;
459 tp->charcount++;
460 }
461 else
462 {
463 if (curr.scode != syntax_whitespace || read_preserve_whitespace)
464 do_ungetc (curr.ch);
465 return;
466 }
467 break;
468
469 case syntax_eof:
470 if (multiple_escape_flag)
471 /* Invalid input. Be tolerant, no error message. */
472 ;
473 return;
474 }
475 }
476 }
477
478 /* A potential number is a token which
479 1. consists only of digits, '+','-','/','^','_','.' and number markers.
480 The base for digits is context dependent, but always 10 if a dot '.'
481 occurs. A number marker is a non-digit letter which is not adjacent
482 to a non-digit letter.
483 2. has at least one digit.
484 3. starts with a digit, '+','-','.','^' or '_'.
485 4. does not end with '+' or '-'.
486 See CLHS 2.3.1.1 "Potential Numbers as Tokens".
487 */
488
489 static inline bool
490 has_a_dot (const struct token *tp)
491 {
492 int n = tp->charcount;
493 int i;
494
495 for (i = 0; i < n; i++)
496 if (tp->chars[i].attribute == a_dot)
497 return true;
498 return false;
499 }
500
501 static inline bool
502 all_a_number (const struct token *tp)
503 {
504 int n = tp->charcount;
505 int i;
506
507 for (i = 0; i < n; i++)
508 if (!is_number_attribute (tp->chars[i].attribute))
509 return false;
510 return true;
511 }
512
513 static inline void
514 a_letter_to_digit (const struct token *tp, int base)
515 {
516 int n = tp->charcount;
517 int i;
518
519 for (i = 0; i < n; i++)
520 if (is_letter_attribute (tp->chars[i].attribute))
521 {
522 int c = tp->chars[i].ch;
523
524 if (c >= 'a')
525 c -= 'a' - 'A';
526 if (c - 'A' + 10 < base)
527 tp->chars[i].attribute -= 2; /* a_letter -> a_letterdigit,
528 a_expo -> a_expodigit */
529 }
530 }
531
532 static inline bool
533 has_a_digit (const struct token *tp)
534 {
535 int n = tp->charcount;
536 int i;
537
538 for (i = 0; i < n; i++)
539 if (tp->chars[i].attribute == a_digit
540 || tp->chars[i].attribute == a_letterdigit
541 || tp->chars[i].attribute == a_expodigit)
542 return true;
543 return false;
544 }
545
546 static inline bool
547 has_adjacent_letters (const struct token *tp)
548 {
549 int n = tp->charcount;
550 int i;
551
552 for (i = 1; i < n; i++)
553 if (is_letter_attribute (tp->chars[i-1].attribute)
554 && is_letter_attribute (tp->chars[i].attribute))
555 return true;
556 return false;
557 }
558
559 static bool
560 is_potential_number (const struct token *tp, int *basep)
561 {
562 /* CLHS 2.3.1.1.1:
563 "A potential number cannot contain any escape characters." */
564 if (tp->with_escape)
565 return false;
566
567 if (has_a_dot (tp))
568 *basep = 10;
569
570 if (!all_a_number (tp))
571 return false;
572
573 a_letter_to_digit (tp, *basep);
574
575 if (!has_a_digit (tp))
576 return false;
577
578 if (has_adjacent_letters (tp))
579 return false;
580
581 if (!(tp->chars[0].attribute >= a_dot
582 && tp->chars[0].attribute <= a_expodigit))
583 return false;
584
585 if (tp->chars[tp->charcount - 1].attribute == a_sign)
586 return false;
587
588 return true;
589 }
590
591 /* A number is one of integer, ratio, float. Each has a particular syntax.
592 See CLHS 2.3.1 "Numbers as Tokens".
593 But note a mistake: The exponent rule should read:
594 exponent ::= exponent-marker [sign] {decimal-digit}+
595 (see 22.1.3.1.3 "Printing Floats"). */
596
597 enum number_type
598 {
599 n_none,
600 n_integer,
601 n_ratio,
602 n_float
603 };
604
605 static enum number_type
606 is_number (const struct token *tp, int *basep)
607 {
608 struct token_char *ptr_limit;
609 struct token_char *ptr1;
610
611 if (!is_potential_number (tp, basep))
612 return n_none;
613
614 /* is_potential_number guarantees
615 - all attributes are >= a_ratio,
616 - there is at least one a_digit or a_letterdigit or a_expodigit, and
617 - if there is an a_dot, then *basep = 10. */
618
619 ptr1 = &tp->chars[0];
620 ptr_limit = &tp->chars[tp->charcount];
621
622 if (ptr1->attribute == a_sign)
623 ptr1++;
624
625 /* Test for syntax
626 * { a_sign | }
627 * { a_digit < base }+ { a_ratio { a_digit < base }+ | }
628 */
629 {
630 bool seen_a_ratio = false;
631 bool seen_a_digit = false; /* seen a digit in last digit block? */
632 struct token_char *ptr;
633
634 for (ptr = ptr1;; ptr++)
635 {
636 if (ptr >= ptr_limit)
637 {
638 if (!seen_a_digit)
639 break;
640 if (seen_a_ratio)
641 return n_ratio;
642 else
643 return n_integer;
644 }
645 if (ptr->attribute == a_digit
646 || ptr->attribute == a_letterdigit
647 || ptr->attribute == a_expodigit)
648 {
649 int c = ptr->ch;
650
651 c = (c < 'A' ? c - '0' : c < 'a' ? c - 'A' + 10 : c - 'a' + 10);
652 if (c >= *basep)
653 break;
654 seen_a_digit = true;
655 }
656 else if (ptr->attribute == a_ratio)
657 {
658 if (seen_a_ratio || !seen_a_digit)
659 break;
660 seen_a_ratio = true;
661 seen_a_digit = false;
662 }
663 else
664 break;
665 }
666 }
667
668 /* Test for syntax
669 * { a_sign | }
670 * { a_digit }* { a_dot { a_digit }* | }
671 * { a_expo { a_sign | } { a_digit }+ | }
672 *
673 * If there is an exponent part, there must be digits before the dot or
674 * after the dot. The result is a float.
675 * If there is no exponen:
676 * If there is no dot, it would an integer in base 10, but is has already
677 * been verified to not be an integer in the current base.
678 * If there is a dot:
679 * If there are digits after the dot, it's a float.
680 * Otherwise, if there are digits before the dot, it's an integer.
681 */
682 *basep = 10;
683 {
684 bool seen_a_dot = false;
685 bool seen_a_dot_with_leading_digits = false;
686 bool seen_a_digit = false; /* seen a digit in last digit block? */
687 struct token_char *ptr;
688
689 for (ptr = ptr1;; ptr++)
690 {
691 if (ptr >= ptr_limit)
692 {
693 /* no exponent */
694 if (!seen_a_dot)
695 return n_none;
696 if (seen_a_digit)
697 return n_float;
698 if (seen_a_dot_with_leading_digits)
699 return n_integer;
700 else
701 return n_none;
702 }
703 if (ptr->attribute == a_digit)
704 {
705 seen_a_digit = true;
706 }
707 else if (ptr->attribute == a_dot)
708 {
709 if (seen_a_dot)
710 return n_none;
711 seen_a_dot = true;
712 if (seen_a_digit)
713 seen_a_dot_with_leading_digits = true;
714 seen_a_digit = false;
715 }
716 else if (ptr->attribute == a_expo || ptr->attribute == a_expodigit)
717 break;
718 else
719 return n_none;
720 }
721 ptr++;
722 if (!seen_a_dot_with_leading_digits || !seen_a_digit)
723 return n_none;
724 if (ptr >= ptr_limit)
725 return n_none;
726 if (ptr->attribute == a_sign)
727 ptr++;
728 seen_a_digit = false;
729 for (;; ptr++)
730 {
731 if (ptr >= ptr_limit)
732 break;
733 if (ptr->attribute != a_digit)
734 return n_none;
735 seen_a_digit = true;
736 }
737 if (!seen_a_digit)
738 return n_none;
739 return n_float;
740 }
741 }
742
743 /* A token representing a symbol must be case converted.
744 For portability, we convert only ASCII characters here. */
745
746 static void
747 upcase_token (struct token *tp)
748 {
749 int n = tp->charcount;
750 int i;
751
752 for (i = 0; i < n; i++)
753 if (tp->chars[i].attribute != a_escaped)
754 {
755 unsigned char c = tp->chars[i].ch;
756 if (c >= 'a' && c <= 'z')
757 tp->chars[i].ch = c - 'a' + 'A';
758 }
759 }
760
761 static void
762 downcase_token (struct token *tp)
763 {
764 int n = tp->charcount;
765 int i;
766
767 for (i = 0; i < n; i++)
768 if (tp->chars[i].attribute != a_escaped)
769 {
770 unsigned char c = tp->chars[i].ch;
771 if (c >= 'A' && c <= 'Z')
772 tp->chars[i].ch = c - 'A' + 'a';
773 }
774 }
775
776 static void
777 case_convert_token (struct token *tp)
778 {
779 int n = tp->charcount;
780 int i;
781
782 switch (readtable_case)
783 {
784 case case_upcase:
785 upcase_token (tp);
786 break;
787
788 case case_downcase:
789 downcase_token (tp);
790 break;
791
792 case case_preserve:
793 break;
794
795 case case_invert:
796 {
797 bool seen_uppercase = false;
798 bool seen_lowercase = false;
799 for (i = 0; i < n; i++)
800 if (tp->chars[i].attribute != a_escaped)
801 {
802 unsigned char c = tp->chars[i].ch;
803 if (c >= 'a' && c <= 'z')
804 seen_lowercase = true;
805 if (c >= 'A' && c <= 'Z')
806 seen_uppercase = true;
807 }
808 if (seen_uppercase)
809 {
810 if (!seen_lowercase)
811 downcase_token (tp);
812 }
813 else
814 {
815 if (seen_lowercase)
816 upcase_token (tp);
817 }
818 }
819 break;
820 }
821 }
822
823
824 /* ========================= Accumulating comments ========================= */
825
826
827 static char *buffer;
828 static size_t bufmax;
829 static size_t buflen;
830
831 static inline void
832 comment_start ()
833 {
834 buflen = 0;
835 }
836
837 static inline void
838 comment_add (int c)
839 {
840 if (buflen >= bufmax)
841 {
842 bufmax = 2 * bufmax + 10;
843 buffer = xrealloc (buffer, bufmax);
844 }
845 buffer[buflen++] = c;
846 }
847
848 static inline void
849 comment_line_end (size_t chars_to_remove)
850 {
851 buflen -= chars_to_remove;
852 while (buflen >= 1
853 && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
854 --buflen;
855 if (chars_to_remove == 0 && buflen >= bufmax)
856 {
857 bufmax = 2 * bufmax + 10;
858 buffer = xrealloc (buffer, bufmax);
859 }
860 buffer[buflen] = '\0';
861 savable_comment_add (buffer);
862 }
863
864
865 /* These are for tracking whether comments count as immediately before
866 keyword. */
867 static int last_comment_line;
868 static int last_non_comment_line;
869
870
871 /* ========================= Accumulating messages ========================= */
872
873
874 static message_list_ty *mlp;
875
876
877 /* ============== Reading of objects. See CLHS 2 "Syntax". ============== */
878
879
880 /* We are only interested in symbols (e.g. GETTEXT or NGETTEXT) and strings.
881 Other objects need not to be represented precisely. */
882 enum object_type
883 {
884 t_symbol, /* symbol */
885 t_string, /* string */
886 t_other, /* other kind of real object */
887 t_dot, /* '.' pseudo object */
888 t_close, /* ')' pseudo object */
889 t_eof /* EOF marker */
890 };
891
892 struct object
893 {
894 enum object_type type;
895 struct token *token; /* for t_symbol and t_string */
896 int line_number_at_start; /* for t_string */
897 };
898
899 /* Free the memory pointed to by a 'struct object'. */
900 static inline void
901 free_object (struct object *op)
902 {
903 if (op->type == t_symbol || op->type == t_string)
904 {
905 free_token (op->token);
906 free (op->token);
907 }
908 }
909
910 /* Convert a t_symbol/t_string token to a char*. */
911 static char *
912 string_of_object (const struct object *op)
913 {
914 char *str;
915 const struct token_char *p;
916 char *q;
917 int n;
918
919 if (!(op->type == t_symbol || op->type == t_string))
920 abort ();
921 n = op->token->charcount;
922 str = XNMALLOC (n + 1, char);
923 q = str;
924 for (p = op->token->chars; n > 0; p++, n--)
925 *q++ = p->ch;
926 *q = '\0';
927 return str;
928 }
929
930
931 /* Context lookup table. */
932 static flag_context_list_table_ty *flag_context_list_table;
933
934
935 /* Maximum supported nesting depth. */
936 #define MAX_NESTING_DEPTH 1000
937
938 /* Current nesting depth. */
939 static int nesting_depth;
940
941
942 /* Read the next object. */
943 static void
944 read_object (struct object *op, flag_context_ty outer_context)
945 {
946 if (nesting_depth > MAX_NESTING_DEPTH)
947 {
948 error_with_progname = false;
949 error (EXIT_FAILURE, 0, _("%s:%d: error: too deeply nested objects"),
950 logical_file_name, line_number);
951 }
952 for (;;)
953 {
954 struct char_syntax curr;
955
956 read_char_syntax (&curr);
957
958 switch (curr.scode)
959 {
960 case syntax_eof:
961 op->type = t_eof;
962 return;
963
964 case syntax_whitespace:
965 if (curr.ch == '\n')
966 /* Comments assumed to be grouped with a message must immediately
967 precede it, with no non-whitespace token on a line between
968 both. */
969 if (last_non_comment_line > last_comment_line)
970 savable_comment_reset ();
971 continue;
972
973 case syntax_illegal:
974 op->type = t_other;
975 return;
976
977 case syntax_single_esc:
978 case syntax_multi_esc:
979 case syntax_constituent:
980 /* Start reading a token. */
981 op->token = XMALLOC (struct token);
982 read_token (op->token, &curr);
983 last_non_comment_line = line_number;
984
985 /* Interpret the token. */
986
987 /* Dots. */
988 if (!op->token->with_escape
989 && op->token->charcount == 1
990 && op->token->chars[0].attribute == a_dot)
991 {
992 free_token (op->token);
993 free (op->token);
994 op->type = t_dot;
995 return;
996 }
997 /* Tokens consisting entirely of dots are illegal, but be tolerant
998 here. */
999
1000 /* Number. */
1001 {
1002 int base = read_base;
1003
1004 if (is_number (op->token, &base) != n_none)
1005 {
1006 free_token (op->token);
1007 free (op->token);
1008 op->type = t_other;
1009 return;
1010 }
1011 }
1012
1013 /* We interpret all other tokens as symbols (including 'reserved
1014 tokens', i.e. potential numbers which are not numbers). */
1015 case_convert_token (op->token);
1016 op->type = t_symbol;
1017 return;
1018
1019 case syntax_t_macro:
1020 case syntax_nt_macro:
1021 /* Read a macro. */
1022 switch (curr.ch)
1023 {
1024 case '(':
1025 {
1026 int arg = 0; /* Current argument number. */
1027 flag_context_list_iterator_ty context_iter;
1028 const struct callshapes *shapes = NULL;
1029 struct arglist_parser *argparser = NULL;
1030
1031 for (;; arg++)
1032 {
1033 struct object inner;
1034 flag_context_ty inner_context;
1035
1036 if (arg == 0)
1037 inner_context = null_context;
1038 else
1039 inner_context =
1040 inherited_context (outer_context,
1041 flag_context_list_iterator_advance (
1042 &context_iter));
1043
1044 ++nesting_depth;
1045 read_object (&inner, inner_context);
1046 nesting_depth--;
1047
1048 /* Recognize end of list. */
1049 if (inner.type == t_close)
1050 {
1051 op->type = t_other;
1052 /* Don't bother converting "()" to "NIL". */
1053 last_non_comment_line = line_number;
1054 if (argparser != NULL)
1055 arglist_parser_done (argparser, arg);
1056 return;
1057 }
1058
1059 /* Dots are not allowed in every position.
1060 But be tolerant. */
1061
1062 /* EOF inside list is illegal.
1063 But be tolerant. */
1064 if (inner.type == t_eof)
1065 break;
1066
1067 if (arg == 0)
1068 {
1069 /* This is the function position. */
1070 if (inner.type == t_symbol)
1071 {
1072 char *symbol_name = string_of_object (&inner);
1073 int i;
1074 int prefix_len;
1075 void *keyword_value;
1076
1077 /* Omit any package name. */
1078 i = inner.token->charcount;
1079 while (i > 0
1080 && inner.token->chars[i-1].attribute != a_pack_m)
1081 i--;
1082 prefix_len = i;
1083
1084 if (hash_find_entry (&keywords,
1085 symbol_name + prefix_len,
1086 strlen (symbol_name + prefix_len),
1087 &keyword_value)
1088 == 0)
1089 shapes = (const struct callshapes *) keyword_value;
1090
1091 argparser = arglist_parser_alloc (mlp, shapes);
1092
1093 context_iter =
1094 flag_context_list_iterator (
1095 flag_context_list_table_lookup (
1096 flag_context_list_table,
1097 symbol_name, strlen (symbol_name)));
1098
1099 free (symbol_name);
1100 }
1101 else
1102 context_iter = null_context_list_iterator;
1103 }
1104 else
1105 {
1106 /* These are the argument positions. */
1107 if (argparser != NULL && inner.type == t_string)
1108 {
1109 char *s = string_of_object (&inner);
1110 mixed_string_ty *ms =
1111 mixed_string_alloc_simple (s, lc_string,
1112 logical_file_name,
1113 inner.line_number_at_start);
1114 free (s);
1115 arglist_parser_remember (argparser, arg, ms,
1116 inner_context,
1117 logical_file_name,
1118 inner.line_number_at_start,
1119 savable_comment, false);
1120 }
1121 }
1122
1123 free_object (&inner);
1124 }
1125
1126 if (argparser != NULL)
1127 arglist_parser_done (argparser, arg);
1128 }
1129 op->type = t_other;
1130 last_non_comment_line = line_number;
1131 return;
1132
1133 case ')':
1134 /* Tell the caller about the end of list.
1135 Unmatched closing parenthesis is illegal.
1136 But be tolerant. */
1137 op->type = t_close;
1138 last_non_comment_line = line_number;
1139 return;
1140
1141 case ',':
1142 {
1143 int c = do_getc ();
1144 /* The ,@ handling inside lists is wrong anyway, because
1145 ,@form expands to an unknown number of elements. */
1146 if (c != EOF && c != '@' && c != '.')
1147 do_ungetc (c);
1148 }
1149 FALLTHROUGH;
1150 case '\'':
1151 case '`':
1152 {
1153 struct object inner;
1154
1155 ++nesting_depth;
1156 read_object (&inner, null_context);
1157 nesting_depth--;
1158
1159 /* Dots and EOF are not allowed here. But be tolerant. */
1160
1161 free_object (&inner);
1162
1163 op->type = t_other;
1164 last_non_comment_line = line_number;
1165 return;
1166 }
1167
1168 case ';':
1169 {
1170 bool all_semicolons = true;
1171
1172 last_comment_line = line_number;
1173 comment_start ();
1174 for (;;)
1175 {
1176 int c = do_getc ();
1177 if (c == EOF || c == '\n')
1178 break;
1179 if (c != ';')
1180 all_semicolons = false;
1181 if (!all_semicolons)
1182 {
1183 /* We skip all leading white space, but not EOLs. */
1184 if (!(buflen == 0 && (c == ' ' || c == '\t')))
1185 comment_add (c);
1186 }
1187 }
1188 comment_line_end (0);
1189 continue;
1190 }
1191
1192 case '"':
1193 {
1194 op->token = XMALLOC (struct token);
1195 init_token (op->token);
1196 op->line_number_at_start = line_number;
1197 for (;;)
1198 {
1199 int c = do_getc ();
1200 if (c == EOF)
1201 /* Invalid input. Be tolerant, no error message. */
1202 break;
1203 if (c == '"')
1204 break;
1205 if (c == '\\') /* syntax_single_esc */
1206 {
1207 c = do_getc ();
1208 if (c == EOF)
1209 /* Invalid input. Be tolerant, no error message. */
1210 break;
1211 }
1212 grow_token (op->token);
1213 op->token->chars[op->token->charcount++].ch = c;
1214 }
1215 op->type = t_string;
1216
1217 if (extract_all)
1218 {
1219 lex_pos_ty pos;
1220
1221 pos.file_name = logical_file_name;
1222 pos.line_number = op->line_number_at_start;
1223 remember_a_message (mlp, NULL, string_of_object (op), false,
1224 false, null_context, &pos,
1225 NULL, savable_comment, false);
1226 }
1227 last_non_comment_line = line_number;
1228 return;
1229 }
1230
1231 case '#':
1232 /* Dispatch macro handling. */
1233 {
1234 int dmc;
1235
1236 for (;;)
1237 {
1238 dmc = do_getc ();
1239 if (dmc == EOF)
1240 /* Invalid input. Be tolerant, no error message. */
1241 {
1242 op->type = t_other;
1243 return;
1244 }
1245 if (!(dmc >= '0' && dmc <= '9'))
1246 break;
1247 }
1248
1249 switch (dmc)
1250 {
1251 case '(':
1252 case '"':
1253 do_ungetc (dmc);
1254 FALLTHROUGH;
1255 case '\'':
1256 case ':':
1257 case '.':
1258 case ',':
1259 case 'A': case 'a':
1260 case 'C': case 'c':
1261 case 'P': case 'p':
1262 case 'S': case 's':
1263 {
1264 struct object inner;
1265 ++nesting_depth;
1266 read_object (&inner, null_context);
1267 nesting_depth--;
1268 /* Dots and EOF are not allowed here.
1269 But be tolerant. */
1270 free_object (&inner);
1271 op->type = t_other;
1272 last_non_comment_line = line_number;
1273 return;
1274 }
1275
1276 case '|':
1277 {
1278 int depth = 0;
1279 int c;
1280
1281 comment_start ();
1282 c = do_getc ();
1283 for (;;)
1284 {
1285 if (c == EOF)
1286 break;
1287 if (c == '|')
1288 {
1289 c = do_getc ();
1290 if (c == EOF)
1291 break;
1292 if (c == '#')
1293 {
1294 if (depth == 0)
1295 {
1296 comment_line_end (0);
1297 break;
1298 }
1299 depth--;
1300 comment_add ('|');
1301 comment_add ('#');
1302 c = do_getc ();
1303 }
1304 else
1305 comment_add ('|');
1306 }
1307 else if (c == '#')
1308 {
1309 c = do_getc ();
1310 if (c == EOF)
1311 break;
1312 comment_add ('#');
1313 if (c == '|')
1314 {
1315 depth++;
1316 comment_add ('|');
1317 c = do_getc ();
1318 }
1319 }
1320 else
1321 {
1322 /* We skip all leading white space. */
1323 if (!(buflen == 0 && (c == ' ' || c == '\t')))
1324 comment_add (c);
1325 if (c == '\n')
1326 {
1327 comment_line_end (1);
1328 comment_start ();
1329 }
1330 c = do_getc ();
1331 }
1332 }
1333 if (c == EOF)
1334 {
1335 /* EOF not allowed here. But be tolerant. */
1336 op->type = t_eof;
1337 return;
1338 }
1339 last_comment_line = line_number;
1340 continue;
1341 }
1342
1343 case '\\':
1344 {
1345 struct token token;
1346 struct char_syntax first;
1347 first.ch = '\\';
1348 first.scode = syntax_single_esc;
1349 read_token (&token, &first);
1350 free_token (&token);
1351 op->type = t_other;
1352 last_non_comment_line = line_number;
1353 return;
1354 }
1355
1356 case 'B': case 'b':
1357 case 'O': case 'o':
1358 case 'X': case 'x':
1359 case 'R': case 'r':
1360 case '*':
1361 {
1362 struct token token;
1363 read_token (&token, NULL);
1364 free_token (&token);
1365 op->type = t_other;
1366 last_non_comment_line = line_number;
1367 return;
1368 }
1369
1370 case '=':
1371 /* Ignore read labels. */
1372 continue;
1373
1374 case '#':
1375 /* Don't bother looking up the corresponding object. */
1376 op->type = t_other;
1377 last_non_comment_line = line_number;
1378 return;
1379
1380 case '+':
1381 case '-':
1382 /* Simply assume every feature expression is true. */
1383 {
1384 struct object inner;
1385 ++nesting_depth;
1386 read_object (&inner, null_context);
1387 nesting_depth--;
1388 /* Dots and EOF are not allowed here.
1389 But be tolerant. */
1390 free_object (&inner);
1391 continue;
1392 }
1393
1394 default:
1395 op->type = t_other;
1396 last_non_comment_line = line_number;
1397 return;
1398 }
1399 /*NOTREACHED*/
1400 abort ();
1401 }
1402
1403 default:
1404 /*NOTREACHED*/
1405 abort ();
1406 }
1407
1408 default:
1409 /*NOTREACHED*/
1410 abort ();
1411 }
1412 }
1413 }
1414
1415
1416 void
1417 extract_lisp (FILE *f,
1418 const char *real_filename, const char *logical_filename,
1419 flag_context_list_table_ty *flag_table,
1420 msgdomain_list_ty *mdlp)
1421 {
1422 mlp = mdlp->item[0]->messages;
1423
1424 fp = f;
1425 real_file_name = real_filename;
1426 logical_file_name = xstrdup (logical_filename);
1427 line_number = 1;
1428
1429 last_comment_line = -1;
1430 last_non_comment_line = -1;
1431
1432 flag_context_list_table = flag_table;
1433 nesting_depth = 0;
1434
1435 init_keywords ();
1436
1437 /* Eat tokens until eof is seen. When read_object returns
1438 due to an unbalanced closing parenthesis, just restart it. */
1439 do
1440 {
1441 struct object toplevel_object;
1442
1443 read_object (&toplevel_object, null_context);
1444
1445 if (toplevel_object.type == t_eof)
1446 break;
1447
1448 free_object (&toplevel_object);
1449 }
1450 while (!feof (fp));
1451
1452 /* Close scanner. */
1453 fp = NULL;
1454 real_file_name = NULL;
1455 logical_file_name = NULL;
1456 line_number = 0;
1457 }