1 /* xgettext Smalltalk backend.
2 Copyright (C) 2002-2003, 2005-2009, 2011, 2018-2020 Free Software Foundation, Inc.
3
4 This file was written by Bruno Haible <haible@clisp.cons.org>, 2002.
5
6 This program is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
10
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with this program. If not, see <https://www.gnu.org/licenses/>. */
18
19 #ifdef HAVE_CONFIG_H
20 # include "config.h"
21 #endif
22
23 /* Specification. */
24 #include "x-smalltalk.h"
25
26 #include <errno.h>
27 #include <stdio.h>
28 #include <stdlib.h>
29
30 #include "attribute.h"
31 #include "message.h"
32 #include "xgettext.h"
33 #include "xg-pos.h"
34 #include "xg-message.h"
35 #include "error.h"
36 #include "xalloc.h"
37 #include "gettext.h"
38
39 #define _(s) gettext(s)
40
41 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
42
43
44 /* The relevant parts of the Smalltalk syntax are:
45
46 stringliteral ::= string | stringconst | symconst
47 stringconst ::= "#"string
48 string ::= "'"[char]*"'"
49 symconst ::= "#"symbol
50 symbol ::= id | binsel | keysel[keysel]*
51 keysel ::= id":"
52 id ::= letter[letter|digit]*
53 letter ::= "A".."Z" | "a".."z"
54 digit ::= "0".."9"
55 binsel ::= selchar[selchar]
56 selchar ::= "+" | "-" | "*" | "/" | "~" | "|" | "," | "<" | ">"
57 | "=" | "&" | "@" | "?" | "%" | "\"
58
59 Strings can contain any characters; to include the string delimiter itself,
60 it must be duplicated.
61
62 Character constants are written "$"char
63
64 Comments are enclosed within double quotes.
65
66 In well-formed expressions, {} and [] and () are balanced.
67 */
68
69
70 /* ======================== Reading of characters. ======================== */
71
72 /* The input file stream. */
73 static FILE *fp;
74
75
76 /* 1. line_number handling. */
77
78 static int
79 phase1_getc ()
80 {
81 int c = getc (fp);
82
83 if (c == EOF)
84 {
85 if (ferror (fp))
86 error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
87 real_file_name);
88 return EOF;
89 }
90
91 if (c == '\n')
92 line_number++;
93
94 return c;
95 }
96
97 /* Supports only one pushback character. */
98 static void
99 phase1_ungetc (int c)
100 {
101 if (c != EOF)
102 {
103 if (c == '\n')
104 --line_number;
105
106 ungetc (c, fp);
107 }
108 }
109
110
111 /* Accumulating comments. */
112
113 static char *buffer;
114 static size_t bufmax;
115 static size_t buflen;
116
117 static inline void
118 comment_start ()
119 {
120 buflen = 0;
121 }
122
123 static inline void
124 comment_add (int c)
125 {
126 if (buflen >= bufmax)
127 {
128 bufmax = 2 * bufmax + 10;
129 buffer = xrealloc (buffer, bufmax);
130 }
131 buffer[buflen++] = c;
132 }
133
134 static inline void
135 comment_line_end ()
136 {
137 while (buflen >= 1
138 && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
139 --buflen;
140 if (buflen >= bufmax)
141 {
142 bufmax = 2 * bufmax + 10;
143 buffer = xrealloc (buffer, bufmax);
144 }
145 buffer[buflen] = '\0';
146 savable_comment_add (buffer);
147 }
148
149
150 /* These are for tracking whether comments count as immediately before
151 keyword. */
152 static int last_comment_line;
153 static int last_non_comment_line;
154
155
156 /* ========================== Reading of tokens. ========================== */
157
158
159 enum token_type_ty
160 {
161 token_type_eof,
162 token_type_uniq, /* # */
163 token_type_symbol, /* symbol */
164 token_type_string_literal, /* string, stringconst, symbolconst */
165 token_type_other /* misc. operator */
166 };
167 typedef enum token_type_ty token_type_ty;
168
169 typedef struct token_ty token_ty;
170 struct token_ty
171 {
172 token_type_ty type;
173 char *string; /* for token_type_string_literal, token_type_symbol */
174 int line_number;
175 };
176
177
178 /* 2. Combine characters into tokens. Discard comments and whitespace. */
179
180 static token_ty phase2_pushback[1];
181 static int phase2_pushback_length;
182
183 static void
184 phase2_get (token_ty *tp)
185 {
186 static char *buffer;
187 static int bufmax;
188 int bufpos;
189 int c;
190
191 if (phase2_pushback_length)
192 {
193 *tp = phase2_pushback[--phase2_pushback_length];
194 return;
195 }
196
197 tp->string = NULL;
198
199 for (;;)
200 {
201 tp->line_number = line_number;
202 c = phase1_getc ();
203 switch (c)
204 {
205 case EOF:
206 tp->type = token_type_eof;
207 return;
208
209 case '"':
210 {
211 /* Comment. */
212 int lineno;
213
214 comment_start ();
215 lineno = line_number;
216 for (;;)
217 {
218 c = phase1_getc ();
219 if (c == '"' || c == EOF)
220 break;
221 if (c == '\n')
222 {
223 comment_line_end ();
224 comment_start ();
225 }
226 else
227 {
228 /* We skip all leading white space, but not EOLs. */
229 if (!(buflen == 0 && (c == ' ' || c == '\t')))
230 comment_add (c);
231 }
232 }
233 comment_line_end ();
234 last_comment_line = lineno;
235 continue;
236 }
237
238 case '\n':
239 if (last_non_comment_line > last_comment_line)
240 savable_comment_reset ();
241 FALLTHROUGH;
242 case ' ':
243 case '\t':
244 case '\r':
245 /* Ignore whitespace. */
246 continue;
247 }
248
249 last_non_comment_line = tp->line_number;
250
251 switch (c)
252 {
253 case '\'':
254 /* String literal. */
255 bufpos = 0;
256 for (;;)
257 {
258 c = phase1_getc ();
259 if (c == EOF)
260 break;
261 if (c == '\'')
262 {
263 c = phase1_getc ();
264 if (c != '\'')
265 {
266 phase1_ungetc (c);
267 break;
268 }
269 }
270 if (bufpos >= bufmax)
271 {
272 bufmax = 2 * bufmax + 10;
273 buffer = xrealloc (buffer, bufmax);
274 }
275 buffer[bufpos++] = c;
276 }
277 if (bufpos >= bufmax)
278 {
279 bufmax = 2 * bufmax + 10;
280 buffer = xrealloc (buffer, bufmax);
281 }
282 buffer[bufpos] = 0;
283 tp->type = token_type_string_literal;
284 tp->string = xstrdup (buffer);
285 return;
286
287 case '+':
288 case '-':
289 case '*':
290 case '/':
291 case '~':
292 case '|':
293 case ',':
294 case '<':
295 case '>':
296 case '=':
297 case '&':
298 case '@':
299 case '?':
300 case '%':
301 case '\\':
302 {
303 char *name;
304 int c2 = phase1_getc ();
305 switch (c2)
306 {
307 case '+':
308 case '-':
309 case '*':
310 case '/':
311 case '~':
312 case '|':
313 case ',':
314 case '<':
315 case '>':
316 case '=':
317 case '&':
318 case '@':
319 case '?':
320 case '%':
321 name = XNMALLOC (3, char);
322 name[0] = c;
323 name[1] = c2;
324 name[2] = '\0';
325 tp->type = token_type_symbol;
326 tp->string = name;
327 return;
328 default:
329 phase1_ungetc (c2);
330 break;
331 }
332 name = XNMALLOC (2, char);
333 name[0] = c;
334 name[1] = '\0';
335 tp->type = token_type_symbol;
336 tp->string = name;
337 return;
338 }
339
340 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
341 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
342 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
343 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
344 case 'Y': case 'Z':
345 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
346 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
347 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
348 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
349 case 'y': case 'z':
350 /* Recognize id or id":"[id":"]* or id":"[id":"]*id. */
351 bufpos = 0;
352 for (;;)
353 {
354 if (bufpos >= bufmax)
355 {
356 bufmax = 2 * bufmax + 10;
357 buffer = xrealloc (buffer, bufmax);
358 }
359 buffer[bufpos++] = c;
360 c = phase1_getc ();
361 switch (c)
362 {
363 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
364 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
365 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
366 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
367 case 'Y': case 'Z':
368 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
369 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
370 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
371 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
372 case 'y': case 'z':
373 case '0': case '1': case '2': case '3': case '4':
374 case '5': case '6': case '7': case '8': case '9':
375 continue;
376 case ':':
377 if (bufpos >= bufmax)
378 {
379 bufmax = 2 * bufmax + 10;
380 buffer = xrealloc (buffer, bufmax);
381 }
382 buffer[bufpos++] = c;
383 c = phase1_getc ();
384 switch (c)
385 {
386 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
387 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
388 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
389 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
390 case 'Y': case 'Z':
391 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
392 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
393 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
394 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
395 case 'y': case 'z':
396 continue;
397 default:
398 phase1_ungetc (c);
399 break;
400 }
401 break;
402 default:
403 phase1_ungetc (c);
404 break;
405 }
406 break;
407 }
408 if (bufpos >= bufmax)
409 {
410 bufmax = 2 * bufmax + 10;
411 buffer = xrealloc (buffer, bufmax);
412 }
413 buffer[bufpos] = '\0';
414 tp->string = xstrdup (buffer);
415 tp->type = token_type_symbol;
416 return;
417
418 case '#':
419 /* Uniquification operator. */
420 tp->type = token_type_uniq;
421 return;
422
423 case '$':
424 c = phase1_getc ();
425 tp->type = token_type_other;
426 return;
427
428 default:
429 tp->type = token_type_other;
430 return;
431 }
432 }
433 }
434
435 /* Supports only one pushback token. */
436 static void
437 phase2_unget (token_ty *tp)
438 {
439 if (tp->type != token_type_eof)
440 {
441 if (phase2_pushback_length == SIZEOF (phase2_pushback))
442 abort ();
443 phase2_pushback[phase2_pushback_length++] = *tp;
444 }
445 }
446
447
448 /* 3. Combine "# string_literal" and "# symbol" to a single token. */
449
450 static token_ty phase3_pushback[1];
451 static int phase3_pushback_length;
452
453 static void
454 phase3_get (token_ty *tp)
455 {
456 if (phase3_pushback_length)
457 {
458 *tp = phase3_pushback[--phase3_pushback_length];
459 return;
460 }
461
462 phase2_get (tp);
463 if (tp->type == token_type_uniq)
464 {
465 token_ty token2;
466
467 phase2_get (&token2);
468 if (token2.type == token_type_symbol
469 || token2.type == token_type_string_literal)
470 {
471 tp->type = token_type_string_literal;
472 tp->string = token2.string;
473 }
474 else
475 phase2_unget (&token2);
476 }
477 }
478
479 /* Supports only one pushback token. */
480 static void
481 phase3_unget (token_ty *tp)
482 {
483 if (tp->type != token_type_eof)
484 {
485 if (phase3_pushback_length == SIZEOF (phase3_pushback))
486 abort ();
487 phase3_pushback[phase3_pushback_length++] = *tp;
488 }
489 }
490
491
492 /* ========================= Extracting strings. ========================== */
493
494 /* The file is broken into tokens. Scan the token stream, looking for the
495 following patterns
496 NLS ? <string>
497 NLS at: <string>
498 NLS at: <string> plural: <string>
499 where <string> is one of
500 string_literal
501 # string_literal
502 # symbol
503 */
504
505 void
506 extract_smalltalk (FILE *f,
507 const char *real_filename, const char *logical_filename,
508 flag_context_list_table_ty *flag_table,
509 msgdomain_list_ty *mdlp)
510 {
511 message_list_ty *mlp = mdlp->item[0]->messages;
512
513 fp = f;
514 real_file_name = real_filename;
515 logical_file_name = xstrdup (logical_filename);
516 line_number = 1;
517
518 last_comment_line = -1;
519 last_non_comment_line = -1;
520
521 phase2_pushback_length = 0;
522 phase3_pushback_length = 0;
523
524 /* Eat tokens until eof is seen. */
525 {
526 /* 0 when no "NLS" has been seen.
527 1 after "NLS".
528 2 after "NLS ?".
529 3 after "NLS at:".
530 4 after "NLS at: <string>".
531 5 after "NLS at: <string> plural:". */
532 int state;
533 /* Remember the message containing the msgid, for msgid_plural.
534 Non-NULL in states 4, 5. */
535 message_ty *plural_mp = NULL;
536
537 /* Start state is 0. */
538 state = 0;
539
540 for (;;)
541 {
542 token_ty token;
543
544 phase3_get (&token);
545
546 switch (token.type)
547 {
548 case token_type_symbol:
549 state = (strcmp (token.string, "NLS") == 0 ? 1 :
550 strcmp (token.string, "?") == 0 && state == 1 ? 2 :
551 strcmp (token.string, "at:") == 0 && state == 1 ? 3 :
552 strcmp (token.string, "plural:") == 0 && state == 4 ? 5 :
553 0);
554 free (token.string);
555 break;
556
557 case token_type_string_literal:
558 if (state == 2)
559 {
560 lex_pos_ty pos;
561 pos.file_name = logical_file_name;
562 pos.line_number = token.line_number;
563 remember_a_message (mlp, NULL, token.string, false, false,
564 null_context, &pos, NULL, savable_comment,
565 false);
566 state = 0;
567 break;
568 }
569 if (state == 3)
570 {
571 lex_pos_ty pos;
572 token_ty token2;
573
574 pos.file_name = logical_file_name;
575 pos.line_number = token.line_number;
576
577 phase3_get (&token2);
578
579 plural_mp =
580 remember_a_message (mlp, NULL, token.string, false,
581 token2.type == token_type_symbol
582 && strcmp (token.string, "plural:") == 0,
583 null_context, &pos,
584 NULL, savable_comment, false);
585
586 phase3_unget (&token2);
587
588 state = 4;
589 break;
590 }
591 if (state == 5)
592 {
593 lex_pos_ty pos;
594 pos.file_name = logical_file_name;
595 pos.line_number = token.line_number;
596 if (plural_mp != NULL)
597 remember_a_message_plural (plural_mp, token.string, false,
598 null_context, &pos,
599 savable_comment, false);
600 state = 0;
601 break;
602 }
603 state = 0;
604 free (token.string);
605 break;
606
607 case token_type_uniq:
608 case token_type_other:
609 state = 0;
610 break;
611
612 case token_type_eof:
613 break;
614
615 default:
616 abort ();
617 }
618
619 if (token.type == token_type_eof)
620 break;
621 }
622 }
623
624 /* Close scanner. */
625 fp = NULL;
626 real_file_name = NULL;
627 logical_file_name = NULL;
628 line_number = 0;
629 }