1  /* xgettext Smalltalk backend.
       2     Copyright (C) 2002-2003, 2005-2009, 2011, 2018-2020 Free Software Foundation, Inc.
       3  
       4     This file was written by Bruno Haible <haible@clisp.cons.org>, 2002.
       5  
       6     This program is free software: you can redistribute it and/or modify
       7     it under the terms of the GNU General Public License as published by
       8     the Free Software Foundation; either version 3 of the License, or
       9     (at your option) any later version.
      10  
      11     This program is distributed in the hope that it will be useful,
      12     but WITHOUT ANY WARRANTY; without even the implied warranty of
      13     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      14     GNU General Public License for more details.
      15  
      16     You should have received a copy of the GNU General Public License
      17     along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
      18  
      19  #ifdef HAVE_CONFIG_H
      20  # include "config.h"
      21  #endif
      22  
      23  /* Specification.  */
      24  #include "x-smalltalk.h"
      25  
      26  #include <errno.h>
      27  #include <stdio.h>
      28  #include <stdlib.h>
      29  
      30  #include "attribute.h"
      31  #include "message.h"
      32  #include "xgettext.h"
      33  #include "xg-pos.h"
      34  #include "xg-message.h"
      35  #include "error.h"
      36  #include "xalloc.h"
      37  #include "gettext.h"
      38  
      39  #define _(s) gettext(s)
      40  
      41  #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
      42  
      43  
      44  /* The relevant parts of the Smalltalk syntax are:
      45  
      46       stringliteral ::= string | stringconst | symconst
      47       stringconst ::= "#"string
      48       string      ::= "'"[char]*"'"
      49       symconst    ::= "#"symbol
      50       symbol      ::= id | binsel | keysel[keysel]*
      51       keysel      ::= id":"
      52       id          ::= letter[letter|digit]*
      53       letter      ::= "A".."Z" | "a".."z"
      54       digit       ::= "0".."9"
      55       binsel      ::= selchar[selchar]
      56       selchar     ::= "+" | "-" | "*" | "/" | "~" | "|" | "," | "<" | ">"
      57                       | "=" | "&" | "@" | "?" | "%" | "\"
      58  
      59     Strings can contain any characters; to include the string delimiter itself,
      60     it must be duplicated.
      61  
      62     Character constants are written  "$"char
      63  
      64     Comments are enclosed within double quotes.
      65  
      66     In well-formed expressions, {} and [] and () are balanced.
      67   */
      68  
      69  
      70  /* ======================== Reading of characters.  ======================== */
      71  
      72  /* The input file stream.  */
      73  static FILE *fp;
      74  
      75  
      76  /* 1. line_number handling.  */
      77  
      78  static int
      79  phase1_getc ()
      80  {
      81    int c = getc (fp);
      82  
      83    if (c == EOF)
      84      {
      85        if (ferror (fp))
      86          error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
      87                 real_file_name);
      88        return EOF;
      89      }
      90  
      91    if (c == '\n')
      92      line_number++;
      93  
      94    return c;
      95  }
      96  
      97  /* Supports only one pushback character.  */
      98  static void
      99  phase1_ungetc (int c)
     100  {
     101    if (c != EOF)
     102      {
     103        if (c == '\n')
     104          --line_number;
     105  
     106        ungetc (c, fp);
     107      }
     108  }
     109  
     110  
     111  /* Accumulating comments.  */
     112  
     113  static char *buffer;
     114  static size_t bufmax;
     115  static size_t buflen;
     116  
     117  static inline void
     118  comment_start ()
     119  {
     120    buflen = 0;
     121  }
     122  
     123  static inline void
     124  comment_add (int c)
     125  {
     126    if (buflen >= bufmax)
     127      {
     128        bufmax = 2 * bufmax + 10;
     129        buffer = xrealloc (buffer, bufmax);
     130      }
     131    buffer[buflen++] = c;
     132  }
     133  
     134  static inline void
     135  comment_line_end ()
     136  {
     137    while (buflen >= 1
     138           && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
     139      --buflen;
     140    if (buflen >= bufmax)
     141      {
     142        bufmax = 2 * bufmax + 10;
     143        buffer = xrealloc (buffer, bufmax);
     144      }
     145    buffer[buflen] = '\0';
     146    savable_comment_add (buffer);
     147  }
     148  
     149  
     150  /* These are for tracking whether comments count as immediately before
     151     keyword.  */
     152  static int last_comment_line;
     153  static int last_non_comment_line;
     154  
     155  
     156  /* ========================== Reading of tokens.  ========================== */
     157  
     158  
     159  enum token_type_ty
     160  {
     161    token_type_eof,
     162    token_type_uniq,              /* # */
     163    token_type_symbol,            /* symbol */
     164    token_type_string_literal,    /* string, stringconst, symbolconst */
     165    token_type_other              /* misc. operator */
     166  };
     167  typedef enum token_type_ty token_type_ty;
     168  
     169  typedef struct token_ty token_ty;
     170  struct token_ty
     171  {
     172    token_type_ty type;
     173    char *string;         /* for token_type_string_literal, token_type_symbol */
     174    int line_number;
     175  };
     176  
     177  
     178  /* 2. Combine characters into tokens.  Discard comments and whitespace.  */
     179  
     180  static token_ty phase2_pushback[1];
     181  static int phase2_pushback_length;
     182  
     183  static void
     184  phase2_get (token_ty *tp)
     185  {
     186    static char *buffer;
     187    static int bufmax;
     188    int bufpos;
     189    int c;
     190  
     191    if (phase2_pushback_length)
     192      {
     193        *tp = phase2_pushback[--phase2_pushback_length];
     194        return;
     195      }
     196  
     197    tp->string = NULL;
     198  
     199    for (;;)
     200      {
     201        tp->line_number = line_number;
     202        c = phase1_getc ();
     203        switch (c)
     204          {
     205          case EOF:
     206            tp->type = token_type_eof;
     207            return;
     208  
     209          case '"':
     210            {
     211              /* Comment.  */
     212              int lineno;
     213  
     214              comment_start ();
     215              lineno = line_number;
     216              for (;;)
     217                {
     218                  c = phase1_getc ();
     219                  if (c == '"' || c == EOF)
     220                    break;
     221                  if (c == '\n')
     222                    {
     223                      comment_line_end ();
     224                      comment_start ();
     225                    }
     226                  else
     227                    {
     228                      /* We skip all leading white space, but not EOLs.  */
     229                      if (!(buflen == 0 && (c == ' ' || c == '\t')))
     230                        comment_add (c);
     231                    }
     232                }
     233              comment_line_end ();
     234              last_comment_line = lineno;
     235              continue;
     236            }
     237  
     238          case '\n':
     239            if (last_non_comment_line > last_comment_line)
     240              savable_comment_reset ();
     241            FALLTHROUGH;
     242          case ' ':
     243          case '\t':
     244          case '\r':
     245            /* Ignore whitespace.  */
     246            continue;
     247          }
     248  
     249        last_non_comment_line = tp->line_number;
     250  
     251        switch (c)
     252          {
     253          case '\'':
     254            /* String literal.  */
     255            bufpos = 0;
     256            for (;;)
     257              {
     258                c = phase1_getc ();
     259                if (c == EOF)
     260                  break;
     261                if (c == '\'')
     262                  {
     263                    c = phase1_getc ();
     264                    if (c != '\'')
     265                      {
     266                        phase1_ungetc (c);
     267                        break;
     268                      }
     269                  }
     270                if (bufpos >= bufmax)
     271                  {
     272                    bufmax = 2 * bufmax + 10;
     273                    buffer = xrealloc (buffer, bufmax);
     274                  }
     275                buffer[bufpos++] = c;
     276              }
     277            if (bufpos >= bufmax)
     278              {
     279                bufmax = 2 * bufmax + 10;
     280                buffer = xrealloc (buffer, bufmax);
     281              }
     282            buffer[bufpos] = 0;
     283            tp->type = token_type_string_literal;
     284            tp->string = xstrdup (buffer);
     285            return;
     286  
     287          case '+':
     288          case '-':
     289          case '*':
     290          case '/':
     291          case '~':
     292          case '|':
     293          case ',':
     294          case '<':
     295          case '>':
     296          case '=':
     297          case '&':
     298          case '@':
     299          case '?':
     300          case '%':
     301          case '\\':
     302            {
     303              char *name;
     304              int c2 = phase1_getc ();
     305              switch (c2)
     306                {
     307                case '+':
     308                case '-':
     309                case '*':
     310                case '/':
     311                case '~':
     312                case '|':
     313                case ',':
     314                case '<':
     315                case '>':
     316                case '=':
     317                case '&':
     318                case '@':
     319                case '?':
     320                case '%':
     321                  name = XNMALLOC (3, char);
     322                  name[0] = c;
     323                  name[1] = c2;
     324                  name[2] = '\0';
     325                  tp->type = token_type_symbol;
     326                  tp->string = name;
     327                  return;
     328                default:
     329                  phase1_ungetc (c2);
     330                  break;
     331                }
     332              name = XNMALLOC (2, char);
     333              name[0] = c;
     334              name[1] = '\0';
     335              tp->type = token_type_symbol;
     336              tp->string = name;
     337              return;
     338            }
     339  
     340          case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
     341          case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
     342          case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
     343          case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
     344          case 'Y': case 'Z':
     345          case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
     346          case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
     347          case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
     348          case 's': case 't': case 'u': case 'v': case 'w': case 'x':
     349          case 'y': case 'z':
     350            /* Recognize id or id":"[id":"]* or id":"[id":"]*id.  */
     351            bufpos = 0;
     352            for (;;)
     353              {
     354                if (bufpos >= bufmax)
     355                  {
     356                    bufmax = 2 * bufmax + 10;
     357                    buffer = xrealloc (buffer, bufmax);
     358                  }
     359                buffer[bufpos++] = c;
     360                c = phase1_getc ();
     361                switch (c)
     362                  {
     363                  case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
     364                  case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
     365                  case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
     366                  case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
     367                  case 'Y': case 'Z':
     368                  case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
     369                  case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
     370                  case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
     371                  case 's': case 't': case 'u': case 'v': case 'w': case 'x':
     372                  case 'y': case 'z':
     373                  case '0': case '1': case '2': case '3': case '4':
     374                  case '5': case '6': case '7': case '8': case '9':
     375                    continue;
     376                  case ':':
     377                    if (bufpos >= bufmax)
     378                      {
     379                        bufmax = 2 * bufmax + 10;
     380                        buffer = xrealloc (buffer, bufmax);
     381                      }
     382                    buffer[bufpos++] = c;
     383                    c = phase1_getc ();
     384                    switch (c)
     385                      {
     386                      case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
     387                      case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
     388                      case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
     389                      case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
     390                      case 'Y': case 'Z':
     391                      case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
     392                      case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
     393                      case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
     394                      case 's': case 't': case 'u': case 'v': case 'w': case 'x':
     395                      case 'y': case 'z':
     396                        continue;
     397                      default:
     398                        phase1_ungetc (c);
     399                        break;
     400                      }
     401                    break;
     402                  default:
     403                    phase1_ungetc (c);
     404                    break;
     405                  }
     406                break;
     407              }
     408            if (bufpos >= bufmax)
     409              {
     410                bufmax = 2 * bufmax + 10;
     411                buffer = xrealloc (buffer, bufmax);
     412              }
     413            buffer[bufpos] = '\0';
     414            tp->string = xstrdup (buffer);
     415            tp->type = token_type_symbol;
     416            return;
     417  
     418          case '#':
     419            /* Uniquification operator.  */
     420            tp->type = token_type_uniq;
     421            return;
     422  
     423          case '$':
     424            c = phase1_getc ();
     425            tp->type = token_type_other;
     426            return;
     427  
     428          default:
     429            tp->type = token_type_other;
     430            return;
     431          }
     432      }
     433  }
     434  
     435  /* Supports only one pushback token.  */
     436  static void
     437  phase2_unget (token_ty *tp)
     438  {
     439    if (tp->type != token_type_eof)
     440      {
     441        if (phase2_pushback_length == SIZEOF (phase2_pushback))
     442          abort ();
     443        phase2_pushback[phase2_pushback_length++] = *tp;
     444      }
     445  }
     446  
     447  
     448  /* 3. Combine "# string_literal" and "# symbol" to a single token.  */
     449  
     450  static token_ty phase3_pushback[1];
     451  static int phase3_pushback_length;
     452  
     453  static void
     454  phase3_get (token_ty *tp)
     455  {
     456    if (phase3_pushback_length)
     457      {
     458        *tp = phase3_pushback[--phase3_pushback_length];
     459        return;
     460      }
     461  
     462    phase2_get (tp);
     463    if (tp->type == token_type_uniq)
     464      {
     465        token_ty token2;
     466  
     467        phase2_get (&token2);
     468        if (token2.type == token_type_symbol
     469            || token2.type == token_type_string_literal)
     470          {
     471            tp->type = token_type_string_literal;
     472            tp->string = token2.string;
     473          }
     474        else
     475          phase2_unget (&token2);
     476      }
     477  }
     478  
     479  /* Supports only one pushback token.  */
     480  static void
     481  phase3_unget (token_ty *tp)
     482  {
     483    if (tp->type != token_type_eof)
     484      {
     485        if (phase3_pushback_length == SIZEOF (phase3_pushback))
     486          abort ();
     487        phase3_pushback[phase3_pushback_length++] = *tp;
     488      }
     489  }
     490  
     491  
     492  /* ========================= Extracting strings.  ========================== */
     493  
     494  /* The file is broken into tokens.  Scan the token stream, looking for the
     495     following patterns
     496        NLS ? <string>
     497        NLS at: <string>
     498        NLS at: <string> plural: <string>
     499     where <string> is one of
     500        string_literal
     501        # string_literal
     502        # symbol
     503   */
     504  
     505  void
     506  extract_smalltalk (FILE *f,
     507                     const char *real_filename, const char *logical_filename,
     508                     flag_context_list_table_ty *flag_table,
     509                     msgdomain_list_ty *mdlp)
     510  {
     511    message_list_ty *mlp = mdlp->item[0]->messages;
     512  
     513    fp = f;
     514    real_file_name = real_filename;
     515    logical_file_name = xstrdup (logical_filename);
     516    line_number = 1;
     517  
     518    last_comment_line = -1;
     519    last_non_comment_line = -1;
     520  
     521    phase2_pushback_length = 0;
     522    phase3_pushback_length = 0;
     523  
     524    /* Eat tokens until eof is seen.  */
     525    {
     526      /* 0 when no "NLS" has been seen.
     527         1 after "NLS".
     528         2 after "NLS ?".
     529         3 after "NLS at:".
     530         4 after "NLS at: <string>".
     531         5 after "NLS at: <string> plural:".  */
     532      int state;
     533      /* Remember the message containing the msgid, for msgid_plural.
     534         Non-NULL in states 4, 5.  */
     535      message_ty *plural_mp = NULL;
     536  
     537      /* Start state is 0.  */
     538      state = 0;
     539  
     540      for (;;)
     541        {
     542          token_ty token;
     543  
     544          phase3_get (&token);
     545  
     546          switch (token.type)
     547            {
     548            case token_type_symbol:
     549              state = (strcmp (token.string, "NLS") == 0 ? 1 :
     550                       strcmp (token.string, "?") == 0 && state == 1 ? 2 :
     551                       strcmp (token.string, "at:") == 0 && state == 1 ? 3 :
     552                       strcmp (token.string, "plural:") == 0 && state == 4 ? 5 :
     553                       0);
     554              free (token.string);
     555              break;
     556  
     557            case token_type_string_literal:
     558              if (state == 2)
     559                {
     560                  lex_pos_ty pos;
     561                  pos.file_name = logical_file_name;
     562                  pos.line_number = token.line_number;
     563                  remember_a_message (mlp, NULL, token.string, false, false,
     564                                      null_context, &pos, NULL, savable_comment,
     565                                      false);
     566                  state = 0;
     567                  break;
     568                }
     569              if (state == 3)
     570                {
     571                  lex_pos_ty pos;
     572                  token_ty token2;
     573  
     574                  pos.file_name = logical_file_name;
     575                  pos.line_number = token.line_number;
     576  
     577                  phase3_get (&token2);
     578  
     579                  plural_mp =
     580                    remember_a_message (mlp, NULL, token.string, false,
     581                                        token2.type == token_type_symbol
     582                                        && strcmp (token.string, "plural:") == 0,
     583                                        null_context, &pos,
     584                                        NULL, savable_comment, false);
     585  
     586                  phase3_unget (&token2);
     587  
     588                  state = 4;
     589                  break;
     590                }
     591              if (state == 5)
     592                {
     593                  lex_pos_ty pos;
     594                  pos.file_name = logical_file_name;
     595                  pos.line_number = token.line_number;
     596                  if (plural_mp != NULL)
     597                    remember_a_message_plural (plural_mp, token.string, false,
     598                                               null_context, &pos,
     599                                               savable_comment, false);
     600                  state = 0;
     601                  break;
     602                }
     603              state = 0;
     604              free (token.string);
     605              break;
     606  
     607            case token_type_uniq:
     608            case token_type_other:
     609              state = 0;
     610              break;
     611  
     612            case token_type_eof:
     613              break;
     614  
     615            default:
     616              abort ();
     617            }
     618  
     619          if (token.type == token_type_eof)
     620            break;
     621        }
     622    }
     623  
     624    /* Close scanner.  */
     625    fp = NULL;
     626    real_file_name = NULL;
     627    logical_file_name = NULL;
     628    line_number = 0;
     629  }