1  /* Copyright (C) 1996-2023 Free Software Foundation, Inc.
       2     This file is part of the GNU C Library.
       3  
       4     This program is free software; you can redistribute it and/or modify
       5     it under the terms of the GNU General Public License as published
       6     by the Free Software Foundation; version 2 of the License, or
       7     (at your option) any later version.
       8  
       9     This program is distributed in the hope that it will be useful,
      10     but WITHOUT ANY WARRANTY; without even the implied warranty of
      11     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      12     GNU General Public License for more details.
      13  
      14     You should have received a copy of the GNU General Public License
      15     along with this program; if not, see <https://www.gnu.org/licenses/>.  */
      16  
      17  #ifdef HAVE_CONFIG_H
      18  # include <config.h>
      19  #endif
      20  
      21  #include <assert.h>
      22  #include <ctype.h>
      23  #include <errno.h>
      24  #include <libintl.h>
      25  #include <stdarg.h>
      26  #include <stdlib.h>
      27  #include <string.h>
      28  #include <stdint.h>
      29  
      30  #include "localedef.h"
      31  #include "charmap.h"
      32  #include "error.h"
      33  #include "linereader.h"
      34  #include "locfile.h"
      35  
      36  /* Prototypes for local functions.  */
      37  static struct token *get_toplvl_escape (struct linereader *lr);
      38  static struct token *get_symname (struct linereader *lr);
      39  static struct token *get_ident (struct linereader *lr);
      40  static struct token *get_string (struct linereader *lr,
      41  				 const struct charmap_t *charmap,
      42  				 struct localedef_t *locale,
      43  				 const struct repertoire_t *repertoire,
      44  				 int verbose);
      45  static bool utf8_decode (struct linereader *lr, uint8_t ch1, uint32_t *wch);
      46  
      47  
      48  struct linereader *
      49  lr_open (const char *fname, kw_hash_fct_t hf)
      50  {
      51    FILE *fp;
      52  
      53    if (fname == NULL || strcmp (fname, "-") == 0
      54        || strcmp (fname, "/dev/stdin") == 0)
      55      return lr_create (stdin, "<stdin>", hf);
      56    else
      57      {
      58        fp = fopen (fname, "rm");
      59        if (fp == NULL)
      60  	return NULL;
      61        return lr_create (fp, fname, hf);
      62      }
      63  }
      64  
      65  struct linereader *
      66  lr_create (FILE *fp, const char *fname, kw_hash_fct_t hf)
      67  {
      68    struct linereader *result;
      69    int n;
      70  
      71    result = (struct linereader *) xmalloc (sizeof (*result));
      72  
      73    result->fp = fp;
      74    result->fname = xstrdup (fname);
      75    result->buf = NULL;
      76    result->bufsize = 0;
      77    result->lineno = 1;
      78    result->idx = 0;
      79    result->comment_char = '#';
      80    result->escape_char = '\\';
      81    result->translate_strings = 1;
      82    result->return_widestr = 0;
      83  
      84    n = getdelim (&result->buf, &result->bufsize, '\n', result->fp);
      85    if (n < 0)
      86      {
      87        int save = errno;
      88        fclose (result->fp);
      89        free ((char *) result->fname);
      90        free (result);
      91        errno = save;
      92        return NULL;
      93      }
      94  
      95    if (n > 1 && result->buf[n - 2] == '\\' && result->buf[n - 1] == '\n')
      96      n -= 2;
      97  
      98    result->buf[n] = '\0';
      99    result->bufact = n;
     100    result->hash_fct = hf;
     101  
     102    return result;
     103  }
     104  
     105  
     106  int
     107  lr_eof (struct linereader *lr)
     108  {
     109    return lr->bufact = 0;
     110  }
     111  
     112  
     113  void
     114  lr_ignore_rest (struct linereader *lr, int verbose)
     115  {
     116    if (verbose)
     117      {
     118        while (isspace (lr->buf[lr->idx]) && lr->buf[lr->idx] != '\n'
     119  	     && lr->buf[lr->idx] != lr->comment_char)
     120  	if (lr->buf[lr->idx] == '\0')
     121  	  {
     122  	    if (lr_next (lr) < 0)
     123  	      return;
     124  	  }
     125  	else
     126  	  ++lr->idx;
     127  
     128        if (lr->buf[lr->idx] != '\n' && ! feof (lr->fp)
     129  	  && lr->buf[lr->idx] != lr->comment_char)
     130  	lr_error (lr, _("trailing garbage at end of line"));
     131      }
     132  
     133    /* Ignore continued line.  */
     134    while (lr->bufact > 0 && lr->buf[lr->bufact - 1] != '\n')
     135      if (lr_next (lr) < 0)
     136        break;
     137  
     138    lr->idx = lr->bufact;
     139  }
     140  
     141  
     142  void
     143  lr_close (struct linereader *lr)
     144  {
     145    fclose (lr->fp);
     146    free (lr->buf);
     147    free (lr);
     148  }
     149  
     150  
     151  int
     152  lr_next (struct linereader *lr)
     153  {
     154    int n;
     155  
     156    n = getdelim (&lr->buf, &lr->bufsize, '\n', lr->fp);
     157    if (n < 0)
     158      return -1;
     159  
     160    ++lr->lineno;
     161  
     162    if (n > 1 && lr->buf[n - 2] == lr->escape_char && lr->buf[n - 1] == '\n')
     163      {
     164  #if 0
     165        /* XXX Is this correct?  */
     166        /* An escaped newline character is substituted with a single <SP>.  */
     167        --n;
     168        lr->buf[n - 1] = ' ';
     169  #else
     170        n -= 2;
     171  #endif
     172      }
     173  
     174    lr->buf[n] = '\0';
     175    lr->bufact = n;
     176    lr->idx = 0;
     177  
     178    return 0;
     179  }
     180  
     181  
     182  /* Defined in error.c.  */
     183  /* This variable is incremented each time `error' is called.  */
     184  extern unsigned int error_message_count;
     185  
     186  /* The calling program should define program_name and set it to the
     187     name of the executing program.  */
     188  extern char *program_name;
     189  
     190  
     191  struct token *
     192  lr_token (struct linereader *lr, const struct charmap_t *charmap,
     193  	  struct localedef_t *locale, const struct repertoire_t *repertoire,
     194  	  int verbose)
     195  {
     196    int ch;
     197  
     198    while (1)
     199      {
     200        do
     201  	{
     202  	  ch = lr_getc (lr);
     203  
     204  	  if (ch == EOF)
     205  	    {
     206  	      lr->token.tok = tok_eof;
     207  	      return &lr->token;
     208  	    };
     209  
     210  	  if (ch == '\n')
     211  	    {
     212  	      lr->token.tok = tok_eol;
     213  	      return &lr->token;
     214  	    }
     215  	}
     216        while (isspace (ch));
     217  
     218        if (ch != lr->comment_char)
     219  	break;
     220  
     221        /* Is there an newline at the end of the buffer?  */
     222        if (lr->buf[lr->bufact - 1] != '\n')
     223  	{
     224  	  /* No.  Some people want this to mean that only the line in
     225  	     the file not the logical, concatenated line is ignored.
     226  	     Let's try this.  */
     227  	  lr->idx = lr->bufact;
     228  	  continue;
     229  	}
     230  
     231        /* Ignore rest of line.  */
     232        lr_ignore_rest (lr, 0);
     233        lr->token.tok = tok_eol;
     234        return &lr->token;
     235      }
     236  
     237    /* Match escape sequences.  */
     238    if (ch == lr->escape_char)
     239      return get_toplvl_escape (lr);
     240  
     241    /* Match ellipsis.  */
     242    if (ch == '.')
     243      {
     244        if (strncmp (&lr->buf[lr->idx], "...(2)....", 10) == 0)
     245  	{
     246  	  int cnt;
     247  	  for (cnt = 0; cnt < 10; ++cnt)
     248  	    lr_getc (lr);
     249  	  lr->token.tok = tok_ellipsis4_2;
     250  	  return &lr->token;
     251  	}
     252        if (strncmp (&lr->buf[lr->idx], "...", 3) == 0)
     253  	{
     254  	  lr_getc (lr);
     255  	  lr_getc (lr);
     256  	  lr_getc (lr);
     257  	  lr->token.tok = tok_ellipsis4;
     258  	  return &lr->token;
     259  	}
     260        if (strncmp (&lr->buf[lr->idx], "..", 2) == 0)
     261  	{
     262  	  lr_getc (lr);
     263  	  lr_getc (lr);
     264  	  lr->token.tok = tok_ellipsis3;
     265  	  return &lr->token;
     266  	}
     267        if (strncmp (&lr->buf[lr->idx], ".(2)..", 6) == 0)
     268  	{
     269  	  int cnt;
     270  	  for (cnt = 0; cnt < 6; ++cnt)
     271  	    lr_getc (lr);
     272  	  lr->token.tok = tok_ellipsis2_2;
     273  	  return &lr->token;
     274  	}
     275        if (lr->buf[lr->idx] == '.')
     276  	{
     277  	  lr_getc (lr);
     278  	  lr->token.tok = tok_ellipsis2;
     279  	  return &lr->token;
     280  	}
     281      }
     282  
     283    switch (ch)
     284      {
     285      case '<':
     286        return get_symname (lr);
     287  
     288      case '0' ... '9':
     289        lr->token.tok = tok_number;
     290        lr->token.val.num = ch - '0';
     291  
     292        while (isdigit (ch = lr_getc (lr)))
     293  	{
     294  	  lr->token.val.num *= 10;
     295  	  lr->token.val.num += ch - '0';
     296  	}
     297        if (isalpha (ch))
     298  	lr_error (lr, _("garbage at end of number"));
     299        lr_ungetn (lr, 1);
     300  
     301        return &lr->token;
     302  
     303      case ';':
     304        lr->token.tok = tok_semicolon;
     305        return &lr->token;
     306  
     307      case ',':
     308        lr->token.tok = tok_comma;
     309        return &lr->token;
     310  
     311      case '(':
     312        lr->token.tok = tok_open_brace;
     313        return &lr->token;
     314  
     315      case ')':
     316        lr->token.tok = tok_close_brace;
     317        return &lr->token;
     318  
     319      case '"':
     320        return get_string (lr, charmap, locale, repertoire, verbose);
     321  
     322      case '-':
     323        ch = lr_getc (lr);
     324        if (ch == '1')
     325  	{
     326  	  lr->token.tok = tok_minus1;
     327  	  return &lr->token;
     328  	}
     329        lr_ungetn (lr, 2);
     330        break;
     331  
     332      case 0x80 ... 0xff:		/* UTF-8 sequence.  */
     333        {
     334  	uint32_t wch;
     335  	if (!utf8_decode (lr, ch, &wch))
     336  	  {
     337  	    lr->token.tok = tok_error;
     338  	    return &lr->token;
     339  	  }
     340  	lr->token.tok = tok_ucs4;
     341  	lr->token.val.ucs4 = wch;
     342  	return &lr->token;
     343        }
     344      }
     345  
     346    return get_ident (lr);
     347  }
     348  
     349  
     350  static struct token *
     351  get_toplvl_escape (struct linereader *lr)
     352  {
     353    /* This is supposed to be a numeric value.  We return the
     354       numerical value and the number of bytes.  */
     355    size_t start_idx = lr->idx - 1;
     356    unsigned char *bytes = lr->token.val.charcode.bytes;
     357    size_t nbytes = 0;
     358    int ch;
     359  
     360    do
     361      {
     362        unsigned int byte = 0;
     363        unsigned int base = 8;
     364  
     365        ch = lr_getc (lr);
     366  
     367        if (ch == 'd')
     368  	{
     369  	  base = 10;
     370  	  ch = lr_getc (lr);
     371  	}
     372        else if (ch == 'x')
     373  	{
     374  	  base = 16;
     375  	  ch = lr_getc (lr);
     376  	}
     377  
     378        if ((base == 16 && !isxdigit (ch))
     379  	  || (base != 16 && (ch < '0' || ch >= (int) ('0' + base))))
     380  	{
     381  	esc_error:
     382  	  lr->token.val.str.startmb = &lr->buf[start_idx];
     383  
     384  	  while (ch != EOF && !isspace (ch))
     385  	    ch = lr_getc (lr);
     386  	  lr->token.val.str.lenmb = lr->idx - start_idx;
     387  
     388  	  lr->token.tok = tok_error;
     389  	  return &lr->token;
     390  	}
     391  
     392        if (isdigit (ch))
     393  	byte = ch - '0';
     394        else
     395  	byte = tolower (ch) - 'a' + 10;
     396  
     397        ch = lr_getc (lr);
     398        if ((base == 16 && !isxdigit (ch))
     399  	  || (base != 16 && (ch < '0' || ch >= (int) ('0' + base))))
     400  	goto esc_error;
     401  
     402        byte *= base;
     403        if (isdigit (ch))
     404  	byte += ch - '0';
     405        else
     406  	byte += tolower (ch) - 'a' + 10;
     407  
     408        ch = lr_getc (lr);
     409        if (base != 16 && isdigit (ch))
     410  	{
     411  	  byte *= base;
     412  	  byte += ch - '0';
     413  
     414  	  ch = lr_getc (lr);
     415  	}
     416  
     417        bytes[nbytes++] = byte;
     418      }
     419    while (ch == lr->escape_char
     420  	 && nbytes < (int) sizeof (lr->token.val.charcode.bytes));
     421  
     422    if (!isspace (ch))
     423      lr_error (lr, _("garbage at end of character code specification"));
     424  
     425    lr_ungetn (lr, 1);
     426  
     427    lr->token.tok = tok_charcode;
     428    lr->token.val.charcode.nbytes = nbytes;
     429  
     430    return &lr->token;
     431  }
     432  
     433  /* Multibyte string buffer.  */
     434  struct lr_buffer
     435  {
     436    size_t act;
     437    size_t max;
     438    char *buf;
     439  };
     440  
     441  /* Initialize *LRB with a default-sized buffer.  */
     442  static void
     443  lr_buffer_init (struct lr_buffer *lrb)
     444  {
     445   lrb->act = 0;
     446   lrb->max = 56;
     447   lrb->buf = xmalloc (lrb->max);
     448  }
     449  
     450  /* Transfers the buffer string from *LRB to LR->token.mbstr.  */
     451  static void
     452  lr_buffer_to_token (struct lr_buffer *lrb, struct linereader *lr)
     453  {
     454    lr->token.val.str.startmb = xrealloc (lrb->buf, lrb->act + 1);
     455    lr->token.val.str.startmb[lrb->act] = '\0';
     456    lr->token.val.str.lenmb = lrb->act;
     457  }
     458  
     459  /* Adds CH to *LRB.  */
     460  static void
     461  addc (struct lr_buffer *lrb, char ch)
     462  {
     463    if (lrb->act == lrb->max)
     464      {
     465        lrb->max *= 2;
     466        lrb->buf = xrealloc (lrb->buf, lrb->max);
     467      }
     468    lrb->buf[lrb->act++] = ch;
     469  }
     470  
     471  /* Adds L bytes at S to *LRB.  */
     472  static void
     473  adds (struct lr_buffer *lrb, const unsigned char *s, size_t l)
     474  {
     475    if (lrb->max - lrb->act < l)
     476      {
     477        size_t required_size = lrb->act + l;
     478        size_t new_max = 2 * lrb->max;
     479        if (new_max < required_size)
     480  	new_max = required_size;
     481        lrb->buf = xrealloc (lrb->buf, new_max);
     482        lrb->max = new_max;
     483      }
     484    memcpy (lrb->buf + lrb->act, s, l);
     485    lrb->act += l;
     486  }
     487  
     488  #define ADDWC(ch) \
     489    do									      \
     490      {									      \
     491        if (buf2act == buf2max)						      \
     492  	{								      \
     493  	  buf2max *= 2;							      \
     494  	  buf2 = xrealloc (buf2, buf2max * 4);				      \
     495  	}								      \
     496        buf2[buf2act++] = (ch);						      \
     497      }									      \
     498    while (0)
     499  
     500  
     501  static struct token *
     502  get_symname (struct linereader *lr)
     503  {
     504    /* Symbol in brackets.  We must distinguish three kinds:
     505       1. reserved words
     506       2. ISO 10646 position values
     507       3. all other.  */
     508    const struct keyword_t *kw;
     509    int ch;
     510    struct lr_buffer lrb;
     511  
     512    lr_buffer_init (&lrb);
     513  
     514    do
     515      {
     516        ch = lr_getc (lr);
     517        if (ch == lr->escape_char)
     518  	{
     519  	  int c2 = lr_getc (lr);
     520  	  addc (&lrb, c2);
     521  
     522  	  if (c2 == '\n')
     523  	    ch = '\n';
     524  	}
     525        else
     526  	addc (&lrb, ch);
     527      }
     528    while (ch != '>' && ch != '\n');
     529  
     530    if (ch == '\n')
     531      lr_error (lr, _("unterminated symbolic name"));
     532  
     533    /* Test for ISO 10646 position value.  */
     534    if (lrb.buf[0] == 'U' && (lrb.act == 6 || lrb.act == 10))
     535      {
     536        char *cp = lrb.buf + 1;
     537        while (cp < &lrb.buf[lrb.act - 1] && isxdigit (*cp))
     538  	++cp;
     539  
     540        if (cp == &lrb.buf[lrb.act - 1])
     541  	{
     542  	  /* Yes, it is.  */
     543  	  lr->token.tok = tok_ucs4;
     544  	  lr->token.val.ucs4 = strtoul (lrb.buf + 1, NULL, 16);
     545  
     546  	  return &lr->token;
     547  	}
     548      }
     549  
     550    /* It is a symbolic name.  Test for reserved words.  */
     551    kw = lr->hash_fct (lrb.buf, lrb.act - 1);
     552  
     553    if (kw != NULL && kw->symname_or_ident == 1)
     554      {
     555        lr->token.tok = kw->token;
     556        free (lrb.buf);
     557      }
     558    else
     559      {
     560        lr->token.tok = tok_bsymbol;
     561        lr_buffer_to_token (&lrb, lr);
     562        --lr->token.val.str.lenmb;  /* Hide the training '>'.  */
     563      }
     564  
     565    return &lr->token;
     566  }
     567  
     568  
     569  static struct token *
     570  get_ident (struct linereader *lr)
     571  {
     572    const struct keyword_t *kw;
     573    int ch;
     574    struct lr_buffer lrb;
     575  
     576    lr_buffer_init (&lrb);
     577  
     578    addc (&lrb, lr->buf[lr->idx - 1]);
     579  
     580    while (!isspace ((ch = lr_getc (lr))) && ch != '"' && ch != ';'
     581  	 && ch != '<' && ch != ',' && ch != EOF)
     582      {
     583        if (ch == lr->escape_char)
     584  	{
     585  	  ch = lr_getc (lr);
     586  	  if (ch == '\n' || ch == EOF)
     587  	    {
     588  	      lr_error (lr, _("invalid escape sequence"));
     589  	      break;
     590  	    }
     591  	}
     592        addc (&lrb, ch);
     593      }
     594  
     595    lr_ungetc (lr, ch);
     596  
     597    kw = lr->hash_fct (lrb.buf, lrb.act);
     598  
     599    if (kw != NULL && kw->symname_or_ident == 0)
     600      {
     601        lr->token.tok = kw->token;
     602        free (lrb.buf);
     603      }
     604    else
     605      {
     606        lr->token.tok = tok_ident;
     607        lr_buffer_to_token (&lrb, lr);
     608      }
     609  
     610    return &lr->token;
     611  }
     612  
     613  /* Process a decoded Unicode codepoint WCH in a string, placing the
     614     multibyte sequence into LRB.  Return false if the character is not
     615     found in CHARMAP/REPERTOIRE.  */
     616  static bool
     617  translate_unicode_codepoint (struct localedef_t *locale,
     618  			     const struct charmap_t *charmap,
     619  			     const struct repertoire_t *repertoire,
     620  			     uint32_t wch, struct lr_buffer *lrb)
     621  {
     622    /* See whether the charmap contains the Uxxxxxxxx names.  */
     623    char utmp[10];
     624    snprintf (utmp, sizeof (utmp), "U%08X", wch);
     625    struct charseq *seq = charmap_find_value (charmap, utmp, 9);
     626  
     627    if (seq == NULL)
     628      {
     629        /* No, this isn't the case.  Now determine from
     630  	 the repertoire the name of the character and
     631  	 find it in the charmap.  */
     632        if (repertoire != NULL)
     633  	{
     634  	  const char *symbol = repertoire_find_symbol (repertoire, wch);
     635  	  if (symbol != NULL)
     636  	    seq = charmap_find_value (charmap, symbol, strlen (symbol));
     637  	}
     638  
     639        if (seq == NULL)
     640  	{
     641  #ifndef NO_TRANSLITERATION
     642  	  /* Transliterate if possible.  */
     643  	  if (locale != NULL)
     644  	    {
     645  	      if ((locale->avail & CTYPE_LOCALE) == 0)
     646  		{
     647  		  /* Load the CTYPE data now.  */
     648  		  int old_needed = locale->needed;
     649  
     650  		  locale->needed = 0;
     651  		  locale = load_locale (LC_CTYPE, locale->name,
     652  					locale->repertoire_name,
     653  					charmap, locale);
     654  		  locale->needed = old_needed;
     655  		}
     656  
     657  	      uint32_t *translit;
     658  	      if ((locale->avail & CTYPE_LOCALE) != 0
     659  		  && ((translit = find_translit (locale, charmap, wch))
     660  		      != NULL))
     661  		/* The CTYPE data contains a matching
     662  		   transliteration.  */
     663  		{
     664  		  for (int i = 0; translit[i] != 0; ++i)
     665  		    {
     666  		      snprintf (utmp, sizeof (utmp), "U%08X", translit[i]);
     667  		      seq = charmap_find_value (charmap, utmp, 9);
     668  		      assert (seq != NULL);
     669  		      adds (lrb, seq->bytes, seq->nbytes);
     670  		    }
     671  		  return true;
     672  		}
     673  	    }
     674  #endif	/* NO_TRANSLITERATION */
     675  
     676  	  /* Not a known name.  */
     677  	  return false;
     678  	}
     679      }
     680  
     681    if (seq != NULL)
     682      {
     683        adds (lrb, seq->bytes, seq->nbytes);
     684        return true;
     685      }
     686    else
     687      return false;
     688  }
     689  
     690  /* Returns true if ch is not EOF (that is, non-negative) and a valid
     691     UTF-8 trailing byte.  */
     692  static bool
     693  utf8_valid_trailing (int ch)
     694  {
     695    return ch >= 0 && (ch & 0xc0) == 0x80;
     696  }
     697  
     698  /* Reports an error for a broken UTF-8 sequence.  CH2 to CH4 may be
     699     EOF.  Always returns false.  */
     700  static bool
     701  utf8_sequence_error (struct linereader *lr, uint8_t ch1, int ch2, int ch3,
     702  		     int ch4)
     703  {
     704    char buf[38];
     705  
     706    if (ch2 < 0)
     707      snprintf (buf, sizeof (buf), "0x%02x", ch1);
     708    else if (ch3 < 0)
     709      snprintf (buf, sizeof (buf), "0x%02x 0x%02x", ch1, ch2);
     710    else if (ch4 < 0)
     711      snprintf (buf, sizeof (buf), "0x%02x 0x%02x 0x%02x", ch1, ch2, ch3);
     712    else
     713      snprintf (buf, sizeof (buf), "0x%02x 0x%02x 0x%02x 0x%02x",
     714  	      ch1, ch2, ch3, ch4);
     715  
     716    lr_error (lr, _("invalid UTF-8 sequence %s"), buf);
     717    return false;
     718  }
     719  
     720  /* Reads a UTF-8 sequence from LR, with the leading byte CH1, and
     721     stores the decoded codepoint in *WCH.  Returns false on failure and
     722     reports an error.  */
     723  static bool
     724  utf8_decode (struct linereader *lr, uint8_t ch1, uint32_t *wch)
     725  {
     726    /* See RFC 3629 section 4 and __gconv_transform_utf8_internal.  */
     727    if (ch1 < 0xc2)
     728      return utf8_sequence_error (lr, ch1, -1, -1, -1);
     729  
     730    int ch2 = lr_getc (lr);
     731    if (!utf8_valid_trailing (ch2))
     732      return utf8_sequence_error (lr, ch1, ch2, -1, -1);
     733  
     734    if (ch1 <= 0xdf)
     735      {
     736        uint32_t result = ((ch1 & 0x1f)  << 6) | (ch2 & 0x3f);
     737        if (result < 0x80)
     738  	return utf8_sequence_error (lr, ch1, ch2, -1, -1);
     739        *wch = result;
     740        return true;
     741      }
     742  
     743    int ch3 = lr_getc (lr);
     744    if (!utf8_valid_trailing (ch3) || ch1 < 0xe0)
     745      return utf8_sequence_error (lr, ch1, ch2, ch3, -1);
     746  
     747    if (ch1 <= 0xef)
     748      {
     749        uint32_t result = (((ch1 & 0x0f)  << 12)
     750  			 | ((ch2 & 0x3f) << 6)
     751  			 | (ch3 & 0x3f));
     752        if (result < 0x800)
     753  	return utf8_sequence_error (lr, ch1, ch2, ch3, -1);
     754        *wch = result;
     755        return true;
     756      }
     757  
     758    int ch4 = lr_getc (lr);
     759    if (!utf8_valid_trailing (ch4) || ch1 < 0xf0 || ch1 > 0xf4)
     760      return utf8_sequence_error (lr, ch1, ch2, ch3, ch4);
     761  
     762    uint32_t result = (((ch1 & 0x07)  << 18)
     763  		     | ((ch2 & 0x3f) << 12)
     764  		     | ((ch3 & 0x3f) << 6)
     765  		     | (ch4 & 0x3f));
     766    if (result < 0x10000)
     767      return utf8_sequence_error (lr, ch1, ch2, ch3, ch4);
     768    *wch = result;
     769    return true;
     770  }
     771  
     772  static struct token *
     773  get_string (struct linereader *lr, const struct charmap_t *charmap,
     774  	    struct localedef_t *locale, const struct repertoire_t *repertoire,
     775  	    int verbose)
     776  {
     777    int return_widestr = lr->return_widestr;
     778    struct lr_buffer lrb;
     779    wchar_t *buf2 = NULL;
     780  
     781    lr_buffer_init (&lrb);
     782  
     783    /* We know it'll be a string.  */
     784    lr->token.tok = tok_string;
     785  
     786    /* If we need not translate the strings (i.e., expand <...> parts)
     787       we can run a simple loop.  */
     788    if (!lr->translate_strings)
     789      {
     790        int ch;
     791  
     792        buf2 = NULL;
     793        while ((ch = lr_getc (lr)) != '"' && ch != '\n' && ch != EOF)
     794  	{
     795  	  if (ch >= 0x80)
     796  	    lr_error (lr, _("illegal 8-bit character in untranslated string"));
     797  	  addc (&lrb, ch);
     798  	}
     799  
     800        /* Catch errors with trailing escape character.  */
     801        if (lrb.act > 0 && lrb.buf[lrb.act - 1] == lr->escape_char
     802  	  && (lrb.act == 1 || lrb.buf[lrb.act - 2] != lr->escape_char))
     803  	{
     804  	  lr_error (lr, _("illegal escape sequence at end of string"));
     805  	  --lrb.act;
     806  	}
     807        else if (ch == '\n' || ch == EOF)
     808  	lr_error (lr, _("unterminated string"));
     809  
     810        addc (&lrb, '\0');
     811      }
     812    else
     813      {
     814        bool illegal_string = false;
     815        size_t buf2act = 0;
     816        size_t buf2max = 56 * sizeof (uint32_t);
     817        int ch;
     818  
     819        /* We have to provide the wide character result as well.  */
     820        if (return_widestr)
     821  	buf2 = xmalloc (buf2max);
     822  
     823        /* Read until the end of the string (or end of the line or file).  */
     824        while ((ch = lr_getc (lr)) != '"' && ch != '\n' && ch != EOF)
     825  	{
     826  	  size_t startidx;
     827  	  uint32_t wch;
     828  	  struct charseq *seq;
     829  
     830  	  if (ch != '<')
     831  	    {
     832  	      /* The standards leave it up to the implementation to
     833  		 decide what to do with characters which stand for
     834  		 themselves.  This implementation treats the input
     835  		 file as encoded in UTF-8.  */
     836  	      if (ch == lr->escape_char)
     837  		{
     838  		  ch = lr_getc (lr);
     839  		  if (ch >= 0x80)
     840  		    {
     841  		      lr_error (lr, _("illegal 8-bit escape sequence"));
     842  		      illegal_string = true;
     843  		      break;
     844  		    }
     845  		  if (ch == '\n' || ch == EOF)
     846  		    break;
     847  		  addc (&lrb, ch);
     848  		  wch = ch;
     849  		}
     850  	      else if (ch < 0x80)
     851  		{
     852  		  wch = ch;
     853  		  addc (&lrb, ch);
     854  		}
     855  	      else 		/* UTF-8 sequence.  */
     856  		{
     857  		  if (!utf8_decode (lr, ch, &wch))
     858  		    {
     859  		      illegal_string = true;
     860  		      break;
     861  		    }
     862  		  if (!translate_unicode_codepoint (locale, charmap,
     863  						    repertoire, wch, &lrb))
     864  		    {
     865  		      /* Ignore the rest of the string.  Callers may
     866  			 skip this string because it cannot be encoded
     867  			 in the output character set.  */
     868  		      illegal_string = true;
     869  		      continue;
     870  		    }
     871  		}
     872  
     873  	      if (return_widestr)
     874  		ADDWC (wch);
     875  
     876  	      continue;
     877  	    }
     878  
     879  	  /* Now we have to search for the end of the symbolic name, i.e.,
     880  	     the closing '>'.  */
     881  	  startidx = lrb.act;
     882  	  while ((ch = lr_getc (lr)) != '>' && ch != '\n' && ch != EOF)
     883  	    {
     884  	      if (ch == lr->escape_char)
     885  		{
     886  		  ch = lr_getc (lr);
     887  		  if (ch == '\n' || ch == EOF)
     888  		    break;
     889  		}
     890  	      addc (&lrb, ch);
     891  	    }
     892  	  if (ch == '\n' || ch == EOF)
     893  	    /* Not a correct string.  */
     894  	    break;
     895  	  if (lrb.act == startidx)
     896  	    {
     897  	      /* <> is no correct name.  Ignore it and also signal an
     898  		 error.  */
     899  	      illegal_string = true;
     900  	      continue;
     901  	    }
     902  
     903  	  /* It might be a Uxxxx symbol.  */
     904  	  if (lrb.buf[startidx] == 'U'
     905  	      && (lrb.act - startidx == 5 || lrb.act - startidx == 9))
     906  	    {
     907  	      char *cp = lrb.buf + startidx + 1;
     908  	      while (cp < &lrb.buf[lrb.act] && isxdigit (*cp))
     909  		++cp;
     910  
     911  	      if (cp == &lrb.buf[lrb.act])
     912  		{
     913  		  /* Yes, it is.  */
     914  		  addc (&lrb, '\0');
     915  		  wch = strtoul (lrb.buf + startidx + 1, NULL, 16);
     916  
     917  		  /* Now forget about the name we just added.  */
     918  		  lrb.act = startidx;
     919  
     920  		  if (return_widestr)
     921  		    ADDWC (wch);
     922  
     923  		  if (!translate_unicode_codepoint (locale, charmap,
     924  						    repertoire, wch, &lrb))
     925  		    illegal_string = true;
     926  		  continue;
     927  		}
     928  	    }
     929  
     930  	  /* We now have the symbolic name in lrb.buf[startidx] to
     931  	     lrb.buf[lrb.act-1].  Now find out the value for this character
     932  	     in the charmap as well as in the repertoire map (in this
     933  	     order).  */
     934  	  seq = charmap_find_value (charmap, &lrb.buf[startidx],
     935  				    lrb.act - startidx);
     936  
     937  	  if (seq == NULL)
     938  	    {
     939  	      /* This name is not in the charmap.  */
     940  	      lr_error (lr, _("symbol `%.*s' not in charmap"),
     941  			(int) (lrb.act - startidx), &lrb.buf[startidx]);
     942  	      illegal_string = true;
     943  	    }
     944  
     945  	  if (return_widestr)
     946  	    {
     947  	      /* Now the same for the multibyte representation.  */
     948  	      if (seq != NULL && seq->ucs4 != UNINITIALIZED_CHAR_VALUE)
     949  		wch = seq->ucs4;
     950  	      else
     951  		{
     952  		  wch = repertoire_find_value (repertoire, &lrb.buf[startidx],
     953  					       lrb.act - startidx);
     954  		  if (seq != NULL)
     955  		    seq->ucs4 = wch;
     956  		}
     957  
     958  	      if (wch == ILLEGAL_CHAR_VALUE)
     959  		{
     960  		  /* This name is not in the repertoire map.  */
     961  		  lr_error (lr, _("symbol `%.*s' not in repertoire map"),
     962  			    (int) (lrb.act - startidx), &lrb.buf[startidx]);
     963  		  illegal_string = true;
     964  		}
     965  	      else
     966  		ADDWC (wch);
     967  	    }
     968  
     969  	  /* Now forget about the name we just added.  */
     970  	  lrb.act = startidx;
     971  
     972  	  /* And copy the bytes.  */
     973  	  if (seq != NULL)
     974  	    adds (&lrb, seq->bytes, seq->nbytes);
     975  	}
     976  
     977        if (ch == '\n' || ch == EOF)
     978  	{
     979  	  lr_error (lr, _("unterminated string"));
     980  	  illegal_string = true;
     981  	}
     982  
     983        if (illegal_string)
     984  	{
     985  	  free (lrb.buf);
     986  	  free (buf2);
     987  	  lr->token.val.str.startmb = NULL;
     988  	  lr->token.val.str.lenmb = 0;
     989  	  lr->token.val.str.startwc = NULL;
     990  	  lr->token.val.str.lenwc = 0;
     991  
     992  	  return &lr->token;
     993  	}
     994  
     995        addc (&lrb, '\0');
     996  
     997        if (return_widestr)
     998  	{
     999  	  ADDWC (0);
    1000  	  lr->token.val.str.startwc = xrealloc (buf2,
    1001  						buf2act * sizeof (uint32_t));
    1002  	  lr->token.val.str.lenwc = buf2act;
    1003  	}
    1004      }
    1005  
    1006    lr_buffer_to_token (&lrb, lr);
    1007  
    1008    return &lr->token;
    1009  }