(root)/
glibc-2.38/
locale/
programs/
charmap.c
       1  /* Copyright (C) 1996-2023 Free Software Foundation, Inc.
       2     This file is part of the GNU C Library.
       3  
       4     This program is free software; you can redistribute it and/or modify
       5     it under the terms of the GNU General Public License as published
       6     by the Free Software Foundation; version 2 of the License, or
       7     (at your option) any later version.
       8  
       9     This program is distributed in the hope that it will be useful,
      10     but WITHOUT ANY WARRANTY; without even the implied warranty of
      11     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      12     GNU General Public License for more details.
      13  
      14     You should have received a copy of the GNU General Public License
      15     along with this program; if not, see <https://www.gnu.org/licenses/>.  */
      16  
      17  #ifdef HAVE_CONFIG_H
      18  # include <config.h>
      19  #endif
      20  
      21  #include <ctype.h>
      22  #include <errno.h>
      23  #include <libintl.h>
      24  #include <limits.h>
      25  #include <stdio.h>
      26  #include <stdlib.h>
      27  #include <string.h>
      28  #include <stdint.h>
      29  
      30  #include "localedef.h"
      31  #include "linereader.h"
      32  #include "charmap.h"
      33  #include "charmap-dir.h"
      34  
      35  #include <assert.h>
      36  
      37  
      38  /* Define the lookup function.  */
      39  #include "charmap-kw.h"
      40  
      41  
      42  /* Prototypes for local functions.  */
      43  static struct charmap_t *parse_charmap (struct linereader *cmfile,
      44  					int verbose, int be_quiet);
      45  static void new_width (struct linereader *cmfile, struct charmap_t *result,
      46  		       const char *from, const char *to,
      47  		       unsigned long int width);
      48  static void charmap_new_char (struct linereader *lr, struct charmap_t *cm,
      49  			      size_t nbytes, unsigned char *bytes,
      50  			      const char *from, const char *to,
      51  			      int decimal_ellipsis, int step);
      52  
      53  
      54  bool enc_not_ascii_compatible;
      55  
      56  
      57  #ifdef NEED_NULL_POINTER
      58  static const char *null_pointer;
      59  #endif
      60  
      61  static struct linereader *
      62  cmlr_open (const char *directory, const char *name, kw_hash_fct_t hf)
      63  {
      64    FILE *fp;
      65  
      66    fp = charmap_open (directory, name);
      67    if (fp == NULL)
      68      return NULL;
      69    else
      70      {
      71        size_t dlen = strlen (directory);
      72        int add_slash = (dlen == 0 || directory[dlen - 1] != '/');
      73        size_t nlen = strlen (name);
      74        char *pathname;
      75        char *p;
      76  
      77        pathname = alloca (dlen + add_slash + nlen + 1);
      78        p = stpcpy (pathname, directory);
      79        if (add_slash)
      80  	*p++ = '/';
      81        stpcpy (p, name);
      82  
      83        return lr_create (fp, pathname, hf);
      84      }
      85  }
      86  
      87  struct charmap_t *
      88  charmap_read (const char *filename, int verbose, int error_not_found,
      89  	      int be_quiet, int use_default)
      90  {
      91    struct charmap_t *result = NULL;
      92  
      93    if (filename != NULL)
      94      {
      95        struct linereader *cmfile;
      96  
      97        /* First try the name as found in the parameter.  */
      98        cmfile = lr_open (filename, charmap_hash);
      99        if (cmfile == NULL)
     100  	{
     101  	  /* No successful.  So start looking through the directories
     102  	     in the I18NPATH if this is a simple name.  */
     103  	  if (strchr (filename, '/') == NULL)
     104  	    {
     105  	      char *i18npath = getenv ("I18NPATH");
     106  	      if (i18npath != NULL && *i18npath != '\0')
     107  		{
     108  		  const size_t pathlen = strlen (i18npath);
     109  		  char i18npathbuf[pathlen + 1];
     110  		  char path[pathlen + sizeof ("/charmaps")];
     111  		  char *next;
     112  		  i18npath = memcpy (i18npathbuf, i18npath, pathlen + 1);
     113  
     114  		  while (cmfile == NULL
     115  			 && (next = strsep (&i18npath, ":")) != NULL)
     116  		    {
     117  		      stpcpy (stpcpy (path, next), "/charmaps");
     118  		      cmfile = cmlr_open (path, filename, charmap_hash);
     119  
     120  		      if (cmfile == NULL)
     121  			/* Try without the "/charmaps" part.  */
     122  			cmfile = cmlr_open (next, filename, charmap_hash);
     123  		    }
     124  		}
     125  
     126  	      if (cmfile == NULL)
     127  		/* Try the default directory.  */
     128  		cmfile = cmlr_open (CHARMAP_PATH, filename, charmap_hash);
     129  	    }
     130  	}
     131  
     132        if (cmfile != NULL)
     133  	result = parse_charmap (cmfile, verbose, be_quiet);
     134  
     135        if (result == NULL && error_not_found)
     136  	record_error (0, errno,
     137  		      _("character map file `%s' not found"),
     138  		      filename);
     139      }
     140  
     141    if (result == NULL && filename != NULL && strchr (filename, '/') == NULL)
     142      {
     143        /* OK, one more try.  We also accept the names given to the
     144  	 character sets in the files.  Sometimes they differ from the
     145  	 file name.  */
     146        CHARMAP_DIR *dir;
     147  
     148        dir = charmap_opendir (CHARMAP_PATH);
     149        if (dir != NULL)
     150  	{
     151  	  const char *dirent;
     152  
     153  	  while ((dirent = charmap_readdir (dir)) != NULL)
     154  	    {
     155  	      char **aliases;
     156  	      char **p;
     157  	      int found;
     158  
     159  	      aliases = charmap_aliases (CHARMAP_PATH, dirent);
     160  	      found = 0;
     161  	      for (p = aliases; *p; p++)
     162  		if (strcasecmp (*p, filename) == 0)
     163  		  {
     164  		    found = 1;
     165  		    break;
     166  		  }
     167  	      charmap_free_aliases (aliases);
     168  
     169  	      if (found)
     170  		{
     171  		  struct linereader *cmfile;
     172  
     173  		  cmfile = cmlr_open (CHARMAP_PATH, dirent, charmap_hash);
     174  		  if (cmfile != NULL)
     175  		    result = parse_charmap (cmfile, verbose, be_quiet);
     176  
     177  		  break;
     178  		}
     179  	    }
     180  
     181  	  charmap_closedir (dir);
     182  	}
     183      }
     184  
     185    if (result == NULL && DEFAULT_CHARMAP != NULL)
     186      {
     187        struct linereader *cmfile;
     188  
     189        cmfile = cmlr_open (CHARMAP_PATH, DEFAULT_CHARMAP, charmap_hash);
     190        if (cmfile != NULL)
     191  	result = parse_charmap (cmfile, verbose, be_quiet);
     192  
     193        if (result == NULL)
     194  	record_error (4, errno,
     195  		      _("default character map file `%s' not found"),
     196  		      DEFAULT_CHARMAP);
     197      }
     198  
     199    if (result != NULL && result->code_set_name == NULL)
     200      /* The input file does not specify a code set name.  This
     201         shouldn't happen but we should cope with it.  */
     202      result->code_set_name = basename (filename);
     203  
     204    /* Test of ASCII compatibility of locale encoding.
     205  
     206       Verify that the encoding to be used in a locale is ASCII compatible,
     207       at least for the graphic characters, excluding the control characters,
     208       '$' and '@'.  This constraint comes from an ISO C 99 restriction.
     209  
     210       ISO C 99 section 7.17.(2) (about wchar_t):
     211         the null character shall have the code value zero and each member of
     212         the basic character set shall have a code value equal to its value
     213         when used as the lone character in an integer character constant.
     214       ISO C 99 section 5.2.1.(3):
     215         Both the basic source and basic execution character sets shall have
     216         the following members: the 26 uppercase letters of the Latin alphabet
     217              A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
     218         the 26 lowercase letters of the Latin alphabet
     219              a b c d e f g h i j k l m n o p q r s t u v w x y z
     220         the 10 decimal digits
     221              0 1 2 3 4 5 6 7 8 9
     222         the following 29 graphic characters
     223              ! " # % & ' ( ) * + , - . / : ; < = > ? [ \ ] ^ _ { | } ~
     224         the space character, and control characters representing horizontal
     225         tab, vertical tab, and form feed.
     226  
     227       Therefore, for all members of the "basic character set", the 'char' code
     228       must have the same value as the 'wchar_t' code, which in glibc is the
     229       same as the Unicode code, which for all of the enumerated characters
     230       is identical to the ASCII code. */
     231    if (result != NULL && use_default)
     232      {
     233        static const char basic_charset[] =
     234  	{
     235  	  'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
     236  	  'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
     237  	  'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
     238  	  'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
     239  	  '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
     240  	  '!', '"', '#', '%', '&', '\'', '(', ')', '*', '+', ',', '-',
     241  	  '.', '/', ':', ';', '<', '=', '>', '?', '[', '\\', ']', '^',
     242  	  '_', '{', '|', '}', '~', ' ', '\t', '\v', '\f', '\0'
     243  	};
     244        int failed = 0;
     245        const char *p = basic_charset;
     246  
     247        do
     248  	{
     249  	  struct charseq *seq = charmap_find_symbol (result, p, 1);
     250  
     251  	  if (seq == NULL || seq->ucs4 != (uint32_t) *p)
     252  	    failed = 1;
     253  	}
     254        while (*p++ != '\0');
     255  
     256        if (failed)
     257  	{
     258  	  /* A user may disable the ASCII compatibility warning check,
     259  	     but we must remember that the encoding is not ASCII
     260  	     compatible, since it may have other implications.  Later
     261  	     we will set _NL_CTYPE_MAP_TO_NONASCII from this value.  */
     262  	  if (warn_ascii)
     263  	    record_warning (_(
     264  "character map `%s' is not ASCII compatible, locale not ISO C compliant "
     265  "[--no-warnings=ascii]"),
     266  			    result->code_set_name);
     267  	  enc_not_ascii_compatible = true;
     268  	}
     269      }
     270  
     271    return result;
     272  }
     273  
     274  
     275  static struct charmap_t *
     276  parse_charmap (struct linereader *cmfile, int verbose, int be_quiet)
     277  {
     278    struct charmap_t *result;
     279    int state;
     280    enum token_t expected_tok = tok_error;
     281    const char *expected_str = NULL;
     282    char *from_name = NULL;
     283    char *to_name = NULL;
     284    enum token_t ellipsis = 0;
     285    int step = 1;
     286  
     287    /* We don't want symbolic names in string to be translated.  */
     288    cmfile->translate_strings = 0;
     289  
     290    /* Allocate room for result.  */
     291    result = (struct charmap_t *) xmalloc (sizeof (struct charmap_t));
     292    memset (result, '\0', sizeof (struct charmap_t));
     293    /* The default DEFAULT_WIDTH is 1.  */
     294    result->width_default = 1;
     295  
     296  #define obstack_chunk_alloc malloc
     297  #define obstack_chunk_free free
     298    obstack_init (&result->mem_pool);
     299  
     300    if (init_hash (&result->char_table, 256)
     301        || init_hash (&result->byte_table, 256))
     302      {
     303        free (result);
     304        return NULL;
     305      }
     306  
     307    /* We use a state machine to describe the charmap description file
     308       format.  */
     309    state = 1;
     310    while (1)
     311      {
     312        /* What's on?  */
     313        struct token *now = lr_token (cmfile, NULL, NULL, NULL, verbose);
     314        enum token_t nowtok = now->tok;
     315        struct token *arg;
     316  
     317        if (nowtok == tok_eof)
     318  	break;
     319  
     320        switch (state)
     321  	{
     322  	case 1:
     323  	  /* The beginning.  We expect the special declarations, EOL or
     324  	     `CHARMAP'.  */
     325  	  if (nowtok == tok_eol)
     326  	    /* Ignore empty lines.  */
     327  	    continue;
     328  
     329  	  if (nowtok == tok_charmap)
     330  	    {
     331  	      from_name = NULL;
     332  	      to_name = NULL;
     333  
     334  	      /* We have to set up the real work.  Fill in some
     335  		 default values.  */
     336  	      if (result->mb_cur_max == 0)
     337  		result->mb_cur_max = 1;
     338  	      if (result->mb_cur_min == 0)
     339  		result->mb_cur_min = result->mb_cur_max;
     340  	      if (result->mb_cur_min > result->mb_cur_max)
     341  		{
     342  		  record_error (0, 0, _("\
     343  %s: <mb_cur_max> must be greater than <mb_cur_min>\n"),
     344  				cmfile->fname);
     345  
     346  		  result->mb_cur_min = result->mb_cur_max;
     347  		}
     348  
     349  	      lr_ignore_rest (cmfile, 1);
     350  
     351  	      state = 2;
     352  	      continue;
     353  	    }
     354  
     355  	  if (nowtok != tok_code_set_name && nowtok != tok_mb_cur_max
     356  	      && nowtok != tok_mb_cur_min && nowtok != tok_escape_char
     357  	      && nowtok != tok_comment_char && nowtok != tok_g0esc
     358  	      && nowtok != tok_g1esc && nowtok != tok_g2esc
     359  	      && nowtok != tok_g3esc && nowtok != tok_repertoiremap
     360  	      && nowtok != tok_include)
     361  	    {
     362  	      lr_error (cmfile, _("syntax error in prolog: %s"),
     363  			_("invalid definition"));
     364  
     365  	      lr_ignore_rest (cmfile, 0);
     366  	      continue;
     367  	    }
     368  
     369  	  /* We know that we need an argument.  */
     370  	  arg = lr_token (cmfile, NULL, NULL, NULL, verbose);
     371  
     372  	  switch (nowtok)
     373  	    {
     374  	    case tok_code_set_name:
     375  	    case tok_repertoiremap:
     376  	      if (arg->tok != tok_ident && arg->tok != tok_string)
     377  		{
     378  		badarg:
     379  		  lr_error (cmfile, _("syntax error in prolog: %s"),
     380  			    _("bad argument"));
     381  
     382  		  lr_ignore_rest (cmfile, 0);
     383  		  continue;
     384  		}
     385  
     386  	      if (nowtok == tok_code_set_name)
     387  		result->code_set_name = obstack_copy0 (&result->mem_pool,
     388  						       arg->val.str.startmb,
     389  						       arg->val.str.lenmb);
     390  	      else
     391  		result->repertoiremap = obstack_copy0 (&result->mem_pool,
     392  						       arg->val.str.startmb,
     393  						       arg->val.str.lenmb);
     394  
     395  	      lr_ignore_rest (cmfile, 1);
     396  	      continue;
     397  
     398  	    case tok_mb_cur_max:
     399  	    case tok_mb_cur_min:
     400  	      if (arg->tok != tok_number)
     401  		goto badarg;
     402  
     403  	      if ((nowtok == tok_mb_cur_max
     404  		       && result->mb_cur_max != 0)
     405  		      || (nowtok == tok_mb_cur_max
     406  			  && result->mb_cur_max != 0))
     407  		lr_error (cmfile, _("duplicate definition of <%s>"),
     408  			  nowtok == tok_mb_cur_min
     409  			  ? "mb_cur_min" : "mb_cur_max");
     410  
     411  	      if (arg->val.num < 1)
     412  		{
     413  		  lr_error (cmfile,
     414  			    _("value for <%s> must be 1 or greater"),
     415  			    nowtok == tok_mb_cur_min
     416  			    ? "mb_cur_min" : "mb_cur_max");
     417  
     418  		  lr_ignore_rest (cmfile, 0);
     419  		  continue;
     420  		}
     421  	      if ((nowtok == tok_mb_cur_max && result->mb_cur_min != 0
     422  		   && (int) arg->val.num < result->mb_cur_min)
     423  		  || (nowtok == tok_mb_cur_min && result->mb_cur_max != 0
     424  		      && (int) arg->val.num > result->mb_cur_max))
     425  		{
     426  		  lr_error (cmfile, _("\
     427  value of <%s> must be greater or equal than the value of <%s>"),
     428  			    "mb_cur_max", "mb_cur_min");
     429  
     430  		  lr_ignore_rest (cmfile, 0);
     431  		  continue;
     432  		}
     433  
     434  	      if (nowtok == tok_mb_cur_max)
     435  		result->mb_cur_max = arg->val.num;
     436  	      else
     437  		result->mb_cur_min = arg->val.num;
     438  
     439  	      lr_ignore_rest (cmfile, 1);
     440  	      continue;
     441  
     442  	    case tok_escape_char:
     443  	    case tok_comment_char:
     444  	      if (arg->tok != tok_ident)
     445  		goto badarg;
     446  
     447  	      if (arg->val.str.lenmb != 1)
     448  		{
     449  		  lr_error (cmfile, _("\
     450  argument to <%s> must be a single character"),
     451  			    nowtok == tok_escape_char ? "escape_char"
     452  						      : "comment_char");
     453  
     454  		  lr_ignore_rest (cmfile, 0);
     455  		  continue;
     456  		}
     457  
     458  	      if (nowtok == tok_escape_char)
     459  		cmfile->escape_char = *arg->val.str.startmb;
     460  	      else
     461  		cmfile->comment_char = *arg->val.str.startmb;
     462  
     463  	      lr_ignore_rest (cmfile, 1);
     464  	      continue;
     465  
     466  	    case tok_g0esc:
     467  	    case tok_g1esc:
     468  	    case tok_g2esc:
     469  	    case tok_g3esc:
     470  	    case tok_escseq:
     471  	      lr_ignore_rest (cmfile, 0); /* XXX */
     472  	      continue;
     473  
     474  	    case tok_include:
     475  	      lr_error (cmfile, _("\
     476  character sets with locking states are not supported"));
     477  	      exit (4);
     478  
     479  	    default:
     480  	      /* Cannot happen.  */
     481  	      assert (! "Should not happen");
     482  	    }
     483  	  break;
     484  
     485  	case 2:
     486  	  /* We have seen `CHARMAP' and now are in the body.  Each line
     487  	     must have the format "%s %s %s\n" or "%s...%s %s %s\n".  */
     488  	  if (nowtok == tok_eol)
     489  	    /* Ignore empty lines.  */
     490  	    continue;
     491  
     492  	  if (nowtok == tok_end)
     493  	    {
     494  	      expected_tok = tok_charmap;
     495  	      expected_str = "CHARMAP";
     496  	      state = 90;
     497  	      continue;
     498  	    }
     499  
     500  	  if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
     501  	    {
     502  	      lr_error (cmfile, _("syntax error in %s definition: %s"),
     503  			"CHARMAP", _("no symbolic name given"));
     504  
     505  	      lr_ignore_rest (cmfile, 0);
     506  	      continue;
     507  	    }
     508  
     509  	  /* If the previous line was not completely correct free the
     510  	     used memory.  */
     511  	  if (from_name != NULL)
     512  	    obstack_free (&result->mem_pool, from_name);
     513  
     514  	  if (nowtok == tok_bsymbol)
     515  	    from_name = (char *) obstack_copy0 (&result->mem_pool,
     516  						now->val.str.startmb,
     517  						now->val.str.lenmb);
     518  	  else
     519  	    {
     520  	      obstack_printf (&result->mem_pool, "U%08X",
     521  			      cmfile->token.val.ucs4);
     522  	      obstack_1grow (&result->mem_pool, '\0');
     523  	      from_name = (char *) obstack_finish (&result->mem_pool);
     524  	    }
     525  	  to_name = NULL;
     526  
     527  	  state = 3;
     528  	  continue;
     529  
     530  	case 3:
     531  	  /* We have two possibilities: We can see an ellipsis or an
     532  	     encoding value.  */
     533  	  if (nowtok == tok_ellipsis3 || nowtok == tok_ellipsis4
     534  	      || nowtok == tok_ellipsis2 || nowtok == tok_ellipsis4_2
     535  	      || nowtok == tok_ellipsis2_2)
     536  	    {
     537  	      ellipsis = nowtok;
     538  	      if (nowtok == tok_ellipsis4_2)
     539  		{
     540  		  step = 2;
     541  		  nowtok = tok_ellipsis4;
     542  		}
     543  	      else if (nowtok == tok_ellipsis2_2)
     544  		{
     545  		  step = 2;
     546  		  nowtok = tok_ellipsis2;
     547  		}
     548  	      state = 4;
     549  	      continue;
     550  	    }
     551  	  /* FALLTHROUGH */
     552  
     553  	case 5:
     554  	  if (nowtok != tok_charcode)
     555  	    {
     556  	      lr_error (cmfile, _("syntax error in %s definition: %s"),
     557  			"CHARMAP", _("invalid encoding given"));
     558  
     559  	      lr_ignore_rest (cmfile, 0);
     560  
     561  	      state = 2;
     562  	      continue;
     563  	    }
     564  
     565  	  if (now->val.charcode.nbytes < result->mb_cur_min)
     566  	    lr_error (cmfile, _("too few bytes in character encoding"));
     567  	  else if (now->val.charcode.nbytes > result->mb_cur_max)
     568  	    lr_error (cmfile, _("too many bytes in character encoding"));
     569  	  else
     570  	    charmap_new_char (cmfile, result, now->val.charcode.nbytes,
     571  			      now->val.charcode.bytes, from_name, to_name,
     572  			      ellipsis != tok_ellipsis2, step);
     573  
     574  	  /* Ignore trailing comment silently.  */
     575  	  lr_ignore_rest (cmfile, 0);
     576  
     577  	  from_name = NULL;
     578  	  to_name = NULL;
     579  	  ellipsis = tok_none;
     580  	  step = 1;
     581  
     582  	  state = 2;
     583  	  continue;
     584  
     585  	case 4:
     586  	  if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
     587  	    {
     588  	      lr_error (cmfile, _("syntax error in %s definition: %s"),
     589  			"CHARMAP",
     590  			_("no symbolic name given for end of range"));
     591  
     592  	      lr_ignore_rest (cmfile, 0);
     593  	      continue;
     594  	    }
     595  
     596  	  /* Copy the to-name in a safe place.  */
     597  	  if (nowtok == tok_bsymbol)
     598  	    to_name = (char *) obstack_copy0 (&result->mem_pool,
     599  					      cmfile->token.val.str.startmb,
     600  					      cmfile->token.val.str.lenmb);
     601  	  else
     602  	    {
     603  	      obstack_printf (&result->mem_pool, "U%08X",
     604  			      cmfile->token.val.ucs4);
     605  	      obstack_1grow (&result->mem_pool, '\0');
     606  	      to_name = (char *) obstack_finish (&result->mem_pool);
     607  	    }
     608  
     609  	  state = 5;
     610  	  continue;
     611  
     612  	case 90:
     613  	  if (nowtok != expected_tok)
     614  	    lr_error (cmfile, _("\
     615  %1$s: definition does not end with `END %1$s'"), expected_str);
     616  
     617  	  lr_ignore_rest (cmfile, nowtok == expected_tok);
     618  	  state = 91;
     619  	  continue;
     620  
     621  	case 91:
     622  	  /* Waiting for WIDTH... */
     623  	  if (nowtok == tok_eol)
     624  	    /* Ignore empty lines.  */
     625  	    continue;
     626  
     627  	  if (nowtok == tok_width_default)
     628  	    {
     629  	      state = 92;
     630  	      continue;
     631  	    }
     632  
     633  	  if (nowtok == tok_width)
     634  	    {
     635  	      lr_ignore_rest (cmfile, 1);
     636  	      state = 93;
     637  	      continue;
     638  	    }
     639  
     640  	  if (nowtok == tok_width_variable)
     641  	    {
     642  	      lr_ignore_rest (cmfile, 1);
     643  	      state = 98;
     644  	      continue;
     645  	    }
     646  
     647  	  lr_error (cmfile, _("\
     648  only WIDTH definitions are allowed to follow the CHARMAP definition"));
     649  
     650  	  lr_ignore_rest (cmfile, 0);
     651  	  continue;
     652  
     653  	case 92:
     654  	  if (nowtok != tok_number)
     655  	    lr_error (cmfile, _("value for %s must be an integer"),
     656  		      "WIDTH_DEFAULT");
     657  	  else
     658  	    result->width_default = now->val.num;
     659  
     660  	  lr_ignore_rest (cmfile, nowtok == tok_number);
     661  
     662  	  state = 91;
     663  	  continue;
     664  
     665  	case 93:
     666  	  /* We now expect `END WIDTH' or lines of the format "%s %d\n" or
     667  	     "%s...%s %d\n".  */
     668  	  if (nowtok == tok_eol)
     669  	    /* ignore empty lines.  */
     670  	    continue;
     671  
     672  	  if (nowtok == tok_end)
     673  	    {
     674  	      expected_tok = tok_width;
     675  	      expected_str = "WIDTH";
     676  	      state = 90;
     677  	      continue;
     678  	    }
     679  
     680  	  if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
     681  	    {
     682  	      lr_error (cmfile, _("syntax error in %s definition: %s"),
     683  			"WIDTH", _("no symbolic name given"));
     684  
     685  	      lr_ignore_rest (cmfile, 0);
     686  	      continue;
     687  	    }
     688  
     689  	  if (from_name != NULL)
     690  	    obstack_free (&result->mem_pool, from_name);
     691  
     692  	  if (nowtok == tok_bsymbol)
     693  	    from_name = (char *) obstack_copy0 (&result->mem_pool,
     694  						now->val.str.startmb,
     695  						now->val.str.lenmb);
     696  	  else
     697  	    {
     698  	      obstack_printf (&result->mem_pool, "U%08X",
     699  			      cmfile->token.val.ucs4);
     700  	      obstack_1grow (&result->mem_pool, '\0');
     701  	      from_name = (char *) obstack_finish (&result->mem_pool);
     702  	    }
     703  
     704  	  to_name = NULL;
     705  
     706  	  state = 94;
     707  	  continue;
     708  
     709  	case 94:
     710  	  if (nowtok == tok_ellipsis3)
     711  	    {
     712  	      state = 95;
     713  	      continue;
     714  	    }
     715  	  /* Fall through.  */
     716  
     717  	case 96:
     718  	  if (nowtok != tok_number)
     719  	    lr_error (cmfile, _("value for %s must be an integer"),
     720  		      "WIDTH");
     721  	  else
     722  	    {
     723  	      /* Store width for chars.  */
     724  	      new_width (cmfile, result, from_name, to_name, now->val.num);
     725  
     726  	      from_name = NULL;
     727  	      to_name = NULL;
     728  	    }
     729  
     730  	  lr_ignore_rest (cmfile, nowtok == tok_number);
     731  
     732  	  state = 93;
     733  	  continue;
     734  
     735  	case 95:
     736  	  if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
     737  	    {
     738  	      lr_error (cmfile, _("syntax error in %s definition: %s"),
     739  			"WIDTH", _("no symbolic name given for end of range"));
     740  
     741  	      lr_ignore_rest (cmfile, 0);
     742  
     743  	      state = 93;
     744  	      continue;
     745  	    }
     746  
     747  	  if (nowtok == tok_bsymbol)
     748  	    to_name = (char *) obstack_copy0 (&result->mem_pool,
     749  					      now->val.str.startmb,
     750  					      now->val.str.lenmb);
     751  	  else
     752  	    {
     753  	      obstack_printf (&result->mem_pool, "U%08X",
     754  			      cmfile->token.val.ucs4);
     755  	      obstack_1grow (&result->mem_pool, '\0');
     756  	      to_name = (char *) obstack_finish (&result->mem_pool);
     757  	    }
     758  
     759  	  state = 96;
     760  	  continue;
     761  
     762  	case 98:
     763  	  /* We now expect `END WIDTH_VARIABLE' or lines of the format
     764  	     "%s\n" or "%s...%s\n".  */
     765  	  if (nowtok == tok_eol)
     766  	    /* ignore empty lines.  */
     767  	    continue;
     768  
     769  	  if (nowtok == tok_end)
     770  	    {
     771  	      expected_tok = tok_width_variable;
     772  	      expected_str = "WIDTH_VARIABLE";
     773  	      state = 90;
     774  	      continue;
     775  	    }
     776  
     777  	  if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
     778  	    {
     779  	      lr_error (cmfile, _("syntax error in %s definition: %s"),
     780  			"WIDTH_VARIABLE", _("no symbolic name given"));
     781  
     782  	      lr_ignore_rest (cmfile, 0);
     783  
     784  	      continue;
     785  	    }
     786  
     787  	  if (from_name != NULL)
     788  	    obstack_free (&result->mem_pool, from_name);
     789  
     790  	  if (nowtok == tok_bsymbol)
     791  	    from_name = (char *) obstack_copy0 (&result->mem_pool,
     792  						now->val.str.startmb,
     793  						now->val.str.lenmb);
     794  	  else
     795  	    {
     796  	      obstack_printf (&result->mem_pool, "U%08X",
     797  			      cmfile->token.val.ucs4);
     798  	      obstack_1grow (&result->mem_pool, '\0');
     799  	      from_name = (char *) obstack_finish (&result->mem_pool);
     800  	    }
     801  	  to_name = NULL;
     802  
     803  	  state = 99;
     804  	  continue;
     805  
     806  	case 99:
     807  	  if (nowtok == tok_ellipsis3)
     808  	    state = 100;
     809  
     810  	  /* Store info.  */
     811  	  from_name = NULL;
     812  
     813  	  /* Warn */
     814  	  state = 98;
     815  	  continue;
     816  
     817  	case 100:
     818  	  if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
     819  	    {
     820  	      lr_error (cmfile, _("syntax error in %s definition: %s"),
     821  			"WIDTH_VARIABLE",
     822  			_("no symbolic name given for end of range"));
     823  	      lr_ignore_rest (cmfile, 0);
     824  	      continue;
     825  	    }
     826  
     827  	  if (nowtok == tok_bsymbol)
     828  	    to_name = (char *) obstack_copy0 (&result->mem_pool,
     829  					      now->val.str.startmb,
     830  					      now->val.str.lenmb);
     831  	  else
     832  	    {
     833  	      obstack_printf (&result->mem_pool, "U%08X",
     834  			      cmfile->token.val.ucs4);
     835  	      obstack_1grow (&result->mem_pool, '\0');
     836  	      to_name = (char *) obstack_finish (&result->mem_pool);
     837  	    }
     838  
     839  	  /* XXX Enter value into table.  */
     840  
     841  	  lr_ignore_rest (cmfile, 1);
     842  
     843  	  state = 98;
     844  	  continue;
     845  
     846  	default:
     847  	  record_error (5, 0, _("%s: error in state machine"),
     848  			__FILE__);
     849  	  /* NOTREACHED */
     850  	}
     851        break;
     852      }
     853  
     854    if (state != 91)
     855      record_error (0, 0, _("%s: premature end of file"),
     856  		  cmfile->fname);
     857  
     858    lr_close (cmfile);
     859  
     860    return result;
     861  }
     862  
     863  
     864  static void
     865  new_width (struct linereader *cmfile, struct charmap_t *result,
     866  	   const char *from, const char *to, unsigned long int width)
     867  {
     868    struct charseq *from_val;
     869    struct charseq *to_val;
     870  
     871    from_val = charmap_find_value (result, from, strlen (from));
     872    if (from_val == NULL)
     873      {
     874        lr_error (cmfile, _("unknown character `%s'"), from);
     875        return;
     876      }
     877  
     878    if (to == NULL)
     879      to_val = from_val;
     880    else
     881      {
     882        to_val = charmap_find_value (result, to, strlen (to));
     883        if (to_val == NULL)
     884  	{
     885  	  lr_error (cmfile, _("unknown character `%s'"), to);
     886  	  return;
     887  	}
     888  
     889        /* Make sure the number of bytes for the end points of the range
     890  	 is correct.  */
     891        if (from_val->nbytes != to_val->nbytes)
     892  	{
     893  	  lr_error (cmfile, _("\
     894  number of bytes for byte sequence of beginning and end of range not the same: %d vs %d"),
     895  		    from_val->nbytes, to_val->nbytes);
     896  	  return;
     897  	}
     898      }
     899  
     900    if (result->nwidth_rules >= result->nwidth_rules_max)
     901      {
     902        size_t new_size = result->nwidth_rules + 32;
     903        struct width_rule *new_rules =
     904  	(struct width_rule *) obstack_alloc (&result->mem_pool,
     905  					     (new_size
     906  					      * sizeof (struct width_rule)));
     907  
     908        memcpy (new_rules, result->width_rules,
     909  	      result->nwidth_rules_max * sizeof (struct width_rule));
     910  
     911        result->width_rules = new_rules;
     912        result->nwidth_rules_max = new_size;
     913      }
     914  
     915    result->width_rules[result->nwidth_rules].from = from_val;
     916    result->width_rules[result->nwidth_rules].to = to_val;
     917    result->width_rules[result->nwidth_rules].width = (unsigned int) width;
     918    ++result->nwidth_rules;
     919  }
     920  
     921  
     922  struct charseq *
     923  charmap_find_value (const struct charmap_t *cm, const char *name, size_t len)
     924  {
     925    void *result;
     926  
     927    return (find_entry ((hash_table *) &cm->char_table, name, len, &result)
     928  	  < 0 ? NULL : (struct charseq *) result);
     929  }
     930  
     931  
     932  static void
     933  charmap_new_char (struct linereader *lr, struct charmap_t *cm,
     934  		  size_t nbytes, unsigned char *bytes,
     935  		  const char *from, const char *to,
     936  		  int decimal_ellipsis, int step)
     937  {
     938    hash_table *ht = &cm->char_table;
     939    hash_table *bt = &cm->byte_table;
     940    struct obstack *ob = &cm->mem_pool;
     941    char *from_end;
     942    char *to_end;
     943    const char *cp;
     944    int prefix_len, len1, len2;
     945    unsigned int from_nr, to_nr, cnt;
     946    struct charseq *newp;
     947  
     948    len1 = strlen (from);
     949  
     950    if (to == NULL)
     951      {
     952        newp = (struct charseq *) obstack_alloc (ob, sizeof (*newp) + nbytes);
     953        newp->nbytes = nbytes;
     954        memcpy (newp->bytes, bytes, nbytes);
     955        newp->name = from;
     956  
     957        newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
     958        if ((from[0] == 'U' || from[0] == 'P') && (len1 == 5 || len1 == 9))
     959  	{
     960  	  /* Maybe the name is of the form `Uxxxx' or `Uxxxxxxxx' where
     961  	     xxxx and xxxxxxxx are hexadecimal numbers.  In this case
     962  	     we use the value of xxxx or xxxxxxxx as the UCS4 value of
     963  	     this character and we don't have to consult the repertoire
     964  	     map.
     965  
     966  	     If the name is of the form `Pxxxx' or `Pxxxxxxxx' the xxxx
     967  	     and xxxxxxxx also give the code point in UCS4 but this must
     968  	     be in the private, i.e., unassigned, area.  This should be
     969  	     used for characters which do not (yet) have an equivalent
     970  	     in ISO 10646 and Unicode.  */
     971  	  char *endp;
     972  
     973  	  errno = 0;
     974  	  newp->ucs4 = strtoul (from + 1, &endp, 16);
     975  	  if (endp - from != len1
     976  	      || (newp->ucs4 == ~((uint32_t) 0) && errno == ERANGE)
     977  	      || newp->ucs4 >= 0x80000000)
     978  	    /* This wasn't successful.  Signal this name cannot be a
     979  	       correct UCS value.  */
     980  	    newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
     981  	}
     982  
     983        insert_entry (ht, from, len1, newp);
     984        insert_entry (bt, newp->bytes, nbytes, newp);
     985        /* Please note that it isn't a bug if a symbol is defined more
     986  	 than once.  All later definitions are simply discarded.  */
     987        return;
     988      }
     989  
     990    /* We have a range: the names must have names with equal prefixes
     991       and an equal number of digits, where the second number is greater
     992       or equal than the first.  */
     993    len2 = strlen (to);
     994  
     995    if (len1 != len2)
     996      {
     997      illegal_range:
     998        lr_error (lr, _("invalid names for character range"));
     999        return;
    1000      }
    1001  
    1002    cp = &from[len1 - 1];
    1003    if (decimal_ellipsis)
    1004      while (isdigit (*cp) && cp >= from)
    1005        --cp;
    1006    else
    1007      while (isxdigit (*cp) && cp >= from)
    1008        {
    1009  	if (!isdigit (*cp) && !isupper (*cp))
    1010  	  lr_error (lr, _("\
    1011  hexadecimal range format should use only capital characters"));
    1012  	--cp;
    1013        }
    1014  
    1015    prefix_len = (cp - from) + 1;
    1016  
    1017    if (cp == &from[len1 - 1] || strncmp (from, to, prefix_len) != 0)
    1018      goto illegal_range;
    1019  
    1020    errno = 0;
    1021    from_nr = strtoul (&from[prefix_len], &from_end, decimal_ellipsis ? 10 : 16);
    1022    if (*from_end != '\0' || (from_nr == UINT_MAX && errno == ERANGE)
    1023        || ((to_nr = strtoul (&to[prefix_len], &to_end,
    1024  			    decimal_ellipsis ? 10 : 16)) == UINT_MAX
    1025  	  && errno == ERANGE)
    1026        || *to_end != '\0')
    1027      {
    1028        lr_error (lr, _("<%s> and <%s> are invalid names for range"), from, to);
    1029        return;
    1030      }
    1031  
    1032    if (from_nr > to_nr)
    1033      {
    1034        lr_error (lr, _("upper limit in range is smaller than lower limit"));
    1035        return;
    1036      }
    1037  
    1038    for (cnt = from_nr; cnt <= to_nr; cnt += step)
    1039      {
    1040        char *name_end;
    1041        obstack_printf (ob, decimal_ellipsis ? "%.*s%0*d" : "%.*s%0*X",
    1042  		      prefix_len, from, len1 - prefix_len, cnt);
    1043        obstack_1grow (ob, '\0');
    1044        name_end = obstack_finish (ob);
    1045  
    1046        newp = (struct charseq *) obstack_alloc (ob, sizeof (*newp) + nbytes);
    1047        newp->nbytes = nbytes;
    1048        memcpy (newp->bytes, bytes, nbytes);
    1049        newp->name = name_end;
    1050  
    1051        newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
    1052        if ((name_end[0] == 'U' || name_end[0] == 'P')
    1053  	  && (len1 == 5 || len1 == 9))
    1054  	{
    1055  	  /* Maybe the name is of the form `Uxxxx' or `Uxxxxxxxx' where
    1056  	     xxxx and xxxxxxxx are hexadecimal numbers.  In this case
    1057  	     we use the value of xxxx or xxxxxxxx as the UCS4 value of
    1058  	     this character and we don't have to consult the repertoire
    1059  	     map.
    1060  
    1061  	     If the name is of the form `Pxxxx' or `Pxxxxxxxx' the xxxx
    1062  	     and xxxxxxxx also give the code point in UCS4 but this must
    1063  	     be in the private, i.e., unassigned, area.  This should be
    1064  	     used for characters which do not (yet) have an equivalent
    1065  	     in ISO 10646 and Unicode.  */
    1066  	  char *endp;
    1067  
    1068  	  errno = 0;
    1069  	  newp->ucs4 = strtoul (name_end + 1, &endp, 16);
    1070  	  if (endp - name_end != len1
    1071  	      || (newp->ucs4 == ~((uint32_t) 0) && errno == ERANGE)
    1072  	      || newp->ucs4 >= 0x80000000)
    1073  	    /* This wasn't successful.  Signal this name cannot be a
    1074  	       correct UCS value.  */
    1075  	    newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
    1076  	}
    1077  
    1078        insert_entry (ht, name_end, len1, newp);
    1079        insert_entry (bt, newp->bytes, nbytes, newp);
    1080        /* Please note we don't examine the return value since it is no error
    1081  	 if we have two definitions for a symbol.  */
    1082  
    1083        /* Increment the value in the byte sequence.  */
    1084        if (++bytes[nbytes - 1] == '\0')
    1085  	{
    1086  	  int b = nbytes - 2;
    1087  
    1088  	  do
    1089  	    if (b < 0)
    1090  	      {
    1091  		lr_error (lr,
    1092  			  _("resulting bytes for range not representable."));
    1093  		return;
    1094  	      }
    1095  	  while (++bytes[b--] == 0);
    1096  	}
    1097      }
    1098  }
    1099  
    1100  
    1101  struct charseq *
    1102  charmap_find_symbol (const struct charmap_t *cm, const char *bytes,
    1103  		     size_t nbytes)
    1104  {
    1105    void *result;
    1106  
    1107    return (find_entry ((hash_table *) &cm->byte_table, bytes, nbytes, &result)
    1108  	  < 0 ? NULL : (struct charseq *) result);
    1109  }