(root)/
glibc-2.38/
locale/
programs/
repertoire.c
       1  /* Copyright (C) 1998-2023 Free Software Foundation, Inc.
       2     This file is part of the GNU C Library.
       3  
       4     This program is free software; you can redistribute it and/or modify
       5     it under the terms of the GNU General Public License as published
       6     by the Free Software Foundation; version 2 of the License, or
       7     (at your option) any later version.
       8  
       9     This program is distributed in the hope that it will be useful,
      10     but WITHOUT ANY WARRANTY; without even the implied warranty of
      11     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      12     GNU General Public License for more details.
      13  
      14     You should have received a copy of the GNU General Public License
      15     along with this program; if not, see <https://www.gnu.org/licenses/>.  */
      16  
      17  #ifdef HAVE_CONFIG_H
      18  # include <config.h>
      19  #endif
      20  
      21  #include <errno.h>
      22  #include <limits.h>
      23  #include <obstack.h>
      24  #include <search.h>
      25  #include <stdlib.h>
      26  #include <string.h>
      27  #include <unistd.h>
      28  #include <stdint.h>
      29  
      30  #include "localedef.h"
      31  #include "linereader.h"
      32  #include "charmap.h"
      33  #include "repertoire.h"
      34  #include "simple-hash.h"
      35  
      36  
      37  /* Simple keyword hashing for the repertoiremap.  */
      38  static const struct keyword_t *repertoiremap_hash (const char *str,
      39  						   size_t len);
      40  static void repertoire_new_char (struct linereader *lr, hash_table *ht,
      41  				 hash_table *rt, struct obstack *ob,
      42  				 uint32_t value, const char *from,
      43  				 const char *to, int decimal_ellipsis);
      44  static int repertoire_compare (const void *p1, const void *p2);
      45  
      46  /* Already known repertoire maps.  */
      47  static void *known;
      48  
      49  /* List of repertoire maps which are not available and which have been
      50     reported to not be.  */
      51  static void *unavailable;
      52  
      53  
      54  struct repertoire_t *
      55  repertoire_read (const char *filename)
      56  {
      57    struct linereader *repfile;
      58    struct repertoire_t *result;
      59    struct repertoire_t **resultp;
      60    struct repertoire_t search;
      61    int state;
      62    char *from_name = NULL;
      63    char *to_name = NULL;
      64    enum token_t ellipsis = tok_none;
      65  
      66    search.name = filename;
      67    resultp = tfind (&search, &known, &repertoire_compare);
      68    if (resultp != NULL)
      69      return *resultp;
      70  
      71    /* Determine path.  */
      72    repfile = lr_open (filename, repertoiremap_hash);
      73    if (repfile == NULL)
      74      {
      75        if (strchr (filename, '/') == NULL)
      76  	{
      77  	  char *i18npath = getenv ("I18NPATH");
      78  	  if (i18npath != NULL && *i18npath != '\0')
      79  	    {
      80  	      const size_t pathlen = strlen (i18npath);
      81  	      char i18npathbuf[pathlen + 1];
      82  	      char path[strlen (filename) + 1 + pathlen
      83  		        + sizeof ("/repertoiremaps/") - 1];
      84  	      char *next;
      85  	      i18npath = memcpy (i18npathbuf, i18npath, pathlen + 1);
      86  
      87  	      while (repfile == NULL
      88  		     && (next = strsep (&i18npath, ":")) != NULL)
      89  		{
      90  		  stpcpy (stpcpy (stpcpy (path, next), "/repertoiremaps/"),
      91  			  filename);
      92  
      93  		  repfile = lr_open (path, repertoiremap_hash);
      94  
      95  		  if (repfile == NULL)
      96  		    {
      97  		      stpcpy (stpcpy (stpcpy (path, next), "/"), filename);
      98  
      99  		      repfile = lr_open (path, repertoiremap_hash);
     100  		    }
     101  		}
     102  	    }
     103  
     104  	  if (repfile == NULL)
     105  	    {
     106  	      /* Look in the systems charmap directory.  */
     107  	      char *buf = xmalloc (strlen (filename) + 1
     108  				   + sizeof (REPERTOIREMAP_PATH));
     109  
     110  	      stpcpy (stpcpy (stpcpy (buf, REPERTOIREMAP_PATH), "/"),
     111  		      filename);
     112  	      repfile = lr_open (buf, repertoiremap_hash);
     113  
     114  	      free (buf);
     115  	    }
     116  	}
     117  
     118        if (repfile == NULL)
     119  	return NULL;
     120      }
     121  
     122    /* We don't want symbolic names in string to be translated.  */
     123    repfile->translate_strings = 0;
     124  
     125    /* Allocate room for result.  */
     126    result = (struct repertoire_t *) xmalloc (sizeof (struct repertoire_t));
     127    memset (result, '\0', sizeof (struct repertoire_t));
     128  
     129    result->name = xstrdup (filename);
     130  
     131  #define obstack_chunk_alloc malloc
     132  #define obstack_chunk_free free
     133    obstack_init (&result->mem_pool);
     134  
     135    if (init_hash (&result->char_table, 256)
     136        || init_hash (&result->reverse_table, 256)
     137        || init_hash (&result->seq_table, 256))
     138      {
     139        free (result);
     140        return NULL;
     141      }
     142  
     143    /* We use a state machine to describe the charmap description file
     144       format.  */
     145    state = 1;
     146    while (1)
     147      {
     148        /* What's on?  */
     149        struct token *now = lr_token (repfile, NULL, NULL, NULL, verbose);
     150        enum token_t nowtok = now->tok;
     151        struct token *arg;
     152  
     153        if (nowtok == tok_eof)
     154  	break;
     155  
     156        switch (state)
     157  	{
     158  	case 1:
     159  	  /* We haven't yet read any character definition.  This is where
     160  	     we accept escape_char and comment_char definitions.  */
     161  	  if (nowtok == tok_eol)
     162  	    /* Ignore empty lines.  */
     163  	    continue;
     164  
     165  	  if (nowtok == tok_escape_char || nowtok == tok_comment_char)
     166  	    {
     167  	      /* We know that we need an argument.  */
     168  	      arg = lr_token (repfile, NULL, NULL, NULL, verbose);
     169  
     170  	      if (arg->tok != tok_ident)
     171  		{
     172  		  lr_error (repfile, _("syntax error in prolog: %s"),
     173  			    _("bad argument"));
     174  
     175  		  lr_ignore_rest (repfile, 0);
     176  		  continue;
     177  		}
     178  
     179  	      if (arg->val.str.lenmb != 1)
     180  		{
     181  		  lr_error (repfile, _("\
     182  argument to <%s> must be a single character"),
     183  			    nowtok == tok_escape_char ? "escape_char"
     184  						      : "comment_char");
     185  
     186  		  lr_ignore_rest (repfile, 0);
     187  		  continue;
     188  		}
     189  
     190  	      if (nowtok == tok_escape_char)
     191  		repfile->escape_char = *arg->val.str.startmb;
     192  	      else
     193  		repfile->comment_char = *arg->val.str.startmb;
     194  
     195  	      lr_ignore_rest (repfile, 1);
     196  	      continue;
     197  	    }
     198  
     199  	  if (nowtok == tok_charids)
     200  	    {
     201  	      lr_ignore_rest (repfile, 1);
     202  
     203  	      state = 2;
     204  	      continue;
     205  	    }
     206  
     207  	  /* Otherwise we start reading the character definitions.  */
     208  	  state = 2;
     209  	  /* FALLTHROUGH */
     210  
     211  	case 2:
     212  	  /* We are now are in the body.  Each line
     213  	     must have the format "%s %s %s\n" or "%s...%s %s %s\n".  */
     214  	  if (nowtok == tok_eol)
     215  	    /* Ignore empty lines.  */
     216  	    continue;
     217  
     218  	  if (nowtok == tok_end)
     219  	    {
     220  	      state = 90;
     221  	      continue;
     222  	    }
     223  
     224  	  if (nowtok != tok_bsymbol)
     225  	    {
     226  	      lr_error (repfile,
     227  			_("syntax error in repertoire map definition: %s"),
     228  			_("no symbolic name given"));
     229  
     230  	      lr_ignore_rest (repfile, 0);
     231  	      continue;
     232  	    }
     233  
     234  	  /* If the previous line was not completely correct free the
     235  	     used memory.  */
     236  	  if (from_name != NULL)
     237  	    obstack_free (&result->mem_pool, from_name);
     238  
     239  	  from_name = (char *) obstack_copy0 (&result->mem_pool,
     240  					      now->val.str.startmb,
     241  					      now->val.str.lenmb);
     242  	  to_name = NULL;
     243  
     244  	  state = 3;
     245  	  continue;
     246  
     247  	case 3:
     248  	  /* We have two possibilities: We can see an ellipsis or an
     249  	     encoding value.  */
     250  	  if (nowtok == tok_ellipsis3 || nowtok == tok_ellipsis4
     251  	      || nowtok == tok_ellipsis2)
     252  	    {
     253  	      ellipsis = nowtok;
     254  	      state = 4;
     255  	      continue;
     256  	    }
     257  	  /* FALLTHROUGH */
     258  
     259  	case 5:
     260  	  /* We expect a value of the form <Uxxxx> or <Uxxxxxxxx> where
     261  	     the xxx mean a hexadecimal value.  */
     262  	  state = 2;
     263  
     264  	  errno = 0;
     265  	  if (nowtok != tok_ucs4)
     266  	    {
     267  	      lr_error (repfile,
     268  			_("syntax error in repertoire map definition: %s"),
     269  			_("no <Uxxxx> or <Uxxxxxxxx> value given"));
     270  
     271  	      lr_ignore_rest (repfile, 0);
     272  	      continue;
     273  	    }
     274  
     275  	  /* We've found a new valid definition.  */
     276  	  repertoire_new_char (repfile, &result->char_table,
     277  			       &result->reverse_table, &result->mem_pool,
     278  			       now->val.ucs4, from_name, to_name,
     279  			       ellipsis != tok_ellipsis2);
     280  
     281  	  /* Ignore the rest of the line.  */
     282  	  lr_ignore_rest (repfile, 0);
     283  
     284  	  from_name = NULL;
     285  	  to_name = NULL;
     286  
     287  	  continue;
     288  
     289  	case 4:
     290  	  if (nowtok != tok_bsymbol)
     291  	    {
     292  	      lr_error (repfile,
     293  			_("syntax error in repertoire map definition: %s"),
     294  			_("no symbolic name given for end of range"));
     295  
     296  	      lr_ignore_rest (repfile, 0);
     297  	      state = 2;
     298  	      continue;
     299  	    }
     300  
     301  	  /* Copy the to-name in a safe place.  */
     302  	  to_name = (char *) obstack_copy0 (&result->mem_pool,
     303  					    repfile->token.val.str.startmb,
     304  					    repfile->token.val.str.lenmb);
     305  
     306  	  state = 5;
     307  	  continue;
     308  
     309  	case 90:
     310  	  if (nowtok != tok_charids)
     311  	    lr_error (repfile, _("\
     312  %1$s: definition does not end with `END %1$s'"), "CHARIDS");
     313  
     314  	  lr_ignore_rest (repfile, nowtok == tok_charids);
     315  	  break;
     316  	}
     317  
     318        break;
     319      }
     320  
     321    if (state != 2 && state != 90 && !be_quiet)
     322      record_error (0, 0, _("%s: premature end of file"),
     323  		  repfile->fname);
     324  
     325    lr_close (repfile);
     326  
     327    if (tsearch (result, &known, &repertoire_compare) == NULL)
     328      /* Something went wrong.  */
     329      record_error (0, errno, _("cannot save new repertoire map"));
     330  
     331    return result;
     332  }
     333  
     334  
     335  void
     336  repertoire_complain (const char *name)
     337  {
     338    if (tfind (name, &unavailable, (__compar_fn_t) strcmp) == NULL)
     339      {
     340        record_error (0, errno, _("\
     341  repertoire map file `%s' not found"), name);
     342  
     343        /* Remember that we reported this map.  */
     344        tsearch (name, &unavailable, (__compar_fn_t) strcmp);
     345      }
     346  }
     347  
     348  
     349  static int
     350  repertoire_compare (const void *p1, const void *p2)
     351  {
     352    struct repertoire_t *r1 = (struct repertoire_t *) p1;
     353    struct repertoire_t *r2 = (struct repertoire_t *) p2;
     354  
     355    return strcmp (r1->name, r2->name);
     356  }
     357  
     358  
     359  static const struct keyword_t *
     360  repertoiremap_hash (const char *str, size_t len)
     361  {
     362    static const struct keyword_t wordlist[] =
     363    {
     364      {"escape_char",      tok_escape_char,     0},
     365      {"comment_char",     tok_comment_char,    0},
     366      {"CHARIDS",          tok_charids,         0},
     367      {"END",              tok_end,             0},
     368    };
     369  
     370    if (len == 11 && memcmp (wordlist[0].name, str, 11) == 0)
     371      return &wordlist[0];
     372    if (len == 12 && memcmp (wordlist[1].name, str, 12) == 0)
     373      return &wordlist[1];
     374    if (len == 7 && memcmp (wordlist[2].name, str, 7) == 0)
     375      return &wordlist[2];
     376    if (len == 3 && memcmp (wordlist[3].name, str, 3) == 0)
     377      return &wordlist[3];
     378  
     379    return NULL;
     380  }
     381  
     382  
     383  static void
     384  repertoire_new_char (struct linereader *lr, hash_table *ht, hash_table *rt,
     385  		     struct obstack *ob, uint32_t value, const char *from,
     386  		     const char *to, int decimal_ellipsis)
     387  {
     388    char *from_end;
     389    char *to_end;
     390    const char *cp;
     391    char *buf = NULL;
     392    int prefix_len, len1, len2;
     393    unsigned long int from_nr, to_nr, cnt;
     394  
     395    if (to == NULL)
     396      {
     397        insert_entry (ht, from, strlen (from),
     398  		    (void *) (unsigned long int) value);
     399        /* Please note that it isn't a bug if a symbol is defined more
     400  	 than once.  All later definitions are simply discarded.  */
     401  
     402        insert_entry (rt, obstack_copy (ob, &value, sizeof (value)),
     403  		    sizeof (value), (void *) from);
     404  
     405        return;
     406      }
     407  
     408    /* We have a range: the names must have names with equal prefixes
     409       and an equal number of digits, where the second number is greater
     410       or equal than the first.  */
     411    len1 = strlen (from);
     412    len2 = strlen (to);
     413  
     414    if (len1 != len2)
     415      {
     416      invalid_range:
     417        lr_error (lr, _("invalid names for character range"));
     418        return;
     419      }
     420  
     421    cp = &from[len1 - 1];
     422    if (decimal_ellipsis)
     423      while (isdigit (*cp) && cp >= from)
     424        --cp;
     425    else
     426      while (isxdigit (*cp) && cp >= from)
     427        {
     428  	if (!isdigit (*cp) && !isupper (*cp))
     429  	  lr_error (lr, _("\
     430  hexadecimal range format should use only capital characters"));
     431  	--cp;
     432        }
     433  
     434    prefix_len = (cp - from) + 1;
     435  
     436    if (cp == &from[len1 - 1] || strncmp (from, to, prefix_len) != 0)
     437      goto invalid_range;
     438  
     439    errno = 0;
     440    from_nr = strtoul (&from[prefix_len], &from_end, decimal_ellipsis ? 10 : 16);
     441    if (*from_end != '\0' || (from_nr == ULONG_MAX && errno == ERANGE)
     442        || ((to_nr = strtoul (&to[prefix_len], &to_end,
     443  			    decimal_ellipsis ? 10 : 16)) == ULONG_MAX
     444            && errno == ERANGE)
     445        || *to_end != '\0')
     446      {
     447        lr_error (lr, _("<%s> and <%s> are invalid names for range"),
     448  		from, to);
     449        return;
     450      }
     451  
     452    if (from_nr > to_nr)
     453      {
     454        lr_error (lr, _("upper limit in range is smaller than lower limit"));
     455        return;
     456      }
     457  
     458    for (cnt = from_nr; cnt <= to_nr; ++cnt)
     459      {
     460        uint32_t this_value = value + (cnt - from_nr);
     461  
     462        obstack_printf (ob, decimal_ellipsis ? "%.*s%0*ld" : "%.*s%0*lX",
     463  		      prefix_len, from, len1 - prefix_len, cnt);
     464        obstack_1grow (ob, '\0');
     465  
     466        insert_entry (ht, buf, len1,
     467  		    (void *) (unsigned long int) this_value);
     468        /* Please note we don't examine the return value since it is no error
     469  	 if we have two definitions for a symbol.  */
     470  
     471        insert_entry (rt, obstack_copy (ob, &this_value, sizeof (this_value)),
     472  		    sizeof (this_value), (void *) from);
     473      }
     474  }
     475  
     476  
     477  uint32_t
     478  repertoire_find_value (const struct repertoire_t *rep, const char *name,
     479  		       size_t len)
     480  {
     481    void *result;
     482  
     483    if (rep == NULL)
     484      return ILLEGAL_CHAR_VALUE;
     485  
     486    if (find_entry ((hash_table *) &rep->char_table, name, len, &result) < 0)
     487      return ILLEGAL_CHAR_VALUE;
     488  
     489    return (uint32_t) ((unsigned long int) result);
     490  }
     491  
     492  
     493  const char *
     494  repertoire_find_symbol (const struct repertoire_t *rep, uint32_t ucs)
     495  {
     496    void *result;
     497  
     498    if (rep == NULL)
     499      return NULL;
     500  
     501    if (find_entry ((hash_table *) &rep->reverse_table, &ucs, sizeof (ucs),
     502  		  &result) < 0)
     503      return NULL;
     504  
     505    return (const char *) result;
     506  }
     507  
     508  
     509  struct charseq *
     510  repertoire_find_seq (const struct repertoire_t *rep, uint32_t ucs)
     511  {
     512    void *result;
     513  
     514    if (rep == NULL)
     515      return NULL;
     516  
     517    if (find_entry ((hash_table *) &rep->seq_table, &ucs, sizeof (ucs),
     518  		  &result) < 0)
     519      return NULL;
     520  
     521    return (struct charseq *) result;
     522  }