(root)/
glibc-2.38/
iconv/
iconv_charmap.c
       1  /* Convert using charmaps and possibly iconv().
       2     Copyright (C) 2001-2023 Free Software Foundation, Inc.
       3     This file is part of the GNU C Library.
       4  
       5     This program is free software; you can redistribute it and/or modify
       6     it under the terms of the GNU General Public License as published
       7     by the Free Software Foundation; version 2 of the License, or
       8     (at your option) any later version.
       9  
      10     This program is distributed in the hope that it will be useful,
      11     but WITHOUT ANY WARRANTY; without even the implied warranty of
      12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      13     GNU General Public License for more details.
      14  
      15     You should have received a copy of the GNU General Public License
      16     along with this program; if not, see <https://www.gnu.org/licenses/>.  */
      17  
      18  #include <assert.h>
      19  #include <errno.h>
      20  #include <error.h>
      21  #include <fcntl.h>
      22  #include <iconv.h>
      23  #include <libintl.h>
      24  #include <stdio.h>
      25  #include <stdlib.h>
      26  #include <unistd.h>
      27  #include <stdint.h>
      28  #include <sys/mman.h>
      29  #include <sys/stat.h>
      30  
      31  #include "iconv_prog.h"
      32  
      33  
      34  /* Prototypes for a few program-wide used functions.  */
      35  #include <programs/xmalloc.h>
      36  
      37  
      38  struct convtable
      39  {
      40    int term[256 / 8];
      41    union
      42    {
      43      struct convtable *sub;
      44      struct charseq *out;
      45    } val[256];
      46  };
      47  
      48  
      49  static inline struct convtable *
      50  allocate_table (void)
      51  {
      52    return (struct convtable *) xcalloc (1, sizeof (struct convtable));
      53  }
      54  
      55  static inline void
      56  free_table (struct convtable *tbl)
      57  {
      58    free (tbl);
      59  }
      60  
      61  
      62  static inline int
      63  is_term (struct convtable *tbl, unsigned int idx)
      64  {
      65    return tbl->term[idx / 8] & (1 << (idx % 8));
      66  }
      67  
      68  
      69  static inline void
      70  clear_term (struct convtable *tbl, unsigned int idx)
      71  {
      72    tbl->term[idx / 8] &= ~(1 << (idx % 8));
      73  }
      74  
      75  
      76  static inline void
      77  set_term (struct convtable *tbl, unsigned int idx)
      78  {
      79    tbl->term[idx / 8] |= 1 << (idx % 8);
      80  }
      81  
      82  
      83  /* Generate the conversion table.  */
      84  static struct convtable *use_from_charmap (struct charmap_t *from_charmap,
      85  					   const char *to_code);
      86  static struct convtable *use_to_charmap (const char *from_code,
      87  					 struct charmap_t *to_charmap);
      88  static struct convtable *use_both_charmaps (struct charmap_t *from_charmap,
      89  					    struct charmap_t *to_charmap);
      90  
      91  /* Prototypes for the functions doing the actual work.  */
      92  static int process_block (struct convtable *tbl, char *addr, size_t len,
      93  			  FILE *output);
      94  static int process_fd (struct convtable *tbl, int fd, FILE *output);
      95  static int process_file (struct convtable *tbl, FILE *input, FILE *output);
      96  
      97  
      98  int
      99  charmap_conversion (const char *from_code, struct charmap_t *from_charmap,
     100  		    const char *to_code, struct charmap_t *to_charmap,
     101  		    int argc, int remaining, char *argv[],
     102  		    const char *output_file)
     103  {
     104    struct convtable *cvtbl;
     105    int status = EXIT_SUCCESS;
     106  
     107    /* We have three different cases to handle:
     108  
     109       - both, from_charmap and to_charmap, are available.  This means we
     110         can assume that the symbolic names match and use them to create
     111         the mapping.
     112  
     113       - only from_charmap is available.  In this case we can only hope that
     114         the symbolic names used are of the <Uxxxx> form in which case we
     115         can use a UCS4->"to_code" iconv() conversion for the second step.
     116  
     117       - only to_charmap is available.  This is similar, only that we would
     118         use iconv() for the "to_code"->UCS4 conversion.
     119  
     120         We first create a table which maps input bytes into output bytes.
     121         Once this is done we can handle all three of the cases above
     122         equally.  */
     123    if (from_charmap != NULL)
     124      {
     125        if (to_charmap == NULL)
     126  	cvtbl = use_from_charmap (from_charmap, to_code);
     127        else
     128  	cvtbl = use_both_charmaps (from_charmap, to_charmap);
     129      }
     130    else
     131      {
     132        assert (to_charmap != NULL);
     133        cvtbl = use_to_charmap (from_code, to_charmap);
     134      }
     135  
     136    /* If we couldn't generate a table stop now.  */
     137    if (cvtbl == NULL)
     138      return EXIT_FAILURE;
     139  
     140    /* Determine output file.  */
     141    FILE *output;
     142    if (output_file != NULL && strcmp (output_file, "-") != 0)
     143      {
     144        output = fopen (output_file, "w");
     145        if (output == NULL)
     146  	error (EXIT_FAILURE, errno, _("cannot open output file"));
     147      }
     148    else
     149      output = stdout;
     150  
     151    /* We can now start the conversion.  */
     152    if (remaining == argc)
     153      {
     154        if (process_file (cvtbl, stdin, output) != 0)
     155  	status = EXIT_FAILURE;
     156      }
     157    else
     158      do
     159        {
     160  	int fd;
     161  
     162  	if (verbose)
     163  	  printf ("%s:\n", argv[remaining]);
     164  	if (strcmp (argv[remaining], "-") == 0)
     165  	  fd = 0;
     166  	else
     167  	  {
     168  	    fd = open (argv[remaining], O_RDONLY);
     169  
     170  	    if (fd == -1)
     171  	      {
     172  		error (0, errno, _("cannot open input file `%s'"),
     173  		       argv[remaining]);
     174  		status = EXIT_FAILURE;
     175  		continue;
     176  	      }
     177  	  }
     178  
     179  #ifdef _POSIX_MAPPED_FILES
     180  	struct stat64 st;
     181  	char *addr;
     182  	/* We have possibilities for reading the input file.  First try
     183  	   to mmap() it since this will provide the fastest solution.  */
     184  	if (fstat64 (fd, &st) == 0
     185  	    && ((addr = mmap (NULL, st.st_size, PROT_READ, MAP_PRIVATE,
     186  			      fd, 0)) != MAP_FAILED))
     187  	  {
     188  	    /* Yes, we can use mmap().  The descriptor is not needed
     189  	       anymore.  */
     190  	    if (close (fd) != 0)
     191  	      error (EXIT_FAILURE, errno,
     192  		     _("error while closing input `%s'"), argv[remaining]);
     193  
     194  	    if (process_block (cvtbl, addr, st.st_size, output) < 0)
     195  	      {
     196  		/* Something went wrong.  */
     197  		status = EXIT_FAILURE;
     198  
     199  		/* We don't need the input data anymore.  */
     200  		munmap ((void *) addr, st.st_size);
     201  
     202  		/* We cannot go on with producing output since it might
     203  		   lead to problem because the last output might leave
     204  		   the output stream in an undefined state.  */
     205  		break;
     206  	      }
     207  
     208  	    /* We don't need the input data anymore.  */
     209  	    munmap ((void *) addr, st.st_size);
     210  	  }
     211  	else
     212  #endif	/* _POSIX_MAPPED_FILES */
     213  	  {
     214  	    /* Read the file in pieces.  */
     215  	    if (process_fd (cvtbl, fd, output) != 0)
     216  	      {
     217  		/* Something went wrong.  */
     218  		status = EXIT_FAILURE;
     219  
     220  		/* We don't need the input file anymore.  */
     221  		close (fd);
     222  
     223  		/* We cannot go on with producing output since it might
     224  		   lead to problem because the last output might leave
     225  		   the output stream in an undefined state.  */
     226  		break;
     227  	      }
     228  
     229  	    /* Now close the file.  */
     230  	    close (fd);
     231  	  }
     232        }
     233      while (++remaining < argc);
     234  
     235    /* All done.  */
     236    if (output != stdout)
     237      fclose (output);
     238    free_table (cvtbl);
     239    return status;
     240  }
     241  
     242  
     243  /* Add the IN->OUT mapping to TBL.  OUT is potentially stored in the table.
     244     IN is used only here, so it need not be kept live afterwards.  */
     245  static void
     246  add_bytes (struct convtable *tbl, const struct charseq *in, struct charseq *out)
     247  {
     248    int n = 0;
     249    unsigned int byte;
     250  
     251    assert (in->nbytes > 0);
     252  
     253    byte = ((unsigned char *) in->bytes)[n];
     254    while (n + 1 < in->nbytes)
     255      {
     256        if (is_term (tbl, byte) || tbl->val[byte].sub == NULL)
     257  	{
     258  	  /* Note that we simply ignore a definition for a byte sequence
     259  	     which is also the prefix for a longer one.  */
     260  	  clear_term (tbl, byte);
     261  	  tbl->val[byte].sub =
     262  	    (struct convtable *) xcalloc (1, sizeof (struct convtable));
     263  	}
     264  
     265        tbl = tbl->val[byte].sub;
     266  
     267        byte = ((unsigned char *) in->bytes)[++n];
     268      }
     269  
     270    /* Only add the new sequence if there is none yet and the byte sequence
     271       is not part of an even longer one.  */
     272    if (! is_term (tbl, byte) && tbl->val[byte].sub == NULL)
     273      {
     274        set_term (tbl, byte);
     275        tbl->val[byte].out = out;
     276      }
     277  }
     278  
     279  /* Try to convert SEQ from WCHAR_T format using CD.
     280     Returns a malloc'd struct or NULL.  */
     281  static struct charseq *
     282  convert_charseq (iconv_t cd, const struct charseq *seq)
     283  {
     284    struct charseq *result = NULL;
     285  
     286    if (seq->ucs4 != UNINITIALIZED_CHAR_VALUE)
     287      {
     288        /* There is a chance.  Try the iconv module.  */
     289        wchar_t inbuf[1] = { seq->ucs4 };
     290        unsigned char outbuf[64];
     291        char *inptr = (char *) inbuf;
     292        size_t inlen = sizeof (inbuf);
     293        char *outptr = (char *) outbuf;
     294        size_t outlen = sizeof (outbuf);
     295  
     296        (void) iconv (cd, &inptr, &inlen, &outptr, &outlen);
     297  
     298        if (outptr != (char *) outbuf)
     299          {
     300            /* We got some output.  Good, use it.  */
     301            outlen = sizeof (outbuf) - outlen;
     302            assert ((char *) outbuf + outlen == outptr);
     303  
     304            result = xmalloc (sizeof (struct charseq) + outlen);
     305            result->name = seq->name;
     306            result->ucs4 = seq->ucs4;
     307            result->nbytes = outlen;
     308            memcpy (result->bytes, outbuf, outlen);
     309          }
     310  
     311        /* Clear any possible state left behind.  */
     312        (void) iconv (cd, NULL, NULL, NULL, NULL);
     313      }
     314  
     315    return result;
     316  }
     317  
     318  
     319  static struct convtable *
     320  use_from_charmap (struct charmap_t *from_charmap, const char *to_code)
     321  {
     322    /* We iterate over all entries in the from_charmap and for those which
     323       have a known UCS4 representation we use an iconv() call to determine
     324       the mapping to the to_code charset.  */
     325    struct convtable *rettbl;
     326    iconv_t cd;
     327    void *ptr = NULL;
     328    const void *key;
     329    size_t keylen;
     330    void *data;
     331  
     332    cd = iconv_open (to_code, "WCHAR_T");
     333    if (cd == (iconv_t) -1)
     334      /* We cannot do anything.  */
     335      return NULL;
     336  
     337    rettbl = allocate_table ();
     338  
     339    while (iterate_table (&from_charmap->char_table, &ptr, &key, &keylen, &data)
     340  	 >= 0)
     341      {
     342        struct charseq *in = data;
     343        struct charseq *newp = convert_charseq (cd, in);
     344        if (newp != NULL)
     345          add_bytes (rettbl, in, newp);
     346      }
     347  
     348    iconv_close (cd);
     349  
     350    return rettbl;
     351  }
     352  
     353  
     354  static struct convtable *
     355  use_to_charmap (const char *from_code, struct charmap_t *to_charmap)
     356  {
     357    /* We iterate over all entries in the to_charmap and for those which
     358       have a known UCS4 representation we use an iconv() call to determine
     359       the mapping to the from_code charset.  */
     360    struct convtable *rettbl;
     361    iconv_t cd;
     362    void *ptr = NULL;
     363    const void *key;
     364    size_t keylen;
     365    void *data;
     366  
     367    /* Note that the conversion we use here is the reverse direction.  Without
     368       exhaustive search we cannot figure out which input yields the UCS4
     369       character we are looking for.  Therefore we determine it the other
     370       way round.  */
     371    cd = iconv_open (from_code, "WCHAR_T");
     372    if (cd == (iconv_t) -1)
     373      /* We cannot do anything.  */
     374      return NULL;
     375  
     376    rettbl = allocate_table ();
     377  
     378    while (iterate_table (&to_charmap->char_table, &ptr, &key, &keylen, &data)
     379  	 >= 0)
     380      {
     381        struct charseq *out = data;
     382        struct charseq *newp = convert_charseq (cd, out);
     383        if (newp != NULL)
     384          {
     385            add_bytes (rettbl, newp, out);
     386            free (newp);
     387          }
     388      }
     389  
     390    iconv_close (cd);
     391  
     392    return rettbl;
     393  }
     394  
     395  
     396  static struct convtable *
     397  use_both_charmaps (struct charmap_t *from_charmap,
     398  		   struct charmap_t *to_charmap)
     399  {
     400    /* In this case we iterate over all the entries in the from_charmap,
     401       determine the internal name, and find an appropriate entry in the
     402       to_charmap (if it exists).  */
     403    struct convtable *rettbl = allocate_table ();
     404    void *ptr = NULL;
     405    const void *key;
     406    size_t keylen;
     407    void *data;
     408  
     409    while (iterate_table (&from_charmap->char_table, &ptr, &key, &keylen, &data)
     410  	 >= 0)
     411      {
     412        struct charseq *in = (struct charseq *) data;
     413        struct charseq *out = charmap_find_value (to_charmap, key, keylen);
     414  
     415        if (out != NULL)
     416  	add_bytes (rettbl, in, out);
     417      }
     418  
     419    return rettbl;
     420  }
     421  
     422  
     423  static int
     424  process_block (struct convtable *tbl, char *addr, size_t len, FILE *output)
     425  {
     426    size_t n = 0;
     427  
     428    while (n < len)
     429      {
     430        struct convtable *cur = tbl;
     431        unsigned char *curp = (unsigned char *) addr;
     432        unsigned int byte = *curp;
     433        int cnt;
     434        struct charseq *out;
     435  
     436        while (! is_term (cur, byte))
     437  	if (cur->val[byte].sub == NULL)
     438  	  {
     439  	    /* This is an invalid sequence.  Skip the first byte if we are
     440  	       ignoring errors.  Otherwise punt.  */
     441  	    if (! omit_invalid)
     442  	      {
     443  		error (0, 0, _("illegal input sequence at position %zd"), n);
     444  		return -1;
     445  	      }
     446  
     447  	    n -= curp - (unsigned char *) addr;
     448  
     449  	    byte = *(curp = (unsigned char *) ++addr);
     450  	    if (++n >= len)
     451  	      /* All converted.  */
     452  	      return 0;
     453  
     454  	    cur = tbl;
     455  	  }
     456  	else
     457  	  {
     458  	    cur = cur->val[byte].sub;
     459  
     460  	    if (++n >= len)
     461  	      {
     462  		error (0, 0, _("\
     463  incomplete character or shift sequence at end of buffer"));
     464  		return -1;
     465  	      }
     466  
     467  	    byte = *++curp;
     468  	  }
     469  
     470        /* We found a final byte.  Write the output bytes.  */
     471        out = cur->val[byte].out;
     472        for (cnt = 0; cnt < out->nbytes; ++cnt)
     473  	fputc_unlocked (out->bytes[cnt], output);
     474  
     475        addr = (char *) curp + 1;
     476        ++n;
     477      }
     478  
     479    return 0;
     480  }
     481  
     482  
     483  static int
     484  process_fd (struct convtable *tbl, int fd, FILE *output)
     485  {
     486    /* We have a problem with reading from a descriptor since we must not
     487       provide the iconv() function an incomplete character or shift
     488       sequence at the end of the buffer.  Since we have to deal with
     489       arbitrary encodings we must read the whole text in a buffer and
     490       process it in one step.  */
     491    static char *inbuf = NULL;
     492    static size_t maxlen = 0;
     493    char *inptr = inbuf;
     494    size_t actlen = 0;
     495  
     496    while (actlen < maxlen)
     497      {
     498        ssize_t n = read (fd, inptr, maxlen - actlen);
     499  
     500        if (n == 0)
     501  	/* No more text to read.  */
     502  	break;
     503  
     504        if (n == -1)
     505  	{
     506  	  /* Error while reading.  */
     507  	  error (0, errno, _("error while reading the input"));
     508  	  return -1;
     509  	}
     510  
     511        inptr += n;
     512        actlen += n;
     513      }
     514  
     515    if (actlen == maxlen)
     516      while (1)
     517        {
     518  	ssize_t n;
     519  	char *new_inbuf;
     520  
     521  	/* Increase the buffer.  */
     522  	new_inbuf = (char *) realloc (inbuf, maxlen + 32768);
     523  	if (new_inbuf == NULL)
     524  	  {
     525  	    error (0, errno, _("unable to allocate buffer for input"));
     526  	    return -1;
     527  	  }
     528  	inbuf = new_inbuf;
     529  	maxlen += 32768;
     530  	inptr = inbuf + actlen;
     531  
     532  	do
     533  	  {
     534  	    n = read (fd, inptr, maxlen - actlen);
     535  
     536  	    if (n == 0)
     537  	      /* No more text to read.  */
     538  	      break;
     539  
     540  	    if (n == -1)
     541  	      {
     542  		/* Error while reading.  */
     543  		error (0, errno, _("error while reading the input"));
     544  		return -1;
     545  	      }
     546  
     547  	    inptr += n;
     548  	    actlen += n;
     549  	  }
     550  	while (actlen < maxlen);
     551  
     552  	if (n == 0)
     553  	  /* Break again so we leave both loops.  */
     554  	  break;
     555        }
     556  
     557    /* Now we have all the input in the buffer.  Process it in one run.  */
     558    return process_block (tbl, inbuf, actlen, output);
     559  }
     560  
     561  
     562  static int
     563  process_file (struct convtable *tbl, FILE *input, FILE *output)
     564  {
     565    /* This should be safe since we use this function only for `stdin' and
     566       we haven't read anything so far.  */
     567    return process_fd (tbl, fileno (input), output);
     568  }