(root)/
glibc-2.38/
iconv/
gconv_charset.c
       1  /* Charset name normalization.
       2     Copyright (C) 2020-2023 Free Software Foundation, Inc.
       3     This file is part of the GNU C Library.
       4  
       5     The GNU C Library is free software; you can redistribute it and/or
       6     modify it under the terms of the GNU Lesser General Public
       7     License as published by the Free Software Foundation; either
       8     version 2.1 of the License, or (at your option) any later version.
       9  
      10     The GNU C Library is distributed in the hope that it will be useful,
      11     but WITHOUT ANY WARRANTY; without even the implied warranty of
      12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
      13     Lesser General Public License for more details.
      14  
      15     You should have received a copy of the GNU Lesser General Public
      16     License along with the GNU C Library; if not, see
      17     <http://www.gnu.org/licenses/>.  */
      18  
      19  
      20  #include <stdlib.h>
      21  #include <ctype.h>
      22  #include <locale.h>
      23  #include <stdbool.h>
      24  #include <string.h>
      25  #include <sys/stat.h>
      26  #include "gconv_int.h"
      27  #include "gconv_charset.h"
      28  
      29  
      30  /* This function returns a pointer to the last suffix in a conversion code
      31     string.  Valid suffixes matched by this function are of the form: '/' or ','
      32     followed by arbitrary text that doesn't contain '/' or ','.  It does not
      33     edit the string in any way.  The caller is expected to parse the suffix and
      34     remove it (by e.g. truncating the string) before the next call.  */
      35  static char *
      36  find_suffix (char *s)
      37  {
      38    /* The conversion code is in the form of a triplet, separated by '/' chars.
      39       The third component of the triplet contains suffixes. If we don't have two
      40       slashes, we don't have a suffix.  */
      41  
      42    int slash_count = 0;
      43    char *suffix_term = NULL;
      44  
      45    for (int i = 0; s[i] != '\0'; i++)
      46      switch (s[i])
      47        {
      48          case '/':
      49            slash_count++;
      50            /* Fallthrough */
      51          case ',':
      52            suffix_term = &s[i];
      53        }
      54  
      55    if (slash_count >= 2)
      56      return suffix_term;
      57  
      58    return NULL;
      59  }
      60  
      61  
      62  struct gconv_parsed_code
      63  {
      64    char *code;
      65    bool translit;
      66    bool ignore;
      67  };
      68  
      69  
      70  /* This function parses an iconv_open encoding PC.CODE, strips any suffixes
      71     (such as TRANSLIT or IGNORE) from it and sets corresponding flags in it.  */
      72  static void
      73  gconv_parse_code (struct gconv_parsed_code *pc)
      74  {
      75    pc->translit = false;
      76    pc->ignore = false;
      77  
      78    while (1)
      79      {
      80        /* First drop any trailing whitespaces and separators.  */
      81        size_t len = strlen (pc->code);
      82        while ((len > 0)
      83               && (isspace (pc->code[len - 1])
      84                   || pc->code[len - 1] == ','
      85                   || pc->code[len - 1] == '/'))
      86          len--;
      87  
      88        pc->code[len] = '\0';
      89  
      90        if (len == 0)
      91          return;
      92  
      93        char * suffix = find_suffix (pc->code);
      94        if (suffix == NULL)
      95          {
      96            /* At this point, we have processed and removed all suffixes from the
      97               code and what remains of the code is suffix free.  */
      98            return;
      99          }
     100        else
     101          {
     102            /* A suffix is processed from the end of the code array going
     103               backwards, one suffix at a time.  The suffix is an index into the
     104               code character array and points to: one past the end of the code
     105               and any unprocessed suffixes, and to the beginning of the suffix
     106               currently being processed during this iteration.  We must process
     107               this suffix and then drop it from the code by terminating the
     108               preceding text with NULL.
     109  
     110               We want to allow and recognize suffixes such as:
     111  
     112               "/TRANSLIT"         i.e. single suffix
     113               "//TRANSLIT"        i.e. single suffix and multiple separators
     114               "//TRANSLIT/IGNORE" i.e. suffixes separated by "/"
     115               "/TRANSLIT//IGNORE" i.e. suffixes separated by "//"
     116               "//IGNORE,TRANSLIT" i.e. suffixes separated by ","
     117               "//IGNORE,"         i.e. trailing ","
     118               "//TRANSLIT/"       i.e. trailing "/"
     119               "//TRANSLIT//"      i.e. trailing "//"
     120               "/"                 i.e. empty suffix.
     121  
     122               Unknown suffixes are silently discarded and ignored.  */
     123  
     124            if ((__strcasecmp_l (suffix,
     125                                 GCONV_TRIPLE_SEPARATOR
     126                                 GCONV_TRANSLIT_SUFFIX,
     127                                 _nl_C_locobj_ptr) == 0)
     128                || (__strcasecmp_l (suffix,
     129                                    GCONV_SUFFIX_SEPARATOR
     130                                    GCONV_TRANSLIT_SUFFIX,
     131                                    _nl_C_locobj_ptr) == 0))
     132              pc->translit = true;
     133  
     134            if ((__strcasecmp_l (suffix,
     135                                 GCONV_TRIPLE_SEPARATOR
     136                                 GCONV_IGNORE_ERRORS_SUFFIX,
     137                                 _nl_C_locobj_ptr) == 0)
     138                || (__strcasecmp_l (suffix,
     139                                    GCONV_SUFFIX_SEPARATOR
     140                                    GCONV_IGNORE_ERRORS_SUFFIX,
     141                                    _nl_C_locobj_ptr) == 0))
     142              pc->ignore = true;
     143  
     144            /* We just processed this suffix.  We can now drop it from the
     145               code string by truncating it at the suffix's position.  */
     146            suffix[0] = '\0';
     147          }
     148      }
     149  }
     150  
     151  
     152  /* This function accepts the charset names of the source and destination of the
     153     conversion and populates *conv_spec with an equivalent conversion
     154     specification that may later be used by __gconv_open.  The charset names
     155     might contain options in the form of suffixes that alter the conversion,
     156     e.g. "ISO-10646/UTF-8/TRANSLIT".  It processes the charset names, ignoring
     157     and truncating any suffix options in fromcode, and processing and truncating
     158     any suffix options in tocode.  Supported suffix options ("TRANSLIT" or
     159     "IGNORE") when found in tocode lead to the corresponding flag in *conv_spec
     160     to be set to true.  Unrecognized suffix options are silently discarded.  If
     161     the function succeeds, it returns conv_spec back to the caller.  It returns
     162     NULL upon failure.  conv_spec must be allocated and freed by the caller.  */
     163  struct gconv_spec *
     164  __gconv_create_spec (struct gconv_spec *conv_spec, const char *fromcode,
     165                     const char *tocode)
     166  {
     167    struct gconv_parsed_code pfc, ptc;
     168    struct gconv_spec *ret = NULL;
     169  
     170    pfc.code = __strdup (fromcode);
     171    ptc.code = __strdup (tocode);
     172  
     173    if ((pfc.code == NULL)
     174        || (ptc.code == NULL))
     175      goto out;
     176  
     177    gconv_parse_code (&pfc);
     178    gconv_parse_code (&ptc);
     179  
     180    /* We ignore suffixes in the fromcode because that is how the current
     181       implementation has always handled them.  Only suffixes in the tocode are
     182       processed and handled.  The reality is that invalid input in the input
     183       character set should only be ignored if the fromcode specifies IGNORE.
     184       The current implementation ignores invalid input in the input character
     185       set if the tocode contains IGNORE.  We preserve this behavior for
     186       backwards compatibility.  In the future we may split the handling of
     187       IGNORE to allow a finer grained specification of ignoring invalid input
     188       and/or ignoring invalid output.  */
     189    conv_spec->translit = ptc.translit;
     190    conv_spec->ignore = ptc.ignore;
     191  
     192    /* 3 extra bytes because 1 extra for '\0', and 2 extra so strip might
     193       be able to add one or two trailing '/' characters if necessary.  */
     194    conv_spec->fromcode = malloc (strlen (fromcode) + 3);
     195    if (conv_spec->fromcode == NULL)
     196      goto out;
     197  
     198    conv_spec->tocode = malloc (strlen (tocode) + 3);
     199    if (conv_spec->tocode == NULL)
     200      {
     201        free (conv_spec->fromcode);
     202        conv_spec->fromcode = NULL;
     203        goto out;
     204      }
     205  
     206    /* Strip unrecognized characters and ensure that the code has two '/'
     207       characters as per conversion code triplet specification.  */
     208    strip (conv_spec->fromcode, pfc.code);
     209    strip (conv_spec->tocode, ptc.code);
     210    ret = conv_spec;
     211  
     212  out:
     213    free (pfc.code);
     214    free (ptc.code);
     215  
     216    return ret;
     217  }
     218  libc_hidden_def (__gconv_create_spec)
     219  
     220  
     221  void
     222  __gconv_destroy_spec (struct gconv_spec *conv_spec)
     223  {
     224    free (conv_spec->fromcode);
     225    free (conv_spec->tocode);
     226    return;
     227  }
     228  libc_hidden_def (__gconv_destroy_spec)