(root)/
glibc-2.38/
iconv/
gconv_trans.c
       1  /* Transliteration using the locale's data.
       2     Copyright (C) 2000-2023 Free Software Foundation, Inc.
       3     This file is part of the GNU C Library.
       4  
       5     The GNU C Library is free software; you can redistribute it and/or
       6     modify it under the terms of the GNU Lesser General Public
       7     License as published by the Free Software Foundation; either
       8     version 2.1 of the License, or (at your option) any later version.
       9  
      10     The GNU C Library is distributed in the hope that it will be useful,
      11     but WITHOUT ANY WARRANTY; without even the implied warranty of
      12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
      13     Lesser General Public License for more details.
      14  
      15     You should have received a copy of the GNU Lesser General Public
      16     License along with the GNU C Library; if not, see
      17     <https://www.gnu.org/licenses/>.  */
      18  
      19  #include <assert.h>
      20  #include <dlfcn.h>
      21  #include <search.h>
      22  #include <stdint.h>
      23  #include <string.h>
      24  #include <stdlib.h>
      25  
      26  #include <libc-lock.h>
      27  #include "gconv_int.h"
      28  #include "../locale/localeinfo.h"
      29  #include <pointer_guard.h>
      30  
      31  
      32  int
      33  __gconv_transliterate (struct __gconv_step *step,
      34  		       struct __gconv_step_data *step_data,
      35  		       const unsigned char *inbufstart,
      36  		       const unsigned char **inbufp,
      37  		       const unsigned char *inbufend,
      38  		       unsigned char **outbufstart, size_t *irreversible)
      39  {
      40    /* Find out about the locale's transliteration.  */
      41    uint32_t size;
      42    const uint32_t *from_idx;
      43    const uint32_t *from_tbl;
      44    const uint32_t *to_idx;
      45    const uint32_t *to_tbl;
      46    const uint32_t *winbuf;
      47    const uint32_t *winbufend;
      48    uint32_t low;
      49    uint32_t high;
      50  
      51    /* The input buffer.  There are actually 4-byte values.  */
      52    winbuf = (const uint32_t *) *inbufp;
      53    winbufend = (const uint32_t *) inbufend;
      54  
      55    __gconv_fct fct = step->__fct;
      56    if (step->__shlib_handle != NULL)
      57      PTR_DEMANGLE (fct);
      58  
      59    /* If there is no transliteration information in the locale don't do
      60       anything and return the error.  */
      61    size = _NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_TRANSLIT_TAB_SIZE);
      62    if (size == 0)
      63      goto no_rules;
      64  
      65    /* Get the rest of the values.  */
      66    from_idx =
      67      (const uint32_t *) _NL_CURRENT (LC_CTYPE, _NL_CTYPE_TRANSLIT_FROM_IDX);
      68    from_tbl =
      69      (const uint32_t *) _NL_CURRENT (LC_CTYPE, _NL_CTYPE_TRANSLIT_FROM_TBL);
      70    to_idx =
      71      (const uint32_t *) _NL_CURRENT (LC_CTYPE, _NL_CTYPE_TRANSLIT_TO_IDX);
      72    to_tbl =
      73      (const uint32_t *) _NL_CURRENT (LC_CTYPE, _NL_CTYPE_TRANSLIT_TO_TBL);
      74  
      75    /* Test whether there is enough input.  */
      76    if (winbuf + 1 > winbufend)
      77      return (winbuf == winbufend
      78  	    ? __GCONV_EMPTY_INPUT : __GCONV_INCOMPLETE_INPUT);
      79  
      80    /* The array starting at FROM_IDX contains indices to the string table
      81       in FROM_TBL.  The indices are sorted wrt to the strings.  I.e., we
      82       are doing binary search.  */
      83    low = 0;
      84    high = size;
      85    while (low < high)
      86      {
      87        uint32_t med = (low + high) / 2;
      88        uint32_t idx;
      89        int cnt;
      90  
      91        /* Compare the string at this index with the string at the current
      92  	 position in the input buffer.  */
      93        idx = from_idx[med];
      94        cnt = 0;
      95        do
      96  	{
      97  	  if (from_tbl[idx + cnt] != winbuf[cnt])
      98  	    /* Does not match.  */
      99  	    break;
     100  	  ++cnt;
     101  	}
     102        while (from_tbl[idx + cnt] != L'\0' && winbuf + cnt < winbufend);
     103  
     104        if (cnt > 0 && from_tbl[idx + cnt] == L'\0')
     105  	{
     106  	  /* Found a matching input sequence.  Now try to convert the
     107  	     possible replacements.  */
     108  	  uint32_t idx2 = to_idx[med];
     109  
     110  	  do
     111  	    {
     112  	      /* Determine length of replacement.  */
     113  	      unsigned int len = 0;
     114  	      int res;
     115  	      const unsigned char *toinptr;
     116  	      unsigned char *outptr;
     117  
     118  	      while (to_tbl[idx2 + len] != L'\0')
     119  		++len;
     120  
     121  	      /* Try this input text.  */
     122  	      toinptr = (const unsigned char *) &to_tbl[idx2];
     123  	      outptr = *outbufstart;
     124  	      res = DL_CALL_FCT (fct,
     125  				 (step, step_data, &toinptr,
     126  				  (const unsigned char *) &to_tbl[idx2 + len],
     127  				  &outptr, NULL, 0, 0));
     128  	      if (res != __GCONV_ILLEGAL_INPUT)
     129  		{
     130  		  /* If the conversion succeeds we have to increment the
     131  		     input buffer.  */
     132  		  if (res == __GCONV_EMPTY_INPUT)
     133  		    {
     134  		      *inbufp += cnt * sizeof (uint32_t);
     135  		      ++*irreversible;
     136  		      res = __GCONV_OK;
     137  		    }
     138  		  /* Do not increment the output pointer if we could not
     139  		     store the entire output. */
     140  		  if (res != __GCONV_FULL_OUTPUT)
     141  		    *outbufstart = outptr;
     142  
     143  		  return res;
     144  		}
     145  
     146  	      /* Next replacement.  */
     147  	      idx2 += len + 1;
     148  	    }
     149  	  while (to_tbl[idx2] != L'\0');
     150  
     151  	  /* Nothing found, continue searching.  */
     152  	}
     153        else if (cnt > 0)
     154  	/* This means that the input buffer contents matches a prefix of
     155  	   an entry.  Since we cannot match it unless we get more input,
     156  	   we will tell the caller about it.  */
     157  	return __GCONV_INCOMPLETE_INPUT;
     158  
     159        if (winbuf + cnt >= winbufend || from_tbl[idx + cnt] < winbuf[cnt])
     160  	low = med + 1;
     161        else
     162  	high = med;
     163      }
     164  
     165   no_rules:
     166    /* Maybe the character is supposed to be ignored.  */
     167    if (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_TRANSLIT_IGNORE_LEN) != 0)
     168      {
     169        int n = _NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_TRANSLIT_IGNORE_LEN);
     170        const uint32_t *ranges =
     171  	(const uint32_t *) _NL_CURRENT (LC_CTYPE, _NL_CTYPE_TRANSLIT_IGNORE);
     172        const uint32_t wc = *(const uint32_t *) (*inbufp);
     173        int i;
     174  
     175        /* Test whether there is enough input.  */
     176        if (winbuf + 1 > winbufend)
     177  	return (winbuf == winbufend
     178  		? __GCONV_EMPTY_INPUT : __GCONV_INCOMPLETE_INPUT);
     179  
     180        for (i = 0; i < n; ranges += 3, ++i)
     181  	if (ranges[0] <= wc && wc <= ranges[1]
     182  	    && (wc - ranges[0]) % ranges[2] == 0)
     183  	  {
     184  	    /* Matches the range.  Ignore it.  */
     185  	    *inbufp += 4;
     186  	    ++*irreversible;
     187  	    return __GCONV_OK;
     188  	  }
     189  	else if (wc < ranges[0])
     190  	  /* There cannot be any other matching range since they are
     191               sorted.  */
     192  	  break;
     193      }
     194  
     195    /* One last chance: use the default replacement.  */
     196    if (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_TRANSLIT_DEFAULT_MISSING_LEN) != 0)
     197      {
     198        const uint32_t *default_missing = (const uint32_t *)
     199  	_NL_CURRENT (LC_CTYPE, _NL_CTYPE_TRANSLIT_DEFAULT_MISSING);
     200        const unsigned char *toinptr = (const unsigned char *) default_missing;
     201        uint32_t len = _NL_CURRENT_WORD (LC_CTYPE,
     202  				       _NL_CTYPE_TRANSLIT_DEFAULT_MISSING_LEN);
     203        unsigned char *outptr;
     204        int res;
     205  
     206        /* Test whether there is enough input.  */
     207        if (winbuf + 1 > winbufend)
     208  	return (winbuf == winbufend
     209  		? __GCONV_EMPTY_INPUT : __GCONV_INCOMPLETE_INPUT);
     210  
     211        outptr = *outbufstart;
     212        res = DL_CALL_FCT (fct,
     213  			 (step, step_data, &toinptr,
     214  			  (const unsigned char *) (default_missing + len),
     215  			  &outptr, NULL, 0, 0));
     216  
     217        if (res != __GCONV_ILLEGAL_INPUT)
     218  	{
     219  	  /* If the conversion succeeds we have to increment the
     220  	     input buffer.  */
     221  	  if (res == __GCONV_EMPTY_INPUT)
     222  	    {
     223  	      /* This worked but is not reversible.  */
     224  	      ++*irreversible;
     225  	      *inbufp += 4;
     226  	      res = __GCONV_OK;
     227  	    }
     228  	  *outbufstart = outptr;
     229  
     230  	  return res;
     231  	}
     232      }
     233  
     234    /* Haven't found a match.  */
     235    return __GCONV_ILLEGAL_INPUT;
     236  }
     237  libc_hidden_def (__gconv_transliterate)