(root)/
glibc-2.38/
iconvdata/
utf-16.c
       1  /* Conversion module for UTF-16.
       2     Copyright (C) 1999-2023 Free Software Foundation, Inc.
       3     This file is part of the GNU C Library.
       4  
       5     The GNU C Library is free software; you can redistribute it and/or
       6     modify it under the terms of the GNU Lesser General Public
       7     License as published by the Free Software Foundation; either
       8     version 2.1 of the License, or (at your option) any later version.
       9  
      10     The GNU C Library is distributed in the hope that it will be useful,
      11     but WITHOUT ANY WARRANTY; without even the implied warranty of
      12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
      13     Lesser General Public License for more details.
      14  
      15     You should have received a copy of the GNU Lesser General Public
      16     License along with the GNU C Library; if not, see
      17     <https://www.gnu.org/licenses/>.  */
      18  
      19  #include <byteswap.h>
      20  #include <dlfcn.h>
      21  #include <gconv.h>
      22  #include <stddef.h>
      23  #include <stdint.h>
      24  #include <stdlib.h>
      25  #include <string.h>
      26  
      27  /* This is the Byte Order Mark character (BOM).  */
      28  #define BOM	0xfeff
      29  /* And in the other byte order.  */
      30  #define BOM_OE	0xfffe
      31  
      32  
      33  /* Definitions used in the body of the `gconv' function.  */
      34  #define FROM_LOOP		from_utf16_loop
      35  #define TO_LOOP			to_utf16_loop
      36  #define DEFINE_INIT		0
      37  #define DEFINE_FINI		0
      38  #define MIN_NEEDED_FROM		2
      39  #define MAX_NEEDED_FROM		4
      40  #define MIN_NEEDED_TO		4
      41  #define ONE_DIRECTION		0
      42  #define FROM_DIRECTION		(dir == from_utf16)
      43  #define PREPARE_LOOP \
      44    enum direction dir = ((struct utf16_data *) step->__data)->dir;	      \
      45    enum variant var = ((struct utf16_data *) step->__data)->var;		      \
      46    if (__glibc_unlikely (data->__invocation_counter == 0))		      \
      47      {									      \
      48        if (var == UTF_16)						      \
      49  	{								      \
      50  	  if (FROM_DIRECTION)						      \
      51  	    {								      \
      52  	      /* We have to find out which byte order the file is	      \
      53  		 encoded in.  */					      \
      54  	      if (inptr + 2 > inend)					      \
      55  		return (inptr == inend					      \
      56  			? __GCONV_EMPTY_INPUT : __GCONV_INCOMPLETE_INPUT);    \
      57  									      \
      58  	      if (get16 (inptr) == BOM)					      \
      59  		/* Simply ignore the BOM character.  */			      \
      60  		*inptrp = inptr += 2;					      \
      61  	      else if (get16 (inptr) == BOM_OE)				      \
      62  		{							      \
      63  		  data->__flags |= __GCONV_SWAP;			      \
      64  		  *inptrp = inptr += 2;					      \
      65  		}							      \
      66  	    }								      \
      67  	  else if (!FROM_DIRECTION && !data->__internal_use)		      \
      68  	    {								      \
      69  	      /* Emit the Byte Order Mark.  */				      \
      70  	      if (__glibc_unlikely (outbuf + 2 > outend))		      \
      71  		return __GCONV_FULL_OUTPUT;				      \
      72  									      \
      73  	      put16 (outbuf, BOM);					      \
      74  	      outbuf += 2;						      \
      75  	    }								      \
      76  	}								      \
      77        else if ((var == UTF_16LE && BYTE_ORDER == BIG_ENDIAN)		      \
      78  	       || (var == UTF_16BE && BYTE_ORDER == LITTLE_ENDIAN))	      \
      79  	data->__flags |= __GCONV_SWAP;					      \
      80      }									      \
      81    const int swap = data->__flags & __GCONV_SWAP;
      82  #define EXTRA_LOOP_ARGS		, swap
      83  
      84  
      85  /* Direction of the transformation.  */
      86  enum direction
      87  {
      88    illegal_dir,
      89    to_utf16,
      90    from_utf16
      91  };
      92  
      93  enum variant
      94  {
      95    illegal_var,
      96    UTF_16,
      97    UTF_16LE,
      98    UTF_16BE
      99  };
     100  
     101  struct utf16_data
     102  {
     103    enum direction dir;
     104    enum variant var;
     105  };
     106  
     107  
     108  extern int gconv_init (struct __gconv_step *step);
     109  int
     110  gconv_init (struct __gconv_step *step)
     111  {
     112    /* Determine which direction.  */
     113    struct utf16_data *new_data;
     114    enum direction dir = illegal_dir;
     115    enum variant var = illegal_var;
     116    int result;
     117  
     118    if (__strcasecmp (step->__from_name, "UTF-16//") == 0)
     119      {
     120        dir = from_utf16;
     121        var = UTF_16;
     122      }
     123    else if (__strcasecmp (step->__to_name, "UTF-16//") == 0)
     124      {
     125        dir = to_utf16;
     126        var = UTF_16;
     127      }
     128    else if (__strcasecmp (step->__from_name, "UTF-16BE//") == 0)
     129      {
     130        dir = from_utf16;
     131        var = UTF_16BE;
     132      }
     133    else if (__strcasecmp (step->__to_name, "UTF-16BE//") == 0)
     134      {
     135        dir = to_utf16;
     136        var = UTF_16BE;
     137      }
     138    else if (__strcasecmp (step->__from_name, "UTF-16LE//") == 0)
     139      {
     140        dir = from_utf16;
     141        var = UTF_16LE;
     142      }
     143    else if (__strcasecmp (step->__to_name, "UTF-16LE//") == 0)
     144      {
     145        dir = to_utf16;
     146        var = UTF_16LE;
     147      }
     148  
     149    result = __GCONV_NOCONV;
     150    if (__builtin_expect (dir, to_utf16) != illegal_dir)
     151      {
     152        new_data = (struct utf16_data *) malloc (sizeof (struct utf16_data));
     153  
     154        result = __GCONV_NOMEM;
     155        if (new_data != NULL)
     156  	{
     157  	  new_data->dir = dir;
     158  	  new_data->var = var;
     159  	  step->__data = new_data;
     160  
     161  	  if (dir == from_utf16)
     162  	    {
     163  	      step->__min_needed_from = MIN_NEEDED_FROM;
     164  	      step->__max_needed_from = MAX_NEEDED_FROM;
     165  	      step->__min_needed_to = MIN_NEEDED_TO;
     166  	      step->__max_needed_to = MIN_NEEDED_TO;
     167  	    }
     168  	  else
     169  	    {
     170  	      step->__min_needed_from = MIN_NEEDED_TO;
     171  	      step->__max_needed_from = MIN_NEEDED_TO;
     172  	      step->__min_needed_to = MIN_NEEDED_FROM;
     173  	      step->__max_needed_to = MAX_NEEDED_FROM;
     174  	    }
     175  
     176  	  step->__stateful = 0;
     177  
     178  	  result = __GCONV_OK;
     179  	}
     180      }
     181  
     182    return result;
     183  }
     184  
     185  
     186  extern void gconv_end (struct __gconv_step *data);
     187  void
     188  gconv_end (struct __gconv_step *data)
     189  {
     190    free (data->__data);
     191  }
     192  
     193  
     194  /* Convert from the internal (UCS4-like) format to UTF-16.  */
     195  #define MIN_NEEDED_INPUT	MIN_NEEDED_TO
     196  #define MIN_NEEDED_OUTPUT	MIN_NEEDED_FROM
     197  #define MAX_NEEDED_OUTPUT	MAX_NEEDED_FROM
     198  #define LOOPFCT			TO_LOOP
     199  #define BODY \
     200    {									      \
     201      uint32_t c = get32 (inptr);						      \
     202  									      \
     203      if (__glibc_unlikely (c >= 0xd800 && c < 0xe000))			      \
     204        {									      \
     205  	/* Surrogate characters in UCS-4 input are not valid.		      \
     206  	   We must catch this.  If we let surrogates pass through,	      \
     207  	   attackers could make a security hole exploit by		      \
     208  	   synthesizing any desired plane 1-16 character.  */		      \
     209  	result = __GCONV_ILLEGAL_INPUT;					      \
     210  	if (! ignore_errors_p ())					      \
     211  	  break;							      \
     212  	inptr += 4;							      \
     213  	++*irreversible;						      \
     214  	continue;							      \
     215        }									      \
     216  									      \
     217      if (swap)								      \
     218        {									      \
     219  	if (__glibc_unlikely (c >= 0x10000))				      \
     220  	  {								      \
     221  	    if (__glibc_unlikely (c >= 0x110000))			      \
     222  	      {								      \
     223  		STANDARD_TO_LOOP_ERR_HANDLER (4);			      \
     224  	      }								      \
     225  									      \
     226  	    /* Generate a surrogate character.  */			      \
     227  	    if (__glibc_unlikely (outptr + 4 > outend))			      \
     228  	      {								      \
     229  		/* Overflow in the output buffer.  */			      \
     230  		result = __GCONV_FULL_OUTPUT;				      \
     231  		break;							      \
     232  	      }								      \
     233  									      \
     234  	    put16 (outptr, bswap_16 (0xd7c0 + (c >> 10)));		      \
     235  	    outptr += 2;						      \
     236  	    put16 (outptr, bswap_16 (0xdc00 + (c & 0x3ff)));		      \
     237  	  }								      \
     238  	else								      \
     239  	  put16 (outptr, bswap_16 (c));					      \
     240        }									      \
     241      else								      \
     242        {									      \
     243  	if (__glibc_unlikely (c >= 0x10000))				      \
     244  	  {								      \
     245  	    if (__glibc_unlikely (c >= 0x110000))			      \
     246  	      {								      \
     247  		STANDARD_TO_LOOP_ERR_HANDLER (4);			      \
     248  	      }								      \
     249  									      \
     250  	    /* Generate a surrogate character.  */			      \
     251  	    if (__glibc_unlikely (outptr + 4 > outend))			      \
     252  	      {								      \
     253  		/* Overflow in the output buffer.  */			      \
     254  		result = __GCONV_FULL_OUTPUT;				      \
     255  		break;							      \
     256  	      }								      \
     257  									      \
     258  	    put16 (outptr, 0xd7c0 + (c >> 10));				      \
     259  	    outptr += 2;						      \
     260  	    put16 (outptr, 0xdc00 + (c & 0x3ff));			      \
     261  	  }								      \
     262  	else								      \
     263  	  put16 (outptr, c);						      \
     264        }									      \
     265      outptr += 2;							      \
     266      inptr += 4;								      \
     267    }
     268  #define LOOP_NEED_FLAGS
     269  #define EXTRA_LOOP_DECLS \
     270  	, int swap
     271  #include <iconv/loop.c>
     272  
     273  
     274  /* Convert from UTF-16 to the internal (UCS4-like) format.  */
     275  #define MIN_NEEDED_INPUT	MIN_NEEDED_FROM
     276  #define MAX_NEEDED_INPUT	MAX_NEEDED_FROM
     277  #define MIN_NEEDED_OUTPUT	MIN_NEEDED_TO
     278  #define LOOPFCT			FROM_LOOP
     279  #define BODY \
     280    {									      \
     281      uint16_t u1 = get16 (inptr);					      \
     282  									      \
     283      if (swap)								      \
     284        {									      \
     285  	u1 = bswap_16 (u1);						      \
     286  									      \
     287  	if (__builtin_expect (u1 < 0xd800, 1) || u1 > 0xdfff)		      \
     288  	  {								      \
     289  	    /* No surrogate.  */					      \
     290  	    put32 (outptr, u1);						      \
     291  	    inptr += 2;							      \
     292  	  }								      \
     293  	else								      \
     294  	  {								      \
     295  	    uint16_t u2;						      \
     296  									      \
     297  	    if (__glibc_unlikely (u1 >= 0xdc00))			      \
     298  	      {								      \
     299  		/* This is no valid first word for a surrogate.  */	      \
     300  		STANDARD_FROM_LOOP_ERR_HANDLER (2);			      \
     301  	      }								      \
     302  									      \
     303  	    /* It's a surrogate character.  At least the first word says      \
     304  	       it is.  */						      \
     305  	    if (__glibc_unlikely (inptr + 4 > inend))			      \
     306  	      {								      \
     307  		/* We don't have enough input for another complete input      \
     308  		   character.  */					      \
     309  		result = __GCONV_INCOMPLETE_INPUT;			      \
     310  		break;							      \
     311  	      }								      \
     312  									      \
     313  	    inptr += 2;							      \
     314  	    u2 = bswap_16 (get16 (inptr));				      \
     315  	    if (__builtin_expect (u2 < 0xdc00, 0)			      \
     316  		|| __builtin_expect (u2 > 0xdfff, 0))			      \
     317  	      {								      \
     318  		/* This is no valid second word for a surrogate.  */	      \
     319  		inptr -= 2;						      \
     320  		STANDARD_FROM_LOOP_ERR_HANDLER (2);			      \
     321  	      }								      \
     322  									      \
     323  	    put32 (outptr, ((u1 - 0xd7c0) << 10) + (u2 - 0xdc00));	      \
     324  	    inptr += 2;							      \
     325  	  }								      \
     326        }									      \
     327      else								      \
     328        {									      \
     329  	if (__builtin_expect (u1 < 0xd800, 1) || u1 > 0xdfff)		      \
     330  	  {								      \
     331  	    /* No surrogate.  */					      \
     332  	    put32 (outptr, u1);						      \
     333  	    inptr += 2;							      \
     334  	  }								      \
     335  	else								      \
     336  	  {								      \
     337  	    if (__glibc_unlikely (u1 >= 0xdc00))			      \
     338  	      {								      \
     339  		/* This is no valid first word for a surrogate.  */	      \
     340  		STANDARD_FROM_LOOP_ERR_HANDLER (2);			      \
     341  	      }								      \
     342  									      \
     343  	    /* It's a surrogate character.  At least the first word says      \
     344  	       it is.  */						      \
     345  	    if (__glibc_unlikely (inptr + 4 > inend))			      \
     346  	      {								      \
     347  		/* We don't have enough input for another complete input      \
     348  		   character.  */					      \
     349  		result = __GCONV_INCOMPLETE_INPUT;			      \
     350  		break;							      \
     351  	      }								      \
     352  									      \
     353  	    inptr += 2;							      \
     354  	    uint16_t u2 = get16 (inptr);				      \
     355  	    if (__builtin_expect (u2 < 0xdc00, 0)			      \
     356  		|| __builtin_expect (u2 > 0xdfff, 0))			      \
     357  	      {								      \
     358  		/* This is no valid second word for a surrogate.  */	      \
     359  		inptr -= 2;						      \
     360  		STANDARD_FROM_LOOP_ERR_HANDLER (2);			      \
     361  	      }								      \
     362  									      \
     363  	    put32 (outptr, ((u1 - 0xd7c0) << 10) + (u2 - 0xdc00));	      \
     364  	    inptr += 2;							      \
     365  	  }								      \
     366        }									      \
     367      outptr += 4;							      \
     368    }
     369  #define LOOP_NEED_FLAGS
     370  #define EXTRA_LOOP_DECLS \
     371  	, int swap
     372  #include <iconv/loop.c>
     373  
     374  
     375  /* Now define the toplevel functions.  */
     376  #include <iconv/skeleton.c>