(root)/
glibc-2.38/
iconvdata/
shift_jisx0213.c
       1  /* Conversion from and to Shift_JISX0213.
       2     Copyright (C) 2002-2023 Free Software Foundation, Inc.
       3     This file is part of the GNU C Library.
       4  
       5     The GNU C Library is free software; you can redistribute it and/or
       6     modify it under the terms of the GNU Lesser General Public
       7     License as published by the Free Software Foundation; either
       8     version 2.1 of the License, or (at your option) any later version.
       9  
      10     The GNU C Library is distributed in the hope that it will be useful,
      11     but WITHOUT ANY WARRANTY; without even the implied warranty of
      12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
      13     Lesser General Public License for more details.
      14  
      15     You should have received a copy of the GNU Lesser General Public
      16     License along with the GNU C Library; if not, see
      17     <https://www.gnu.org/licenses/>.  */
      18  
      19  #include <dlfcn.h>
      20  #include <stdint.h>
      21  #include <gconv.h>
      22  
      23  /* The structure of Shift_JISX0213 is as follows:
      24  
      25     0x00..0x7F: ISO646-JP, an ASCII variant
      26  
      27     0x{A1..DF}: JISX0201 Katakana.
      28  
      29     0x{81..9F,E0..EF}{40..7E,80..FC}: JISX0213 plane 1.
      30  
      31     0x{F0..FC}{40..7E,80..FC}: JISX0213 plane 2, with irregular row mapping.
      32  
      33     Note that some JISX0213 characters are not contained in Unicode 3.2
      34     and are therefore best represented as sequences of Unicode characters.
      35  */
      36  
      37  #include "jisx0213.h"
      38  
      39  /* Definitions used in the body of the `gconv' function.  */
      40  #define CHARSET_NAME		"SHIFT_JISX0213//"
      41  #define FROM_LOOP		from_shift_jisx0213
      42  #define TO_LOOP			to_shift_jisx0213
      43  #define DEFINE_INIT		1
      44  #define DEFINE_FINI		1
      45  #define ONE_DIRECTION		0
      46  #define FROM_LOOP_MIN_NEEDED_FROM	1
      47  #define FROM_LOOP_MAX_NEEDED_FROM	2
      48  #define FROM_LOOP_MIN_NEEDED_TO		4
      49  #define FROM_LOOP_MAX_NEEDED_TO		8
      50  #define TO_LOOP_MIN_NEEDED_FROM		4
      51  #define TO_LOOP_MAX_NEEDED_FROM		4
      52  #define TO_LOOP_MIN_NEEDED_TO		1
      53  #define TO_LOOP_MAX_NEEDED_TO		2
      54  #define PREPARE_LOOP \
      55    int saved_state;							      \
      56    int *statep = &data->__statep->__count;
      57  #define EXTRA_LOOP_ARGS		, statep
      58  
      59  
      60  /* Since we might have to reset input pointer we must be able to save
      61     and restore the state.  */
      62  #define SAVE_RESET_STATE(Save) \
      63    if (Save)								      \
      64      saved_state = *statep;						      \
      65    else									      \
      66      *statep = saved_state
      67  
      68  
      69  /* During Shift_JISX0213 to UCS-4 conversion, the COUNT element of the state
      70     contains the last UCS-4 character, shifted by 3 bits.
      71     During UCS-4 to Shift_JISX0213 conversion, the COUNT element of the state
      72     contains the last two bytes to be output, shifted by 3 bits.  */
      73  
      74  /* Since this is a stateful encoding we have to provide code which resets
      75     the output state to the initial state.  This has to be done during the
      76     flushing.  */
      77  #define EMIT_SHIFT_TO_INIT \
      78    if (data->__statep->__count != 0)					      \
      79      {									      \
      80        if (FROM_DIRECTION)						      \
      81  	{								      \
      82  	  if (__glibc_likely (outbuf + 4 <= outend))			      \
      83  	    {								      \
      84  	      /* Write out the last character.  */			      \
      85  	      *((uint32_t *) outbuf) = data->__statep->__count >> 3;	      \
      86  	      outbuf += sizeof (uint32_t);				      \
      87  	      data->__statep->__count = 0;				      \
      88  	    }								      \
      89  	  else								      \
      90  	    /* We don't have enough room in the output buffer.  */	      \
      91  	    status = __GCONV_FULL_OUTPUT;				      \
      92  	}								      \
      93        else								      \
      94  	{								      \
      95  	  if (__glibc_likely (outbuf + 2 <= outend))			      \
      96  	    {								      \
      97  	      /* Write out the last character.  */			      \
      98  	      uint32_t lasttwo = data->__statep->__count >> 3;		      \
      99  	      *outbuf++ = (lasttwo >> 8) & 0xff;			      \
     100  	      *outbuf++ = lasttwo & 0xff;				      \
     101  	      data->__statep->__count = 0;				      \
     102  	    }								      \
     103  	  else								      \
     104  	    /* We don't have enough room in the output buffer.  */	      \
     105  	    status = __GCONV_FULL_OUTPUT;				      \
     106  	}								      \
     107      }
     108  
     109  
     110  /* First define the conversion function from Shift_JISX0213 to UCS-4.  */
     111  #define MIN_NEEDED_INPUT	FROM_LOOP_MIN_NEEDED_FROM
     112  #define MAX_NEEDED_INPUT	FROM_LOOP_MAX_NEEDED_FROM
     113  #define MIN_NEEDED_OUTPUT	FROM_LOOP_MIN_NEEDED_TO
     114  #define MAX_NEEDED_OUTPUT	FROM_LOOP_MAX_NEEDED_TO
     115  #define LOOPFCT			FROM_LOOP
     116  #define BODY \
     117    {									      \
     118      uint32_t ch;							      \
     119  									      \
     120      /* Determine whether there is a buffered character pending.  */	      \
     121      ch = *statep >> 3;							      \
     122      if (__glibc_likely (ch == 0))					      \
     123        {									      \
     124  	/* No - so look at the next input byte.  */			      \
     125  	ch = *inptr;							      \
     126  									      \
     127  	if (ch < 0x80)							      \
     128  	  {								      \
     129  	    /* Plain ISO646-JP character.  */				      \
     130  	    if (__glibc_unlikely (ch == 0x5c))				      \
     131  	      ch = 0xa5;						      \
     132  	    else if (__glibc_unlikely (ch == 0x7e))			      \
     133  	      ch = 0x203e;						      \
     134  	    ++inptr;							      \
     135  	  }								      \
     136  	else if (ch >= 0xa1 && ch <= 0xdf)				      \
     137  	  {								      \
     138  	    /* Half-width katakana.  */					      \
     139  	    ch += 0xfec0;						      \
     140  	    ++inptr;							      \
     141  	  }								      \
     142  	else if ((ch >= 0x81 && ch <= 0x9f) || (ch >= 0xe0 && ch <= 0xfc))    \
     143  	  {								      \
     144  	    /* Two byte character.  */					      \
     145  	    uint32_t ch2;						      \
     146  									      \
     147  	    if (__glibc_unlikely (inptr + 1 >= inend))			      \
     148  	      {								      \
     149  		/* The second byte is not available.  */		      \
     150  		result = __GCONV_INCOMPLETE_INPUT;			      \
     151  		break;							      \
     152  	      }								      \
     153  									      \
     154  	    ch2 = inptr[1];						      \
     155  									      \
     156  	    /* The second byte must be in the range 0x{40..7E,80..FC}.  */    \
     157  	    if (__glibc_unlikely (ch2 < 0x40 || ch2 == 0x7f || ch2 > 0xfc))   \
     158  	      {								      \
     159  		/* This is an illegal character.  */			      \
     160  		STANDARD_FROM_LOOP_ERR_HANDLER (1);			      \
     161  	      }								      \
     162  									      \
     163  	    /* Convert to row and column.  */				      \
     164  	    if (ch < 0xe0)						      \
     165  	      ch -= 0x81;						      \
     166  	    else							      \
     167  	      ch -= 0xc1;						      \
     168  	    if (ch2 < 0x80)						      \
     169  	      ch2 -= 0x40;						      \
     170  	    else							      \
     171  	      ch2 -= 0x41;						      \
     172  	    /* Now 0 <= ch <= 0x3b, 0 <= ch2 <= 0xbb.  */		      \
     173  	    ch = 2 * ch;						      \
     174  	    if (ch2 >= 0x5e)						      \
     175  	      ch2 -= 0x5e, ch++;					      \
     176  	    ch2 += 0x21;						      \
     177  	    if (ch >= 0x5e)						      \
     178  	      {								      \
     179  		/* Handling of JISX 0213 plane 2 rows.  */		      \
     180  		if (ch >= 0x67)						      \
     181  		  ch += 230;						      \
     182  		else if (ch >= 0x63 || ch == 0x5f)			      \
     183  		  ch += 168;						      \
     184  		else 							      \
     185  		  ch += 162;						      \
     186  	      }								      \
     187  									      \
     188  	    ch = jisx0213_to_ucs4 (0x121 + ch, ch2);			      \
     189  									      \
     190  	    if (ch == 0)						      \
     191  	      {								      \
     192  		/* This is an illegal character.  */			      \
     193  		STANDARD_FROM_LOOP_ERR_HANDLER (1);			      \
     194  	      }								      \
     195  									      \
     196  	    inptr += 2;							      \
     197  									      \
     198  	    if (ch < 0x80)						      \
     199  	      {								      \
     200  		/* It's a combining character.  */			      \
     201  		uint32_t u1 = __jisx0213_to_ucs_combining[ch - 1][0];	      \
     202  		uint32_t u2 = __jisx0213_to_ucs_combining[ch - 1][1];	      \
     203  									      \
     204  		put32 (outptr, u1);					      \
     205  		outptr += 4;						      \
     206  									      \
     207  		/* See whether we have room for two characters.  */	      \
     208  		if (outptr + 4 <= outend)				      \
     209  		  {							      \
     210  		    put32 (outptr, u2);					      \
     211  		    outptr += 4;					      \
     212  		    continue;						      \
     213  		  }							      \
     214  									      \
     215  		/* Otherwise store only the first character now, and	      \
     216  		   put the second one into the queue.  */		      \
     217  		*statep = u2 << 3;					      \
     218  		/* Tell the caller why we terminate the loop.  */	      \
     219  		result = __GCONV_FULL_OUTPUT;				      \
     220  		break;							      \
     221  	      }								      \
     222  	  }								      \
     223  	else								      \
     224  	  {								      \
     225  	    /* This is illegal.  */					      \
     226  	    STANDARD_FROM_LOOP_ERR_HANDLER (1);				      \
     227  	  }								      \
     228        }									      \
     229  									      \
     230      put32 (outptr, ch);							      \
     231      outptr += 4;							      \
     232    }
     233  #define LOOP_NEED_FLAGS
     234  #define EXTRA_LOOP_DECLS	, int *statep
     235  #define ONEBYTE_BODY \
     236    {									      \
     237      if (c < 0x80)							      \
     238        {									      \
     239  	if (c == 0x5c)							      \
     240  	  return 0xa5;							      \
     241  	if (c == 0x7e)							      \
     242  	  return 0x203e;						      \
     243  	return c;							      \
     244        }									      \
     245      if (c >= 0xa1 && c <= 0xdf)						      \
     246        return 0xfec0 + c;						      \
     247      return WEOF;							      \
     248    }
     249  #include <iconv/loop.c>
     250  
     251  
     252  /* Next, define the other direction, from UCS-4 to Shift_JISX0213.  */
     253  
     254  /* Composition tables for each of the relevant combining characters.  */
     255  static const struct
     256  {
     257    uint16_t base;
     258    uint16_t composed;
     259  } comp_table_data[] =
     260  {
     261  #define COMP_TABLE_IDX_02E5 0
     262  #define COMP_TABLE_LEN_02E5 1
     263    { 0x8684, 0x8685 }, /* 0x12B65 = 0x12B64 U+02E5 */
     264  #define COMP_TABLE_IDX_02E9 (COMP_TABLE_IDX_02E5 + COMP_TABLE_LEN_02E5)
     265  #define COMP_TABLE_LEN_02E9 1
     266    { 0x8680, 0x8686 }, /* 0x12B66 = 0x12B60 U+02E9 */
     267  #define COMP_TABLE_IDX_0300 (COMP_TABLE_IDX_02E9 + COMP_TABLE_LEN_02E9)
     268  #define COMP_TABLE_LEN_0300 5
     269    { 0x857b, 0x8663 }, /* 0x12B44 = 0x1295C U+0300 */
     270    { 0x8657, 0x8667 }, /* 0x12B48 = 0x12B38 U+0300 */
     271    { 0x8656, 0x8669 }, /* 0x12B4A = 0x12B37 U+0300 */
     272    { 0x864f, 0x866b }, /* 0x12B4C = 0x12B30 U+0300 */
     273    { 0x8662, 0x866d }, /* 0x12B4E = 0x12B43 U+0300 */
     274  #define COMP_TABLE_IDX_0301 (COMP_TABLE_IDX_0300 + COMP_TABLE_LEN_0300)
     275  #define COMP_TABLE_LEN_0301 4
     276    { 0x8657, 0x8668 }, /* 0x12B49 = 0x12B38 U+0301 */
     277    { 0x8656, 0x866a }, /* 0x12B4B = 0x12B37 U+0301 */
     278    { 0x864f, 0x866c }, /* 0x12B4D = 0x12B30 U+0301 */
     279    { 0x8662, 0x866e }, /* 0x12B4F = 0x12B43 U+0301 */
     280  #define COMP_TABLE_IDX_309A (COMP_TABLE_IDX_0301 + COMP_TABLE_LEN_0301)
     281  #define COMP_TABLE_LEN_309A 14
     282    { 0x82a9, 0x82f5 }, /* 0x12477 = 0x1242B U+309A */
     283    { 0x82ab, 0x82f6 }, /* 0x12478 = 0x1242D U+309A */
     284    { 0x82ad, 0x82f7 }, /* 0x12479 = 0x1242F U+309A */
     285    { 0x82af, 0x82f8 }, /* 0x1247A = 0x12431 U+309A */
     286    { 0x82b1, 0x82f9 }, /* 0x1247B = 0x12433 U+309A */
     287    { 0x834a, 0x8397 }, /* 0x12577 = 0x1252B U+309A */
     288    { 0x834c, 0x8398 }, /* 0x12578 = 0x1252D U+309A */
     289    { 0x834e, 0x8399 }, /* 0x12579 = 0x1252F U+309A */
     290    { 0x8350, 0x839a }, /* 0x1257A = 0x12531 U+309A */
     291    { 0x8352, 0x839b }, /* 0x1257B = 0x12533 U+309A */
     292    { 0x835a, 0x839c }, /* 0x1257C = 0x1253B U+309A */
     293    { 0x8363, 0x839d }, /* 0x1257D = 0x12544 U+309A */
     294    { 0x8367, 0x839e }, /* 0x1257E = 0x12548 U+309A */
     295    { 0x83f3, 0x83f6 }, /* 0x12678 = 0x12675 U+309A */
     296  };
     297  
     298  #define MIN_NEEDED_INPUT	TO_LOOP_MIN_NEEDED_FROM
     299  #define MAX_NEEDED_INPUT	TO_LOOP_MAX_NEEDED_FROM
     300  #define MIN_NEEDED_OUTPUT	TO_LOOP_MIN_NEEDED_TO
     301  #define MAX_NEEDED_OUTPUT	TO_LOOP_MAX_NEEDED_TO
     302  #define LOOPFCT			TO_LOOP
     303  #define BODY \
     304    {									      \
     305      uint32_t ch = get32 (inptr);					      \
     306  									      \
     307      if ((*statep >> 3) != 0)						      \
     308        {									      \
     309  	/* Attempt to combine the last character with this one.  */	      \
     310  	uint16_t lasttwo = *statep >> 3;				      \
     311  	unsigned int idx;						      \
     312  	unsigned int len;						      \
     313  									      \
     314  	if (ch == 0x02e5)						      \
     315  	  idx = COMP_TABLE_IDX_02E5, len = COMP_TABLE_LEN_02E5;		      \
     316  	else if (ch == 0x02e9)						      \
     317  	  idx = COMP_TABLE_IDX_02E9, len = COMP_TABLE_LEN_02E9;		      \
     318  	else if (ch == 0x0300)						      \
     319  	  idx = COMP_TABLE_IDX_0300, len = COMP_TABLE_LEN_0300;		      \
     320  	else if (ch == 0x0301)						      \
     321  	  idx = COMP_TABLE_IDX_0301, len = COMP_TABLE_LEN_0301;		      \
     322  	else if (ch == 0x309a)						      \
     323  	  idx = COMP_TABLE_IDX_309A, len = COMP_TABLE_LEN_309A;		      \
     324  	else								      \
     325  	  goto not_combining;						      \
     326  									      \
     327  	do								      \
     328  	  if (comp_table_data[idx].base == lasttwo)			      \
     329  	    break;							      \
     330  	while (++idx, --len > 0);					      \
     331  									      \
     332  	if (len > 0)							      \
     333  	  {								      \
     334  	    /* Output the combined character.  */			      \
     335  	    if (__glibc_unlikely (outptr + 1 >= outend))		      \
     336  	      {								      \
     337  		result = __GCONV_FULL_OUTPUT;				      \
     338  		break;							      \
     339  	      }								      \
     340  	    lasttwo = comp_table_data[idx].composed;			      \
     341  	    *outptr++ = (lasttwo >> 8) & 0xff;				      \
     342  	    *outptr++ = lasttwo & 0xff;					      \
     343  	    *statep = 0;						      \
     344  	    inptr += 4;							      \
     345  	    continue;							      \
     346  	  }								      \
     347  									      \
     348        not_combining:							      \
     349  	/* Output the buffered character.  */				      \
     350  	if (__glibc_unlikely (outptr + 1 >= outend))			      \
     351  	  {								      \
     352  	    result = __GCONV_FULL_OUTPUT;				      \
     353  	    break;							      \
     354  	  }								      \
     355  	*outptr++ = (lasttwo >> 8) & 0xff;				      \
     356  	*outptr++ = lasttwo & 0xff;					      \
     357  	*statep = 0;							      \
     358  	continue;							      \
     359        }									      \
     360  									      \
     361      if (ch < 0x80)							      \
     362        /* Plain ISO646-JP character.  */					      \
     363        *outptr++ = ch;							      \
     364      else if (ch == 0xa5)						      \
     365        *outptr++ = 0x5c;							      \
     366      else if (ch == 0x203e)						      \
     367        *outptr++ = 0x7e;							      \
     368      else if (ch >= 0xff61 && ch <= 0xff9f)				      \
     369        /* Half-width katakana.  */					      \
     370        *outptr++ = ch - 0xfec0;						      \
     371      else								      \
     372        {									      \
     373  	unsigned int s1, s2;						      \
     374  	uint32_t jch = ucs4_to_jisx0213 (ch);				      \
     375  	if (jch == 0)							      \
     376  	  {								      \
     377  	    UNICODE_TAG_HANDLER (ch, 4);				      \
     378  									      \
     379  	    /* Illegal character.  */					      \
     380  	    STANDARD_TO_LOOP_ERR_HANDLER (4);				      \
     381  	  }								      \
     382  									      \
     383  	/* Convert it to shifted representation.  */			      \
     384  	s1 = jch >> 8;							      \
     385  	s2 = jch & 0x7f;							      \
     386  	s1 -= 0x21;							      \
     387  	s2 -= 0x21;							      \
     388  	if (s1 >= 0x5e)							      \
     389  	  {								      \
     390  	    /* Handling of JISX 0213 plane 2 rows.  */			      \
     391  	    if (s1 >= 0xcd) /* rows 0x26E..0x27E */			      \
     392  	      s1 -= 102;						      \
     393  	    else if (s1 >= 0x8b || s1 == 0x87) /* rows 0x228, 0x22C..0x22F */ \
     394  	      s1 -= 40;							      \
     395  	    else /* rows 0x221, 0x223..0x225 */				      \
     396  	      s1 -= 34;							      \
     397  	    /* Now 0x5e <= s1 <= 0x77.  */				      \
     398  	  }								      \
     399  	if (s1 & 1)							      \
     400  	  s2 += 0x5e;							      \
     401  	s1 = s1 >> 1;							      \
     402  	if (s1 < 0x1f)							      \
     403  	  s1 += 0x81;							      \
     404  	else								      \
     405  	  s1 += 0xc1;							      \
     406  	if (s2 < 0x3f)							      \
     407  	  s2 += 0x40;							      \
     408  	else								      \
     409  	  s2 += 0x41;							      \
     410  									      \
     411  	if (jch & 0x0080)						      \
     412  	  {								      \
     413  	    /* A possible match in comp_table_data.  We have to buffer it.  */\
     414  									      \
     415  	    /* We know it's a JISX 0213 plane 1 character.  */		      \
     416  	    assert ((jch & 0x8000) == 0);				      \
     417  									      \
     418  	    *statep = ((s1 << 8) | s2) << 3;				      \
     419  	    inptr += 4;							      \
     420  	    continue;							      \
     421  	  }								      \
     422  									      \
     423  	/* Output the shifted representation.  */			      \
     424  	if (__glibc_unlikely (outptr + 1 >= outend))			      \
     425  	  {								      \
     426  	    result = __GCONV_FULL_OUTPUT;				      \
     427  	    break;							      \
     428  	  }								      \
     429  	*outptr++ = s1;							      \
     430  	*outptr++ = s2;							      \
     431        }									      \
     432  									      \
     433      inptr += 4;								      \
     434    }
     435  #define LOOP_NEED_FLAGS
     436  #define EXTRA_LOOP_DECLS	, int *statep
     437  #include <iconv/loop.c>
     438  
     439  
     440  /* Now define the toplevel functions.  */
     441  #include <iconv/skeleton.c>