(root)/
Python-3.12.0/
Modules/
cjkcodecs/
_codecs_kr.c
       1  /*
       2   * _codecs_kr.c: Codecs collection for Korean encodings
       3   *
       4   * Written by Hye-Shik Chang <perky@FreeBSD.org>
       5   */
       6  
       7  #include "cjkcodecs.h"
       8  #include "mappings_kr.h"
       9  
      10  /*
      11   * EUC-KR codec
      12   */
      13  
      14  #define EUCKR_JAMO_FIRSTBYTE    0xA4
      15  #define EUCKR_JAMO_FILLER       0xD4
      16  
      17  static const unsigned char u2cgk_choseong[19] = {
      18      0xa1, 0xa2, 0xa4, 0xa7, 0xa8, 0xa9, 0xb1, 0xb2,
      19      0xb3, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb,
      20      0xbc, 0xbd, 0xbe
      21  };
      22  static const unsigned char u2cgk_jungseong[21] = {
      23      0xbf, 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6,
      24      0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce,
      25      0xcf, 0xd0, 0xd1, 0xd2, 0xd3
      26  };
      27  static const unsigned char u2cgk_jongseong[28] = {
      28      0xd4, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
      29      0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 0xb0,
      30      0xb1, 0xb2, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xba,
      31      0xbb, 0xbc, 0xbd, 0xbe
      32  };
      33  
      34  ENCODER(euc_kr)
      35  {
      36      while (*inpos < inlen) {
      37          Py_UCS4 c = INCHAR1;
      38          DBCHAR code;
      39  
      40          if (c < 0x80) {
      41              WRITEBYTE1((unsigned char)c);
      42              NEXT(1, 1);
      43              continue;
      44          }
      45  
      46          if (c > 0xFFFF)
      47              return 1;
      48  
      49          REQUIRE_OUTBUF(2);
      50          if (TRYMAP_ENC(cp949, code, c))
      51              ;
      52          else
      53              return 1;
      54  
      55          if ((code & 0x8000) == 0) {
      56              /* KS X 1001 coded character */
      57              OUTBYTE1((code >> 8) | 0x80);
      58              OUTBYTE2((code & 0xFF) | 0x80);
      59              NEXT(1, 2);
      60          }
      61          else {
      62              /* Mapping is found in CP949 extension,
      63                 but we encode it in KS X 1001:1998,
      64                 make-up sequence for EUC-KR. */
      65  
      66              REQUIRE_OUTBUF(8);
      67  
      68              /* syllable composition precedence */
      69              OUTBYTE1(EUCKR_JAMO_FIRSTBYTE);
      70              OUTBYTE2(EUCKR_JAMO_FILLER);
      71  
      72              /* All code points in CP949 extension are in unicode
      73               * Hangul Syllable area. */
      74              assert(0xac00 <= c && c <= 0xd7a3);
      75              c -= 0xac00;
      76  
      77              OUTBYTE3(EUCKR_JAMO_FIRSTBYTE);
      78              OUTBYTE4(u2cgk_choseong[c / 588]);
      79              NEXT_OUT(4);
      80  
      81              OUTBYTE1(EUCKR_JAMO_FIRSTBYTE);
      82              OUTBYTE2(u2cgk_jungseong[(c / 28) % 21]);
      83              OUTBYTE3(EUCKR_JAMO_FIRSTBYTE);
      84              OUTBYTE4(u2cgk_jongseong[c % 28]);
      85              NEXT(1, 4);
      86          }
      87      }
      88  
      89      return 0;
      90  }
      91  
      92  #define NONE    127
      93  
      94  static const unsigned char cgk2u_choseong[] = { /* [A1, BE] */
      95         0,    1, NONE,    2, NONE, NONE,    3,    4,
      96         5, NONE, NONE, NONE, NONE, NONE, NONE, NONE,
      97         6,    7,    8, NONE,    9,   10,   11,   12,
      98        13,   14,   15,   16,   17,   18
      99  };
     100  static const unsigned char cgk2u_jongseong[] = { /* [A1, BE] */
     101         1,    2,    3,    4,    5,    6,    7, NONE,
     102         8,    9,   10,   11,   12,   13,   14,   15,
     103        16,   17, NONE,   18,   19,   20,   21,   22,
     104      NONE,   23,   24,   25,   26,   27
     105  };
     106  
     107  DECODER(euc_kr)
     108  {
     109      while (inleft > 0) {
     110          unsigned char c = INBYTE1;
     111          Py_UCS4 decoded;
     112  
     113          if (c < 0x80) {
     114              OUTCHAR(c);
     115              NEXT_IN(1);
     116              continue;
     117          }
     118  
     119          REQUIRE_INBUF(2);
     120  
     121          if (c == EUCKR_JAMO_FIRSTBYTE &&
     122              INBYTE2 == EUCKR_JAMO_FILLER) {
     123              /* KS X 1001:1998 make-up sequence */
     124              DBCHAR cho, jung, jong;
     125  
     126              REQUIRE_INBUF(8);
     127              if ((*inbuf)[2] != EUCKR_JAMO_FIRSTBYTE ||
     128                  (*inbuf)[4] != EUCKR_JAMO_FIRSTBYTE ||
     129                  (*inbuf)[6] != EUCKR_JAMO_FIRSTBYTE)
     130                  return 1;
     131  
     132              c = (*inbuf)[3];
     133              if (0xa1 <= c && c <= 0xbe)
     134                  cho = cgk2u_choseong[c - 0xa1];
     135              else
     136                  cho = NONE;
     137  
     138              c = (*inbuf)[5];
     139              jung = (0xbf <= c && c <= 0xd3) ? c - 0xbf : NONE;
     140  
     141              c = (*inbuf)[7];
     142              if (c == EUCKR_JAMO_FILLER)
     143                  jong = 0;
     144              else if (0xa1 <= c && c <= 0xbe)
     145                  jong = cgk2u_jongseong[c - 0xa1];
     146              else
     147                  jong = NONE;
     148  
     149              if (cho == NONE || jung == NONE || jong == NONE)
     150                  return 1;
     151  
     152              OUTCHAR(0xac00 + cho*588 + jung*28 + jong);
     153              NEXT_IN(8);
     154          }
     155          else if (TRYMAP_DEC(ksx1001, decoded, c ^ 0x80, INBYTE2 ^ 0x80)) {
     156              OUTCHAR(decoded);
     157              NEXT_IN(2);
     158          }
     159          else
     160              return 1;
     161      }
     162  
     163      return 0;
     164  }
     165  #undef NONE
     166  
     167  
     168  /*
     169   * CP949 codec
     170   */
     171  
     172  ENCODER(cp949)
     173  {
     174      while (*inpos < inlen) {
     175          Py_UCS4 c = INCHAR1;
     176          DBCHAR code;
     177  
     178          if (c < 0x80) {
     179              WRITEBYTE1((unsigned char)c);
     180              NEXT(1, 1);
     181              continue;
     182          }
     183  
     184          if (c > 0xFFFF)
     185              return 1;
     186  
     187          REQUIRE_OUTBUF(2);
     188          if (TRYMAP_ENC(cp949, code, c))
     189              ;
     190          else
     191              return 1;
     192  
     193          OUTBYTE1((code >> 8) | 0x80);
     194          if (code & 0x8000)
     195              OUTBYTE2(code & 0xFF); /* MSB set: CP949 */
     196          else
     197              OUTBYTE2((code & 0xFF) | 0x80); /* MSB unset: ks x 1001 */
     198          NEXT(1, 2);
     199      }
     200  
     201      return 0;
     202  }
     203  
     204  DECODER(cp949)
     205  {
     206      while (inleft > 0) {
     207          unsigned char c = INBYTE1;
     208          Py_UCS4 decoded;
     209  
     210          if (c < 0x80) {
     211              OUTCHAR(c);
     212              NEXT_IN(1);
     213              continue;
     214          }
     215  
     216          REQUIRE_INBUF(2);
     217          if (TRYMAP_DEC(ksx1001, decoded, c ^ 0x80, INBYTE2 ^ 0x80))
     218              OUTCHAR(decoded);
     219          else if (TRYMAP_DEC(cp949ext, decoded, c, INBYTE2))
     220              OUTCHAR(decoded);
     221          else
     222              return 1;
     223  
     224          NEXT_IN(2);
     225      }
     226  
     227      return 0;
     228  }
     229  
     230  
     231  /*
     232   * JOHAB codec
     233   */
     234  
     235  static const unsigned char u2johabidx_choseong[32] = {
     236                  0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
     237      0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
     238      0x10, 0x11, 0x12, 0x13, 0x14,
     239  };
     240  static const unsigned char u2johabidx_jungseong[32] = {
     241                        0x03, 0x04, 0x05, 0x06, 0x07,
     242                  0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
     243                  0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
     244                  0x1a, 0x1b, 0x1c, 0x1d,
     245  };
     246  static const unsigned char u2johabidx_jongseong[32] = {
     247            0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
     248      0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
     249      0x10, 0x11,       0x13, 0x14, 0x15, 0x16, 0x17,
     250      0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d,
     251  };
     252  static const DBCHAR u2johabjamo[] = {
     253              0x8841, 0x8c41, 0x8444, 0x9041, 0x8446, 0x8447, 0x9441,
     254      0x9841, 0x9c41, 0x844a, 0x844b, 0x844c, 0x844d, 0x844e, 0x844f,
     255      0x8450, 0xa041, 0xa441, 0xa841, 0x8454, 0xac41, 0xb041, 0xb441,
     256      0xb841, 0xbc41, 0xc041, 0xc441, 0xc841, 0xcc41, 0xd041, 0x8461,
     257      0x8481, 0x84a1, 0x84c1, 0x84e1, 0x8541, 0x8561, 0x8581, 0x85a1,
     258      0x85c1, 0x85e1, 0x8641, 0x8661, 0x8681, 0x86a1, 0x86c1, 0x86e1,
     259      0x8741, 0x8761, 0x8781, 0x87a1,
     260  };
     261  
     262  ENCODER(johab)
     263  {
     264      while (*inpos < inlen) {
     265          Py_UCS4 c = INCHAR1;
     266          DBCHAR code;
     267  
     268          if (c < 0x80) {
     269              WRITEBYTE1((unsigned char)c);
     270              NEXT(1, 1);
     271              continue;
     272          }
     273  
     274          if (c > 0xFFFF)
     275              return 1;
     276  
     277          REQUIRE_OUTBUF(2);
     278  
     279          if (c >= 0xac00 && c <= 0xd7a3) {
     280              c -= 0xac00;
     281              code = 0x8000 |
     282                  (u2johabidx_choseong[c / 588] << 10) |
     283                  (u2johabidx_jungseong[(c / 28) % 21] << 5) |
     284                  u2johabidx_jongseong[c % 28];
     285          }
     286          else if (c >= 0x3131 && c <= 0x3163)
     287              code = u2johabjamo[c - 0x3131];
     288          else if (TRYMAP_ENC(cp949, code, c)) {
     289              unsigned char c1, c2, t2;
     290              unsigned short t1;
     291  
     292              assert((code & 0x8000) == 0);
     293              c1 = code >> 8;
     294              c2 = code & 0xff;
     295              if (((c1 >= 0x21 && c1 <= 0x2c) ||
     296                  (c1 >= 0x4a && c1 <= 0x7d)) &&
     297                  (c2 >= 0x21 && c2 <= 0x7e)) {
     298                  t1 = (c1 < 0x4a ? (c1 - 0x21 + 0x1b2) :
     299                            (c1 - 0x21 + 0x197));
     300                  t2 = ((t1 & 1) ? 0x5e : 0) + (c2 - 0x21);
     301                  OUTBYTE1(t1 >> 1);
     302                  OUTBYTE2(t2 < 0x4e ? t2 + 0x31 : t2 + 0x43);
     303                  NEXT(1, 2);
     304                  continue;
     305              }
     306              else
     307                  return 1;
     308          }
     309          else
     310              return 1;
     311  
     312          OUTBYTE1(code >> 8);
     313          OUTBYTE2(code & 0xff);
     314          NEXT(1, 2);
     315      }
     316  
     317      return 0;
     318  }
     319  
     320  #define FILL 0xfd
     321  #define NONE 0xff
     322  
     323  static const unsigned char johabidx_choseong[32] = {
     324      NONE, FILL, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05,
     325      0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d,
     326      0x0e, 0x0f, 0x10, 0x11, 0x12, NONE, NONE, NONE,
     327      NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE,
     328  };
     329  static const unsigned char johabidx_jungseong[32] = {
     330      NONE, NONE, FILL, 0x00, 0x01, 0x02, 0x03, 0x04,
     331      NONE, NONE, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a,
     332      NONE, NONE, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10,
     333      NONE, NONE, 0x11, 0x12, 0x13, 0x14, NONE, NONE,
     334  };
     335  static const unsigned char johabidx_jongseong[32] = {
     336      NONE, FILL, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
     337      0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
     338      0x0f, 0x10, NONE, 0x11, 0x12, 0x13, 0x14, 0x15,
     339      0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, NONE, NONE,
     340  };
     341  
     342  static const unsigned char johabjamo_choseong[32] = {
     343      NONE, FILL, 0x31, 0x32, 0x34, 0x37, 0x38, 0x39,
     344      0x41, 0x42, 0x43, 0x45, 0x46, 0x47, 0x48, 0x49,
     345      0x4a, 0x4b, 0x4c, 0x4d, 0x4e, NONE, NONE, NONE,
     346      NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE,
     347  };
     348  static const unsigned char johabjamo_jungseong[32] = {
     349      NONE, NONE, FILL, 0x4f, 0x50, 0x51, 0x52, 0x53,
     350      NONE, NONE, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59,
     351      NONE, NONE, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
     352      NONE, NONE, 0x60, 0x61, 0x62, 0x63, NONE, NONE,
     353  };
     354  static const unsigned char johabjamo_jongseong[32] = {
     355      NONE, FILL, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36,
     356      0x37, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
     357      0x40, 0x41, NONE, 0x42, 0x44, 0x45, 0x46, 0x47,
     358      0x48, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, NONE, NONE,
     359  };
     360  
     361  DECODER(johab)
     362  {
     363      while (inleft > 0) {
     364          unsigned char c = INBYTE1, c2;
     365          Py_UCS4 decoded;
     366  
     367          if (c < 0x80) {
     368              OUTCHAR(c);
     369              NEXT_IN(1);
     370              continue;
     371          }
     372  
     373          REQUIRE_INBUF(2);
     374          c2 = INBYTE2;
     375  
     376          if (c < 0xd8) {
     377              /* johab hangul */
     378              unsigned char c_cho, c_jung, c_jong;
     379              unsigned char i_cho, i_jung, i_jong;
     380  
     381              c_cho = (c >> 2) & 0x1f;
     382              c_jung = ((c << 3) | c2 >> 5) & 0x1f;
     383              c_jong = c2 & 0x1f;
     384  
     385              i_cho = johabidx_choseong[c_cho];
     386              i_jung = johabidx_jungseong[c_jung];
     387              i_jong = johabidx_jongseong[c_jong];
     388  
     389              if (i_cho == NONE || i_jung == NONE || i_jong == NONE)
     390                  return 1;
     391  
     392              /* we don't use U+1100 hangul jamo yet. */
     393              if (i_cho == FILL) {
     394                  if (i_jung == FILL) {
     395                      if (i_jong == FILL)
     396                          OUTCHAR(0x3000);
     397                      else
     398                          OUTCHAR(0x3100 |
     399                              johabjamo_jongseong[c_jong]);
     400                  }
     401                  else {
     402                      if (i_jong == FILL)
     403                          OUTCHAR(0x3100 |
     404                              johabjamo_jungseong[c_jung]);
     405                      else
     406                          return 1;
     407                  }
     408              } else {
     409                  if (i_jung == FILL) {
     410                      if (i_jong == FILL)
     411                          OUTCHAR(0x3100 |
     412                              johabjamo_choseong[c_cho]);
     413                      else
     414                          return 1;
     415                  }
     416                  else
     417                      OUTCHAR(0xac00 +
     418                          i_cho * 588 +
     419                          i_jung * 28 +
     420                          (i_jong == FILL ? 0 : i_jong));
     421              }
     422              NEXT_IN(2);
     423          } else {
     424              /* KS X 1001 except hangul jamos and syllables */
     425              if (c == 0xdf || c > 0xf9 ||
     426                  c2 < 0x31 || (c2 >= 0x80 && c2 < 0x91) ||
     427                  (c2 & 0x7f) == 0x7f ||
     428                  (c == 0xda && (c2 >= 0xa1 && c2 <= 0xd3)))
     429                  return 1;
     430              else {
     431                  unsigned char t1, t2;
     432  
     433                  t1 = (c < 0xe0 ? 2 * (c - 0xd9) :
     434                           2 * c - 0x197);
     435                  t2 = (c2 < 0x91 ? c2 - 0x31 : c2 - 0x43);
     436                  t1 = t1 + (t2 < 0x5e ? 0 : 1) + 0x21;
     437                  t2 = (t2 < 0x5e ? t2 : t2 - 0x5e) + 0x21;
     438  
     439                  if (TRYMAP_DEC(ksx1001, decoded, t1, t2)) {
     440                      OUTCHAR(decoded);
     441                      NEXT_IN(2);
     442                  }
     443                  else {
     444                      return 1;
     445                  }
     446              }
     447          }
     448      }
     449  
     450      return 0;
     451  }
     452  #undef NONE
     453  #undef FILL
     454  
     455  
     456  BEGIN_MAPPINGS_LIST(3)
     457    MAPPING_DECONLY(ksx1001)
     458    MAPPING_ENCONLY(cp949)
     459    MAPPING_DECONLY(cp949ext)
     460  END_MAPPINGS_LIST
     461  
     462  BEGIN_CODECS_LIST(3)
     463    CODEC_STATELESS(euc_kr)
     464    CODEC_STATELESS(cp949)
     465    CODEC_STATELESS(johab)
     466  END_CODECS_LIST
     467  
     468  I_AM_A_MODULE_FOR(kr)