(root)/
Python-3.12.0/
Modules/
cjkcodecs/
_codecs_iso2022.c
       1  /*
       2   * _codecs_iso2022.c: Codecs collection for ISO-2022 encodings.
       3   *
       4   * Written by Hye-Shik Chang <perky@FreeBSD.org>
       5   */
       6  
       7  #define USING_IMPORTED_MAPS
       8  #define USING_BINARY_PAIR_SEARCH
       9  #define EXTERN_JISX0213_PAIR
      10  #define EMULATE_JISX0213_2000_ENCODE_INVALID MAP_UNMAPPABLE
      11  #define EMULATE_JISX0213_2000_DECODE_INVALID MAP_UNMAPPABLE
      12  
      13  #define CJK_MOD_SPECIFIC_STATE                  \
      14      /* kr */                                    \
      15      const encode_map *cp949_encmap;             \
      16      const decode_map *ksx1001_decmap;           \
      17                                                  \
      18      /* jp */                                    \
      19      const encode_map *jisxcommon_encmap;        \
      20      const decode_map *jisx0208_decmap;          \
      21      const decode_map *jisx0212_decmap;          \
      22      const encode_map *jisx0213_bmp_encmap;      \
      23      const decode_map *jisx0213_1_bmp_decmap;    \
      24      const decode_map *jisx0213_2_bmp_decmap;    \
      25      const encode_map *jisx0213_emp_encmap;      \
      26      const decode_map *jisx0213_1_emp_decmap;    \
      27      const decode_map *jisx0213_2_emp_decmap;    \
      28                                                  \
      29      /* cn */                                    \
      30      const encode_map *gbcommon_encmap;          \
      31      const decode_map *gb2312_decmap;
      32  
      33  
      34  #include "cjkcodecs.h"
      35  #include "alg_jisx0201.h"
      36  #include "emu_jisx0213_2000.h"
      37  #include "mappings_jisx0213_pair.h"
      38  
      39  /* STATE
      40  
      41     state->c[0-3]
      42  
      43      00000000
      44      ||^^^^^|
      45      |+-----+----  G0-3 Character Set
      46      +-----------  Is G0-3 double byte?
      47  
      48     state->c[4]
      49  
      50      00000000
      51            ||
      52            |+----  Locked-Shift?
      53            +-----  ESC Throughout
      54  */
      55  
      56  #define ESC                     0x1B
      57  #define SO                      0x0E
      58  #define SI                      0x0F
      59  #define LF                      0x0A
      60  
      61  #define MAX_ESCSEQLEN           16
      62  
      63  #define CHARSET_ISO8859_1       'A'
      64  #define CHARSET_ASCII           'B'
      65  #define CHARSET_ISO8859_7       'F'
      66  #define CHARSET_JISX0201_K      'I'
      67  #define CHARSET_JISX0201_R      'J'
      68  
      69  #define CHARSET_GB2312          ('A'|CHARSET_DBCS)
      70  #define CHARSET_JISX0208        ('B'|CHARSET_DBCS)
      71  #define CHARSET_KSX1001         ('C'|CHARSET_DBCS)
      72  #define CHARSET_JISX0212        ('D'|CHARSET_DBCS)
      73  #define CHARSET_GB2312_8565     ('E'|CHARSET_DBCS)
      74  #define CHARSET_CNS11643_1      ('G'|CHARSET_DBCS)
      75  #define CHARSET_CNS11643_2      ('H'|CHARSET_DBCS)
      76  #define CHARSET_JISX0213_2000_1 ('O'|CHARSET_DBCS)
      77  #define CHARSET_JISX0213_2      ('P'|CHARSET_DBCS)
      78  #define CHARSET_JISX0213_2004_1 ('Q'|CHARSET_DBCS)
      79  #define CHARSET_JISX0208_O      ('@'|CHARSET_DBCS)
      80  
      81  #define CHARSET_DBCS            0x80
      82  #define ESCMARK(mark)           ((mark) & 0x7f)
      83  
      84  #define IS_ESCEND(c)    (((c) >= 'A' && (c) <= 'Z') || (c) == '@')
      85  #define IS_ISO2022ESC(c2) \
      86          ((c2) == '(' || (c2) == ')' || (c2) == '$' || \
      87           (c2) == '.' || (c2) == '&')
      88      /* this is not a complete list of ISO-2022 escape sequence headers.
      89       * but, it's enough to implement CJK instances of iso-2022. */
      90  
      91  #define MAP_UNMAPPABLE          0xFFFF
      92  #define MAP_MULTIPLE_AVAIL      0xFFFE /* for JIS X 0213 */
      93  
      94  #define F_SHIFTED               0x01
      95  #define F_ESCTHROUGHOUT         0x02
      96  
      97  #define STATE_SETG(dn, v)       do { ((state)->c[dn]) = (v); } while (0)
      98  #define STATE_GETG(dn)          ((state)->c[dn])
      99  
     100  #define STATE_G0                STATE_GETG(0)
     101  #define STATE_G1                STATE_GETG(1)
     102  #define STATE_G2                STATE_GETG(2)
     103  #define STATE_G3                STATE_GETG(3)
     104  #define STATE_SETG0(v)          STATE_SETG(0, v)
     105  #define STATE_SETG1(v)          STATE_SETG(1, v)
     106  #define STATE_SETG2(v)          STATE_SETG(2, v)
     107  #define STATE_SETG3(v)          STATE_SETG(3, v)
     108  
     109  #define STATE_SETFLAG(f)        do { ((state)->c[4]) |= (f); } while (0)
     110  #define STATE_GETFLAG(f)        ((state)->c[4] & (f))
     111  #define STATE_CLEARFLAG(f)      do { ((state)->c[4]) &= ~(f); } while (0)
     112  #define STATE_CLEARFLAGS()      do { ((state)->c[4]) = 0; } while (0)
     113  
     114  #define ISO2022_CONFIG          ((const struct iso2022_config *)(codec->config))
     115  #define CONFIG_ISSET(flag)      (ISO2022_CONFIG->flags & (flag))
     116  #define CONFIG_DESIGNATIONS     (ISO2022_CONFIG->designations)
     117  
     118  /* iso2022_config.flags */
     119  #define NO_SHIFT                0x01
     120  #define USE_G2                  0x02
     121  #define USE_JISX0208_EXT        0x04
     122  
     123  /*-*- internal data structures -*-*/
     124  
     125  typedef int (*iso2022_init_func)(const MultibyteCodec *codec);
     126  typedef Py_UCS4 (*iso2022_decode_func)(const MultibyteCodec *codec,
     127                                         const unsigned char *data);
     128  typedef DBCHAR (*iso2022_encode_func)(const MultibyteCodec *codec,
     129                                        const Py_UCS4 *data,
     130                                        Py_ssize_t *length);
     131  
     132  struct iso2022_designation {
     133      unsigned char mark;
     134      unsigned char plane;
     135      unsigned char width;
     136      iso2022_init_func initializer;
     137      iso2022_decode_func decoder;
     138      iso2022_encode_func encoder;
     139  };
     140  
     141  struct iso2022_config {
     142      int flags;
     143      const struct iso2022_designation *designations; /* non-ascii desigs */
     144  };
     145  
     146  /*-*- iso-2022 codec implementation -*-*/
     147  
     148  CODEC_INIT(iso2022)
     149  {
     150      const struct iso2022_designation *desig;
     151      for (desig = CONFIG_DESIGNATIONS; desig->mark; desig++) {
     152          if (desig->initializer != NULL && desig->initializer(codec) != 0) {
     153              return -1;
     154          }
     155      }
     156      return 0;
     157  }
     158  
     159  ENCODER_INIT(iso2022)
     160  {
     161      STATE_CLEARFLAGS();
     162      STATE_SETG0(CHARSET_ASCII);
     163      STATE_SETG1(CHARSET_ASCII);
     164      return 0;
     165  }
     166  
     167  ENCODER_RESET(iso2022)
     168  {
     169      if (STATE_GETFLAG(F_SHIFTED)) {
     170          WRITEBYTE1(SI);
     171          NEXT_OUT(1);
     172          STATE_CLEARFLAG(F_SHIFTED);
     173      }
     174      if (STATE_G0 != CHARSET_ASCII) {
     175          WRITEBYTE3(ESC, '(', 'B');
     176          NEXT_OUT(3);
     177          STATE_SETG0(CHARSET_ASCII);
     178      }
     179      return 0;
     180  }
     181  
     182  ENCODER(iso2022)
     183  {
     184      while (*inpos < inlen) {
     185          const struct iso2022_designation *dsg;
     186          DBCHAR encoded;
     187          Py_UCS4 c = INCHAR1;
     188          Py_ssize_t insize;
     189  
     190          if (c < 0x80) {
     191              if (STATE_G0 != CHARSET_ASCII) {
     192                  WRITEBYTE3(ESC, '(', 'B');
     193                  STATE_SETG0(CHARSET_ASCII);
     194                  NEXT_OUT(3);
     195              }
     196              if (STATE_GETFLAG(F_SHIFTED)) {
     197                  WRITEBYTE1(SI);
     198                  STATE_CLEARFLAG(F_SHIFTED);
     199                  NEXT_OUT(1);
     200              }
     201              WRITEBYTE1((unsigned char)c);
     202              NEXT(1, 1);
     203              continue;
     204          }
     205  
     206          insize = 1;
     207  
     208          encoded = MAP_UNMAPPABLE;
     209          for (dsg = CONFIG_DESIGNATIONS; dsg->mark; dsg++) {
     210              Py_ssize_t length = 1;
     211              encoded = dsg->encoder(codec, &c, &length);
     212              if (encoded == MAP_MULTIPLE_AVAIL) {
     213                  /* this implementation won't work for pair
     214                   * of non-bmp characters. */
     215                  if (inlen - *inpos < 2) {
     216                      if (!(flags & MBENC_FLUSH))
     217                          return MBERR_TOOFEW;
     218                      length = -1;
     219                  }
     220                  else
     221                      length = 2;
     222                  encoded = dsg->encoder(codec, &c, &length);
     223                  if (encoded != MAP_UNMAPPABLE) {
     224                      insize = length;
     225                      break;
     226                  }
     227              }
     228              else if (encoded != MAP_UNMAPPABLE)
     229                  break;
     230          }
     231  
     232          if (!dsg->mark)
     233              return 1;
     234          assert(dsg->width == 1 || dsg->width == 2);
     235  
     236          switch (dsg->plane) {
     237          case 0: /* G0 */
     238              if (STATE_GETFLAG(F_SHIFTED)) {
     239                  WRITEBYTE1(SI);
     240                  STATE_CLEARFLAG(F_SHIFTED);
     241                  NEXT_OUT(1);
     242              }
     243              if (STATE_G0 != dsg->mark) {
     244                  if (dsg->width == 1) {
     245                      WRITEBYTE3(ESC, '(', ESCMARK(dsg->mark));
     246                      STATE_SETG0(dsg->mark);
     247                      NEXT_OUT(3);
     248                  }
     249                  else if (dsg->mark == CHARSET_JISX0208) {
     250                      WRITEBYTE3(ESC, '$', ESCMARK(dsg->mark));
     251                      STATE_SETG0(dsg->mark);
     252                      NEXT_OUT(3);
     253                  }
     254                  else {
     255                      WRITEBYTE4(ESC, '$', '(',
     256                          ESCMARK(dsg->mark));
     257                      STATE_SETG0(dsg->mark);
     258                      NEXT_OUT(4);
     259                  }
     260              }
     261              break;
     262          case 1: /* G1 */
     263              if (STATE_G1 != dsg->mark) {
     264                  if (dsg->width == 1) {
     265                      WRITEBYTE3(ESC, ')', ESCMARK(dsg->mark));
     266                      STATE_SETG1(dsg->mark);
     267                      NEXT_OUT(3);
     268                  }
     269                  else {
     270                      WRITEBYTE4(ESC, '$', ')', ESCMARK(dsg->mark));
     271                      STATE_SETG1(dsg->mark);
     272                      NEXT_OUT(4);
     273                  }
     274              }
     275              if (!STATE_GETFLAG(F_SHIFTED)) {
     276                  WRITEBYTE1(SO);
     277                  STATE_SETFLAG(F_SHIFTED);
     278                  NEXT_OUT(1);
     279              }
     280              break;
     281          default: /* G2 and G3 is not supported: no encoding in
     282                    * CJKCodecs are using them yet */
     283              return MBERR_INTERNAL;
     284          }
     285  
     286          if (dsg->width == 1) {
     287              WRITEBYTE1((unsigned char)encoded);
     288              NEXT_OUT(1);
     289          }
     290          else {
     291              WRITEBYTE2(encoded >> 8, encoded & 0xff);
     292              NEXT_OUT(2);
     293          }
     294          NEXT_INCHAR(insize);
     295      }
     296  
     297      return 0;
     298  }
     299  
     300  DECODER_INIT(iso2022)
     301  {
     302      STATE_CLEARFLAGS();
     303      STATE_SETG0(CHARSET_ASCII);
     304      STATE_SETG1(CHARSET_ASCII);
     305      STATE_SETG2(CHARSET_ASCII);
     306      return 0;
     307  }
     308  
     309  DECODER_RESET(iso2022)
     310  {
     311      STATE_SETG0(CHARSET_ASCII);
     312      STATE_CLEARFLAG(F_SHIFTED);
     313      return 0;
     314  }
     315  
     316  static Py_ssize_t
     317  iso2022processesc(const MultibyteCodec *codec, MultibyteCodec_State *state,
     318                    const unsigned char **inbuf, Py_ssize_t *inleft)
     319  {
     320      unsigned char charset, designation;
     321      Py_ssize_t i, esclen = 0;
     322  
     323      for (i = 1;i < MAX_ESCSEQLEN;i++) {
     324          if (i >= *inleft)
     325              return MBERR_TOOFEW;
     326          if (IS_ESCEND((*inbuf)[i])) {
     327              esclen = i + 1;
     328              break;
     329          }
     330          else if (CONFIG_ISSET(USE_JISX0208_EXT) && i+1 < *inleft &&
     331                   (*inbuf)[i] == '&' && (*inbuf)[i+1] == '@') {
     332              i += 2;
     333          }
     334      }
     335  
     336      switch (esclen) {
     337      case 0:
     338          return 1; /* unterminated escape sequence */
     339      case 3:
     340          if (INBYTE2 == '$') {
     341              charset = INBYTE3 | CHARSET_DBCS;
     342              designation = 0;
     343          }
     344          else {
     345              charset = INBYTE3;
     346              if (INBYTE2 == '(')
     347                  designation = 0;
     348              else if (INBYTE2 == ')')
     349                  designation = 1;
     350              else if (CONFIG_ISSET(USE_G2) && INBYTE2 == '.')
     351                  designation = 2;
     352              else
     353                  return 3;
     354          }
     355          break;
     356      case 4:
     357          if (INBYTE2 != '$')
     358              return 4;
     359  
     360          charset = INBYTE4 | CHARSET_DBCS;
     361          if (INBYTE3 == '(')
     362              designation = 0;
     363          else if (INBYTE3 == ')')
     364              designation = 1;
     365          else
     366              return 4;
     367          break;
     368      case 6: /* designation with prefix */
     369          if (CONFIG_ISSET(USE_JISX0208_EXT) &&
     370              (*inbuf)[3] == ESC && (*inbuf)[4] == '$' &&
     371              (*inbuf)[5] == 'B') {
     372              charset = 'B' | CHARSET_DBCS;
     373              designation = 0;
     374          }
     375          else
     376              return 6;
     377          break;
     378      default:
     379          return esclen;
     380      }
     381  
     382      /* raise error when the charset is not designated for this encoding */
     383      if (charset != CHARSET_ASCII) {
     384          const struct iso2022_designation *dsg;
     385  
     386          for (dsg = CONFIG_DESIGNATIONS; dsg->mark; dsg++) {
     387              if (dsg->mark == charset)
     388                  break;
     389          }
     390          if (!dsg->mark)
     391              return esclen;
     392      }
     393  
     394      STATE_SETG(designation, charset);
     395      *inleft -= esclen;
     396      (*inbuf) += esclen;
     397      return 0;
     398  }
     399  
     400  #define ISO8859_7_DECODE(c, writer)                                \
     401      if ((c) < 0xa0) {                                              \
     402          OUTCHAR(c);                                                \
     403      } else if ((c) < 0xc0 && (0x288f3bc9L & (1L << ((c)-0xa0)))) { \
     404          OUTCHAR(c);                                                \
     405      } else if ((c) >= 0xb4 && (c) <= 0xfe && ((c) >= 0xd4 ||       \
     406               (0xbffffd77L & (1L << ((c)-0xb4))))) {                \
     407          OUTCHAR(0x02d0 + (c));                                     \
     408      } else if ((c) == 0xa1) {                                      \
     409          OUTCHAR(0x2018);                                           \
     410      } else if ((c) == 0xa2) {                                      \
     411          OUTCHAR(0x2019);                                           \
     412      } else if ((c) == 0xaf) {                                      \
     413          OUTCHAR(0x2015);                                           \
     414      }
     415  
     416  static Py_ssize_t
     417  iso2022processg2(const MultibyteCodec *codec, MultibyteCodec_State *state,
     418                   const unsigned char **inbuf, Py_ssize_t *inleft,
     419                   _PyUnicodeWriter *writer)
     420  {
     421      /* not written to use encoder, decoder functions because only few
     422       * encodings use G2 designations in CJKCodecs */
     423      if (STATE_G2 == CHARSET_ISO8859_1) {
     424          if (INBYTE3 < 0x80)
     425              OUTCHAR(INBYTE3 + 0x80);
     426          else
     427              return 3;
     428      }
     429      else if (STATE_G2 == CHARSET_ISO8859_7) {
     430          ISO8859_7_DECODE(INBYTE3 ^ 0x80, writer)
     431          else
     432              return 3;
     433      }
     434      else if (STATE_G2 == CHARSET_ASCII) {
     435          if (INBYTE3 & 0x80)
     436              return 3;
     437          else
     438              OUTCHAR(INBYTE3);
     439      }
     440      else
     441          return MBERR_INTERNAL;
     442  
     443      (*inbuf) += 3;
     444      *inleft -= 3;
     445      return 0;
     446  }
     447  
     448  DECODER(iso2022)
     449  {
     450      const struct iso2022_designation *dsgcache = NULL;
     451  
     452      while (inleft > 0) {
     453          unsigned char c = INBYTE1;
     454          Py_ssize_t err;
     455  
     456          if (STATE_GETFLAG(F_ESCTHROUGHOUT)) {
     457              /* ESC throughout mode:
     458               * for non-iso2022 escape sequences */
     459              OUTCHAR(c); /* assume as ISO-8859-1 */
     460              NEXT_IN(1);
     461              if (IS_ESCEND(c)) {
     462                  STATE_CLEARFLAG(F_ESCTHROUGHOUT);
     463              }
     464              continue;
     465          }
     466  
     467          switch (c) {
     468          case ESC:
     469              REQUIRE_INBUF(2);
     470              if (IS_ISO2022ESC(INBYTE2)) {
     471                  err = iso2022processesc(codec, state,
     472                                          inbuf, &inleft);
     473                  if (err != 0)
     474                      return err;
     475              }
     476              else if (CONFIG_ISSET(USE_G2) && INBYTE2 == 'N') {/* SS2 */
     477                  REQUIRE_INBUF(3);
     478                  err = iso2022processg2(codec, state,
     479                                         inbuf, &inleft, writer);
     480                  if (err != 0)
     481                      return err;
     482              }
     483              else {
     484                  OUTCHAR(ESC);
     485                  STATE_SETFLAG(F_ESCTHROUGHOUT);
     486                  NEXT_IN(1);
     487              }
     488              break;
     489          case SI:
     490              if (CONFIG_ISSET(NO_SHIFT))
     491                  goto bypass;
     492              STATE_CLEARFLAG(F_SHIFTED);
     493              NEXT_IN(1);
     494              break;
     495          case SO:
     496              if (CONFIG_ISSET(NO_SHIFT))
     497                  goto bypass;
     498              STATE_SETFLAG(F_SHIFTED);
     499              NEXT_IN(1);
     500              break;
     501          case LF:
     502              STATE_CLEARFLAG(F_SHIFTED);
     503              OUTCHAR(LF);
     504              NEXT_IN(1);
     505              break;
     506          default:
     507              if (c < 0x20) /* C0 */
     508                  goto bypass;
     509              else if (c >= 0x80)
     510                  return 1;
     511              else {
     512                  const struct iso2022_designation *dsg;
     513                  unsigned char charset;
     514                  Py_UCS4 decoded;
     515  
     516                  if (STATE_GETFLAG(F_SHIFTED))
     517                      charset = STATE_G1;
     518                  else
     519                      charset = STATE_G0;
     520  
     521                  if (charset == CHARSET_ASCII) {
     522  bypass:
     523                      OUTCHAR(c);
     524                      NEXT_IN(1);
     525                      break;
     526                  }
     527  
     528                  if (dsgcache != NULL &&
     529                      dsgcache->mark == charset)
     530                          dsg = dsgcache;
     531                  else {
     532                      for (dsg = CONFIG_DESIGNATIONS;
     533                           dsg->mark != charset
     534  #ifdef Py_DEBUG
     535                              && dsg->mark != '\0'
     536  #endif
     537                           ; dsg++)
     538                      {
     539                          /* noop */
     540                      }
     541                      assert(dsg->mark != '\0');
     542                      dsgcache = dsg;
     543                  }
     544  
     545                  REQUIRE_INBUF(dsg->width);
     546                  decoded = dsg->decoder(codec, *inbuf);
     547                  if (decoded == MAP_UNMAPPABLE)
     548                      return dsg->width;
     549  
     550                  if (decoded < 0x10000) {
     551                      OUTCHAR(decoded);
     552                  }
     553                  else if (decoded < 0x30000) {
     554                      OUTCHAR(decoded);
     555                  }
     556                  else { /* JIS X 0213 pairs */
     557                      OUTCHAR2(decoded >> 16, decoded & 0xffff);
     558                  }
     559                  NEXT_IN(dsg->width);
     560              }
     561              break;
     562          }
     563      }
     564      return 0;
     565  }
     566  
     567  /*-*- mapping access functions -*-*/
     568  
     569  static int
     570  ksx1001_init(const MultibyteCodec *codec)
     571  {
     572      cjkcodecs_module_state *st = codec->modstate;
     573      if (IMPORT_MAP(kr, cp949, &st->cp949_encmap, NULL) ||
     574          IMPORT_MAP(kr, ksx1001, NULL, &st->ksx1001_decmap))
     575      {
     576          return -1;
     577      }
     578      return 0;
     579  }
     580  
     581  static Py_UCS4
     582  ksx1001_decoder(const MultibyteCodec *codec, const unsigned char *data)
     583  {
     584      Py_UCS4 u;
     585      if (TRYMAP_DEC_ST(ksx1001, u, data[0], data[1]))
     586          return u;
     587      else
     588          return MAP_UNMAPPABLE;
     589  }
     590  
     591  static DBCHAR
     592  ksx1001_encoder(const MultibyteCodec *codec, const Py_UCS4 *data,
     593                  Py_ssize_t *length)
     594  {
     595      DBCHAR coded;
     596      assert(*length == 1);
     597      if (*data < 0x10000) {
     598          if (TRYMAP_ENC_ST(cp949, coded, *data)) {
     599              if (!(coded & 0x8000))
     600                  return coded;
     601          }
     602      }
     603      return MAP_UNMAPPABLE;
     604  }
     605  
     606  static int
     607  jisx0208_init(const MultibyteCodec *codec)
     608  {
     609      cjkcodecs_module_state *st = codec->modstate;
     610      if (IMPORT_MAP(jp, jisxcommon, &st->jisxcommon_encmap, NULL) ||
     611          IMPORT_MAP(jp, jisx0208, NULL, &st->jisx0208_decmap))
     612      {
     613          return -1;
     614      }
     615      return 0;
     616  }
     617  
     618  static Py_UCS4
     619  jisx0208_decoder(const MultibyteCodec *codec, const unsigned char *data)
     620  {
     621      Py_UCS4 u;
     622      if (data[0] == 0x21 && data[1] == 0x40) /* F/W REVERSE SOLIDUS */
     623          return 0xff3c;
     624      else if (TRYMAP_DEC_ST(jisx0208, u, data[0], data[1]))
     625          return u;
     626      else
     627          return MAP_UNMAPPABLE;
     628  }
     629  
     630  static DBCHAR
     631  jisx0208_encoder(const MultibyteCodec *codec, const Py_UCS4 *data,
     632                   Py_ssize_t *length)
     633  {
     634      DBCHAR coded;
     635      assert(*length == 1);
     636      if (*data < 0x10000) {
     637          if (*data == 0xff3c) /* F/W REVERSE SOLIDUS */
     638              return 0x2140;
     639          else if (TRYMAP_ENC_ST(jisxcommon, coded, *data)) {
     640              if (!(coded & 0x8000))
     641                  return coded;
     642          }
     643      }
     644      return MAP_UNMAPPABLE;
     645  }
     646  
     647  static int
     648  jisx0212_init(const MultibyteCodec *codec)
     649  {
     650      cjkcodecs_module_state *st = codec->modstate;
     651      if (IMPORT_MAP(jp, jisxcommon, &st->jisxcommon_encmap, NULL) ||
     652          IMPORT_MAP(jp, jisx0212, NULL, &st->jisx0212_decmap))
     653      {
     654          return -1;
     655      }
     656      return 0;
     657  }
     658  
     659  static Py_UCS4
     660  jisx0212_decoder(const MultibyteCodec *codec, const unsigned char *data)
     661  {
     662      Py_UCS4 u;
     663      if (TRYMAP_DEC_ST(jisx0212, u, data[0], data[1]))
     664          return u;
     665      else
     666          return MAP_UNMAPPABLE;
     667  }
     668  
     669  static DBCHAR
     670  jisx0212_encoder(const MultibyteCodec *codec, const Py_UCS4 *data,
     671                   Py_ssize_t *length)
     672  {
     673      DBCHAR coded;
     674      assert(*length == 1);
     675      if (*data < 0x10000) {
     676          if (TRYMAP_ENC_ST(jisxcommon, coded, *data)) {
     677              if (coded & 0x8000)
     678                  return coded & 0x7fff;
     679          }
     680      }
     681      return MAP_UNMAPPABLE;
     682  }
     683  
     684  static int
     685  jisx0213_init(const MultibyteCodec *codec)
     686  {
     687      cjkcodecs_module_state *st = codec->modstate;
     688      if (jisx0208_init(codec) ||
     689          IMPORT_MAP(jp, jisx0213_bmp, &st->jisx0213_bmp_encmap, NULL) ||
     690          IMPORT_MAP(jp, jisx0213_1_bmp, NULL, &st->jisx0213_1_bmp_decmap) ||
     691          IMPORT_MAP(jp, jisx0213_2_bmp, NULL, &st->jisx0213_2_bmp_decmap) ||
     692          IMPORT_MAP(jp, jisx0213_emp, &st->jisx0213_emp_encmap, NULL) ||
     693          IMPORT_MAP(jp, jisx0213_1_emp, NULL, &st->jisx0213_1_emp_decmap) ||
     694          IMPORT_MAP(jp, jisx0213_2_emp, NULL, &st->jisx0213_2_emp_decmap) ||
     695          IMPORT_MAP(jp, jisx0213_pair,
     696                     &jisx0213_pair_encmap, &jisx0213_pair_decmap))
     697      {
     698          return -1;
     699      }
     700      return 0;
     701  }
     702  
     703  #define config ((void *)2000)
     704  static Py_UCS4
     705  jisx0213_2000_1_decoder(const MultibyteCodec *codec, const unsigned char *data)
     706  {
     707      Py_UCS4 u;
     708      EMULATE_JISX0213_2000_DECODE_PLANE1(config, u, data[0], data[1])
     709      else if (data[0] == 0x21 && data[1] == 0x40) /* F/W REVERSE SOLIDUS */
     710          return 0xff3c;
     711      else if (TRYMAP_DEC_ST(jisx0208, u, data[0], data[1]))
     712          ;
     713      else if (TRYMAP_DEC_ST(jisx0213_1_bmp, u, data[0], data[1]))
     714          ;
     715      else if (TRYMAP_DEC_ST(jisx0213_1_emp, u, data[0], data[1]))
     716          u |= 0x20000;
     717      else if (TRYMAP_DEC(jisx0213_pair, u, data[0], data[1]))
     718          ;
     719      else
     720          return MAP_UNMAPPABLE;
     721      return u;
     722  }
     723  
     724  static Py_UCS4
     725  jisx0213_2000_2_decoder(const MultibyteCodec *codec, const unsigned char *data)
     726  {
     727      Py_UCS4 u;
     728      EMULATE_JISX0213_2000_DECODE_PLANE2_CHAR(config, u, data[0], data[1])
     729      if (TRYMAP_DEC_ST(jisx0213_2_bmp, u, data[0], data[1]))
     730          ;
     731      else if (TRYMAP_DEC_ST(jisx0213_2_emp, u, data[0], data[1]))
     732          u |= 0x20000;
     733      else
     734          return MAP_UNMAPPABLE;
     735      return u;
     736  }
     737  #undef config
     738  
     739  static Py_UCS4
     740  jisx0213_2004_1_decoder(const MultibyteCodec *codec, const unsigned char *data)
     741  {
     742      Py_UCS4 u;
     743      if (data[0] == 0x21 && data[1] == 0x40) /* F/W REVERSE SOLIDUS */
     744          return 0xff3c;
     745      else if (TRYMAP_DEC_ST(jisx0208, u, data[0], data[1]))
     746          ;
     747      else if (TRYMAP_DEC_ST(jisx0213_1_bmp, u, data[0], data[1]))
     748          ;
     749      else if (TRYMAP_DEC_ST(jisx0213_1_emp, u, data[0], data[1]))
     750          u |= 0x20000;
     751      else if (TRYMAP_DEC(jisx0213_pair, u, data[0], data[1]))
     752          ;
     753      else
     754          return MAP_UNMAPPABLE;
     755      return u;
     756  }
     757  
     758  static Py_UCS4
     759  jisx0213_2004_2_decoder(const MultibyteCodec *codec, const unsigned char *data)
     760  {
     761      Py_UCS4 u;
     762      if (TRYMAP_DEC_ST(jisx0213_2_bmp, u, data[0], data[1]))
     763          ;
     764      else if (TRYMAP_DEC_ST(jisx0213_2_emp, u, data[0], data[1]))
     765          u |= 0x20000;
     766      else
     767          return MAP_UNMAPPABLE;
     768      return u;
     769  }
     770  
     771  static DBCHAR
     772  jisx0213_encoder(const MultibyteCodec *codec, const Py_UCS4 *data,
     773                   Py_ssize_t *length, const void *config)
     774  {
     775      DBCHAR coded;
     776  
     777      switch (*length) {
     778      case 1: /* first character */
     779          if (*data >= 0x10000) {
     780              if ((*data) >> 16 == 0x20000 >> 16) {
     781                  EMULATE_JISX0213_2000_ENCODE_EMP(config, coded, *data)
     782                  else if (TRYMAP_ENC_ST(jisx0213_emp, coded, (*data) & 0xffff))
     783                      return coded;
     784              }
     785              return MAP_UNMAPPABLE;
     786          }
     787  
     788          EMULATE_JISX0213_2000_ENCODE_BMP(config, coded, *data)
     789          else if (TRYMAP_ENC_ST(jisx0213_bmp, coded, *data)) {
     790              if (coded == MULTIC)
     791                  return MAP_MULTIPLE_AVAIL;
     792          }
     793          else if (TRYMAP_ENC_ST(jisxcommon, coded, *data)) {
     794              if (coded & 0x8000)
     795                  return MAP_UNMAPPABLE;
     796          }
     797          else
     798              return MAP_UNMAPPABLE;
     799          return coded;
     800  
     801      case 2: /* second character of unicode pair */
     802          coded = find_pairencmap((ucs2_t)data[0], (ucs2_t)data[1],
     803                                  jisx0213_pair_encmap, JISX0213_ENCPAIRS);
     804          if (coded != DBCINV)
     805              return coded;
     806          /* fall through */
     807  
     808      case -1: /* flush unterminated */
     809          *length = 1;
     810          coded = find_pairencmap((ucs2_t)data[0], 0,
     811                                  jisx0213_pair_encmap, JISX0213_ENCPAIRS);
     812          if (coded == DBCINV)
     813              return MAP_UNMAPPABLE;
     814          else
     815              return coded;
     816          break;
     817  
     818      default:
     819          return MAP_UNMAPPABLE;
     820      }
     821  }
     822  
     823  static DBCHAR
     824  jisx0213_2000_1_encoder(const MultibyteCodec *codec, const Py_UCS4 *data,
     825                          Py_ssize_t *length)
     826  {
     827      DBCHAR coded = jisx0213_encoder(codec, data, length, (void *)2000);
     828      if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL)
     829          return coded;
     830      else if (coded & 0x8000)
     831          return MAP_UNMAPPABLE;
     832      else
     833          return coded;
     834  }
     835  
     836  static DBCHAR
     837  jisx0213_2000_1_encoder_paironly(const MultibyteCodec *codec,
     838                                   const Py_UCS4 *data, Py_ssize_t *length)
     839  {
     840      DBCHAR coded;
     841      Py_ssize_t ilength = *length;
     842  
     843      coded = jisx0213_encoder(codec, data, length, (void *)2000);
     844      switch (ilength) {
     845      case 1:
     846          if (coded == MAP_MULTIPLE_AVAIL)
     847              return MAP_MULTIPLE_AVAIL;
     848          else
     849              return MAP_UNMAPPABLE;
     850      case 2:
     851          if (*length != 2)
     852              return MAP_UNMAPPABLE;
     853          else
     854              return coded;
     855      default:
     856          return MAP_UNMAPPABLE;
     857      }
     858  }
     859  
     860  static DBCHAR
     861  jisx0213_2000_2_encoder(const MultibyteCodec *codec, const Py_UCS4 *data,
     862                          Py_ssize_t *length)
     863  {
     864      DBCHAR coded = jisx0213_encoder(codec, data, length, (void *)2000);
     865      if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL)
     866          return coded;
     867      else if (coded & 0x8000)
     868          return coded & 0x7fff;
     869      else
     870          return MAP_UNMAPPABLE;
     871  }
     872  
     873  static DBCHAR
     874  jisx0213_2004_1_encoder(const MultibyteCodec *codec, const Py_UCS4 *data,
     875                          Py_ssize_t *length)
     876  {
     877      DBCHAR coded = jisx0213_encoder(codec, data, length, NULL);
     878      if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL)
     879          return coded;
     880      else if (coded & 0x8000)
     881          return MAP_UNMAPPABLE;
     882      else
     883          return coded;
     884  }
     885  
     886  static DBCHAR
     887  jisx0213_2004_1_encoder_paironly(const MultibyteCodec *codec,
     888                                   const Py_UCS4 *data, Py_ssize_t *length)
     889  {
     890      DBCHAR coded;
     891      Py_ssize_t ilength = *length;
     892  
     893      coded = jisx0213_encoder(codec, data, length, NULL);
     894      switch (ilength) {
     895      case 1:
     896          if (coded == MAP_MULTIPLE_AVAIL)
     897              return MAP_MULTIPLE_AVAIL;
     898          else
     899              return MAP_UNMAPPABLE;
     900      case 2:
     901          if (*length != 2)
     902              return MAP_UNMAPPABLE;
     903          else
     904              return coded;
     905      default:
     906          return MAP_UNMAPPABLE;
     907      }
     908  }
     909  
     910  static DBCHAR
     911  jisx0213_2004_2_encoder(const MultibyteCodec *codec, const Py_UCS4 *data,
     912                          Py_ssize_t *length)
     913  {
     914      DBCHAR coded = jisx0213_encoder(codec, data, length, NULL);
     915      if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL)
     916          return coded;
     917      else if (coded & 0x8000)
     918          return coded & 0x7fff;
     919      else
     920          return MAP_UNMAPPABLE;
     921  }
     922  
     923  static Py_UCS4
     924  jisx0201_r_decoder(const MultibyteCodec *codec, const unsigned char *data)
     925  {
     926      Py_UCS4 u;
     927      JISX0201_R_DECODE_CHAR(*data, u)
     928      else
     929          return MAP_UNMAPPABLE;
     930      return u;
     931  }
     932  
     933  static DBCHAR
     934  jisx0201_r_encoder(const MultibyteCodec *codec, const Py_UCS4 *data,
     935                     Py_ssize_t *length)
     936  {
     937      DBCHAR coded;
     938      JISX0201_R_ENCODE(*data, coded)
     939      else
     940          return MAP_UNMAPPABLE;
     941      return coded;
     942  }
     943  
     944  static Py_UCS4
     945  jisx0201_k_decoder(const MultibyteCodec *codec, const unsigned char *data)
     946  {
     947      Py_UCS4 u;
     948      JISX0201_K_DECODE_CHAR(*data ^ 0x80, u)
     949      else
     950          return MAP_UNMAPPABLE;
     951      return u;
     952  }
     953  
     954  static DBCHAR
     955  jisx0201_k_encoder(const MultibyteCodec *codec, const Py_UCS4 *data,
     956                     Py_ssize_t *length)
     957  {
     958      DBCHAR coded;
     959      JISX0201_K_ENCODE(*data, coded)
     960      else
     961          return MAP_UNMAPPABLE;
     962      return coded - 0x80;
     963  }
     964  
     965  static int
     966  gb2312_init(const MultibyteCodec *codec)
     967  {
     968      cjkcodecs_module_state *st = codec->modstate;
     969      if (IMPORT_MAP(cn, gbcommon, &st->gbcommon_encmap, NULL) ||
     970          IMPORT_MAP(cn, gb2312, NULL, &st->gb2312_decmap))
     971      {
     972          return -1;
     973      }
     974      return 0;
     975  }
     976  
     977  static Py_UCS4
     978  gb2312_decoder(const MultibyteCodec *codec, const unsigned char *data)
     979  {
     980      Py_UCS4 u;
     981      if (TRYMAP_DEC_ST(gb2312, u, data[0], data[1]))
     982          return u;
     983      else
     984          return MAP_UNMAPPABLE;
     985  }
     986  
     987  static DBCHAR
     988  gb2312_encoder(const MultibyteCodec *codec, const Py_UCS4 *data,
     989                 Py_ssize_t *length)
     990  {
     991      DBCHAR coded;
     992      assert(*length == 1);
     993      if (*data < 0x10000) {
     994          if (TRYMAP_ENC_ST(gbcommon, coded, *data)) {
     995              if (!(coded & 0x8000))
     996                  return coded;
     997          }
     998      }
     999      return MAP_UNMAPPABLE;
    1000  }
    1001  
    1002  
    1003  static Py_UCS4
    1004  dummy_decoder(const MultibyteCodec *codec, const unsigned char *data)
    1005  {
    1006      return MAP_UNMAPPABLE;
    1007  }
    1008  
    1009  static DBCHAR
    1010  dummy_encoder(const MultibyteCodec *codec, const Py_UCS4 *data,
    1011                Py_ssize_t *length)
    1012  {
    1013      return MAP_UNMAPPABLE;
    1014  }
    1015  
    1016  /*-*- registry tables -*-*/
    1017  
    1018  #define REGISTRY_KSX1001_G0     { CHARSET_KSX1001, 0, 2,                \
    1019                    ksx1001_init,                                         \
    1020                    ksx1001_decoder, ksx1001_encoder }
    1021  #define REGISTRY_KSX1001_G1     { CHARSET_KSX1001, 1, 2,                \
    1022                    ksx1001_init,                                         \
    1023                    ksx1001_decoder, ksx1001_encoder }
    1024  #define REGISTRY_JISX0201_R     { CHARSET_JISX0201_R, 0, 1,             \
    1025                    NULL,                                                 \
    1026                    jisx0201_r_decoder, jisx0201_r_encoder }
    1027  #define REGISTRY_JISX0201_K     { CHARSET_JISX0201_K, 0, 1,             \
    1028                    NULL,                                                 \
    1029                    jisx0201_k_decoder, jisx0201_k_encoder }
    1030  #define REGISTRY_JISX0208       { CHARSET_JISX0208, 0, 2,               \
    1031                    jisx0208_init,                                        \
    1032                    jisx0208_decoder, jisx0208_encoder }
    1033  #define REGISTRY_JISX0208_O     { CHARSET_JISX0208_O, 0, 2,             \
    1034                    jisx0208_init,                                        \
    1035                    jisx0208_decoder, jisx0208_encoder }
    1036  #define REGISTRY_JISX0212       { CHARSET_JISX0212, 0, 2,               \
    1037                    jisx0212_init,                                        \
    1038                    jisx0212_decoder, jisx0212_encoder }
    1039  #define REGISTRY_JISX0213_2000_1 { CHARSET_JISX0213_2000_1, 0, 2,       \
    1040                    jisx0213_init,                                        \
    1041                    jisx0213_2000_1_decoder,                              \
    1042                    jisx0213_2000_1_encoder }
    1043  #define REGISTRY_JISX0213_2000_1_PAIRONLY { CHARSET_JISX0213_2000_1, 0, 2, \
    1044                    jisx0213_init,                                        \
    1045                    jisx0213_2000_1_decoder,                              \
    1046                    jisx0213_2000_1_encoder_paironly }
    1047  #define REGISTRY_JISX0213_2000_2 { CHARSET_JISX0213_2, 0, 2,            \
    1048                    jisx0213_init,                                        \
    1049                    jisx0213_2000_2_decoder,                              \
    1050                    jisx0213_2000_2_encoder }
    1051  #define REGISTRY_JISX0213_2004_1 { CHARSET_JISX0213_2004_1, 0, 2,       \
    1052                    jisx0213_init,                                        \
    1053                    jisx0213_2004_1_decoder,                              \
    1054                    jisx0213_2004_1_encoder }
    1055  #define REGISTRY_JISX0213_2004_1_PAIRONLY { CHARSET_JISX0213_2004_1, 0, 2, \
    1056                    jisx0213_init,                                        \
    1057                    jisx0213_2004_1_decoder,                              \
    1058                    jisx0213_2004_1_encoder_paironly }
    1059  #define REGISTRY_JISX0213_2004_2 { CHARSET_JISX0213_2, 0, 2,            \
    1060                    jisx0213_init,                                        \
    1061                    jisx0213_2004_2_decoder,                              \
    1062                    jisx0213_2004_2_encoder }
    1063  #define REGISTRY_GB2312         { CHARSET_GB2312, 0, 2,                 \
    1064                    gb2312_init,                                          \
    1065                    gb2312_decoder, gb2312_encoder }
    1066  #define REGISTRY_CNS11643_1     { CHARSET_CNS11643_1, 1, 2,             \
    1067                    cns11643_init,                                        \
    1068                    cns11643_1_decoder, cns11643_1_encoder }
    1069  #define REGISTRY_CNS11643_2     { CHARSET_CNS11643_2, 2, 2,             \
    1070                    cns11643_init,                                        \
    1071                    cns11643_2_decoder, cns11643_2_encoder }
    1072  #define REGISTRY_ISO8859_1      { CHARSET_ISO8859_1, 2, 1,              \
    1073                    NULL, dummy_decoder, dummy_encoder }
    1074  #define REGISTRY_ISO8859_7      { CHARSET_ISO8859_7, 2, 1,              \
    1075                    NULL, dummy_decoder, dummy_encoder }
    1076  #define REGISTRY_SENTINEL       { 0, }
    1077  #define CONFIGDEF(var, attrs)                                           \
    1078      static const struct iso2022_config iso2022_##var##_config = {       \
    1079          attrs, iso2022_##var##_designations                             \
    1080      };
    1081  
    1082  static const struct iso2022_designation iso2022_kr_designations[] = {
    1083      REGISTRY_KSX1001_G1, REGISTRY_SENTINEL
    1084  };
    1085  CONFIGDEF(kr, 0)
    1086  
    1087  static const struct iso2022_designation iso2022_jp_designations[] = {
    1088      REGISTRY_JISX0208, REGISTRY_JISX0201_R, REGISTRY_JISX0208_O,
    1089      REGISTRY_SENTINEL
    1090  };
    1091  CONFIGDEF(jp, NO_SHIFT | USE_JISX0208_EXT)
    1092  
    1093  static const struct iso2022_designation iso2022_jp_1_designations[] = {
    1094      REGISTRY_JISX0208, REGISTRY_JISX0212, REGISTRY_JISX0201_R,
    1095      REGISTRY_JISX0208_O, REGISTRY_SENTINEL
    1096  };
    1097  CONFIGDEF(jp_1, NO_SHIFT | USE_JISX0208_EXT)
    1098  
    1099  static const struct iso2022_designation iso2022_jp_2_designations[] = {
    1100      REGISTRY_JISX0208, REGISTRY_JISX0212, REGISTRY_KSX1001_G0,
    1101      REGISTRY_GB2312, REGISTRY_JISX0201_R, REGISTRY_JISX0208_O,
    1102      REGISTRY_ISO8859_1, REGISTRY_ISO8859_7, REGISTRY_SENTINEL
    1103  };
    1104  CONFIGDEF(jp_2, NO_SHIFT | USE_G2 | USE_JISX0208_EXT)
    1105  
    1106  static const struct iso2022_designation iso2022_jp_2004_designations[] = {
    1107      REGISTRY_JISX0213_2004_1_PAIRONLY, REGISTRY_JISX0208,
    1108      REGISTRY_JISX0213_2004_1, REGISTRY_JISX0213_2004_2, REGISTRY_SENTINEL
    1109  };
    1110  CONFIGDEF(jp_2004, NO_SHIFT | USE_JISX0208_EXT)
    1111  
    1112  static const struct iso2022_designation iso2022_jp_3_designations[] = {
    1113      REGISTRY_JISX0213_2000_1_PAIRONLY, REGISTRY_JISX0208,
    1114      REGISTRY_JISX0213_2000_1, REGISTRY_JISX0213_2000_2, REGISTRY_SENTINEL
    1115  };
    1116  CONFIGDEF(jp_3, NO_SHIFT | USE_JISX0208_EXT)
    1117  
    1118  static const struct iso2022_designation iso2022_jp_ext_designations[] = {
    1119      REGISTRY_JISX0208, REGISTRY_JISX0212, REGISTRY_JISX0201_R,
    1120      REGISTRY_JISX0201_K, REGISTRY_JISX0208_O, REGISTRY_SENTINEL
    1121  };
    1122  CONFIGDEF(jp_ext, NO_SHIFT | USE_JISX0208_EXT)
    1123  
    1124  
    1125  BEGIN_MAPPINGS_LIST(0)
    1126    /* no mapping table here */
    1127  END_MAPPINGS_LIST
    1128  
    1129  #define ISO2022_CODEC(variation)                \
    1130  NEXT_CODEC = (MultibyteCodec){                  \
    1131      "iso2022_" #variation,                      \
    1132      &iso2022_##variation##_config,              \
    1133      iso2022_codec_init,                         \
    1134      _STATEFUL_METHODS(iso2022)                  \
    1135  };
    1136  
    1137  BEGIN_CODECS_LIST(7)
    1138    ISO2022_CODEC(kr)
    1139    ISO2022_CODEC(jp)
    1140    ISO2022_CODEC(jp_1)
    1141    ISO2022_CODEC(jp_2)
    1142    ISO2022_CODEC(jp_2004)
    1143    ISO2022_CODEC(jp_3)
    1144    ISO2022_CODEC(jp_ext)
    1145  END_CODECS_LIST
    1146  
    1147  I_AM_A_MODULE_FOR(iso2022)