(root)/
Python-3.12.0/
Modules/
cjkcodecs/
_codecs_cn.c
       1  /*
       2   * _codecs_cn.c: Codecs collection for Mainland Chinese encodings
       3   *
       4   * Written by Hye-Shik Chang <perky@FreeBSD.org>
       5   */
       6  
       7  #include "cjkcodecs.h"
       8  #include "mappings_cn.h"
       9  
      10  /**
      11   * hz is predefined as 100 on AIX. So we undefine it to avoid
      12   * conflict against hz codec's.
      13   */
      14  #ifdef _AIX
      15  #undef hz
      16  #endif
      17  
      18  /* GBK and GB2312 map differently in few code points that are listed below:
      19   *
      20   *              gb2312                          gbk
      21   * A1A4         U+30FB KATAKANA MIDDLE DOT      U+00B7 MIDDLE DOT
      22   * A1AA         U+2015 HORIZONTAL BAR           U+2014 EM DASH
      23   * A844         undefined                       U+2015 HORIZONTAL BAR
      24   */
      25  
      26  #define GBK_DECODE(dc1, dc2, writer)                                \
      27      if ((dc1) == 0xa1 && (dc2) == 0xaa) {                           \
      28          OUTCHAR(0x2014);                                            \
      29      }                                                               \
      30      else if ((dc1) == 0xa8 && (dc2) == 0x44) {                      \
      31          OUTCHAR(0x2015);                                            \
      32      }                                                               \
      33      else if ((dc1) == 0xa1 && (dc2) == 0xa4) {                      \
      34          OUTCHAR(0x00b7);                                            \
      35      }                                                               \
      36      else if (TRYMAP_DEC(gb2312, decoded, dc1 ^ 0x80, dc2 ^ 0x80)) { \
      37          OUTCHAR(decoded);                                           \
      38      }                                                               \
      39      else if (TRYMAP_DEC(gbkext, decoded, dc1, dc2)) {               \
      40          OUTCHAR(decoded);                                           \
      41      }
      42  
      43  #define GBK_ENCODE(code, assi)                                         \
      44      if ((code) == 0x2014) {                                            \
      45          (assi) = 0xa1aa;                                               \
      46      } else if ((code) == 0x2015) {                                     \
      47          (assi) = 0xa844;                                               \
      48      } else if ((code) == 0x00b7) {                                     \
      49          (assi) = 0xa1a4;                                               \
      50      } else if ((code) != 0x30fb && TRYMAP_ENC(gbcommon, assi, code)) { \
      51          ;                                                              \
      52      }
      53  
      54  /*
      55   * codecs in this file use the first byte of MultibyteCodec_State.c[8]
      56   * to store a 0 or 1 state value
      57   */
      58  #define CN_STATE_OFFSET 0
      59  
      60  /*
      61   * GB2312 codec
      62   */
      63  
      64  ENCODER(gb2312)
      65  {
      66      while (*inpos < inlen) {
      67          Py_UCS4 c = INCHAR1;
      68          DBCHAR code;
      69  
      70          if (c < 0x80) {
      71              WRITEBYTE1((unsigned char)c);
      72              NEXT(1, 1);
      73              continue;
      74          }
      75  
      76          if (c > 0xFFFF)
      77              return 1;
      78  
      79          REQUIRE_OUTBUF(2);
      80          if (TRYMAP_ENC(gbcommon, code, c))
      81              ;
      82          else
      83              return 1;
      84  
      85          if (code & 0x8000) /* MSB set: GBK */
      86              return 1;
      87  
      88          OUTBYTE1((code >> 8) | 0x80);
      89          OUTBYTE2((code & 0xFF) | 0x80);
      90          NEXT(1, 2);
      91      }
      92  
      93      return 0;
      94  }
      95  
      96  DECODER(gb2312)
      97  {
      98      while (inleft > 0) {
      99          unsigned char c = **inbuf;
     100          Py_UCS4 decoded;
     101  
     102          if (c < 0x80) {
     103              OUTCHAR(c);
     104              NEXT_IN(1);
     105              continue;
     106          }
     107  
     108          REQUIRE_INBUF(2);
     109          if (TRYMAP_DEC(gb2312, decoded, c ^ 0x80, INBYTE2 ^ 0x80)) {
     110              OUTCHAR(decoded);
     111              NEXT_IN(2);
     112          }
     113          else
     114              return 1;
     115      }
     116  
     117      return 0;
     118  }
     119  
     120  
     121  /*
     122   * GBK codec
     123   */
     124  
     125  ENCODER(gbk)
     126  {
     127      while (*inpos < inlen) {
     128          Py_UCS4 c = INCHAR1;
     129          DBCHAR code;
     130  
     131          if (c < 0x80) {
     132              WRITEBYTE1((unsigned char)c);
     133              NEXT(1, 1);
     134              continue;
     135          }
     136  
     137          if (c > 0xFFFF)
     138              return 1;
     139  
     140          REQUIRE_OUTBUF(2);
     141  
     142          GBK_ENCODE(c, code)
     143          else
     144              return 1;
     145  
     146          OUTBYTE1((code >> 8) | 0x80);
     147          if (code & 0x8000)
     148              OUTBYTE2((code & 0xFF)); /* MSB set: GBK */
     149          else
     150              OUTBYTE2((code & 0xFF) | 0x80); /* MSB unset: GB2312 */
     151          NEXT(1, 2);
     152      }
     153  
     154      return 0;
     155  }
     156  
     157  DECODER(gbk)
     158  {
     159      while (inleft > 0) {
     160          unsigned char c = INBYTE1;
     161          Py_UCS4 decoded;
     162  
     163          if (c < 0x80) {
     164              OUTCHAR(c);
     165              NEXT_IN(1);
     166              continue;
     167          }
     168  
     169          REQUIRE_INBUF(2);
     170  
     171          GBK_DECODE(c, INBYTE2, writer)
     172          else
     173              return 1;
     174  
     175          NEXT_IN(2);
     176      }
     177  
     178      return 0;
     179  }
     180  
     181  
     182  /*
     183   * GB18030 codec
     184   */
     185  
     186  ENCODER(gb18030)
     187  {
     188      while (*inpos < inlen) {
     189          Py_UCS4 c = INCHAR1;
     190          DBCHAR code;
     191  
     192          if (c < 0x80) {
     193              WRITEBYTE1(c);
     194              NEXT(1, 1);
     195              continue;
     196          }
     197  
     198          if (c >= 0x10000) {
     199              Py_UCS4 tc = c - 0x10000;
     200              assert (c <= 0x10FFFF);
     201  
     202              REQUIRE_OUTBUF(4);
     203  
     204              OUTBYTE4((unsigned char)(tc % 10) + 0x30);
     205              tc /= 10;
     206              OUTBYTE3((unsigned char)(tc % 126) + 0x81);
     207              tc /= 126;
     208              OUTBYTE2((unsigned char)(tc % 10) + 0x30);
     209              tc /= 10;
     210              OUTBYTE1((unsigned char)(tc + 0x90));
     211  
     212              NEXT(1, 4);
     213              continue;
     214          }
     215  
     216          REQUIRE_OUTBUF(2);
     217  
     218          GBK_ENCODE(c, code)
     219          else if (TRYMAP_ENC(gb18030ext, code, c))
     220              ;
     221          else {
     222              const struct _gb18030_to_unibmp_ranges *utrrange;
     223  
     224              REQUIRE_OUTBUF(4);
     225  
     226              for (utrrange = gb18030_to_unibmp_ranges;
     227                   utrrange->first != 0;
     228                   utrrange++)
     229                  if (utrrange->first <= c &&
     230                      c <= utrrange->last) {
     231                      Py_UCS4 tc;
     232  
     233                      tc = c - utrrange->first +
     234                           utrrange->base;
     235  
     236                      OUTBYTE4((unsigned char)(tc % 10) + 0x30);
     237                      tc /= 10;
     238                      OUTBYTE3((unsigned char)(tc % 126) + 0x81);
     239                      tc /= 126;
     240                      OUTBYTE2((unsigned char)(tc % 10) + 0x30);
     241                      tc /= 10;
     242                      OUTBYTE1((unsigned char)tc + 0x81);
     243  
     244                      NEXT(1, 4);
     245                      break;
     246                  }
     247  
     248              if (utrrange->first == 0)
     249                  return 1;
     250              continue;
     251          }
     252  
     253          OUTBYTE1((code >> 8) | 0x80);
     254          if (code & 0x8000)
     255              OUTBYTE2((code & 0xFF)); /* MSB set: GBK or GB18030ext */
     256          else
     257              OUTBYTE2((code & 0xFF) | 0x80); /* MSB unset: GB2312 */
     258  
     259          NEXT(1, 2);
     260      }
     261  
     262      return 0;
     263  }
     264  
     265  DECODER(gb18030)
     266  {
     267      while (inleft > 0) {
     268          unsigned char c = INBYTE1, c2;
     269          Py_UCS4 decoded;
     270  
     271          if (c < 0x80) {
     272              OUTCHAR(c);
     273              NEXT_IN(1);
     274              continue;
     275          }
     276  
     277          REQUIRE_INBUF(2);
     278  
     279          c2 = INBYTE2;
     280          if (c2 >= 0x30 && c2 <= 0x39) { /* 4 bytes seq */
     281              const struct _gb18030_to_unibmp_ranges *utr;
     282              unsigned char c3, c4;
     283              Py_UCS4 lseq;
     284  
     285              REQUIRE_INBUF(4);
     286              c3 = INBYTE3;
     287              c4 = INBYTE4;
     288              if (c  < 0x81 || c  > 0xFE ||
     289                  c3 < 0x81 || c3 > 0xFE ||
     290                  c4 < 0x30 || c4 > 0x39)
     291                  return 1;
     292              c -= 0x81;  c2 -= 0x30;
     293              c3 -= 0x81; c4 -= 0x30;
     294  
     295              if (c < 4) { /* U+0080 - U+FFFF */
     296                  lseq = ((Py_UCS4)c * 10 + c2) * 1260 +
     297                      (Py_UCS4)c3 * 10 + c4;
     298                  if (lseq < 39420) {
     299                      for (utr = gb18030_to_unibmp_ranges;
     300                           lseq >= (utr + 1)->base;
     301                           utr++) ;
     302                      OUTCHAR(utr->first - utr->base + lseq);
     303                      NEXT_IN(4);
     304                      continue;
     305                  }
     306              }
     307              else if (c >= 15) { /* U+10000 - U+10FFFF */
     308                  lseq = 0x10000 + (((Py_UCS4)c-15) * 10 + c2)
     309                      * 1260 + (Py_UCS4)c3 * 10 + c4;
     310                  if (lseq <= 0x10FFFF) {
     311                      OUTCHAR(lseq);
     312                      NEXT_IN(4);
     313                      continue;
     314                  }
     315              }
     316              return 1;
     317          }
     318  
     319          GBK_DECODE(c, c2, writer)
     320          else if (TRYMAP_DEC(gb18030ext, decoded, c, c2))
     321              OUTCHAR(decoded);
     322          else
     323              return 1;
     324  
     325          NEXT_IN(2);
     326      }
     327  
     328      return 0;
     329  }
     330  
     331  
     332  /*
     333   * HZ codec
     334   */
     335  
     336  ENCODER_INIT(hz)
     337  {
     338      state->c[CN_STATE_OFFSET] = 0;
     339      return 0;
     340  }
     341  
     342  ENCODER_RESET(hz)
     343  {
     344      if (state->c[CN_STATE_OFFSET] != 0) {
     345          WRITEBYTE2('~', '}');
     346          state->c[CN_STATE_OFFSET] = 0;
     347          NEXT_OUT(2);
     348      }
     349      return 0;
     350  }
     351  
     352  ENCODER(hz)
     353  {
     354      while (*inpos < inlen) {
     355          Py_UCS4 c = INCHAR1;
     356          DBCHAR code;
     357  
     358          if (c < 0x80) {
     359              if (state->c[CN_STATE_OFFSET]) {
     360                  WRITEBYTE2('~', '}');
     361                  NEXT_OUT(2);
     362                  state->c[CN_STATE_OFFSET] = 0;
     363              }
     364              WRITEBYTE1((unsigned char)c);
     365              NEXT(1, 1);
     366              if (c == '~') {
     367                  WRITEBYTE1('~');
     368                  NEXT_OUT(1);
     369              }
     370              continue;
     371          }
     372  
     373          if (c > 0xFFFF)
     374              return 1;
     375  
     376          if (TRYMAP_ENC(gbcommon, code, c))
     377              ;
     378          else
     379              return 1;
     380  
     381          if (code & 0x8000) /* MSB set: GBK */
     382              return 1;
     383  
     384          if (state->c[CN_STATE_OFFSET] == 0) {
     385              WRITEBYTE4('~', '{', code >> 8, code & 0xff);
     386              NEXT(1, 4);
     387              state->c[CN_STATE_OFFSET] = 1;
     388          }
     389          else {
     390              WRITEBYTE2(code >> 8, code & 0xff);
     391              NEXT(1, 2);
     392          }
     393      }
     394  
     395      return 0;
     396  }
     397  
     398  DECODER_INIT(hz)
     399  {
     400      state->c[CN_STATE_OFFSET] = 0;
     401      return 0;
     402  }
     403  
     404  DECODER_RESET(hz)
     405  {
     406      state->c[CN_STATE_OFFSET] = 0;
     407      return 0;
     408  }
     409  
     410  DECODER(hz)
     411  {
     412      while (inleft > 0) {
     413          unsigned char c = INBYTE1;
     414          Py_UCS4 decoded;
     415  
     416          if (c == '~') {
     417              unsigned char c2 = INBYTE2;
     418  
     419              REQUIRE_INBUF(2);
     420              if (c2 == '~' && state->c[CN_STATE_OFFSET] == 0)
     421                  OUTCHAR('~');
     422              else if (c2 == '{' && state->c[CN_STATE_OFFSET] == 0)
     423                  state->c[CN_STATE_OFFSET] = 1; /* set GB */
     424              else if (c2 == '\n' && state->c[CN_STATE_OFFSET] == 0)
     425                  ; /* line-continuation */
     426              else if (c2 == '}' && state->c[CN_STATE_OFFSET] == 1)
     427                  state->c[CN_STATE_OFFSET] = 0; /* set ASCII */
     428              else
     429                  return 1;
     430              NEXT_IN(2);
     431              continue;
     432          }
     433  
     434          if (c & 0x80)
     435              return 1;
     436  
     437          if (state->c[CN_STATE_OFFSET] == 0) { /* ASCII mode */
     438              OUTCHAR(c);
     439              NEXT_IN(1);
     440          }
     441          else { /* GB mode */
     442              REQUIRE_INBUF(2);
     443              if (TRYMAP_DEC(gb2312, decoded, c, INBYTE2)) {
     444                  OUTCHAR(decoded);
     445                  NEXT_IN(2);
     446              }
     447              else
     448                  return 1;
     449          }
     450      }
     451  
     452      return 0;
     453  }
     454  
     455  
     456  BEGIN_MAPPINGS_LIST(4)
     457    MAPPING_DECONLY(gb2312)
     458    MAPPING_DECONLY(gbkext)
     459    MAPPING_ENCONLY(gbcommon)
     460    MAPPING_ENCDEC(gb18030ext)
     461  END_MAPPINGS_LIST
     462  
     463  BEGIN_CODECS_LIST(4)
     464    CODEC_STATELESS(gb2312)
     465    CODEC_STATELESS(gbk)
     466    CODEC_STATELESS(gb18030)
     467    CODEC_STATEFUL(hz)
     468  END_CODECS_LIST
     469  
     470  I_AM_A_MODULE_FOR(cn)