1 /*
2 * _codecs_iso2022.c: Codecs collection for ISO-2022 encodings.
3 *
4 * Written by Hye-Shik Chang <perky@FreeBSD.org>
5 */
6
7 #define USING_IMPORTED_MAPS
8 #define USING_BINARY_PAIR_SEARCH
9 #define EXTERN_JISX0213_PAIR
10 #define EMULATE_JISX0213_2000_ENCODE_INVALID MAP_UNMAPPABLE
11 #define EMULATE_JISX0213_2000_DECODE_INVALID MAP_UNMAPPABLE
12
13 #define CJK_MOD_SPECIFIC_STATE \
14 /* kr */ \
15 const encode_map *cp949_encmap; \
16 const decode_map *ksx1001_decmap; \
17 \
18 /* jp */ \
19 const encode_map *jisxcommon_encmap; \
20 const decode_map *jisx0208_decmap; \
21 const decode_map *jisx0212_decmap; \
22 const encode_map *jisx0213_bmp_encmap; \
23 const decode_map *jisx0213_1_bmp_decmap; \
24 const decode_map *jisx0213_2_bmp_decmap; \
25 const encode_map *jisx0213_emp_encmap; \
26 const decode_map *jisx0213_1_emp_decmap; \
27 const decode_map *jisx0213_2_emp_decmap; \
28 \
29 /* cn */ \
30 const encode_map *gbcommon_encmap; \
31 const decode_map *gb2312_decmap;
32
33
34 #include "cjkcodecs.h"
35 #include "alg_jisx0201.h"
36 #include "emu_jisx0213_2000.h"
37 #include "mappings_jisx0213_pair.h"
38
39 /* STATE
40
41 state->c[0-3]
42
43 00000000
44 ||^^^^^|
45 |+-----+---- G0-3 Character Set
46 +----------- Is G0-3 double byte?
47
48 state->c[4]
49
50 00000000
51 ||
52 |+---- Locked-Shift?
53 +----- ESC Throughout
54 */
55
56 #define ESC 0x1B
57 #define SO 0x0E
58 #define SI 0x0F
59 #define LF 0x0A
60
61 #define MAX_ESCSEQLEN 16
62
63 #define CHARSET_ISO8859_1 'A'
64 #define CHARSET_ASCII 'B'
65 #define CHARSET_ISO8859_7 'F'
66 #define CHARSET_JISX0201_K 'I'
67 #define CHARSET_JISX0201_R 'J'
68
69 #define CHARSET_GB2312 ('A'|CHARSET_DBCS)
70 #define CHARSET_JISX0208 ('B'|CHARSET_DBCS)
71 #define CHARSET_KSX1001 ('C'|CHARSET_DBCS)
72 #define CHARSET_JISX0212 ('D'|CHARSET_DBCS)
73 #define CHARSET_GB2312_8565 ('E'|CHARSET_DBCS)
74 #define CHARSET_CNS11643_1 ('G'|CHARSET_DBCS)
75 #define CHARSET_CNS11643_2 ('H'|CHARSET_DBCS)
76 #define CHARSET_JISX0213_2000_1 ('O'|CHARSET_DBCS)
77 #define CHARSET_JISX0213_2 ('P'|CHARSET_DBCS)
78 #define CHARSET_JISX0213_2004_1 ('Q'|CHARSET_DBCS)
79 #define CHARSET_JISX0208_O ('@'|CHARSET_DBCS)
80
81 #define CHARSET_DBCS 0x80
82 #define ESCMARK(mark) ((mark) & 0x7f)
83
84 #define IS_ESCEND(c) (((c) >= 'A' && (c) <= 'Z') || (c) == '@')
85 #define IS_ISO2022ESC(c2) \
86 ((c2) == '(' || (c2) == ')' || (c2) == '$' || \
87 (c2) == '.' || (c2) == '&')
88 /* this is not a complete list of ISO-2022 escape sequence headers.
89 * but, it's enough to implement CJK instances of iso-2022. */
90
91 #define MAP_UNMAPPABLE 0xFFFF
92 #define MAP_MULTIPLE_AVAIL 0xFFFE /* for JIS X 0213 */
93
94 #define F_SHIFTED 0x01
95 #define F_ESCTHROUGHOUT 0x02
96
97 #define STATE_SETG(dn, v) do { ((state)->c[dn]) = (v); } while (0)
98 #define STATE_GETG(dn) ((state)->c[dn])
99
100 #define STATE_G0 STATE_GETG(0)
101 #define STATE_G1 STATE_GETG(1)
102 #define STATE_G2 STATE_GETG(2)
103 #define STATE_G3 STATE_GETG(3)
104 #define STATE_SETG0(v) STATE_SETG(0, v)
105 #define STATE_SETG1(v) STATE_SETG(1, v)
106 #define STATE_SETG2(v) STATE_SETG(2, v)
107 #define STATE_SETG3(v) STATE_SETG(3, v)
108
109 #define STATE_SETFLAG(f) do { ((state)->c[4]) |= (f); } while (0)
110 #define STATE_GETFLAG(f) ((state)->c[4] & (f))
111 #define STATE_CLEARFLAG(f) do { ((state)->c[4]) &= ~(f); } while (0)
112 #define STATE_CLEARFLAGS() do { ((state)->c[4]) = 0; } while (0)
113
114 #define ISO2022_CONFIG ((const struct iso2022_config *)(codec->config))
115 #define CONFIG_ISSET(flag) (ISO2022_CONFIG->flags & (flag))
116 #define CONFIG_DESIGNATIONS (ISO2022_CONFIG->designations)
117
118 /* iso2022_config.flags */
119 #define NO_SHIFT 0x01
120 #define USE_G2 0x02
121 #define USE_JISX0208_EXT 0x04
122
123 /*-*- internal data structures -*-*/
124
125 typedef int (*iso2022_init_func)(const MultibyteCodec *codec);
126 typedef Py_UCS4 (*iso2022_decode_func)(const MultibyteCodec *codec,
127 const unsigned char *data);
128 typedef DBCHAR (*iso2022_encode_func)(const MultibyteCodec *codec,
129 const Py_UCS4 *data,
130 Py_ssize_t *length);
131
132 struct iso2022_designation {
133 unsigned char mark;
134 unsigned char plane;
135 unsigned char width;
136 iso2022_init_func initializer;
137 iso2022_decode_func decoder;
138 iso2022_encode_func encoder;
139 };
140
141 struct iso2022_config {
142 int flags;
143 const struct iso2022_designation *designations; /* non-ascii desigs */
144 };
145
146 /*-*- iso-2022 codec implementation -*-*/
147
148 CODEC_INIT(iso2022)
149 {
150 const struct iso2022_designation *desig;
151 for (desig = CONFIG_DESIGNATIONS; desig->mark; desig++) {
152 if (desig->initializer != NULL && desig->initializer(codec) != 0) {
153 return -1;
154 }
155 }
156 return 0;
157 }
158
159 ENCODER_INIT(iso2022)
160 {
161 STATE_CLEARFLAGS();
162 STATE_SETG0(CHARSET_ASCII);
163 STATE_SETG1(CHARSET_ASCII);
164 return 0;
165 }
166
167 ENCODER_RESET(iso2022)
168 {
169 if (STATE_GETFLAG(F_SHIFTED)) {
170 WRITEBYTE1(SI);
171 NEXT_OUT(1);
172 STATE_CLEARFLAG(F_SHIFTED);
173 }
174 if (STATE_G0 != CHARSET_ASCII) {
175 WRITEBYTE3(ESC, '(', 'B');
176 NEXT_OUT(3);
177 STATE_SETG0(CHARSET_ASCII);
178 }
179 return 0;
180 }
181
182 ENCODER(iso2022)
183 {
184 while (*inpos < inlen) {
185 const struct iso2022_designation *dsg;
186 DBCHAR encoded;
187 Py_UCS4 c = INCHAR1;
188 Py_ssize_t insize;
189
190 if (c < 0x80) {
191 if (STATE_G0 != CHARSET_ASCII) {
192 WRITEBYTE3(ESC, '(', 'B');
193 STATE_SETG0(CHARSET_ASCII);
194 NEXT_OUT(3);
195 }
196 if (STATE_GETFLAG(F_SHIFTED)) {
197 WRITEBYTE1(SI);
198 STATE_CLEARFLAG(F_SHIFTED);
199 NEXT_OUT(1);
200 }
201 WRITEBYTE1((unsigned char)c);
202 NEXT(1, 1);
203 continue;
204 }
205
206 insize = 1;
207
208 encoded = MAP_UNMAPPABLE;
209 for (dsg = CONFIG_DESIGNATIONS; dsg->mark; dsg++) {
210 Py_ssize_t length = 1;
211 encoded = dsg->encoder(codec, &c, &length);
212 if (encoded == MAP_MULTIPLE_AVAIL) {
213 /* this implementation won't work for pair
214 * of non-bmp characters. */
215 if (inlen - *inpos < 2) {
216 if (!(flags & MBENC_FLUSH))
217 return MBERR_TOOFEW;
218 length = -1;
219 }
220 else
221 length = 2;
222 encoded = dsg->encoder(codec, &c, &length);
223 if (encoded != MAP_UNMAPPABLE) {
224 insize = length;
225 break;
226 }
227 }
228 else if (encoded != MAP_UNMAPPABLE)
229 break;
230 }
231
232 if (!dsg->mark)
233 return 1;
234 assert(dsg->width == 1 || dsg->width == 2);
235
236 switch (dsg->plane) {
237 case 0: /* G0 */
238 if (STATE_GETFLAG(F_SHIFTED)) {
239 WRITEBYTE1(SI);
240 STATE_CLEARFLAG(F_SHIFTED);
241 NEXT_OUT(1);
242 }
243 if (STATE_G0 != dsg->mark) {
244 if (dsg->width == 1) {
245 WRITEBYTE3(ESC, '(', ESCMARK(dsg->mark));
246 STATE_SETG0(dsg->mark);
247 NEXT_OUT(3);
248 }
249 else if (dsg->mark == CHARSET_JISX0208) {
250 WRITEBYTE3(ESC, '$', ESCMARK(dsg->mark));
251 STATE_SETG0(dsg->mark);
252 NEXT_OUT(3);
253 }
254 else {
255 WRITEBYTE4(ESC, '$', '(',
256 ESCMARK(dsg->mark));
257 STATE_SETG0(dsg->mark);
258 NEXT_OUT(4);
259 }
260 }
261 break;
262 case 1: /* G1 */
263 if (STATE_G1 != dsg->mark) {
264 if (dsg->width == 1) {
265 WRITEBYTE3(ESC, ')', ESCMARK(dsg->mark));
266 STATE_SETG1(dsg->mark);
267 NEXT_OUT(3);
268 }
269 else {
270 WRITEBYTE4(ESC, '$', ')', ESCMARK(dsg->mark));
271 STATE_SETG1(dsg->mark);
272 NEXT_OUT(4);
273 }
274 }
275 if (!STATE_GETFLAG(F_SHIFTED)) {
276 WRITEBYTE1(SO);
277 STATE_SETFLAG(F_SHIFTED);
278 NEXT_OUT(1);
279 }
280 break;
281 default: /* G2 and G3 is not supported: no encoding in
282 * CJKCodecs are using them yet */
283 return MBERR_INTERNAL;
284 }
285
286 if (dsg->width == 1) {
287 WRITEBYTE1((unsigned char)encoded);
288 NEXT_OUT(1);
289 }
290 else {
291 WRITEBYTE2(encoded >> 8, encoded & 0xff);
292 NEXT_OUT(2);
293 }
294 NEXT_INCHAR(insize);
295 }
296
297 return 0;
298 }
299
300 DECODER_INIT(iso2022)
301 {
302 STATE_CLEARFLAGS();
303 STATE_SETG0(CHARSET_ASCII);
304 STATE_SETG1(CHARSET_ASCII);
305 STATE_SETG2(CHARSET_ASCII);
306 return 0;
307 }
308
309 DECODER_RESET(iso2022)
310 {
311 STATE_SETG0(CHARSET_ASCII);
312 STATE_CLEARFLAG(F_SHIFTED);
313 return 0;
314 }
315
316 static Py_ssize_t
317 iso2022processesc(const MultibyteCodec *codec, MultibyteCodec_State *state,
318 const unsigned char **inbuf, Py_ssize_t *inleft)
319 {
320 unsigned char charset, designation;
321 Py_ssize_t i, esclen = 0;
322
323 for (i = 1;i < MAX_ESCSEQLEN;i++) {
324 if (i >= *inleft)
325 return MBERR_TOOFEW;
326 if (IS_ESCEND((*inbuf)[i])) {
327 esclen = i + 1;
328 break;
329 }
330 else if (CONFIG_ISSET(USE_JISX0208_EXT) && i+1 < *inleft &&
331 (*inbuf)[i] == '&' && (*inbuf)[i+1] == '@') {
332 i += 2;
333 }
334 }
335
336 switch (esclen) {
337 case 0:
338 return 1; /* unterminated escape sequence */
339 case 3:
340 if (INBYTE2 == '$') {
341 charset = INBYTE3 | CHARSET_DBCS;
342 designation = 0;
343 }
344 else {
345 charset = INBYTE3;
346 if (INBYTE2 == '(')
347 designation = 0;
348 else if (INBYTE2 == ')')
349 designation = 1;
350 else if (CONFIG_ISSET(USE_G2) && INBYTE2 == '.')
351 designation = 2;
352 else
353 return 3;
354 }
355 break;
356 case 4:
357 if (INBYTE2 != '$')
358 return 4;
359
360 charset = INBYTE4 | CHARSET_DBCS;
361 if (INBYTE3 == '(')
362 designation = 0;
363 else if (INBYTE3 == ')')
364 designation = 1;
365 else
366 return 4;
367 break;
368 case 6: /* designation with prefix */
369 if (CONFIG_ISSET(USE_JISX0208_EXT) &&
370 (*inbuf)[3] == ESC && (*inbuf)[4] == '$' &&
371 (*inbuf)[5] == 'B') {
372 charset = 'B' | CHARSET_DBCS;
373 designation = 0;
374 }
375 else
376 return 6;
377 break;
378 default:
379 return esclen;
380 }
381
382 /* raise error when the charset is not designated for this encoding */
383 if (charset != CHARSET_ASCII) {
384 const struct iso2022_designation *dsg;
385
386 for (dsg = CONFIG_DESIGNATIONS; dsg->mark; dsg++) {
387 if (dsg->mark == charset)
388 break;
389 }
390 if (!dsg->mark)
391 return esclen;
392 }
393
394 STATE_SETG(designation, charset);
395 *inleft -= esclen;
396 (*inbuf) += esclen;
397 return 0;
398 }
399
400 #define ISO8859_7_DECODE(c, writer) \
401 if ((c) < 0xa0) { \
402 OUTCHAR(c); \
403 } else if ((c) < 0xc0 && (0x288f3bc9L & (1L << ((c)-0xa0)))) { \
404 OUTCHAR(c); \
405 } else if ((c) >= 0xb4 && (c) <= 0xfe && ((c) >= 0xd4 || \
406 (0xbffffd77L & (1L << ((c)-0xb4))))) { \
407 OUTCHAR(0x02d0 + (c)); \
408 } else if ((c) == 0xa1) { \
409 OUTCHAR(0x2018); \
410 } else if ((c) == 0xa2) { \
411 OUTCHAR(0x2019); \
412 } else if ((c) == 0xaf) { \
413 OUTCHAR(0x2015); \
414 }
415
416 static Py_ssize_t
417 iso2022processg2(const MultibyteCodec *codec, MultibyteCodec_State *state,
418 const unsigned char **inbuf, Py_ssize_t *inleft,
419 _PyUnicodeWriter *writer)
420 {
421 /* not written to use encoder, decoder functions because only few
422 * encodings use G2 designations in CJKCodecs */
423 if (STATE_G2 == CHARSET_ISO8859_1) {
424 if (INBYTE3 < 0x80)
425 OUTCHAR(INBYTE3 + 0x80);
426 else
427 return 3;
428 }
429 else if (STATE_G2 == CHARSET_ISO8859_7) {
430 ISO8859_7_DECODE(INBYTE3 ^ 0x80, writer)
431 else
432 return 3;
433 }
434 else if (STATE_G2 == CHARSET_ASCII) {
435 if (INBYTE3 & 0x80)
436 return 3;
437 else
438 OUTCHAR(INBYTE3);
439 }
440 else
441 return MBERR_INTERNAL;
442
443 (*inbuf) += 3;
444 *inleft -= 3;
445 return 0;
446 }
447
448 DECODER(iso2022)
449 {
450 const struct iso2022_designation *dsgcache = NULL;
451
452 while (inleft > 0) {
453 unsigned char c = INBYTE1;
454 Py_ssize_t err;
455
456 if (STATE_GETFLAG(F_ESCTHROUGHOUT)) {
457 /* ESC throughout mode:
458 * for non-iso2022 escape sequences */
459 OUTCHAR(c); /* assume as ISO-8859-1 */
460 NEXT_IN(1);
461 if (IS_ESCEND(c)) {
462 STATE_CLEARFLAG(F_ESCTHROUGHOUT);
463 }
464 continue;
465 }
466
467 switch (c) {
468 case ESC:
469 REQUIRE_INBUF(2);
470 if (IS_ISO2022ESC(INBYTE2)) {
471 err = iso2022processesc(codec, state,
472 inbuf, &inleft);
473 if (err != 0)
474 return err;
475 }
476 else if (CONFIG_ISSET(USE_G2) && INBYTE2 == 'N') {/* SS2 */
477 REQUIRE_INBUF(3);
478 err = iso2022processg2(codec, state,
479 inbuf, &inleft, writer);
480 if (err != 0)
481 return err;
482 }
483 else {
484 OUTCHAR(ESC);
485 STATE_SETFLAG(F_ESCTHROUGHOUT);
486 NEXT_IN(1);
487 }
488 break;
489 case SI:
490 if (CONFIG_ISSET(NO_SHIFT))
491 goto bypass;
492 STATE_CLEARFLAG(F_SHIFTED);
493 NEXT_IN(1);
494 break;
495 case SO:
496 if (CONFIG_ISSET(NO_SHIFT))
497 goto bypass;
498 STATE_SETFLAG(F_SHIFTED);
499 NEXT_IN(1);
500 break;
501 case LF:
502 STATE_CLEARFLAG(F_SHIFTED);
503 OUTCHAR(LF);
504 NEXT_IN(1);
505 break;
506 default:
507 if (c < 0x20) /* C0 */
508 goto bypass;
509 else if (c >= 0x80)
510 return 1;
511 else {
512 const struct iso2022_designation *dsg;
513 unsigned char charset;
514 Py_UCS4 decoded;
515
516 if (STATE_GETFLAG(F_SHIFTED))
517 charset = STATE_G1;
518 else
519 charset = STATE_G0;
520
521 if (charset == CHARSET_ASCII) {
522 bypass:
523 OUTCHAR(c);
524 NEXT_IN(1);
525 break;
526 }
527
528 if (dsgcache != NULL &&
529 dsgcache->mark == charset)
530 dsg = dsgcache;
531 else {
532 for (dsg = CONFIG_DESIGNATIONS;
533 dsg->mark != charset
534 #ifdef Py_DEBUG
535 && dsg->mark != '\0'
536 #endif
537 ; dsg++)
538 {
539 /* noop */
540 }
541 assert(dsg->mark != '\0');
542 dsgcache = dsg;
543 }
544
545 REQUIRE_INBUF(dsg->width);
546 decoded = dsg->decoder(codec, *inbuf);
547 if (decoded == MAP_UNMAPPABLE)
548 return dsg->width;
549
550 if (decoded < 0x10000) {
551 OUTCHAR(decoded);
552 }
553 else if (decoded < 0x30000) {
554 OUTCHAR(decoded);
555 }
556 else { /* JIS X 0213 pairs */
557 OUTCHAR2(decoded >> 16, decoded & 0xffff);
558 }
559 NEXT_IN(dsg->width);
560 }
561 break;
562 }
563 }
564 return 0;
565 }
566
567 /*-*- mapping access functions -*-*/
568
569 static int
570 ksx1001_init(const MultibyteCodec *codec)
571 {
572 cjkcodecs_module_state *st = codec->modstate;
573 if (IMPORT_MAP(kr, cp949, &st->cp949_encmap, NULL) ||
574 IMPORT_MAP(kr, ksx1001, NULL, &st->ksx1001_decmap))
575 {
576 return -1;
577 }
578 return 0;
579 }
580
581 static Py_UCS4
582 ksx1001_decoder(const MultibyteCodec *codec, const unsigned char *data)
583 {
584 Py_UCS4 u;
585 if (TRYMAP_DEC_ST(ksx1001, u, data[0], data[1]))
586 return u;
587 else
588 return MAP_UNMAPPABLE;
589 }
590
591 static DBCHAR
592 ksx1001_encoder(const MultibyteCodec *codec, const Py_UCS4 *data,
593 Py_ssize_t *length)
594 {
595 DBCHAR coded;
596 assert(*length == 1);
597 if (*data < 0x10000) {
598 if (TRYMAP_ENC_ST(cp949, coded, *data)) {
599 if (!(coded & 0x8000))
600 return coded;
601 }
602 }
603 return MAP_UNMAPPABLE;
604 }
605
606 static int
607 jisx0208_init(const MultibyteCodec *codec)
608 {
609 cjkcodecs_module_state *st = codec->modstate;
610 if (IMPORT_MAP(jp, jisxcommon, &st->jisxcommon_encmap, NULL) ||
611 IMPORT_MAP(jp, jisx0208, NULL, &st->jisx0208_decmap))
612 {
613 return -1;
614 }
615 return 0;
616 }
617
618 static Py_UCS4
619 jisx0208_decoder(const MultibyteCodec *codec, const unsigned char *data)
620 {
621 Py_UCS4 u;
622 if (data[0] == 0x21 && data[1] == 0x40) /* F/W REVERSE SOLIDUS */
623 return 0xff3c;
624 else if (TRYMAP_DEC_ST(jisx0208, u, data[0], data[1]))
625 return u;
626 else
627 return MAP_UNMAPPABLE;
628 }
629
630 static DBCHAR
631 jisx0208_encoder(const MultibyteCodec *codec, const Py_UCS4 *data,
632 Py_ssize_t *length)
633 {
634 DBCHAR coded;
635 assert(*length == 1);
636 if (*data < 0x10000) {
637 if (*data == 0xff3c) /* F/W REVERSE SOLIDUS */
638 return 0x2140;
639 else if (TRYMAP_ENC_ST(jisxcommon, coded, *data)) {
640 if (!(coded & 0x8000))
641 return coded;
642 }
643 }
644 return MAP_UNMAPPABLE;
645 }
646
647 static int
648 jisx0212_init(const MultibyteCodec *codec)
649 {
650 cjkcodecs_module_state *st = codec->modstate;
651 if (IMPORT_MAP(jp, jisxcommon, &st->jisxcommon_encmap, NULL) ||
652 IMPORT_MAP(jp, jisx0212, NULL, &st->jisx0212_decmap))
653 {
654 return -1;
655 }
656 return 0;
657 }
658
659 static Py_UCS4
660 jisx0212_decoder(const MultibyteCodec *codec, const unsigned char *data)
661 {
662 Py_UCS4 u;
663 if (TRYMAP_DEC_ST(jisx0212, u, data[0], data[1]))
664 return u;
665 else
666 return MAP_UNMAPPABLE;
667 }
668
669 static DBCHAR
670 jisx0212_encoder(const MultibyteCodec *codec, const Py_UCS4 *data,
671 Py_ssize_t *length)
672 {
673 DBCHAR coded;
674 assert(*length == 1);
675 if (*data < 0x10000) {
676 if (TRYMAP_ENC_ST(jisxcommon, coded, *data)) {
677 if (coded & 0x8000)
678 return coded & 0x7fff;
679 }
680 }
681 return MAP_UNMAPPABLE;
682 }
683
684 static int
685 jisx0213_init(const MultibyteCodec *codec)
686 {
687 cjkcodecs_module_state *st = codec->modstate;
688 if (jisx0208_init(codec) ||
689 IMPORT_MAP(jp, jisx0213_bmp, &st->jisx0213_bmp_encmap, NULL) ||
690 IMPORT_MAP(jp, jisx0213_1_bmp, NULL, &st->jisx0213_1_bmp_decmap) ||
691 IMPORT_MAP(jp, jisx0213_2_bmp, NULL, &st->jisx0213_2_bmp_decmap) ||
692 IMPORT_MAP(jp, jisx0213_emp, &st->jisx0213_emp_encmap, NULL) ||
693 IMPORT_MAP(jp, jisx0213_1_emp, NULL, &st->jisx0213_1_emp_decmap) ||
694 IMPORT_MAP(jp, jisx0213_2_emp, NULL, &st->jisx0213_2_emp_decmap) ||
695 IMPORT_MAP(jp, jisx0213_pair,
696 &jisx0213_pair_encmap, &jisx0213_pair_decmap))
697 {
698 return -1;
699 }
700 return 0;
701 }
702
703 #define config ((void *)2000)
704 static Py_UCS4
705 jisx0213_2000_1_decoder(const MultibyteCodec *codec, const unsigned char *data)
706 {
707 Py_UCS4 u;
708 EMULATE_JISX0213_2000_DECODE_PLANE1(config, u, data[0], data[1])
709 else if (data[0] == 0x21 && data[1] == 0x40) /* F/W REVERSE SOLIDUS */
710 return 0xff3c;
711 else if (TRYMAP_DEC_ST(jisx0208, u, data[0], data[1]))
712 ;
713 else if (TRYMAP_DEC_ST(jisx0213_1_bmp, u, data[0], data[1]))
714 ;
715 else if (TRYMAP_DEC_ST(jisx0213_1_emp, u, data[0], data[1]))
716 u |= 0x20000;
717 else if (TRYMAP_DEC(jisx0213_pair, u, data[0], data[1]))
718 ;
719 else
720 return MAP_UNMAPPABLE;
721 return u;
722 }
723
724 static Py_UCS4
725 jisx0213_2000_2_decoder(const MultibyteCodec *codec, const unsigned char *data)
726 {
727 Py_UCS4 u;
728 EMULATE_JISX0213_2000_DECODE_PLANE2_CHAR(config, u, data[0], data[1])
729 if (TRYMAP_DEC_ST(jisx0213_2_bmp, u, data[0], data[1]))
730 ;
731 else if (TRYMAP_DEC_ST(jisx0213_2_emp, u, data[0], data[1]))
732 u |= 0x20000;
733 else
734 return MAP_UNMAPPABLE;
735 return u;
736 }
737 #undef config
738
739 static Py_UCS4
740 jisx0213_2004_1_decoder(const MultibyteCodec *codec, const unsigned char *data)
741 {
742 Py_UCS4 u;
743 if (data[0] == 0x21 && data[1] == 0x40) /* F/W REVERSE SOLIDUS */
744 return 0xff3c;
745 else if (TRYMAP_DEC_ST(jisx0208, u, data[0], data[1]))
746 ;
747 else if (TRYMAP_DEC_ST(jisx0213_1_bmp, u, data[0], data[1]))
748 ;
749 else if (TRYMAP_DEC_ST(jisx0213_1_emp, u, data[0], data[1]))
750 u |= 0x20000;
751 else if (TRYMAP_DEC(jisx0213_pair, u, data[0], data[1]))
752 ;
753 else
754 return MAP_UNMAPPABLE;
755 return u;
756 }
757
758 static Py_UCS4
759 jisx0213_2004_2_decoder(const MultibyteCodec *codec, const unsigned char *data)
760 {
761 Py_UCS4 u;
762 if (TRYMAP_DEC_ST(jisx0213_2_bmp, u, data[0], data[1]))
763 ;
764 else if (TRYMAP_DEC_ST(jisx0213_2_emp, u, data[0], data[1]))
765 u |= 0x20000;
766 else
767 return MAP_UNMAPPABLE;
768 return u;
769 }
770
771 static DBCHAR
772 jisx0213_encoder(const MultibyteCodec *codec, const Py_UCS4 *data,
773 Py_ssize_t *length, const void *config)
774 {
775 DBCHAR coded;
776
777 switch (*length) {
778 case 1: /* first character */
779 if (*data >= 0x10000) {
780 if ((*data) >> 16 == 0x20000 >> 16) {
781 EMULATE_JISX0213_2000_ENCODE_EMP(config, coded, *data)
782 else if (TRYMAP_ENC_ST(jisx0213_emp, coded, (*data) & 0xffff))
783 return coded;
784 }
785 return MAP_UNMAPPABLE;
786 }
787
788 EMULATE_JISX0213_2000_ENCODE_BMP(config, coded, *data)
789 else if (TRYMAP_ENC_ST(jisx0213_bmp, coded, *data)) {
790 if (coded == MULTIC)
791 return MAP_MULTIPLE_AVAIL;
792 }
793 else if (TRYMAP_ENC_ST(jisxcommon, coded, *data)) {
794 if (coded & 0x8000)
795 return MAP_UNMAPPABLE;
796 }
797 else
798 return MAP_UNMAPPABLE;
799 return coded;
800
801 case 2: /* second character of unicode pair */
802 coded = find_pairencmap((ucs2_t)data[0], (ucs2_t)data[1],
803 jisx0213_pair_encmap, JISX0213_ENCPAIRS);
804 if (coded != DBCINV)
805 return coded;
806 /* fall through */
807
808 case -1: /* flush unterminated */
809 *length = 1;
810 coded = find_pairencmap((ucs2_t)data[0], 0,
811 jisx0213_pair_encmap, JISX0213_ENCPAIRS);
812 if (coded == DBCINV)
813 return MAP_UNMAPPABLE;
814 else
815 return coded;
816 break;
817
818 default:
819 return MAP_UNMAPPABLE;
820 }
821 }
822
823 static DBCHAR
824 jisx0213_2000_1_encoder(const MultibyteCodec *codec, const Py_UCS4 *data,
825 Py_ssize_t *length)
826 {
827 DBCHAR coded = jisx0213_encoder(codec, data, length, (void *)2000);
828 if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL)
829 return coded;
830 else if (coded & 0x8000)
831 return MAP_UNMAPPABLE;
832 else
833 return coded;
834 }
835
836 static DBCHAR
837 jisx0213_2000_1_encoder_paironly(const MultibyteCodec *codec,
838 const Py_UCS4 *data, Py_ssize_t *length)
839 {
840 DBCHAR coded;
841 Py_ssize_t ilength = *length;
842
843 coded = jisx0213_encoder(codec, data, length, (void *)2000);
844 switch (ilength) {
845 case 1:
846 if (coded == MAP_MULTIPLE_AVAIL)
847 return MAP_MULTIPLE_AVAIL;
848 else
849 return MAP_UNMAPPABLE;
850 case 2:
851 if (*length != 2)
852 return MAP_UNMAPPABLE;
853 else
854 return coded;
855 default:
856 return MAP_UNMAPPABLE;
857 }
858 }
859
860 static DBCHAR
861 jisx0213_2000_2_encoder(const MultibyteCodec *codec, const Py_UCS4 *data,
862 Py_ssize_t *length)
863 {
864 DBCHAR coded = jisx0213_encoder(codec, data, length, (void *)2000);
865 if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL)
866 return coded;
867 else if (coded & 0x8000)
868 return coded & 0x7fff;
869 else
870 return MAP_UNMAPPABLE;
871 }
872
873 static DBCHAR
874 jisx0213_2004_1_encoder(const MultibyteCodec *codec, const Py_UCS4 *data,
875 Py_ssize_t *length)
876 {
877 DBCHAR coded = jisx0213_encoder(codec, data, length, NULL);
878 if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL)
879 return coded;
880 else if (coded & 0x8000)
881 return MAP_UNMAPPABLE;
882 else
883 return coded;
884 }
885
886 static DBCHAR
887 jisx0213_2004_1_encoder_paironly(const MultibyteCodec *codec,
888 const Py_UCS4 *data, Py_ssize_t *length)
889 {
890 DBCHAR coded;
891 Py_ssize_t ilength = *length;
892
893 coded = jisx0213_encoder(codec, data, length, NULL);
894 switch (ilength) {
895 case 1:
896 if (coded == MAP_MULTIPLE_AVAIL)
897 return MAP_MULTIPLE_AVAIL;
898 else
899 return MAP_UNMAPPABLE;
900 case 2:
901 if (*length != 2)
902 return MAP_UNMAPPABLE;
903 else
904 return coded;
905 default:
906 return MAP_UNMAPPABLE;
907 }
908 }
909
910 static DBCHAR
911 jisx0213_2004_2_encoder(const MultibyteCodec *codec, const Py_UCS4 *data,
912 Py_ssize_t *length)
913 {
914 DBCHAR coded = jisx0213_encoder(codec, data, length, NULL);
915 if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL)
916 return coded;
917 else if (coded & 0x8000)
918 return coded & 0x7fff;
919 else
920 return MAP_UNMAPPABLE;
921 }
922
923 static Py_UCS4
924 jisx0201_r_decoder(const MultibyteCodec *codec, const unsigned char *data)
925 {
926 Py_UCS4 u;
927 JISX0201_R_DECODE_CHAR(*data, u)
928 else
929 return MAP_UNMAPPABLE;
930 return u;
931 }
932
933 static DBCHAR
934 jisx0201_r_encoder(const MultibyteCodec *codec, const Py_UCS4 *data,
935 Py_ssize_t *length)
936 {
937 DBCHAR coded;
938 JISX0201_R_ENCODE(*data, coded)
939 else
940 return MAP_UNMAPPABLE;
941 return coded;
942 }
943
944 static Py_UCS4
945 jisx0201_k_decoder(const MultibyteCodec *codec, const unsigned char *data)
946 {
947 Py_UCS4 u;
948 JISX0201_K_DECODE_CHAR(*data ^ 0x80, u)
949 else
950 return MAP_UNMAPPABLE;
951 return u;
952 }
953
954 static DBCHAR
955 jisx0201_k_encoder(const MultibyteCodec *codec, const Py_UCS4 *data,
956 Py_ssize_t *length)
957 {
958 DBCHAR coded;
959 JISX0201_K_ENCODE(*data, coded)
960 else
961 return MAP_UNMAPPABLE;
962 return coded - 0x80;
963 }
964
965 static int
966 gb2312_init(const MultibyteCodec *codec)
967 {
968 cjkcodecs_module_state *st = codec->modstate;
969 if (IMPORT_MAP(cn, gbcommon, &st->gbcommon_encmap, NULL) ||
970 IMPORT_MAP(cn, gb2312, NULL, &st->gb2312_decmap))
971 {
972 return -1;
973 }
974 return 0;
975 }
976
977 static Py_UCS4
978 gb2312_decoder(const MultibyteCodec *codec, const unsigned char *data)
979 {
980 Py_UCS4 u;
981 if (TRYMAP_DEC_ST(gb2312, u, data[0], data[1]))
982 return u;
983 else
984 return MAP_UNMAPPABLE;
985 }
986
987 static DBCHAR
988 gb2312_encoder(const MultibyteCodec *codec, const Py_UCS4 *data,
989 Py_ssize_t *length)
990 {
991 DBCHAR coded;
992 assert(*length == 1);
993 if (*data < 0x10000) {
994 if (TRYMAP_ENC_ST(gbcommon, coded, *data)) {
995 if (!(coded & 0x8000))
996 return coded;
997 }
998 }
999 return MAP_UNMAPPABLE;
1000 }
1001
1002
1003 static Py_UCS4
1004 dummy_decoder(const MultibyteCodec *codec, const unsigned char *data)
1005 {
1006 return MAP_UNMAPPABLE;
1007 }
1008
1009 static DBCHAR
1010 dummy_encoder(const MultibyteCodec *codec, const Py_UCS4 *data,
1011 Py_ssize_t *length)
1012 {
1013 return MAP_UNMAPPABLE;
1014 }
1015
1016 /*-*- registry tables -*-*/
1017
1018 #define REGISTRY_KSX1001_G0 { CHARSET_KSX1001, 0, 2, \
1019 ksx1001_init, \
1020 ksx1001_decoder, ksx1001_encoder }
1021 #define REGISTRY_KSX1001_G1 { CHARSET_KSX1001, 1, 2, \
1022 ksx1001_init, \
1023 ksx1001_decoder, ksx1001_encoder }
1024 #define REGISTRY_JISX0201_R { CHARSET_JISX0201_R, 0, 1, \
1025 NULL, \
1026 jisx0201_r_decoder, jisx0201_r_encoder }
1027 #define REGISTRY_JISX0201_K { CHARSET_JISX0201_K, 0, 1, \
1028 NULL, \
1029 jisx0201_k_decoder, jisx0201_k_encoder }
1030 #define REGISTRY_JISX0208 { CHARSET_JISX0208, 0, 2, \
1031 jisx0208_init, \
1032 jisx0208_decoder, jisx0208_encoder }
1033 #define REGISTRY_JISX0208_O { CHARSET_JISX0208_O, 0, 2, \
1034 jisx0208_init, \
1035 jisx0208_decoder, jisx0208_encoder }
1036 #define REGISTRY_JISX0212 { CHARSET_JISX0212, 0, 2, \
1037 jisx0212_init, \
1038 jisx0212_decoder, jisx0212_encoder }
1039 #define REGISTRY_JISX0213_2000_1 { CHARSET_JISX0213_2000_1, 0, 2, \
1040 jisx0213_init, \
1041 jisx0213_2000_1_decoder, \
1042 jisx0213_2000_1_encoder }
1043 #define REGISTRY_JISX0213_2000_1_PAIRONLY { CHARSET_JISX0213_2000_1, 0, 2, \
1044 jisx0213_init, \
1045 jisx0213_2000_1_decoder, \
1046 jisx0213_2000_1_encoder_paironly }
1047 #define REGISTRY_JISX0213_2000_2 { CHARSET_JISX0213_2, 0, 2, \
1048 jisx0213_init, \
1049 jisx0213_2000_2_decoder, \
1050 jisx0213_2000_2_encoder }
1051 #define REGISTRY_JISX0213_2004_1 { CHARSET_JISX0213_2004_1, 0, 2, \
1052 jisx0213_init, \
1053 jisx0213_2004_1_decoder, \
1054 jisx0213_2004_1_encoder }
1055 #define REGISTRY_JISX0213_2004_1_PAIRONLY { CHARSET_JISX0213_2004_1, 0, 2, \
1056 jisx0213_init, \
1057 jisx0213_2004_1_decoder, \
1058 jisx0213_2004_1_encoder_paironly }
1059 #define REGISTRY_JISX0213_2004_2 { CHARSET_JISX0213_2, 0, 2, \
1060 jisx0213_init, \
1061 jisx0213_2004_2_decoder, \
1062 jisx0213_2004_2_encoder }
1063 #define REGISTRY_GB2312 { CHARSET_GB2312, 0, 2, \
1064 gb2312_init, \
1065 gb2312_decoder, gb2312_encoder }
1066 #define REGISTRY_CNS11643_1 { CHARSET_CNS11643_1, 1, 2, \
1067 cns11643_init, \
1068 cns11643_1_decoder, cns11643_1_encoder }
1069 #define REGISTRY_CNS11643_2 { CHARSET_CNS11643_2, 2, 2, \
1070 cns11643_init, \
1071 cns11643_2_decoder, cns11643_2_encoder }
1072 #define REGISTRY_ISO8859_1 { CHARSET_ISO8859_1, 2, 1, \
1073 NULL, dummy_decoder, dummy_encoder }
1074 #define REGISTRY_ISO8859_7 { CHARSET_ISO8859_7, 2, 1, \
1075 NULL, dummy_decoder, dummy_encoder }
1076 #define REGISTRY_SENTINEL { 0, }
1077 #define CONFIGDEF(var, attrs) \
1078 static const struct iso2022_config iso2022_##var##_config = { \
1079 attrs, iso2022_##var##_designations \
1080 };
1081
1082 static const struct iso2022_designation iso2022_kr_designations[] = {
1083 REGISTRY_KSX1001_G1, REGISTRY_SENTINEL
1084 };
1085 CONFIGDEF(kr, 0)
1086
1087 static const struct iso2022_designation iso2022_jp_designations[] = {
1088 REGISTRY_JISX0208, REGISTRY_JISX0201_R, REGISTRY_JISX0208_O,
1089 REGISTRY_SENTINEL
1090 };
1091 CONFIGDEF(jp, NO_SHIFT | USE_JISX0208_EXT)
1092
1093 static const struct iso2022_designation iso2022_jp_1_designations[] = {
1094 REGISTRY_JISX0208, REGISTRY_JISX0212, REGISTRY_JISX0201_R,
1095 REGISTRY_JISX0208_O, REGISTRY_SENTINEL
1096 };
1097 CONFIGDEF(jp_1, NO_SHIFT | USE_JISX0208_EXT)
1098
1099 static const struct iso2022_designation iso2022_jp_2_designations[] = {
1100 REGISTRY_JISX0208, REGISTRY_JISX0212, REGISTRY_KSX1001_G0,
1101 REGISTRY_GB2312, REGISTRY_JISX0201_R, REGISTRY_JISX0208_O,
1102 REGISTRY_ISO8859_1, REGISTRY_ISO8859_7, REGISTRY_SENTINEL
1103 };
1104 CONFIGDEF(jp_2, NO_SHIFT | USE_G2 | USE_JISX0208_EXT)
1105
1106 static const struct iso2022_designation iso2022_jp_2004_designations[] = {
1107 REGISTRY_JISX0213_2004_1_PAIRONLY, REGISTRY_JISX0208,
1108 REGISTRY_JISX0213_2004_1, REGISTRY_JISX0213_2004_2, REGISTRY_SENTINEL
1109 };
1110 CONFIGDEF(jp_2004, NO_SHIFT | USE_JISX0208_EXT)
1111
1112 static const struct iso2022_designation iso2022_jp_3_designations[] = {
1113 REGISTRY_JISX0213_2000_1_PAIRONLY, REGISTRY_JISX0208,
1114 REGISTRY_JISX0213_2000_1, REGISTRY_JISX0213_2000_2, REGISTRY_SENTINEL
1115 };
1116 CONFIGDEF(jp_3, NO_SHIFT | USE_JISX0208_EXT)
1117
1118 static const struct iso2022_designation iso2022_jp_ext_designations[] = {
1119 REGISTRY_JISX0208, REGISTRY_JISX0212, REGISTRY_JISX0201_R,
1120 REGISTRY_JISX0201_K, REGISTRY_JISX0208_O, REGISTRY_SENTINEL
1121 };
1122 CONFIGDEF(jp_ext, NO_SHIFT | USE_JISX0208_EXT)
1123
1124
1125 BEGIN_MAPPINGS_LIST(0)
1126 /* no mapping table here */
1127 END_MAPPINGS_LIST
1128
1129 #define ISO2022_CODEC(variation) \
1130 NEXT_CODEC = (MultibyteCodec){ \
1131 "iso2022_" #variation, \
1132 &iso2022_##variation##_config, \
1133 iso2022_codec_init, \
1134 _STATEFUL_METHODS(iso2022) \
1135 };
1136
1137 BEGIN_CODECS_LIST(7)
1138 ISO2022_CODEC(kr)
1139 ISO2022_CODEC(jp)
1140 ISO2022_CODEC(jp_1)
1141 ISO2022_CODEC(jp_2)
1142 ISO2022_CODEC(jp_2004)
1143 ISO2022_CODEC(jp_3)
1144 ISO2022_CODEC(jp_ext)
1145 END_CODECS_LIST
1146
1147 I_AM_A_MODULE_FOR(iso2022)