1 /*
2 * cjkcodecs.h: common header for cjkcodecs
3 *
4 * Written by Hye-Shik Chang <perky@FreeBSD.org>
5 */
6
7 #ifndef _CJKCODECS_H_
8 #define _CJKCODECS_H_
9
10 #define PY_SSIZE_T_CLEAN
11 #include "Python.h"
12 #include "multibytecodec.h"
13
14
15 /* a unicode "undefined" code point */
16 #define UNIINV 0xFFFE
17
18 /* internal-use DBCS code points which aren't used by any charsets */
19 #define NOCHAR 0xFFFF
20 #define MULTIC 0xFFFE
21 #define DBCINV 0xFFFD
22
23 /* shorter macros to save source size of mapping tables */
24 #define U UNIINV
25 #define N NOCHAR
26 #define M MULTIC
27 #define D DBCINV
28
29 struct dbcs_index {
30 const ucs2_t *map;
31 unsigned char bottom, top;
32 };
33 typedef struct dbcs_index decode_map;
34
35 struct widedbcs_index {
36 const Py_UCS4 *map;
37 unsigned char bottom, top;
38 };
39 typedef struct widedbcs_index widedecode_map;
40
41 struct unim_index {
42 const DBCHAR *map;
43 unsigned char bottom, top;
44 };
45 typedef struct unim_index encode_map;
46
47 struct unim_index_bytebased {
48 const unsigned char *map;
49 unsigned char bottom, top;
50 };
51
52 struct dbcs_map {
53 const char *charset;
54 const struct unim_index *encmap;
55 const struct dbcs_index *decmap;
56 };
57
58 struct pair_encodemap {
59 Py_UCS4 uniseq;
60 DBCHAR code;
61 };
62
63 #ifndef CJK_MOD_SPECIFIC_STATE
64 #define CJK_MOD_SPECIFIC_STATE
65 #endif
66
67 typedef struct _cjk_mod_state {
68 int num_mappings;
69 int num_codecs;
70 struct dbcs_map *mapping_list;
71 MultibyteCodec *codec_list;
72
73 CJK_MOD_SPECIFIC_STATE
74 } cjkcodecs_module_state;
75
76 static inline cjkcodecs_module_state *
77 get_module_state(PyObject *mod)
78 {
79 void *state = PyModule_GetState(mod);
80 assert(state != NULL);
81 return (cjkcodecs_module_state *)state;
82 }
83
84 #define CODEC_INIT(encoding) \
85 static int encoding##_codec_init(const MultibyteCodec *codec)
86
87 #define ENCODER_INIT(encoding) \
88 static int encoding##_encode_init( \
89 MultibyteCodec_State *state, const MultibyteCodec *codec)
90 #define ENCODER(encoding) \
91 static Py_ssize_t encoding##_encode( \
92 MultibyteCodec_State *state, const MultibyteCodec *codec, \
93 int kind, const void *data, \
94 Py_ssize_t *inpos, Py_ssize_t inlen, \
95 unsigned char **outbuf, Py_ssize_t outleft, int flags)
96 #define ENCODER_RESET(encoding) \
97 static Py_ssize_t encoding##_encode_reset( \
98 MultibyteCodec_State *state, const MultibyteCodec *codec, \
99 unsigned char **outbuf, Py_ssize_t outleft)
100
101 #define DECODER_INIT(encoding) \
102 static int encoding##_decode_init( \
103 MultibyteCodec_State *state, const MultibyteCodec *codec)
104 #define DECODER(encoding) \
105 static Py_ssize_t encoding##_decode( \
106 MultibyteCodec_State *state, const MultibyteCodec *codec, \
107 const unsigned char **inbuf, Py_ssize_t inleft, \
108 _PyUnicodeWriter *writer)
109 #define DECODER_RESET(encoding) \
110 static Py_ssize_t encoding##_decode_reset( \
111 MultibyteCodec_State *state, const MultibyteCodec *codec)
112
113 #define NEXT_IN(i) \
114 do { \
115 (*inbuf) += (i); \
116 (inleft) -= (i); \
117 } while (0)
118 #define NEXT_INCHAR(i) \
119 do { \
120 (*inpos) += (i); \
121 } while (0)
122 #define NEXT_OUT(o) \
123 do { \
124 (*outbuf) += (o); \
125 (outleft) -= (o); \
126 } while (0)
127 #define NEXT(i, o) \
128 do { \
129 NEXT_INCHAR(i); \
130 NEXT_OUT(o); \
131 } while (0)
132
133 #define REQUIRE_INBUF(n) \
134 do { \
135 if (inleft < (n)) \
136 return MBERR_TOOFEW; \
137 } while (0)
138
139 #define REQUIRE_OUTBUF(n) \
140 do { \
141 if (outleft < (n)) \
142 return MBERR_TOOSMALL; \
143 } while (0)
144
145 #define INBYTE1 ((*inbuf)[0])
146 #define INBYTE2 ((*inbuf)[1])
147 #define INBYTE3 ((*inbuf)[2])
148 #define INBYTE4 ((*inbuf)[3])
149
150 #define INCHAR1 (PyUnicode_READ(kind, data, *inpos))
151 #define INCHAR2 (PyUnicode_READ(kind, data, *inpos + 1))
152
153 #define OUTCHAR(c) \
154 do { \
155 if (_PyUnicodeWriter_WriteChar(writer, (c)) < 0) \
156 return MBERR_EXCEPTION; \
157 } while (0)
158
159 #define OUTCHAR2(c1, c2) \
160 do { \
161 Py_UCS4 _c1 = (c1); \
162 Py_UCS4 _c2 = (c2); \
163 if (_PyUnicodeWriter_Prepare(writer, 2, Py_MAX(_c1, c2)) < 0) \
164 return MBERR_EXCEPTION; \
165 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, _c1); \
166 PyUnicode_WRITE(writer->kind, writer->data, writer->pos + 1, _c2); \
167 writer->pos += 2; \
168 } while (0)
169
170 #define OUTBYTEI(c, i) \
171 do { \
172 assert((unsigned char)(c) == (c)); \
173 ((*outbuf)[i]) = (c); \
174 } while (0)
175
176 #define OUTBYTE1(c) OUTBYTEI(c, 0)
177 #define OUTBYTE2(c) OUTBYTEI(c, 1)
178 #define OUTBYTE3(c) OUTBYTEI(c, 2)
179 #define OUTBYTE4(c) OUTBYTEI(c, 3)
180
181 #define WRITEBYTE1(c1) \
182 do { \
183 REQUIRE_OUTBUF(1); \
184 OUTBYTE1(c1); \
185 } while (0)
186 #define WRITEBYTE2(c1, c2) \
187 do { \
188 REQUIRE_OUTBUF(2); \
189 OUTBYTE1(c1); \
190 OUTBYTE2(c2); \
191 } while (0)
192 #define WRITEBYTE3(c1, c2, c3) \
193 do { \
194 REQUIRE_OUTBUF(3); \
195 OUTBYTE1(c1); \
196 OUTBYTE2(c2); \
197 OUTBYTE3(c3); \
198 } while (0)
199 #define WRITEBYTE4(c1, c2, c3, c4) \
200 do { \
201 REQUIRE_OUTBUF(4); \
202 OUTBYTE1(c1); \
203 OUTBYTE2(c2); \
204 OUTBYTE3(c3); \
205 OUTBYTE4(c4); \
206 } while (0)
207
208 #define _TRYMAP_ENC(m, assi, val) \
209 ((m)->map != NULL && (val) >= (m)->bottom && \
210 (val)<= (m)->top && ((assi) = (m)->map[(val) - \
211 (m)->bottom]) != NOCHAR)
212 #define TRYMAP_ENC(charset, assi, uni) \
213 _TRYMAP_ENC(&charset##_encmap[(uni) >> 8], assi, (uni) & 0xff)
214 #define TRYMAP_ENC_ST(charset, assi, uni) \
215 _TRYMAP_ENC(&(codec->modstate->charset##_encmap)[(uni) >> 8], \
216 assi, (uni) & 0xff)
217
218 #define _TRYMAP_DEC(m, assi, val) \
219 ((m)->map != NULL && \
220 (val) >= (m)->bottom && \
221 (val)<= (m)->top && \
222 ((assi) = (m)->map[(val) - (m)->bottom]) != UNIINV)
223 #define TRYMAP_DEC(charset, assi, c1, c2) \
224 _TRYMAP_DEC(&charset##_decmap[c1], assi, c2)
225 #define TRYMAP_DEC_ST(charset, assi, c1, c2) \
226 _TRYMAP_DEC(&(codec->modstate->charset##_decmap)[c1], assi, c2)
227
228 #define BEGIN_MAPPINGS_LIST(NUM) \
229 static int \
230 add_mappings(cjkcodecs_module_state *st) \
231 { \
232 int idx = 0; \
233 (void)idx; \
234 st->num_mappings = NUM; \
235 st->mapping_list = PyMem_Calloc(NUM, sizeof(struct dbcs_map)); \
236 if (st->mapping_list == NULL) { \
237 return -1; \
238 }
239
240 #define MAPPING_ENCONLY(enc) \
241 st->mapping_list[idx++] = (struct dbcs_map){#enc, (void*)enc##_encmap, NULL};
242 #define MAPPING_DECONLY(enc) \
243 st->mapping_list[idx++] = (struct dbcs_map){#enc, NULL, (void*)enc##_decmap};
244 #define MAPPING_ENCDEC(enc) \
245 st->mapping_list[idx++] = (struct dbcs_map){#enc, (void*)enc##_encmap, (void*)enc##_decmap};
246
247 #define END_MAPPINGS_LIST \
248 assert(st->num_mappings == idx); \
249 return 0; \
250 }
251
252 #define BEGIN_CODECS_LIST(NUM) \
253 static int \
254 add_codecs(cjkcodecs_module_state *st) \
255 { \
256 int idx = 0; \
257 (void)idx; \
258 st->num_codecs = NUM; \
259 st->codec_list = PyMem_Calloc(NUM, sizeof(MultibyteCodec)); \
260 if (st->codec_list == NULL) { \
261 return -1; \
262 }
263
264 #define _STATEFUL_METHODS(enc) \
265 enc##_encode, \
266 enc##_encode_init, \
267 enc##_encode_reset, \
268 enc##_decode, \
269 enc##_decode_init, \
270 enc##_decode_reset,
271 #define _STATELESS_METHODS(enc) \
272 enc##_encode, NULL, NULL, \
273 enc##_decode, NULL, NULL,
274
275 #define NEXT_CODEC \
276 st->codec_list[idx++]
277
278 #define CODEC_STATEFUL(enc) \
279 NEXT_CODEC = (MultibyteCodec){#enc, NULL, NULL, _STATEFUL_METHODS(enc)};
280 #define CODEC_STATELESS(enc) \
281 NEXT_CODEC = (MultibyteCodec){#enc, NULL, NULL, _STATELESS_METHODS(enc)};
282 #define CODEC_STATELESS_WINIT(enc) \
283 NEXT_CODEC = (MultibyteCodec){#enc, NULL, enc##_codec_init, _STATELESS_METHODS(enc)};
284
285 #define END_CODECS_LIST \
286 assert(st->num_codecs == idx); \
287 for (int i = 0; i < st->num_codecs; i++) { \
288 st->codec_list[i].modstate = st; \
289 } \
290 return 0; \
291 }
292
293
294
295 static PyObject *
296 getmultibytecodec(void)
297 {
298 return _PyImport_GetModuleAttrString("_multibytecodec", "__create_codec");
299 }
300
301 static void
302 destroy_codec_capsule(PyObject *capsule)
303 {
304 void *ptr = PyCapsule_GetPointer(capsule, CODEC_CAPSULE);
305 codec_capsule *data = (codec_capsule *)ptr;
306 Py_DECREF(data->cjk_module);
307 PyMem_Free(ptr);
308 }
309
310 static codec_capsule *
311 capsulate_codec(PyObject *mod, const MultibyteCodec *codec)
312 {
313 codec_capsule *data = PyMem_Malloc(sizeof(codec_capsule));
314 if (data == NULL) {
315 PyErr_NoMemory();
316 return NULL;
317 }
318 data->codec = codec;
319 data->cjk_module = Py_NewRef(mod);
320 return data;
321 }
322
323 static PyObject *
324 _getcodec(PyObject *self, const MultibyteCodec *codec)
325 {
326 PyObject *cofunc = getmultibytecodec();
327 if (cofunc == NULL) {
328 return NULL;
329 }
330
331 codec_capsule *data = capsulate_codec(self, codec);
332 if (data == NULL) {
333 Py_DECREF(cofunc);
334 return NULL;
335 }
336 PyObject *codecobj = PyCapsule_New(data, CODEC_CAPSULE,
337 destroy_codec_capsule);
338 if (codecobj == NULL) {
339 PyMem_Free(data);
340 Py_DECREF(cofunc);
341 return NULL;
342 }
343
344 PyObject *res = PyObject_CallOneArg(cofunc, codecobj);
345 Py_DECREF(codecobj);
346 Py_DECREF(cofunc);
347 return res;
348 }
349
350 static PyObject *
351 getcodec(PyObject *self, PyObject *encoding)
352 {
353 if (!PyUnicode_Check(encoding)) {
354 PyErr_SetString(PyExc_TypeError,
355 "encoding name must be a string.");
356 return NULL;
357 }
358 const char *enc = PyUnicode_AsUTF8(encoding);
359 if (enc == NULL) {
360 return NULL;
361 }
362
363 cjkcodecs_module_state *st = get_module_state(self);
364 for (int i = 0; i < st->num_codecs; i++) {
365 const MultibyteCodec *codec = &st->codec_list[i];
366 if (strcmp(codec->encoding, enc) == 0) {
367 return _getcodec(self, codec);
368 }
369 }
370
371 PyErr_SetString(PyExc_LookupError,
372 "no such codec is supported.");
373 return NULL;
374 }
375
376 static int add_mappings(cjkcodecs_module_state *);
377 static int add_codecs(cjkcodecs_module_state *);
378
379 static int
380 register_maps(PyObject *module)
381 {
382 // Init module state.
383 cjkcodecs_module_state *st = get_module_state(module);
384 if (add_mappings(st) < 0) {
385 return -1;
386 }
387 if (add_codecs(st) < 0) {
388 return -1;
389 }
390
391 for (int i = 0; i < st->num_mappings; i++) {
392 const struct dbcs_map *h = &st->mapping_list[i];
393 char mhname[256] = "__map_";
394 strcpy(mhname + sizeof("__map_") - 1, h->charset);
395
396 PyObject *capsule = PyCapsule_New((void *)h, MAP_CAPSULE, NULL);
397 if (capsule == NULL) {
398 return -1;
399 }
400 if (PyModule_AddObject(module, mhname, capsule) < 0) {
401 Py_DECREF(capsule);
402 return -1;
403 }
404 }
405 return 0;
406 }
407
408 #ifdef USING_BINARY_PAIR_SEARCH
409 static DBCHAR
410 find_pairencmap(ucs2_t body, ucs2_t modifier,
411 const struct pair_encodemap *haystack, int haystacksize)
412 {
413 int pos, min, max;
414 Py_UCS4 value = body << 16 | modifier;
415
416 min = 0;
417 max = haystacksize;
418
419 for (pos = haystacksize >> 1; min != max; pos = (min + max) >> 1) {
420 if (value < haystack[pos].uniseq) {
421 if (max != pos) {
422 max = pos;
423 continue;
424 }
425 }
426 else if (value > haystack[pos].uniseq) {
427 if (min != pos) {
428 min = pos;
429 continue;
430 }
431 }
432 break;
433 }
434
435 if (value == haystack[pos].uniseq) {
436 return haystack[pos].code;
437 }
438 return DBCINV;
439 }
440 #endif
441
442 #ifdef USING_IMPORTED_MAPS
443 #define IMPORT_MAP(locale, charset, encmap, decmap) \
444 importmap("_codecs_" #locale, "__map_" #charset, \
445 (const void**)encmap, (const void**)decmap)
446
447 static int
448 importmap(const char *modname, const char *symbol,
449 const void **encmap, const void **decmap)
450 {
451 PyObject *o, *mod;
452
453 mod = PyImport_ImportModule(modname);
454 if (mod == NULL)
455 return -1;
456
457 o = PyObject_GetAttrString(mod, symbol);
458 if (o == NULL)
459 goto errorexit;
460 else if (!PyCapsule_IsValid(o, MAP_CAPSULE)) {
461 PyErr_SetString(PyExc_ValueError,
462 "map data must be a Capsule.");
463 goto errorexit;
464 }
465 else {
466 struct dbcs_map *map;
467 map = PyCapsule_GetPointer(o, MAP_CAPSULE);
468 if (encmap != NULL)
469 *encmap = map->encmap;
470 if (decmap != NULL)
471 *decmap = map->decmap;
472 Py_DECREF(o);
473 }
474
475 Py_DECREF(mod);
476 return 0;
477
478 errorexit:
479 Py_DECREF(mod);
480 return -1;
481 }
482 #endif
483
484 static int
485 _cjk_exec(PyObject *module)
486 {
487 return register_maps(module);
488 }
489
490 static void
491 _cjk_free(void *mod)
492 {
493 cjkcodecs_module_state *st = get_module_state((PyObject *)mod);
494 PyMem_Free(st->mapping_list);
495 PyMem_Free(st->codec_list);
496 }
497
498 static struct PyMethodDef _cjk_methods[] = {
499 {"getcodec", (PyCFunction)getcodec, METH_O, ""},
500 {NULL, NULL},
501 };
502
503 static PyModuleDef_Slot _cjk_slots[] = {
504 {Py_mod_exec, _cjk_exec},
505 {Py_mod_multiple_interpreters, Py_MOD_PER_INTERPRETER_GIL_SUPPORTED},
506 {0, NULL}
507 };
508
509 #define I_AM_A_MODULE_FOR(loc) \
510 static struct PyModuleDef _cjk_module = { \
511 PyModuleDef_HEAD_INIT, \
512 .m_name = "_codecs_"#loc, \
513 .m_size = sizeof(cjkcodecs_module_state), \
514 .m_methods = _cjk_methods, \
515 .m_slots = _cjk_slots, \
516 .m_free = _cjk_free, \
517 }; \
518 \
519 PyMODINIT_FUNC \
520 PyInit__codecs_##loc(void) \
521 { \
522 return PyModuleDef_Init(&_cjk_module); \
523 }
524
525 #endif