1 """ Unicode Mapping Parser and Codec Generator.
2
3 This script parses Unicode mapping files as available from the Unicode
4 site (ftp://ftp.unicode.org/Public/MAPPINGS/) and creates Python codec
5 modules from them. The codecs use the standard character mapping codec
6 to actually apply the mapping.
7
8 Synopsis: gencodec.py dir codec_prefix
9
10 All files in dir are scanned and those producing non-empty mappings
11 will be written to <codec_prefix><mapname>.py with <mapname> being the
12 first part of the map's filename ('a' in a.b.c.txt) converted to
13 lowercase with hyphens replaced by underscores.
14
15 The tool also writes marshalled versions of the mapping tables to the
16 same location (with .mapping extension).
17
18 Written by Marc-Andre Lemburg (mal@lemburg.com).
19
20 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
21 (c) Copyright Guido van Rossum, 2000.
22
23 Table generation:
24 (c) Copyright Marc-Andre Lemburg, 2005.
25 Licensed to PSF under a Contributor Agreement.
26
27 """#"
28
29 import re, os, marshal, codecs
30
31 # Maximum allowed size of charmap tables
32 MAX_TABLE_SIZE = 8192
33
34 # Standard undefined Unicode code point
35 UNI_UNDEFINED = chr(0xFFFE)
36
37 # Placeholder for a missing code point
38 MISSING_CODE = -1
39
40 mapRE = re.compile(r'((?:0x[0-9a-fA-F]+\+?)+)'
41 r'\s+'
42 r'((?:(?:0x[0-9a-fA-Z]+|<[A-Za-z]+>)\+?)*)'
43 r'\s*'
44 r'(#.+)?')
45
46 def parsecodes(codes, len=len, range=range):
47
48 """ Converts code combinations to either a single code integer
49 or a tuple of integers.
50
51 meta-codes (in angular brackets, e.g. <LR> and <RL>) are
52 ignored.
53
54 Empty codes or illegal ones are returned as None.
55
56 """
57 if not codes:
58 return MISSING_CODE
59 l = codes.split('+')
60 if len(l) == 1:
61 return int(l[0],16)
62 for i in range(len(l)):
63 try:
64 l[i] = int(l[i],16)
65 except ValueError:
66 l[i] = MISSING_CODE
67 l = [x for x in l if x != MISSING_CODE]
68 if len(l) == 1:
69 return l[0]
70 else:
71 return tuple(l)
72
73 def readmap(filename):
74
75 with open(filename) as f:
76 lines = f.readlines()
77 enc2uni = {}
78 identity = []
79 unmapped = list(range(256))
80
81 # UTC mapping tables per convention don't include the identity
82 # mappings for code points 0x00 - 0x1F and 0x7F, unless these are
83 # explicitly mapped to different characters or undefined
84 for i in list(range(32)) + [127]:
85 identity.append(i)
86 unmapped.remove(i)
87 enc2uni[i] = (i, 'CONTROL CHARACTER')
88
89 for line in lines:
90 line = line.strip()
91 if not line or line[0] == '#':
92 continue
93 m = mapRE.match(line)
94 if not m:
95 #print '* not matched: %s' % repr(line)
96 continue
97 enc,uni,comment = m.groups()
98 enc = parsecodes(enc)
99 uni = parsecodes(uni)
100 if comment is None:
101 comment = ''
102 else:
103 comment = comment[1:].strip()
104 if not isinstance(enc, tuple) and enc < 256:
105 if enc in unmapped:
106 unmapped.remove(enc)
107 if enc == uni:
108 identity.append(enc)
109 enc2uni[enc] = (uni,comment)
110 else:
111 enc2uni[enc] = (uni,comment)
112
113 # If there are more identity-mapped entries than unmapped entries,
114 # it pays to generate an identity dictionary first, and add explicit
115 # mappings to None for the rest
116 if len(identity) >= len(unmapped):
117 for enc in unmapped:
118 enc2uni[enc] = (MISSING_CODE, "")
119 enc2uni['IDENTITY'] = 256
120
121 return enc2uni
122
123 def hexrepr(t, precision=4):
124
125 if t is None:
126 return 'None'
127 try:
128 len(t)
129 except TypeError:
130 return '0x%0*X' % (precision, t)
131 try:
132 return '(' + ', '.join(['0x%0*X' % (precision, item)
133 for item in t]) + ')'
134 except TypeError as why:
135 print('* failed to convert %r: %s' % (t, why))
136 raise
137
138 def python_mapdef_code(varname, map, comments=1, precisions=(2, 4)):
139
140 l = []
141 append = l.append
142 if "IDENTITY" in map:
143 append("%s = codecs.make_identity_dict(range(%d))" %
144 (varname, map["IDENTITY"]))
145 append("%s.update({" % varname)
146 splits = 1
147 del map["IDENTITY"]
148 identity = 1
149 else:
150 append("%s = {" % varname)
151 splits = 0
152 identity = 0
153
154 mappings = sorted(map.items())
155 i = 0
156 key_precision, value_precision = precisions
157 for mapkey, mapvalue in mappings:
158 mapcomment = ''
159 if isinstance(mapkey, tuple):
160 (mapkey, mapcomment) = mapkey
161 if isinstance(mapvalue, tuple):
162 (mapvalue, mapcomment) = mapvalue
163 if mapkey is None:
164 continue
165 if (identity and
166 mapkey == mapvalue and
167 mapkey < 256):
168 # No need to include identity mappings, since these
169 # are already set for the first 256 code points.
170 continue
171 key = hexrepr(mapkey, key_precision)
172 value = hexrepr(mapvalue, value_precision)
173 if mapcomment and comments:
174 append(' %s: %s,\t# %s' % (key, value, mapcomment))
175 else:
176 append(' %s: %s,' % (key, value))
177 i += 1
178 if i == 4096:
179 # Split the definition into parts to that the Python
180 # parser doesn't dump core
181 if splits == 0:
182 append('}')
183 else:
184 append('})')
185 append('%s.update({' % varname)
186 i = 0
187 splits = splits + 1
188 if splits == 0:
189 append('}')
190 else:
191 append('})')
192
193 return l
194
195 def python_tabledef_code(varname, map, comments=1, key_precision=2):
196
197 l = []
198 append = l.append
199 append('%s = (' % varname)
200
201 # Analyze map and create table dict
202 mappings = sorted(map.items())
203 table = {}
204 maxkey = 255
205 if 'IDENTITY' in map:
206 for key in range(256):
207 table[key] = (key, '')
208 del map['IDENTITY']
209 for mapkey, mapvalue in mappings:
210 mapcomment = ''
211 if isinstance(mapkey, tuple):
212 (mapkey, mapcomment) = mapkey
213 if isinstance(mapvalue, tuple):
214 (mapvalue, mapcomment) = mapvalue
215 if mapkey == MISSING_CODE:
216 continue
217 table[mapkey] = (mapvalue, mapcomment)
218 if mapkey > maxkey:
219 maxkey = mapkey
220 if maxkey > MAX_TABLE_SIZE:
221 # Table too large
222 return None
223
224 # Create table code
225 maxchar = 0
226 for key in range(maxkey + 1):
227 if key not in table:
228 mapvalue = MISSING_CODE
229 mapcomment = 'UNDEFINED'
230 else:
231 mapvalue, mapcomment = table[key]
232 if mapvalue == MISSING_CODE:
233 mapchar = UNI_UNDEFINED
234 else:
235 if isinstance(mapvalue, tuple):
236 # 1-n mappings not supported
237 return None
238 else:
239 mapchar = chr(mapvalue)
240 maxchar = max(maxchar, ord(mapchar))
241 if mapcomment and comments:
242 append(' %a \t# %s -> %s' % (mapchar,
243 hexrepr(key, key_precision),
244 mapcomment))
245 else:
246 append(' %a' % mapchar)
247
248 if maxchar < 256:
249 append(' %a \t## Widen to UCS2 for optimization' % UNI_UNDEFINED)
250 append(')')
251 return l
252
253 def codegen(name, map, encodingname, comments=1):
254
255 """ Returns Python source for the given map.
256
257 Comments are included in the source, if comments is true (default).
258
259 """
260 # Generate code
261 decoding_map_code = python_mapdef_code(
262 'decoding_map',
263 map,
264 comments=comments)
265 decoding_table_code = python_tabledef_code(
266 'decoding_table',
267 map,
268 comments=comments)
269 encoding_map_code = python_mapdef_code(
270 'encoding_map',
271 codecs.make_encoding_map(map),
272 comments=comments,
273 precisions=(4, 2))
274
275 if decoding_table_code:
276 suffix = 'table'
277 else:
278 suffix = 'map'
279
280 l = [
281 '''\
282 """ Python Character Mapping Codec %s generated from '%s' with gencodec.py.
283
284 """#"
285
286 import codecs
287
288 ### Codec APIs
289
290 class Codec(codecs.Codec):
291
292 def encode(self, input, errors='strict'):
293 return codecs.charmap_encode(input, errors, encoding_%s)
294
295 def decode(self, input, errors='strict'):
296 return codecs.charmap_decode(input, errors, decoding_%s)
297 ''' % (encodingname, name, suffix, suffix)]
298 l.append('''\
299 class IncrementalEncoder(codecs.IncrementalEncoder):
300 def encode(self, input, final=False):
301 return codecs.charmap_encode(input, self.errors, encoding_%s)[0]
302
303 class IncrementalDecoder(codecs.IncrementalDecoder):
304 def decode(self, input, final=False):
305 return codecs.charmap_decode(input, self.errors, decoding_%s)[0]''' %
306 (suffix, suffix))
307
308 l.append('''
309 class StreamWriter(Codec, codecs.StreamWriter):
310 pass
311
312 class StreamReader(Codec, codecs.StreamReader):
313 pass
314
315 ### encodings module API
316
317 def getregentry():
318 return codecs.CodecInfo(
319 name=%r,
320 encode=Codec().encode,
321 decode=Codec().decode,
322 incrementalencoder=IncrementalEncoder,
323 incrementaldecoder=IncrementalDecoder,
324 streamreader=StreamReader,
325 streamwriter=StreamWriter,
326 )
327 ''' % encodingname.replace('_', '-'))
328
329 # Add decoding table or map (with preference to the table)
330 if not decoding_table_code:
331 l.append('''
332 ### Decoding Map
333 ''')
334 l.extend(decoding_map_code)
335 else:
336 l.append('''
337 ### Decoding Table
338 ''')
339 l.extend(decoding_table_code)
340
341 # Add encoding map
342 if decoding_table_code:
343 l.append('''
344 ### Encoding table
345 encoding_table = codecs.charmap_build(decoding_table)
346 ''')
347 else:
348 l.append('''
349 ### Encoding Map
350 ''')
351 l.extend(encoding_map_code)
352
353 # Final new-line
354 l.append('')
355
356 return '\n'.join(l).expandtabs()
357
358 def pymap(name,map,pyfile,encodingname,comments=1):
359
360 code = codegen(name,map,encodingname,comments)
361 with open(pyfile,'w') as f:
362 f.write(code)
363
364 def marshalmap(name,map,marshalfile):
365
366 d = {}
367 for e,(u,c) in map.items():
368 d[e] = (u,c)
369 with open(marshalfile,'wb') as f:
370 marshal.dump(d,f)
371
372 def convertdir(dir, dirprefix='', nameprefix='', comments=1):
373
374 mapnames = os.listdir(dir)
375 for mapname in mapnames:
376 mappathname = os.path.join(dir, mapname)
377 if not os.path.isfile(mappathname):
378 continue
379 name = os.path.split(mapname)[1]
380 name = name.replace('-','_')
381 name = name.split('.')[0]
382 name = name.lower()
383 name = nameprefix + name
384 codefile = name + '.py'
385 marshalfile = name + '.mapping'
386 print('converting %s to %s and %s' % (mapname,
387 dirprefix + codefile,
388 dirprefix + marshalfile))
389 try:
390 map = readmap(os.path.join(dir,mapname))
391 if not map:
392 print('* map is empty; skipping')
393 else:
394 pymap(mappathname, map, dirprefix + codefile,name,comments)
395 marshalmap(mappathname, map, dirprefix + marshalfile)
396 except ValueError as why:
397 print('* conversion failed: %s' % why)
398 raise
399
400 def rewritepythondir(dir, dirprefix='', comments=1):
401
402 mapnames = os.listdir(dir)
403 for mapname in mapnames:
404 if not mapname.endswith('.mapping'):
405 continue
406 name = mapname[:-len('.mapping')]
407 codefile = name + '.py'
408 print('converting %s to %s' % (mapname,
409 dirprefix + codefile))
410 try:
411 with open(os.path.join(dir, mapname), 'rb') as f:
412 map = marshal.load(f)
413 if not map:
414 print('* map is empty; skipping')
415 else:
416 pymap(mapname, map, dirprefix + codefile,name,comments)
417 except ValueError as why:
418 print('* conversion failed: %s' % why)
419
420 if __name__ == '__main__':
421
422 import sys
423 if 1:
424 convertdir(*sys.argv[1:])
425 else:
426 rewritepythondir(*sys.argv[1:])