1 #!/usr/bin/env python3
2 #
3 # Original script modified in November 2003 to take advantage of
4 # the character-validation range routines, and updated to the
5 # current Unicode information (Version 4.0.1)
6 #
7 # NOTE: there is an 'alias' facility for blocks which are not present in
8 # the current release, but are needed for ABI compatibility. This
9 # must be accomplished MANUALLY! Please see the comments below under
10 # 'blockAliases'
11 #
12 import sys
13 import string
14 import time
15
16 webpage = "http://www.unicode.org/Public/4.0-Update1/UCD-4.0.1.html"
17 sources = "Blocks-4.0.1.txt UnicodeData-4.0.1.txt"
18
19 #
20 # blockAliases is a small hack - it is used for mapping block names which
21 # were were used in the 3.1 release, but are missing or changed in the current
22 # release. The format is "OldBlockName:NewBlockName1[,NewBlockName2[,...]]"
23 blockAliases = []
24 blockAliases.append("CombiningMarksforSymbols:CombiningDiacriticalMarksforSymbols")
25 blockAliases.append("Greek:GreekandCoptic")
26 blockAliases.append("PrivateUse:PrivateUseArea,SupplementaryPrivateUseArea-A," +
27 "SupplementaryPrivateUseArea-B")
28
29 # minTableSize gives the minimum number of ranges which must be present
30 # before a range table is produced. If there are less than this
31 # number, inline comparisons are generated
32 minTableSize = 8
33
34 (blockfile, catfile) = sources.split()
35
36
37 #
38 # Now process the "blocks" file, reducing it to a dictionary
39 # indexed by blockname, containing a tuple with the applicable
40 # block range
41 #
42 BlockNames = {}
43 try:
44 blocks = open(blockfile, "r")
45 except:
46 print("Missing %s, aborting ..." % blockfile)
47 sys.exit(1)
48
49 for line in blocks.readlines():
50 if line[0] == '#':
51 continue
52 line = line.strip()
53 if line == '':
54 continue
55 try:
56 fields = line.split(';')
57 range = fields[0].strip()
58 (start, end) = range.split("..")
59 name = fields[1].strip()
60 name = name.replace(' ', '')
61 except:
62 print("Failed to process line: %s" % (line))
63 continue
64 start = "0x" + start
65 end = "0x" + end
66 try:
67 BlockNames[name].append((start, end))
68 except:
69 BlockNames[name] = [(start, end)]
70 blocks.close()
71 print("Parsed %d blocks descriptions" % (len(BlockNames.keys())))
72
73 for block in blockAliases:
74 alias = block.split(':')
75 alist = alias[1].split(',')
76 for comp in alist:
77 if comp in BlockNames:
78 if alias[0] not in BlockNames:
79 BlockNames[alias[0]] = []
80 for r in BlockNames[comp]:
81 BlockNames[alias[0]].append(r)
82 else:
83 print("Alias %s: %s not in Blocks" % (alias[0], comp))
84 continue
85
86 #
87 # Next process the Categories file. This is more complex, since
88 # the file is in code sequence, and we need to invert it. We use
89 # a dictionary with index category-name, with each entry containing
90 # all the ranges (codepoints) of that category. Note that category
91 # names comprise two parts - the general category, and the "subclass"
92 # within that category. Therefore, both "general category" (which is
93 # the first character of the 2-character category-name) and the full
94 # (2-character) name are entered into this dictionary.
95 #
96 try:
97 data = open(catfile, "r")
98 except:
99 print("Missing %s, aborting ..." % catfile)
100 sys.exit(1)
101
102 nbchar = 0;
103 Categories = {}
104 for line in data.readlines():
105 if line[0] == '#':
106 continue
107 line = line.strip()
108 if line == '':
109 continue
110 try:
111 fields = line.split(';')
112 point = fields[0].strip()
113 value = 0
114 while point != '':
115 value = value * 16
116 if point[0] >= '0' and point[0] <= '9':
117 value = value + ord(point[0]) - ord('0')
118 elif point[0] >= 'A' and point[0] <= 'F':
119 value = value + 10 + ord(point[0]) - ord('A')
120 elif point[0] >= 'a' and point[0] <= 'f':
121 value = value + 10 + ord(point[0]) - ord('a')
122 point = point[1:]
123 name = fields[2]
124 except:
125 print("Failed to process line: %s" % (line))
126 continue
127
128 nbchar = nbchar + 1
129 # update entry for "full name"
130 try:
131 Categories[name].append(value)
132 except:
133 try:
134 Categories[name] = [value]
135 except:
136 print("Failed to process line: %s" % (line))
137 # update "general category" name
138 try:
139 Categories[name[0]].append(value)
140 except:
141 try:
142 Categories[name[0]] = [value]
143 except:
144 print("Failed to process line: %s" % (line))
145
146 blocks.close()
147 print("Parsed %d char generating %d categories" % (nbchar, len(Categories.keys())))
148
149 #
150 # The data is now all read. Time to process it into a more useful form.
151 #
152 # reduce the number list into ranges
153 for cat in Categories.keys():
154 list = Categories[cat]
155 start = -1
156 prev = -1
157 end = -1
158 ranges = []
159 for val in list:
160 if start == -1:
161 start = val
162 prev = val
163 continue
164 elif val == prev + 1:
165 prev = val
166 continue
167 elif prev == start:
168 ranges.append((prev, prev))
169 start = val
170 prev = val
171 continue
172 else:
173 ranges.append((start, prev))
174 start = val
175 prev = val
176 continue
177 if prev == start:
178 ranges.append((prev, prev))
179 else:
180 ranges.append((start, prev))
181 Categories[cat] = ranges
182
183 #
184 # Assure all data is in alphabetic order, since we will be doing binary
185 # searches on the tables.
186 #
187 bkeys = sorted(BlockNames.keys())
188
189 ckeys = sorted(Categories.keys())
190
191 #
192 # Generate the resulting files
193 #
194 try:
195 header = open("include/libxml/xmlunicode.h", "w")
196 except:
197 print("Failed to open include/libxml/xmlunicode.h")
198 sys.exit(1)
199
200 try:
201 output = open("xmlunicode.c", "w")
202 except:
203 print("Failed to open xmlunicode.c")
204 sys.exit(1)
205
206 date = time.asctime(time.localtime(time.time()))
207
208 header.write(
209 """/*
210 * Summary: Unicode character APIs
211 * Description: API for the Unicode character APIs
212 *
213 * This file is automatically generated from the
214 * UCS description files of the Unicode Character Database
215 * %s
216 * using the genUnicode.py Python script.
217 *
218 * Generation date: %s
219 * Sources: %s
220 * Author: Daniel Veillard
221 */
222
223 #ifndef __XML_UNICODE_H__
224 #define __XML_UNICODE_H__
225
226 #include <libxml/xmlversion.h>
227
228 #ifdef LIBXML_UNICODE_ENABLED
229
230 #ifdef __cplusplus
231 extern "C" {
232 #endif
233
234 """ % (webpage, date, sources));
235
236 output.write(
237 """/*
238 * xmlunicode.c: this module implements the Unicode character APIs
239 *
240 * This file is automatically generated from the
241 * UCS description files of the Unicode Character Database
242 * %s
243 * using the genUnicode.py Python script.
244 *
245 * Generation date: %s
246 * Sources: %s
247 * Daniel Veillard <veillard@redhat.com>
248 */
249
250 #define IN_LIBXML
251 #include "libxml.h"
252
253 #ifdef LIBXML_UNICODE_ENABLED
254
255 #include <string.h>
256 #include <libxml/xmlversion.h>
257 #include <libxml/xmlunicode.h>
258 #include <libxml/chvalid.h>
259
260 typedef int (xmlIntFunc)(int); /* just to keep one's mind untwisted */
261
262 typedef struct {
263 const char *rangename;
264 xmlIntFunc *func;
265 } xmlUnicodeRange;
266
267 typedef struct {
268 const xmlUnicodeRange *table;
269 int numentries;
270 } xmlUnicodeNameTable;
271
272
273 static xmlIntFunc *xmlUnicodeLookup(const xmlUnicodeNameTable *tptr, const char *tname);
274
275 static const xmlUnicodeRange xmlUnicodeBlocks[] = {
276 """ % (webpage, date, sources));
277
278 flag = 0
279 for block in bkeys:
280 name = block.replace('-', '')
281 if flag:
282 output.write(',\n')
283 else:
284 flag = 1
285 output.write(' {"%s", xmlUCSIs%s}' % (block, name))
286 output.write('};\n\n')
287
288 output.write('static const xmlUnicodeRange xmlUnicodeCats[] = {\n')
289 flag = 0;
290 for name in ckeys:
291 if flag:
292 output.write(',\n')
293 else:
294 flag = 1
295 output.write(' {"%s", xmlUCSIsCat%s}' % (name, name))
296 output.write('};\n\n')
297
298 #
299 # For any categories with more than minTableSize ranges we generate
300 # a range table suitable for xmlCharInRange
301 #
302 for name in ckeys:
303 if len(Categories[name]) > minTableSize:
304 numshort = 0
305 numlong = 0
306 ranges = Categories[name]
307 sptr = "NULL"
308 lptr = "NULL"
309 for range in ranges:
310 (low, high) = range
311 if high < 0x10000:
312 if numshort == 0:
313 pline = "static const xmlChSRange xml%sS[] = {" % name
314 sptr = "xml%sS" % name
315 else:
316 pline += ","
317 numshort += 1
318 else:
319 if numlong == 0:
320 if numshort > 0:
321 output.write(pline + " };\n")
322 pline = "static const xmlChLRange xml%sL[] = {" % name
323 lptr = "xml%sL" % name
324 else:
325 pline += ","
326 numlong += 1
327 if len(pline) > 60:
328 output.write(pline + "\n")
329 pline = " "
330 elif pline[-1:] == ",":
331 pline += " "
332 pline += "{%s, %s}" % (hex(low), hex(high))
333 output.write(pline + " };\nstatic const xmlChRangeGroup xml%sG = {%s,%s,%s,%s};\n\n"
334 % (name, numshort, numlong, sptr, lptr))
335
336
337 output.write(
338 """static const xmlUnicodeNameTable xmlUnicodeBlockTbl = {xmlUnicodeBlocks, %s};
339 static const xmlUnicodeNameTable xmlUnicodeCatTbl = {xmlUnicodeCats, %s};
340
341 /**
342 * xmlUnicodeLookup:
343 * @tptr: pointer to the name table
344 * @name: name to be found
345 *
346 * binary table lookup for user-supplied name
347 *
348 * Returns pointer to range function if found, otherwise NULL
349 */
350 static xmlIntFunc
351 *xmlUnicodeLookup(const xmlUnicodeNameTable *tptr, const char *tname) {
352 int low, high, mid, cmp;
353 const xmlUnicodeRange *sptr;
354
355 if ((tptr == NULL) || (tname == NULL)) return(NULL);
356
357 low = 0;
358 high = tptr->numentries - 1;
359 sptr = tptr->table;
360 while (low <= high) {
361 mid = (low + high) / 2;
362 if ((cmp=strcmp(tname, sptr[mid].rangename)) == 0)
363 return (sptr[mid].func);
364 if (cmp < 0)
365 high = mid - 1;
366 else
367 low = mid + 1;
368 }
369 return (NULL);
370 }
371
372 """ % (len(BlockNames), len(Categories)) )
373
374 for block in bkeys:
375 name = block.replace('-', '')
376 header.write("XMLPUBFUN int xmlUCSIs%s\t(int code);\n" % name)
377 output.write("/**\n * xmlUCSIs%s:\n * @code: UCS code point\n" % (name))
378 output.write(" *\n * Check whether the character is part of %s UCS Block\n"%
379 (block))
380 output.write(" *\n * Returns 1 if true 0 otherwise\n */\n");
381 output.write("int\nxmlUCSIs%s(int code) {\n return(" % name)
382 flag = 0
383 for (start, end) in BlockNames[block]:
384 if flag:
385 output.write(" ||\n ")
386 else:
387 flag = 1
388 output.write("((code >= %s) && (code <= %s))" % (start, end))
389 output.write(");\n}\n\n")
390
391 header.write("\nXMLPUBFUN int xmlUCSIsBlock\t(int code, const char *block);\n\n")
392 output.write(
393 """/**
394 * xmlUCSIsBlock:
395 * @code: UCS code point
396 * @block: UCS block name
397 *
398 * Check whether the character is part of the UCS Block
399 *
400 * Returns 1 if true, 0 if false and -1 on unknown block
401 */
402 int
403 xmlUCSIsBlock(int code, const char *block) {
404 xmlIntFunc *func;
405
406 func = xmlUnicodeLookup(&xmlUnicodeBlockTbl, block);
407 if (func == NULL)
408 return (-1);
409 return (func(code));
410 }
411
412 """)
413
414 for name in ckeys:
415 ranges = Categories[name]
416 header.write("XMLPUBFUN int xmlUCSIsCat%s\t(int code);\n" % name)
417 output.write("/**\n * xmlUCSIsCat%s:\n * @code: UCS code point\n" % (name))
418 output.write(" *\n * Check whether the character is part of %s UCS Category\n"%
419 (name))
420 output.write(" *\n * Returns 1 if true 0 otherwise\n */\n");
421 output.write("int\nxmlUCSIsCat%s(int code) {\n" % name)
422 if len(Categories[name]) > minTableSize:
423 output.write(" return(xmlCharInRange((unsigned int)code, &xml%sG)"
424 % name)
425 else:
426 start = 1
427 for range in ranges:
428 (begin, end) = range;
429 if start:
430 output.write(" return(");
431 start = 0
432 else:
433 output.write(" ||\n ");
434 if (begin == end):
435 output.write("(code == %s)" % (hex(begin)))
436 else:
437 output.write("((code >= %s) && (code <= %s))" % (
438 hex(begin), hex(end)))
439 output.write(");\n}\n\n")
440
441 header.write("\nXMLPUBFUN int xmlUCSIsCat\t(int code, const char *cat);\n")
442 output.write(
443 """/**
444 * xmlUCSIsCat:
445 * @code: UCS code point
446 * @cat: UCS Category name
447 *
448 * Check whether the character is part of the UCS Category
449 *
450 * Returns 1 if true, 0 if false and -1 on unknown category
451 */
452 int
453 xmlUCSIsCat(int code, const char *cat) {
454 xmlIntFunc *func;
455
456 func = xmlUnicodeLookup(&xmlUnicodeCatTbl, cat);
457 if (func == NULL)
458 return (-1);
459 return (func(code));
460 }
461
462 #endif /* LIBXML_UNICODE_ENABLED */
463 """)
464
465 header.write("""
466 #ifdef __cplusplus
467 }
468 #endif
469
470 #endif /* LIBXML_UNICODE_ENABLED */
471
472 #endif /* __XML_UNICODE_H__ */
473 """);
474
475 header.close()
476 output.close()