1 #!/usr/bin/env python3
2
3 # Run this script like so:
4 #
5 # ./update-gtranslit.py /path/to/glibc/localedata/locales > gtranslit-data.h
6
7 import os
8 import sys
9
10
11 localedir = sys.argv[1]
12
13
14 # returns true if the name looks like a POSIX locale name
15 def looks_like_locale(name):
16 name, _, variant = name.partition("@")
17
18 if "_" not in name:
19 return False
20
21 lang, _, land = name.partition("_")
22
23 return len(lang) == 2 or len(lang) == 3 and len(land) == 2
24
25
26 # handles <U1234> style escapes
27 def unescape(string):
28 chunks = []
29
30 n = len(string)
31 i = 0
32
33 while i < n:
34 start_escape = string.find("<", i)
35
36 if start_escape == -1:
37 chunks.append(string[i:])
38 break
39
40 assert string[start_escape : (start_escape + 2)] == "<U"
41 start_escape += 2
42
43 end_escape = string.find(">", start_escape)
44 assert end_escape != -1
45
46 chunks.append(chr(int(string[start_escape:end_escape], 16)))
47 i = end_escape + 1
48
49 return "".join(chunks)
50
51
52 # Checks if a string is ascii
53 def is_ascii(string):
54 return all(ord(c) < 0x80 for c in string)
55
56
57 # A Mapping is a map from non-ascii strings to ascii strings.
58 #
59 # It corresponds to a sequence of one or more mapping lines:
60 #
61 # <U00C4> "<U0041><U0308>";"<U0041><U0045>"
62 #
63 # in a file.
64 class ESC[4;38;5;81mMapping:
65 def __init__(self):
66 self.serialised = None
67 self.mapping = {}
68
69 # Scans a string like
70 #
71 # <U00C4> "<U0041><U0308>";"<U0041><U0045>" % \
72 # LATIN CAPITAL LETTER A WITH DIAERESIS.
73 #
74 # and adds the first all-ascii choice (or IGNORE) to the mapping
75 # dictionary, with the origin string as the key. In the case of
76 # IGNORE, stores the empty string.
77 def consider_mapping_line(self, line):
78 key, value, rest = (line + " % comment").split(maxsplit=2)
79
80 key = unescape(key)
81
82 for alternative in value.split(";"):
83 if alternative[0] == '"' and alternative[-1] == '"':
84 unescaped = unescape(alternative[1:-1])
85 if is_ascii(unescaped):
86 self.mapping[key] = unescaped
87 break
88
89 elif alternative[0] == "<" and alternative[-1] == ">":
90 unescaped = unescape(alternative)
91 if is_ascii(unescaped):
92 self.mapping[key] = unescaped
93 break
94
95 elif alternative == "IGNORE":
96 self.mapping[key] = ""
97 break
98
99 # Performs a normal dictionary merge, but ensures that there are no
100 # conflicting entries between the original dictionary and the requested
101 # changes
102 def merge_mapping(self, changes):
103 for key in changes.mapping:
104 if key in self.mapping:
105 assert self.mapping[key] == changes.mapping[key]
106
107 self.mapping.update(changes.mapping)
108
109 # Can't get much flatter...
110 def get_flattened(self):
111 return [self]
112
113 def serialise(self, serialiser):
114 if self.serialised is None:
115 self.serialised = serialiser.add_mapping(self.mapping)
116
117 return self.serialised
118
119
120 # A Chain is a sequence of mappings and chains.
121 #
122 # A chain contains another chain whenever "copy" or "include" is
123 # encountered in a source file.
124 #
125 # A chain contains a mapping whenever a sequence of mapping lines:
126 #
127 # <U00C4> "<U0041><U0308>";"<U0041><U0045>"
128 #
129 # is encountered in a file.
130 #
131 # The order of lookup is reverse: later entries override earlier ones.
132 class ESC[4;38;5;81mChain:
133 def __init__(self, name):
134 self.serialised = None
135 self.name = name
136 self.chain = []
137 self.links = 0
138
139 self.read_from_file(os.path.join(localedir, name))
140
141 def read_from_file(self, filename):
142 current_mapping = None
143 in_lc_ctype = False
144 in_translit = False
145
146 fp = open(filename, encoding="ascii", errors="surrogateescape")
147
148 for line in fp:
149 line = line.strip()
150
151 if in_lc_ctype:
152 if line == "END LC_CTYPE":
153 break
154
155 if line.startswith("copy") or line.startswith("include"):
156 if current_mapping:
157 self.chain.append(current_mapping)
158
159 copyname = unescape(line.split('"', 3)[1])
160 copyfile = get_chain(copyname)
161 self.chain.append(copyfile)
162 copyfile.links += 1
163
164 current_mapping = None
165
166 elif line == "translit_start":
167 in_translit = True
168
169 elif line == "translit_end":
170 in_translit = False
171
172 elif in_translit and line.startswith("<U"):
173 if not current_mapping:
174 current_mapping = Mapping()
175
176 current_mapping.consider_mapping_line(line)
177
178 elif line == "" or line.startswith("%"):
179 pass
180
181 elif "default_missing <U003F>":
182 pass
183
184 elif in_translit:
185 print("unknown line:", line)
186 assert False
187
188 elif line == "LC_CTYPE":
189 in_lc_ctype = True
190
191 if current_mapping:
192 self.chain.append(current_mapping)
193
194 # If there is only one link to this chain, we may as well just
195 # return the contents of the chain so that they can be merged into
196 # our sole parent directly. Otherwise, return ourselves.
197 def get_flattened(self):
198 if self.links == 1:
199 return sum((item.get_flattened() for item in self.chain), [])
200 else:
201 return [self]
202
203 def serialise(self, serialiser):
204 if self.serialised is None:
205 # Before we serialise, see if we can optimise a bit
206 self.chain = sum((item.get_flattened() for item in self.chain), [])
207
208 i = 0
209 while i < len(self.chain) - 1:
210 if isinstance(self.chain[i], Mapping) and isinstance(
211 self.chain[i + 1], Mapping
212 ):
213 # We have two mappings in a row. Try to merge them.
214 self.chain[i].merge_mapping(self.chain[i + 1])
215 del self.chain[i + 1]
216 else:
217 i += 1
218
219 # If all that is left is one item, just serialise that directly
220 if len(self.chain) == 1:
221 self.serialised = self.chain[0].serialise(serialiser)
222 else:
223 ids = [item.serialise(serialiser) for item in self.chain]
224 self.serialised = serialiser.add_chain(ids)
225
226 return self.serialised
227
228
229 # Chain cache -- allows sharing of common chains
230 chains = {}
231
232
233 def get_chain(name):
234 if name not in chains:
235 chains[name] = Chain(name)
236
237 return chains[name]
238
239
240 # Remove the country name from a locale, preserving variant
241 # eg: 'sr_RS@latin' -> 'sr@latin'
242 def remove_country(string):
243 base, at, variant = string.partition("@")
244 lang, _, land = base.partition("_")
245 return lang + at + variant
246
247
248 def encode_range(start, end):
249 assert start <= end
250 length = end - start
251
252 assert start < 0x1000
253 assert length < 0x8
254
255 result = 0x8000 + (length << 12) + start
256
257 assert result < 0x10000
258
259 return result
260
261
262 def c_pair_array(array):
263 return "{ " + ", ".join("{ %u, %u }" % pair for pair in array) + " };"
264
265
266 class ESC[4;38;5;81mSerialiser:
267 def __init__(self):
268 self.mappings = []
269 self.chains = []
270 self.locales = {}
271
272 def add_mapping(self, mapping):
273 if mapping in self.mappings:
274 mapping_id = self.mappings.index(mapping)
275 else:
276 mapping_id = len(self.mappings)
277 self.mappings.append(mapping)
278
279 assert mapping_id < 128
280 return mapping_id
281
282 def add_chain(self, chain):
283 if chain in self.chains:
284 chain_id = self.chains.index(chain)
285 else:
286 chain_id = len(self.chains)
287 self.chains.append(chain)
288
289 assert chain_id < 128
290 return 128 + chain_id
291
292 def add_locale(self, name, item_id):
293 self.locales[name] = item_id
294
295 def add_default(self, item_id):
296 self.default = item_id
297
298 def optimise_locales(self):
299 # Check if all regions of a language/variant agree
300 languages = list(set(remove_country(locale) for locale in self.locales))
301
302 for language in languages:
303 locales = [
304 locale for locale in self.locales if remove_country(locale) == language
305 ]
306
307 item_id = self.locales[locales[0]]
308 if all(self.locales[locale] == item_id for locale in locales):
309 self.locales[language] = item_id
310 for locale in locales:
311 del self.locales[locale]
312
313 # Check if a variant is the same as the non-variant form
314 # eg: 'de@euro' and 'de'
315 for variant in list(locale for locale in self.locales if "@" in locale):
316 base, _, _ = variant.partition("@")
317 if base in self.locales and self.locales[base] == self.locales[variant]:
318 del self.locales[variant]
319
320 # Eliminate any entries that are just the same as the C locale
321 for locale in list(self.locales):
322 if self.locales[locale] == self.default:
323 del self.locales[locale]
324
325 def to_c(self):
326 src_table = ""
327 ascii_table = ""
328 mappings_table = []
329 mapping_ranges = []
330 chains_table = []
331 chain_starts = []
332 locale_names = ""
333 locale_index = []
334 max_lookup = 0
335 max_localename = 0
336
337 for mapping in self.mappings:
338 mapping_ranges.append((len(mappings_table), len(mapping)))
339
340 for key in sorted(mapping):
341 if len(key) == 1 and ord(key[0]) < 0x8000:
342 src_range = ord(key[0])
343 else:
344 existing = src_table.find(key)
345 if existing == -1:
346 start = len(src_table)
347 assert all(ord(c) <= 0x10FFFF for c in key)
348 src_table += key
349 src_range = encode_range(start, len(src_table))
350 max_lookup = max(max_lookup, len(key))
351 else:
352 src_range = encode_range(existing, existing + len(key))
353
354 value = mapping[key]
355 if len(value) == 1 and ord(value[0]) < 0x80:
356 ascii_range = ord(value[0])
357 else:
358 existing = ascii_table.find(value)
359 if existing == -1:
360 start = len(ascii_table)
361 assert all(ord(c) < 0x80 for c in value)
362 ascii_table += value
363 ascii_range = encode_range(start, len(ascii_table))
364 else:
365 ascii_range = encode_range(existing, existing + len(value))
366
367 mappings_table.append((src_range, ascii_range))
368
369 for chain in self.chains:
370 chain_starts.append(len(chains_table))
371
372 for item_id in reversed(chain):
373 assert item_id < 0xFF
374 chains_table.append(item_id)
375 chains_table.append(0xFF)
376
377 for locale in sorted(self.locales):
378 max_localename = max(max_localename, len(locale))
379 name_offset = len(locale_names)
380 assert all(ord(c) <= 0x7F for c in locale)
381 locale_names += locale + "\0"
382
383 item_id = self.locales[locale]
384
385 assert name_offset < 256
386 assert item_id < 256
387 locale_index.append((name_offset, item_id))
388
389 print("/* Generated by update-gtranslit.py */")
390 print("#define MAX_KEY_SIZE", max_lookup)
391 print("#define MAX_LOCALE_NAME", max_localename)
392 print(
393 "static const gunichar src_table[] = {",
394 ", ".join(str(ord(c)) for c in src_table),
395 "};",
396 )
397 # cannot do this in plain ascii because of trigraphs... :(
398 print(
399 "static const gchar ascii_table[] = {",
400 ", ".join(str(ord(c)) for c in ascii_table),
401 "};",
402 )
403 print(
404 "static const struct mapping_entry mappings_table[] =",
405 c_pair_array(mappings_table),
406 )
407 print(
408 "static const struct mapping_range mapping_ranges[] =",
409 c_pair_array(mapping_ranges),
410 )
411 print(
412 "static const guint8 chains_table[] = {",
413 ", ".join(str(i) for i in chains_table),
414 "};",
415 )
416 print(
417 "static const guint8 chain_starts[] = {",
418 ", ".join(str(i) for i in chain_starts),
419 "};",
420 )
421 print(
422 'static const gchar locale_names[] = "'
423 + locale_names.replace("\0", "\\0")
424 + '";'
425 )
426 print(
427 "static const struct locale_entry locale_index[] = ",
428 c_pair_array(locale_index),
429 )
430 print("static const guint8 default_item_id = %u;" % (self.default,))
431
432 def dump(self):
433 print(self.mappings)
434 print(self.chains)
435 print(self.locales)
436
437
438 locales = []
439 for name in os.listdir(localedir):
440 if looks_like_locale(name):
441 chain = get_chain(name)
442 locales.append(chain)
443 chain.links += 1
444
445 serialiser = Serialiser()
446
447 for locale in locales:
448 serialiser.add_locale(locale.name, locale.serialise(serialiser))
449
450 i18n = get_chain("i18n").serialise(serialiser)
451 combining = get_chain("translit_combining").serialise(serialiser)
452 serialiser.add_default(serialiser.add_chain([i18n, combining]))
453
454 serialiser.optimise_locales()
455
456 serialiser.to_c()