(root)/
glib-2.79.0/
glib/
update-gtranslit.py
       1  #!/usr/bin/env python3
       2  
       3  # Run this script like so:
       4  #
       5  #  ./update-gtranslit.py /path/to/glibc/localedata/locales > gtranslit-data.h
       6  
       7  import os
       8  import sys
       9  
      10  
      11  localedir = sys.argv[1]
      12  
      13  
      14  # returns true if the name looks like a POSIX locale name
      15  def looks_like_locale(name):
      16      name, _, variant = name.partition("@")
      17  
      18      if "_" not in name:
      19          return False
      20  
      21      lang, _, land = name.partition("_")
      22  
      23      return len(lang) == 2 or len(lang) == 3 and len(land) == 2
      24  
      25  
      26  # handles <U1234> style escapes
      27  def unescape(string):
      28      chunks = []
      29  
      30      n = len(string)
      31      i = 0
      32  
      33      while i < n:
      34          start_escape = string.find("<", i)
      35  
      36          if start_escape == -1:
      37              chunks.append(string[i:])
      38              break
      39  
      40          assert string[start_escape : (start_escape + 2)] == "<U"
      41          start_escape += 2
      42  
      43          end_escape = string.find(">", start_escape)
      44          assert end_escape != -1
      45  
      46          chunks.append(chr(int(string[start_escape:end_escape], 16)))
      47          i = end_escape + 1
      48  
      49      return "".join(chunks)
      50  
      51  
      52  # Checks if a string is ascii
      53  def is_ascii(string):
      54      return all(ord(c) < 0x80 for c in string)
      55  
      56  
      57  # A Mapping is a map from non-ascii strings to ascii strings.
      58  #
      59  # It corresponds to a sequence of one or more mapping lines:
      60  #
      61  #   <U00C4> "<U0041><U0308>";"<U0041><U0045>"
      62  #
      63  # in a file.
      64  class ESC[4;38;5;81mMapping:
      65      def __init__(self):
      66          self.serialised = None
      67          self.mapping = {}
      68  
      69      # Scans a string like
      70      #
      71      #   <U00C4> "<U0041><U0308>";"<U0041><U0045>" % \
      72      #   LATIN CAPITAL LETTER A WITH DIAERESIS.
      73      #
      74      # and adds the first all-ascii choice (or IGNORE) to the mapping
      75      # dictionary, with the origin string as the key.  In the case of
      76      # IGNORE, stores the empty string.
      77      def consider_mapping_line(self, line):
      78          key, value, rest = (line + " % comment").split(maxsplit=2)
      79  
      80          key = unescape(key)
      81  
      82          for alternative in value.split(";"):
      83              if alternative[0] == '"' and alternative[-1] == '"':
      84                  unescaped = unescape(alternative[1:-1])
      85                  if is_ascii(unescaped):
      86                      self.mapping[key] = unescaped
      87                      break
      88  
      89              elif alternative[0] == "<" and alternative[-1] == ">":
      90                  unescaped = unescape(alternative)
      91                  if is_ascii(unescaped):
      92                      self.mapping[key] = unescaped
      93                      break
      94  
      95              elif alternative == "IGNORE":
      96                  self.mapping[key] = ""
      97                  break
      98  
      99      # Performs a normal dictionary merge, but ensures that there are no
     100      # conflicting entries between the original dictionary and the requested
     101      # changes
     102      def merge_mapping(self, changes):
     103          for key in changes.mapping:
     104              if key in self.mapping:
     105                  assert self.mapping[key] == changes.mapping[key]
     106  
     107          self.mapping.update(changes.mapping)
     108  
     109      # Can't get much flatter...
     110      def get_flattened(self):
     111          return [self]
     112  
     113      def serialise(self, serialiser):
     114          if self.serialised is None:
     115              self.serialised = serialiser.add_mapping(self.mapping)
     116  
     117          return self.serialised
     118  
     119  
     120  # A Chain is a sequence of mappings and chains.
     121  #
     122  # A chain contains another chain whenever "copy" or "include" is
     123  # encountered in a source file.
     124  #
     125  # A chain contains a mapping whenever a sequence of mapping lines:
     126  #
     127  #   <U00C4> "<U0041><U0308>";"<U0041><U0045>"
     128  #
     129  # is encountered in a file.
     130  #
     131  # The order of lookup is reverse: later entries override earlier ones.
     132  class ESC[4;38;5;81mChain:
     133      def __init__(self, name):
     134          self.serialised = None
     135          self.name = name
     136          self.chain = []
     137          self.links = 0
     138  
     139          self.read_from_file(os.path.join(localedir, name))
     140  
     141      def read_from_file(self, filename):
     142          current_mapping = None
     143          in_lc_ctype = False
     144          in_translit = False
     145  
     146          fp = open(filename, encoding="ascii", errors="surrogateescape")
     147  
     148          for line in fp:
     149              line = line.strip()
     150  
     151              if in_lc_ctype:
     152                  if line == "END LC_CTYPE":
     153                      break
     154  
     155                  if line.startswith("copy") or line.startswith("include"):
     156                      if current_mapping:
     157                          self.chain.append(current_mapping)
     158  
     159                      copyname = unescape(line.split('"', 3)[1])
     160                      copyfile = get_chain(copyname)
     161                      self.chain.append(copyfile)
     162                      copyfile.links += 1
     163  
     164                      current_mapping = None
     165  
     166                  elif line == "translit_start":
     167                      in_translit = True
     168  
     169                  elif line == "translit_end":
     170                      in_translit = False
     171  
     172                  elif in_translit and line.startswith("<U"):
     173                      if not current_mapping:
     174                          current_mapping = Mapping()
     175  
     176                      current_mapping.consider_mapping_line(line)
     177  
     178                  elif line == "" or line.startswith("%"):
     179                      pass
     180  
     181                  elif "default_missing <U003F>":
     182                      pass
     183  
     184                  elif in_translit:
     185                      print("unknown line:", line)
     186                      assert False
     187  
     188              elif line == "LC_CTYPE":
     189                  in_lc_ctype = True
     190  
     191          if current_mapping:
     192              self.chain.append(current_mapping)
     193  
     194      # If there is only one link to this chain, we may as well just
     195      # return the contents of the chain so that they can be merged into
     196      # our sole parent directly.  Otherwise, return ourselves.
     197      def get_flattened(self):
     198          if self.links == 1:
     199              return sum((item.get_flattened() for item in self.chain), [])
     200          else:
     201              return [self]
     202  
     203      def serialise(self, serialiser):
     204          if self.serialised is None:
     205              # Before we serialise, see if we can optimise a bit
     206              self.chain = sum((item.get_flattened() for item in self.chain), [])
     207  
     208              i = 0
     209              while i < len(self.chain) - 1:
     210                  if isinstance(self.chain[i], Mapping) and isinstance(
     211                      self.chain[i + 1], Mapping
     212                  ):
     213                      # We have two mappings in a row.  Try to merge them.
     214                      self.chain[i].merge_mapping(self.chain[i + 1])
     215                      del self.chain[i + 1]
     216                  else:
     217                      i += 1
     218  
     219              # If all that is left is one item, just serialise that directly
     220              if len(self.chain) == 1:
     221                  self.serialised = self.chain[0].serialise(serialiser)
     222              else:
     223                  ids = [item.serialise(serialiser) for item in self.chain]
     224                  self.serialised = serialiser.add_chain(ids)
     225  
     226          return self.serialised
     227  
     228  
     229  # Chain cache -- allows sharing of common chains
     230  chains = {}
     231  
     232  
     233  def get_chain(name):
     234      if name not in chains:
     235          chains[name] = Chain(name)
     236  
     237      return chains[name]
     238  
     239  
     240  # Remove the country name from a locale, preserving variant
     241  # eg: 'sr_RS@latin' -> 'sr@latin'
     242  def remove_country(string):
     243      base, at, variant = string.partition("@")
     244      lang, _, land = base.partition("_")
     245      return lang + at + variant
     246  
     247  
     248  def encode_range(start, end):
     249      assert start <= end
     250      length = end - start
     251  
     252      assert start < 0x1000
     253      assert length < 0x8
     254  
     255      result = 0x8000 + (length << 12) + start
     256  
     257      assert result < 0x10000
     258  
     259      return result
     260  
     261  
     262  def c_pair_array(array):
     263      return "{ " + ", ".join("{ %u, %u }" % pair for pair in array) + " };"
     264  
     265  
     266  class ESC[4;38;5;81mSerialiser:
     267      def __init__(self):
     268          self.mappings = []
     269          self.chains = []
     270          self.locales = {}
     271  
     272      def add_mapping(self, mapping):
     273          if mapping in self.mappings:
     274              mapping_id = self.mappings.index(mapping)
     275          else:
     276              mapping_id = len(self.mappings)
     277              self.mappings.append(mapping)
     278  
     279          assert mapping_id < 128
     280          return mapping_id
     281  
     282      def add_chain(self, chain):
     283          if chain in self.chains:
     284              chain_id = self.chains.index(chain)
     285          else:
     286              chain_id = len(self.chains)
     287              self.chains.append(chain)
     288  
     289          assert chain_id < 128
     290          return 128 + chain_id
     291  
     292      def add_locale(self, name, item_id):
     293          self.locales[name] = item_id
     294  
     295      def add_default(self, item_id):
     296          self.default = item_id
     297  
     298      def optimise_locales(self):
     299          # Check if all regions of a language/variant agree
     300          languages = list(set(remove_country(locale) for locale in self.locales))
     301  
     302          for language in languages:
     303              locales = [
     304                  locale for locale in self.locales if remove_country(locale) == language
     305              ]
     306  
     307              item_id = self.locales[locales[0]]
     308              if all(self.locales[locale] == item_id for locale in locales):
     309                  self.locales[language] = item_id
     310                  for locale in locales:
     311                      del self.locales[locale]
     312  
     313          # Check if a variant is the same as the non-variant form
     314          # eg: 'de@euro' and 'de'
     315          for variant in list(locale for locale in self.locales if "@" in locale):
     316              base, _, _ = variant.partition("@")
     317              if base in self.locales and self.locales[base] == self.locales[variant]:
     318                  del self.locales[variant]
     319  
     320          # Eliminate any entries that are just the same as the C locale
     321          for locale in list(self.locales):
     322              if self.locales[locale] == self.default:
     323                  del self.locales[locale]
     324  
     325      def to_c(self):
     326          src_table = ""
     327          ascii_table = ""
     328          mappings_table = []
     329          mapping_ranges = []
     330          chains_table = []
     331          chain_starts = []
     332          locale_names = ""
     333          locale_index = []
     334          max_lookup = 0
     335          max_localename = 0
     336  
     337          for mapping in self.mappings:
     338              mapping_ranges.append((len(mappings_table), len(mapping)))
     339  
     340              for key in sorted(mapping):
     341                  if len(key) == 1 and ord(key[0]) < 0x8000:
     342                      src_range = ord(key[0])
     343                  else:
     344                      existing = src_table.find(key)
     345                      if existing == -1:
     346                          start = len(src_table)
     347                          assert all(ord(c) <= 0x10FFFF for c in key)
     348                          src_table += key
     349                          src_range = encode_range(start, len(src_table))
     350                          max_lookup = max(max_lookup, len(key))
     351                      else:
     352                          src_range = encode_range(existing, existing + len(key))
     353  
     354                  value = mapping[key]
     355                  if len(value) == 1 and ord(value[0]) < 0x80:
     356                      ascii_range = ord(value[0])
     357                  else:
     358                      existing = ascii_table.find(value)
     359                      if existing == -1:
     360                          start = len(ascii_table)
     361                          assert all(ord(c) < 0x80 for c in value)
     362                          ascii_table += value
     363                          ascii_range = encode_range(start, len(ascii_table))
     364                      else:
     365                          ascii_range = encode_range(existing, existing + len(value))
     366  
     367                  mappings_table.append((src_range, ascii_range))
     368  
     369          for chain in self.chains:
     370              chain_starts.append(len(chains_table))
     371  
     372              for item_id in reversed(chain):
     373                  assert item_id < 0xFF
     374                  chains_table.append(item_id)
     375              chains_table.append(0xFF)
     376  
     377          for locale in sorted(self.locales):
     378              max_localename = max(max_localename, len(locale))
     379              name_offset = len(locale_names)
     380              assert all(ord(c) <= 0x7F for c in locale)
     381              locale_names += locale + "\0"
     382  
     383              item_id = self.locales[locale]
     384  
     385              assert name_offset < 256
     386              assert item_id < 256
     387              locale_index.append((name_offset, item_id))
     388  
     389          print("/* Generated by update-gtranslit.py */")
     390          print("#define MAX_KEY_SIZE", max_lookup)
     391          print("#define MAX_LOCALE_NAME", max_localename)
     392          print(
     393              "static const gunichar src_table[] = {",
     394              ", ".join(str(ord(c)) for c in src_table),
     395              "};",
     396          )
     397          # cannot do this in plain ascii because of trigraphs... :(
     398          print(
     399              "static const gchar ascii_table[] = {",
     400              ", ".join(str(ord(c)) for c in ascii_table),
     401              "};",
     402          )
     403          print(
     404              "static const struct mapping_entry mappings_table[] =",
     405              c_pair_array(mappings_table),
     406          )
     407          print(
     408              "static const struct mapping_range mapping_ranges[] =",
     409              c_pair_array(mapping_ranges),
     410          )
     411          print(
     412              "static const guint8 chains_table[] = {",
     413              ", ".join(str(i) for i in chains_table),
     414              "};",
     415          )
     416          print(
     417              "static const guint8 chain_starts[] = {",
     418              ", ".join(str(i) for i in chain_starts),
     419              "};",
     420          )
     421          print(
     422              'static const gchar locale_names[] = "'
     423              + locale_names.replace("\0", "\\0")
     424              + '";'
     425          )
     426          print(
     427              "static const struct locale_entry locale_index[] = ",
     428              c_pair_array(locale_index),
     429          )
     430          print("static const guint8 default_item_id = %u;" % (self.default,))
     431  
     432      def dump(self):
     433          print(self.mappings)
     434          print(self.chains)
     435          print(self.locales)
     436  
     437  
     438  locales = []
     439  for name in os.listdir(localedir):
     440      if looks_like_locale(name):
     441          chain = get_chain(name)
     442          locales.append(chain)
     443          chain.links += 1
     444  
     445  serialiser = Serialiser()
     446  
     447  for locale in locales:
     448      serialiser.add_locale(locale.name, locale.serialise(serialiser))
     449  
     450  i18n = get_chain("i18n").serialise(serialiser)
     451  combining = get_chain("translit_combining").serialise(serialiser)
     452  serialiser.add_default(serialiser.add_chain([i18n, combining]))
     453  
     454  serialiser.optimise_locales()
     455  
     456  serialiser.to_c()