1  #!/usr/bin/env python3
       2  #
       3  # fontconfig/fc-lang/fc-lang.py
       4  #
       5  # Copyright © 2001-2002 Keith Packard
       6  # Copyright © 2019 Tim-Philipp Müller
       7  #
       8  # Permission to use, copy, modify, distribute, and sell this software and its
       9  # documentation for any purpose is hereby granted without fee, provided that
      10  # the above copyright notice appear in all copies and that both that
      11  # copyright notice and this permission notice appear in supporting
      12  # documentation, and that the name of the author(s) not be used in
      13  # advertising or publicity pertaining to distribution of the software without
      14  # specific, written prior permission.  The authors make no
      15  # representations about the suitability of this software for any purpose.  It
      16  # is provided "as is" without express or implied warranty.
      17  #
      18  # THE AUTHOR(S) DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
      19  # INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
      20  # EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY SPECIAL, INDIRECT OR
      21  # CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
      22  # DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
      23  # TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
      24  # PERFORMANCE OF THIS SOFTWARE.
      25  
      26  # fc-lang
      27  #
      28  # Read a set of language orthographies and build C declarations for
      29  # charsets which can then be used to identify which languages are
      30  # supported by a given font.
      31  #
      32  # TODO: this code is not very pythonic, a lot of it is a 1:1 translation
      33  # of the C code and we could probably simplify it a bit
      34  import argparse
      35  import string
      36  import sys
      37  import os
      38  
      39  # we just store the leaves in a dict, we can order the leaves later if needed
      40  class ESC[4;38;5;81mCharSet:
      41      def __init__(self):
      42          self.leaves = {} # leaf_number -> leaf data (= 16 uint32)
      43  
      44      def add_char(self, ucs4):
      45          assert ucs4 < 0x01000000
      46          leaf_num = ucs4 >> 8
      47          if leaf_num in self.leaves:
      48              leaf = self.leaves[leaf_num]
      49          else:
      50              leaf = [0, 0, 0, 0, 0, 0, 0, 0] # 256/32 = 8
      51              self.leaves[leaf_num] = leaf
      52          leaf[(ucs4 & 0xff) >> 5] |= (1 << (ucs4 & 0x1f))
      53          #print('{:08x} [{:04x}] --> {}'.format(ucs4, ucs4>>8, leaf))
      54  
      55      def del_char(self, ucs4):
      56          assert ucs4 < 0x01000000
      57          leaf_num = ucs4 >> 8
      58          if leaf_num in self.leaves:
      59              leaf = self.leaves[leaf_num]
      60              leaf[(ucs4 & 0xff) >> 5] &= ~(1 << (ucs4 & 0x1f))
      61              # We don't bother removing the leaf if it's empty */
      62              #print('{:08x} [{:04x}] --> {}'.format(ucs4, ucs4>>8, leaf))
      63  
      64      def equals(self, other_cs):
      65          keys = sorted(self.leaves.keys())
      66          other_keys = sorted(other_cs.leaves.keys())
      67          if len(keys) != len(other_keys):
      68              return False
      69          for k1, k2 in zip(keys, other_keys):
      70              if k1 != k2:
      71                  return False
      72              if not leaves_equal(self.leaves[k1], other_cs.leaves[k2]):
      73                  return False
      74          return True
      75  
      76  # Convert a file name into a name suitable for C declarations
      77  def get_name(file_name):
      78      return file_name.split('.')[0]
      79  
      80  # Convert a C name into a language name
      81  def get_lang(c_name):
      82      return c_name.replace('_', '-').replace(' ', '').lower()
      83  
      84  def read_orth_file(file_name):
      85      lines = []
      86      with open(file_name, 'r', encoding='utf-8') as orth_file:
      87          for num, line in enumerate(orth_file):
      88              if line.startswith('include '):
      89                  include_fn = line[8:].strip()
      90                  lines += read_orth_file(include_fn)
      91              else:
      92                  # remove comments and strip whitespaces
      93                  line = line.split('#')[0].strip()
      94                  line = line.split('\t')[0].strip()
      95                  # skip empty lines
      96                  if line:
      97                      lines += [(file_name, num, line)]
      98  
      99      return lines
     100  
     101  def leaves_equal(leaf1, leaf2):
     102      for v1, v2 in zip(leaf1, leaf2):
     103          if v1 != v2:
     104              return False
     105      return True
     106  
     107  # Build a single charset from a source file
     108  #
     109  # The file format is quite simple, either
     110  # a single hex value or a pair separated with a dash
     111  def parse_orth_file(file_name, lines):
     112      charset = CharSet()
     113      for fn, num, line in lines:
     114          delete_char = line.startswith('-')
     115          if delete_char:
     116              line = line[1:]
     117          if line.find('-') != -1:
     118              parts = line.split('-')
     119          elif line.find('..') != -1:
     120              parts = line.split('..')
     121          else:
     122              parts = [line]
     123  
     124          start = int(parts.pop(0), 16)
     125          end = start
     126          if parts:
     127              end = int(parts.pop(0), 16)
     128          if parts:
     129              print('ERROR: {} line {}: parse error (too many parts)'.format(fn, num))
     130  
     131          for ucs4 in range(start, end+1):
     132              if delete_char:
     133                  charset.del_char(ucs4)
     134              else:
     135                  charset.add_char(ucs4)
     136  
     137      assert charset.equals(charset) # sanity check for the equals function
     138  
     139      return charset
     140  
     141  if __name__=='__main__':
     142      parser = argparse.ArgumentParser()
     143      parser.add_argument('orth_files', nargs='+', help='List of .orth files')
     144      parser.add_argument('--directory', dest='directory', default=None)
     145      parser.add_argument('--template', dest='template_file', default=None)
     146      parser.add_argument('--output', dest='output_file', default=None)
     147  
     148      args = parser.parse_args()
     149  
     150      sets = []
     151      names = []
     152      langs = []
     153      country = []
     154  
     155      total_leaves = 0
     156  
     157      LangCountrySets = {}
     158  
     159      # Open output file
     160      if args.output_file:
     161          sys.stdout = open(args.output_file, 'w', encoding='utf-8')
     162  
     163      # Read the template file
     164      if args.template_file:
     165          tmpl_file = open(args.template_file, 'r', encoding='utf-8')
     166      else:
     167          tmpl_file = sys.stdin
     168  
     169      # Change into source dir if specified (after opening other files)
     170      if args.directory:
     171          os.chdir(args.directory)
     172  
     173      orth_entries = {}
     174      for i, fn in enumerate(args.orth_files):
     175          orth_entries[fn] = i
     176  
     177      for fn in sorted(orth_entries.keys()):
     178          lines = read_orth_file(fn)
     179          charset = parse_orth_file(fn, lines)
     180  
     181          sets.append(charset)
     182  
     183          name = get_name(fn)
     184          names.append(name)
     185  
     186          lang = get_lang(name)
     187          langs.append(lang)
     188          if lang.find('-') != -1:
     189              country.append(orth_entries[fn]) # maps to original index
     190              language_family = lang.split('-')[0]
     191              if not language_family in LangCountrySets:
     192                LangCountrySets[language_family] = []
     193              LangCountrySets[language_family] += [orth_entries[fn]]
     194  
     195          total_leaves += len(charset.leaves)
     196  
     197      # Find unique leaves
     198      leaves = []
     199      for s in sets:
     200         for leaf_num in sorted(s.leaves.keys()):
     201             leaf = s.leaves[leaf_num]
     202             is_unique = True
     203             for existing_leaf in leaves:
     204                 if leaves_equal(leaf, existing_leaf):
     205                    is_unique = False
     206                    break
     207             #print('unique: ', is_unique)
     208             if is_unique:
     209                 leaves.append(leaf)
     210  
     211      # Find duplicate charsets
     212      duplicate = []
     213      for i, s in enumerate(sets):
     214          dup_num = None
     215          if i >= 1:
     216              for j, s_cmp in enumerate(sets):
     217                  if j >= i:
     218                      break
     219                  if s_cmp.equals(s):
     220                      dup_num = j
     221                      break
     222  
     223          duplicate.append(dup_num)
     224  
     225      tn = 0
     226      off = {}
     227      for i, s in enumerate(sets):
     228          if duplicate[i]:
     229              continue
     230          off[i] = tn
     231          tn += len(s.leaves)
     232  
     233      # Scan the input until the marker is found
     234      # FIXME: this is a bit silly really, might just as well hardcode
     235      #        the license header in the script and drop the template
     236      for line in tmpl_file:
     237          if line.strip() == '@@@':
     238              break
     239          print(line, end='')
     240  
     241      print('/* total size: {} unique leaves: {} */\n'.format(total_leaves, len(leaves)))
     242  
     243      print('#define LEAF0       ({} * sizeof (FcLangCharSet))'.format(len(sets)))
     244      print('#define OFF0        (LEAF0 + {} * sizeof (FcCharLeaf))'.format(len(leaves)))
     245      print('#define NUM0        (OFF0 + {} * sizeof (uintptr_t))'.format(tn))
     246      print('#define SET(n)      (n * sizeof (FcLangCharSet) + offsetof (FcLangCharSet, charset))')
     247      print('#define OFF(s,o)    (OFF0 + o * sizeof (uintptr_t) - SET(s))')
     248      print('#define NUM(s,n)    (NUM0 + n * sizeof (FcChar16) - SET(s))')
     249      print('#define LEAF(o,l)   (LEAF0 + l * sizeof (FcCharLeaf) - (OFF0 + o * sizeof (intptr_t)))')
     250      print('#define fcLangCharSets (fcLangData.langCharSets)')
     251      print('#define fcLangCharSetIndices (fcLangData.langIndices)')
     252      print('#define fcLangCharSetIndicesInv (fcLangData.langIndicesInv)')
     253  
     254      assert len(sets) < 256 # FIXME: need to change index type to 16-bit below then
     255  
     256      print('''
     257  static const struct {{
     258      FcLangCharSet  langCharSets[{}];
     259      FcCharLeaf     leaves[{}];
     260      uintptr_t      leaf_offsets[{}];
     261      FcChar16       numbers[{}];
     262      {}       langIndices[{}];
     263      {}       langIndicesInv[{}];
     264  }} fcLangData = {{'''.format(len(sets), len(leaves), tn, tn,
     265                               'FcChar8 ', len(sets), 'FcChar8 ', len(sets)))
     266  
     267      # Dump sets
     268      print('{')
     269      for i, s in enumerate(sets):
     270          if duplicate[i]:
     271              j = duplicate[i]
     272          else:
     273              j = i
     274          print('    {{ "{}",  {{ FC_REF_CONSTANT, {}, OFF({},{}), NUM({},{}) }} }}, /* {} */'.format(
     275  		langs[i], len(sets[j].leaves), i, off[j], i, off[j], i))
     276  
     277      print('},')
     278  
     279      # Dump leaves
     280      print('{')
     281      for l, leaf in enumerate(leaves):
     282          print('    {{ {{ /* {} */'.format(l), end='')
     283          for i in range(0, 8): # 256/32 = 8
     284              if i % 4 == 0:
     285                  print('\n   ', end='')
     286              print(' 0x{:08x},'.format(leaf[i]), end='')
     287          print('\n    } },')
     288      print('},')
     289  
     290      # Dump leaves
     291      print('{')
     292      for i, s in enumerate(sets):
     293          if duplicate[i]:
     294              continue
     295  
     296          print('    /* {} */'.format(names[i]))
     297  
     298          for n, leaf_num in enumerate(sorted(s.leaves.keys())):
     299              leaf = s.leaves[leaf_num]
     300              if n % 4 == 0:
     301                  print('   ', end='')
     302              found = [k for k, unique_leaf in enumerate(leaves) if leaves_equal(unique_leaf,leaf)] 
     303              assert found, "Couldn't find leaf in unique leaves list!"
     304              assert len(found) == 1
     305              print(' LEAF({:3},{:3}),'.format(off[i], found[0]), end='')
     306              if n % 4 == 3:
     307                  print('')
     308          if len(s.leaves) % 4 != 0:
     309              print('')
     310  
     311      print('},')
     312  	
     313      print('{')
     314      for i, s in enumerate(sets):
     315          if duplicate[i]:
     316              continue
     317  
     318          print('    /* {} */'.format(names[i]))
     319  
     320          for n, leaf_num in enumerate(sorted(s.leaves.keys())):
     321              leaf = s.leaves[leaf_num]
     322              if n % 8 == 0:
     323                  print('   ', end='')
     324              print(' 0x{:04x},'.format(leaf_num), end='')
     325              if n % 8 == 7:
     326                  print('')
     327          if len(s.leaves) % 8 != 0:
     328              print('')
     329  
     330      print('},')
     331  
     332      # langIndices
     333      print('{')
     334      for i, s in enumerate(sets):
     335          fn = '{}.orth'.format(names[i])
     336          print('    {}, /* {} */'.format(orth_entries[fn], names[i]))
     337      print('},')
     338  
     339      # langIndicesInv
     340      print('{')
     341      for i, k in enumerate(orth_entries.keys()):
     342          name = get_name(k)
     343          idx = names.index(name)
     344          print('    {}, /* {} */'.format(idx, name))
     345      print('}')
     346  
     347      print('};\n')
     348  
     349      print('#define NUM_LANG_CHAR_SET	{}'.format(len(sets)))
     350      num_lang_set_map = (len(sets) + 31) // 32;
     351      print('#define NUM_LANG_SET_MAP	{}'.format(num_lang_set_map))
     352  
     353      # Dump indices with country codes
     354      assert len(country) > 0
     355      assert len(LangCountrySets) > 0
     356      print('')
     357      print('static const FcChar32 fcLangCountrySets[][NUM_LANG_SET_MAP] = {')
     358      for k in sorted(LangCountrySets.keys()):
     359          langset_map = [0] * num_lang_set_map # initialise all zeros
     360          for entries_id in LangCountrySets[k]:
     361              langset_map[entries_id >> 5] |= (1 << (entries_id & 0x1f))
     362          print('    {', end='')
     363          for v in langset_map:
     364              print(' 0x{:08x},'.format(v), end='')
     365          print(' }}, /* {} */'.format(k))
     366  
     367      print('};\n')
     368      print('#define NUM_COUNTRY_SET {}\n'.format(len(LangCountrySets)))
     369  
     370      # Find ranges for each letter for faster searching
     371      # Dump sets start/finish for the fastpath
     372      print('static const FcLangCharSetRange  fcLangCharSetRanges[] = {\n')
     373      for c in string.ascii_lowercase: # a-z
     374          start = 9999
     375          stop = -1
     376          for i, s in enumerate(sets):
     377              if names[i].startswith(c):
     378                  start = min(start,i)
     379                  stop = max(stop,i)
     380          print('    {{ {}, {} }}, /* {} */'.format(start, stop, c))
     381      print('};\n')
     382  
     383      # And flush out the rest of the input file
     384      for line in tmpl_file:
     385          print(line, end='')
     386      
     387      sys.stdout.flush()