1  #!/usr/bin/env python3
       2  #
       3  # fontconfig/fc-case/fc-case.py
       4  #
       5  # Copyright © 2004 Keith Packard
       6  # Copyright © 2019 Tim-Philipp Müller
       7  #
       8  # Permission to use, copy, modify, distribute, and sell this software and its
       9  # documentation for any purpose is hereby granted without fee, provided that
      10  # the above copyright notice appear in all copies and that both that
      11  # copyright notice and this permission notice appear in supporting
      12  # documentation, and that the name of the author(s) not be used in
      13  # advertising or publicity pertaining to distribution of the software without
      14  # specific, written prior permission.  The authors make no
      15  # representations about the suitability of this software for any purpose.  It
      16  # is provided "as is" without express or implied warranty.
      17  #
      18  # THE AUTHOR(S) DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
      19  # INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
      20  # EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY SPECIAL, INDIRECT OR
      21  # CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
      22  # DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
      23  # TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
      24  # PERFORMANCE OF THIS SOFTWARE.
      25  
      26  from enum import Enum
      27  import argparse
      28  import string
      29  import sys
      30  
      31  class ESC[4;38;5;81mCaseFoldClass(ESC[4;38;5;149mEnum):
      32      COMMON = 1
      33      FULL = 2
      34      SIMPLE = 3
      35      TURKIC = 4
      36  
      37  class ESC[4;38;5;81mCaseFoldMethod(ESC[4;38;5;149mEnum):
      38      RANGE = 0
      39      EVEN_ODD = 1
      40      FULL = 2
      41  
      42  caseFoldClassMap = {
      43    'C' : CaseFoldClass.COMMON,
      44    'F' : CaseFoldClass.FULL,
      45    'S' : CaseFoldClass.SIMPLE,
      46    'T' : CaseFoldClass.TURKIC
      47  }
      48  
      49  folds = []
      50  
      51  def ucs4_to_utf8(ucs4):
      52      utf8_rep = []
      53      
      54      if ucs4 < 0x80:
      55          utf8_rep.append(ucs4)
      56          bits = -6
      57      elif ucs4 < 0x800:
      58          utf8_rep.append(((ucs4 >> 6) & 0x1F) | 0xC0)
      59          bits = 0
      60      elif ucs4 < 0x10000:
      61          utf8_rep.append(((ucs4 >> 12) & 0x0F) | 0xE0)
      62          bits = 6
      63      elif ucs4 < 0x200000:
      64          utf8_rep.append(((ucs4 >> 18) & 0x07) | 0xF0)
      65          bits = 12
      66      elif ucs4 < 0x4000000:
      67          utf8_rep.append(((ucs4 >> 24) & 0x03) | 0xF8)
      68          bits = 18
      69      elif ucs4 < 0x80000000:
      70          utf8_rep.append(((ucs4 >> 30) & 0x01) | 0xFC)
      71          bits = 24
      72      else:
      73          return [];
      74  
      75      while bits >= 0:
      76          utf8_rep.append(((ucs4 >> bits) & 0x3F) | 0x80)
      77          bits-= 6
      78  
      79      return utf8_rep
      80  
      81  def utf8_size(ucs4):
      82      return len(ucs4_to_utf8(ucs4))
      83  
      84  case_fold_method_name_map = {
      85      CaseFoldMethod.RANGE: 'FC_CASE_FOLD_RANGE,',
      86      CaseFoldMethod.EVEN_ODD: 'FC_CASE_FOLD_EVEN_ODD,',
      87      CaseFoldMethod.FULL: 'FC_CASE_FOLD_FULL,',
      88  }
      89  
      90  if __name__=='__main__':
      91      parser = argparse.ArgumentParser()
      92      parser.add_argument('case_folding_file')
      93      parser.add_argument('--template', dest='template_file', default=None)
      94      parser.add_argument('--output', dest='output_file', default=None)
      95  
      96      args = parser.parse_args()
      97  
      98      minFoldChar = None
      99      maxFoldChar = None
     100      fold = None
     101  
     102      foldChars = []
     103      maxFoldChars = 0
     104  
     105      maxExpand = 0
     106  
     107      # Read the standard Unicode CaseFolding.txt file
     108      with open(args.case_folding_file, 'r', encoding='utf-8') as casefile:
     109          for cnt, line in enumerate(casefile):
     110              if not line or not line[0] in string.hexdigits:
     111                  continue
     112  
     113              # print('Line {}: {}'.format(cnt, line.strip()))
     114  
     115              tokens = line.split('; ')
     116  
     117              if len(tokens) < 3:
     118                  print('Not enough tokens in line {}'.format(cnt), file=sys.stderr)
     119                  sys.exit(1)
     120  
     121              # Get upper case value
     122              upper = int(tokens.pop(0), 16)
     123  
     124              # Get class
     125              cfclass = caseFoldClassMap[tokens.pop(0)]
     126  
     127              # Get list of result characters
     128              lower = list(map(lambda s: int(s,16), tokens.pop(0).split()))
     129  
     130              # print('\t----> {:04X} {} {}'.format(upper, cfclass, lower))
     131  
     132              if not minFoldChar:
     133                  minFoldChar = upper
     134  
     135              maxFoldChar = upper;
     136  
     137              if cfclass in [CaseFoldClass.COMMON, CaseFoldClass.FULL]:
     138                  if len(lower) == 1:
     139                      # foldExtends
     140                      if fold and fold['method'] == CaseFoldMethod.RANGE:
     141                          foldExtends = (lower[0] - upper) == fold['offset'] and upper == fold['upper'] + fold['count']
     142                      elif fold and fold['method'] == CaseFoldMethod.EVEN_ODD:
     143                          foldExtends = (lower[0] - upper) == 1 and upper == (fold['upper'] + fold['count'] + 1)
     144                      else:
     145                          foldExtends = False
     146  
     147                      if foldExtends:
     148                          # This modifies the last fold item in the array too
     149                          fold['count'] = upper - fold['upper'] + 1;
     150                      else:
     151                          fold = {}
     152                          fold['upper'] = upper
     153                          fold['offset'] = lower[0] - upper;
     154                          if fold['offset'] == 1:
     155                              fold['method'] = CaseFoldMethod.EVEN_ODD
     156                          else:
     157                              fold['method'] = CaseFoldMethod.RANGE
     158                          fold['count'] = 1
     159                          folds.append(fold)
     160                      expand = utf8_size (lower[0]) - utf8_size(upper)
     161                  else:
     162                      fold = {}
     163                      fold['upper'] = upper
     164                      fold['method'] = CaseFoldMethod.FULL
     165                      fold['offset'] = len(foldChars)
     166  
     167                      # add chars
     168                      for c in lower:
     169                          utf8_rep = ucs4_to_utf8(c)
     170                          # print('{} -> {}'.format(c,utf8_rep))
     171                          for utf8_char in utf8_rep:
     172                              foldChars.append(utf8_char)
     173  
     174                      fold['count'] = len(foldChars) - fold['offset']
     175                      folds.append(fold)
     176  
     177                      if fold['count'] > maxFoldChars:
     178                          maxFoldChars = fold['count']
     179  
     180                      expand = fold['count'] - utf8_size(upper)
     181                      if expand > maxExpand:
     182                          maxExpand = expand
     183  
     184      # Open output file
     185      if args.output_file:
     186          sys.stdout = open(args.output_file, 'w', encoding='utf-8')
     187  
     188      # Read the template file
     189      if args.template_file:
     190          tmpl_file = open(args.template_file, 'r', encoding='utf-8')
     191      else:
     192          tmpl_file = sys.stdin
     193      
     194      # Scan the input until the marker is found
     195      # FIXME: this is a bit silly really, might just as well harcode
     196      #        the license header in the script and drop the template
     197      for line in tmpl_file:
     198          if line.strip() == '@@@':
     199              break
     200          print(line, end='')
     201      
     202      # Dump these tables
     203      print('#define FC_NUM_CASE_FOLD\t{}'.format(len(folds)))
     204      print('#define FC_NUM_CASE_FOLD_CHARS\t{}'.format(len(foldChars)))
     205      print('#define FC_MAX_CASE_FOLD_CHARS\t{}'.format(maxFoldChars))
     206      print('#define FC_MAX_CASE_FOLD_EXPAND\t{}'.format(maxExpand))
     207      print('#define FC_MIN_FOLD_CHAR\t0x{:08x}'.format(minFoldChar))
     208      print('#define FC_MAX_FOLD_CHAR\t0x{:08x}'.format(maxFoldChar))
     209      print('')
     210  
     211      # Dump out ranges
     212      print('static const FcCaseFold    fcCaseFold[FC_NUM_CASE_FOLD] = {')
     213      for f in folds:
     214           short_offset = f['offset']
     215           if short_offset < -32367:
     216               short_offset += 65536
     217           if short_offset > 32368:
     218               short_offset -= 65536
     219           print('    {} 0x{:08x}, {:22s} 0x{:04x}, {:6d} {},'.format('{',
     220                 f['upper'], case_fold_method_name_map[f['method']],
     221                 f['count'], short_offset, '}'))
     222      print('};\n')
     223  
     224      # Dump out "other" values
     225      print('static const FcChar8\tfcCaseFoldChars[FC_NUM_CASE_FOLD_CHARS] = {')
     226      for n, c in enumerate(foldChars):
     227          if n == len(foldChars) - 1:
     228              end = ''
     229          elif n % 16 == 15:
     230              end = ',\n'
     231          else:
     232              end = ','
     233          print('0x{:02x}'.format(c), end=end)
     234      print('\n};')
     235  
     236      # And flush out the rest of the input file
     237      for line in tmpl_file:
     238          print(line, end='')
     239      
     240      sys.stdout.flush()