1 #!/usr/bin/env python3
2 #
3 # fontconfig/fc-case/fc-case.py
4 #
5 # Copyright © 2004 Keith Packard
6 # Copyright © 2019 Tim-Philipp Müller
7 #
8 # Permission to use, copy, modify, distribute, and sell this software and its
9 # documentation for any purpose is hereby granted without fee, provided that
10 # the above copyright notice appear in all copies and that both that
11 # copyright notice and this permission notice appear in supporting
12 # documentation, and that the name of the author(s) not be used in
13 # advertising or publicity pertaining to distribution of the software without
14 # specific, written prior permission. The authors make no
15 # representations about the suitability of this software for any purpose. It
16 # is provided "as is" without express or implied warranty.
17 #
18 # THE AUTHOR(S) DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
19 # INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
20 # EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY SPECIAL, INDIRECT OR
21 # CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
22 # DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
23 # TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
24 # PERFORMANCE OF THIS SOFTWARE.
25
26 from enum import Enum
27 import argparse
28 import string
29 import sys
30
31 class ESC[4;38;5;81mCaseFoldClass(ESC[4;38;5;149mEnum):
32 COMMON = 1
33 FULL = 2
34 SIMPLE = 3
35 TURKIC = 4
36
37 class ESC[4;38;5;81mCaseFoldMethod(ESC[4;38;5;149mEnum):
38 RANGE = 0
39 EVEN_ODD = 1
40 FULL = 2
41
42 caseFoldClassMap = {
43 'C' : CaseFoldClass.COMMON,
44 'F' : CaseFoldClass.FULL,
45 'S' : CaseFoldClass.SIMPLE,
46 'T' : CaseFoldClass.TURKIC
47 }
48
49 folds = []
50
51 def ucs4_to_utf8(ucs4):
52 utf8_rep = []
53
54 if ucs4 < 0x80:
55 utf8_rep.append(ucs4)
56 bits = -6
57 elif ucs4 < 0x800:
58 utf8_rep.append(((ucs4 >> 6) & 0x1F) | 0xC0)
59 bits = 0
60 elif ucs4 < 0x10000:
61 utf8_rep.append(((ucs4 >> 12) & 0x0F) | 0xE0)
62 bits = 6
63 elif ucs4 < 0x200000:
64 utf8_rep.append(((ucs4 >> 18) & 0x07) | 0xF0)
65 bits = 12
66 elif ucs4 < 0x4000000:
67 utf8_rep.append(((ucs4 >> 24) & 0x03) | 0xF8)
68 bits = 18
69 elif ucs4 < 0x80000000:
70 utf8_rep.append(((ucs4 >> 30) & 0x01) | 0xFC)
71 bits = 24
72 else:
73 return [];
74
75 while bits >= 0:
76 utf8_rep.append(((ucs4 >> bits) & 0x3F) | 0x80)
77 bits-= 6
78
79 return utf8_rep
80
81 def utf8_size(ucs4):
82 return len(ucs4_to_utf8(ucs4))
83
84 case_fold_method_name_map = {
85 CaseFoldMethod.RANGE: 'FC_CASE_FOLD_RANGE,',
86 CaseFoldMethod.EVEN_ODD: 'FC_CASE_FOLD_EVEN_ODD,',
87 CaseFoldMethod.FULL: 'FC_CASE_FOLD_FULL,',
88 }
89
90 if __name__=='__main__':
91 parser = argparse.ArgumentParser()
92 parser.add_argument('case_folding_file')
93 parser.add_argument('--template', dest='template_file', default=None)
94 parser.add_argument('--output', dest='output_file', default=None)
95
96 args = parser.parse_args()
97
98 minFoldChar = None
99 maxFoldChar = None
100 fold = None
101
102 foldChars = []
103 maxFoldChars = 0
104
105 maxExpand = 0
106
107 # Read the standard Unicode CaseFolding.txt file
108 with open(args.case_folding_file, 'r', encoding='utf-8') as casefile:
109 for cnt, line in enumerate(casefile):
110 if not line or not line[0] in string.hexdigits:
111 continue
112
113 # print('Line {}: {}'.format(cnt, line.strip()))
114
115 tokens = line.split('; ')
116
117 if len(tokens) < 3:
118 print('Not enough tokens in line {}'.format(cnt), file=sys.stderr)
119 sys.exit(1)
120
121 # Get upper case value
122 upper = int(tokens.pop(0), 16)
123
124 # Get class
125 cfclass = caseFoldClassMap[tokens.pop(0)]
126
127 # Get list of result characters
128 lower = list(map(lambda s: int(s,16), tokens.pop(0).split()))
129
130 # print('\t----> {:04X} {} {}'.format(upper, cfclass, lower))
131
132 if not minFoldChar:
133 minFoldChar = upper
134
135 maxFoldChar = upper;
136
137 if cfclass in [CaseFoldClass.COMMON, CaseFoldClass.FULL]:
138 if len(lower) == 1:
139 # foldExtends
140 if fold and fold['method'] == CaseFoldMethod.RANGE:
141 foldExtends = (lower[0] - upper) == fold['offset'] and upper == fold['upper'] + fold['count']
142 elif fold and fold['method'] == CaseFoldMethod.EVEN_ODD:
143 foldExtends = (lower[0] - upper) == 1 and upper == (fold['upper'] + fold['count'] + 1)
144 else:
145 foldExtends = False
146
147 if foldExtends:
148 # This modifies the last fold item in the array too
149 fold['count'] = upper - fold['upper'] + 1;
150 else:
151 fold = {}
152 fold['upper'] = upper
153 fold['offset'] = lower[0] - upper;
154 if fold['offset'] == 1:
155 fold['method'] = CaseFoldMethod.EVEN_ODD
156 else:
157 fold['method'] = CaseFoldMethod.RANGE
158 fold['count'] = 1
159 folds.append(fold)
160 expand = utf8_size (lower[0]) - utf8_size(upper)
161 else:
162 fold = {}
163 fold['upper'] = upper
164 fold['method'] = CaseFoldMethod.FULL
165 fold['offset'] = len(foldChars)
166
167 # add chars
168 for c in lower:
169 utf8_rep = ucs4_to_utf8(c)
170 # print('{} -> {}'.format(c,utf8_rep))
171 for utf8_char in utf8_rep:
172 foldChars.append(utf8_char)
173
174 fold['count'] = len(foldChars) - fold['offset']
175 folds.append(fold)
176
177 if fold['count'] > maxFoldChars:
178 maxFoldChars = fold['count']
179
180 expand = fold['count'] - utf8_size(upper)
181 if expand > maxExpand:
182 maxExpand = expand
183
184 # Open output file
185 if args.output_file:
186 sys.stdout = open(args.output_file, 'w', encoding='utf-8')
187
188 # Read the template file
189 if args.template_file:
190 tmpl_file = open(args.template_file, 'r', encoding='utf-8')
191 else:
192 tmpl_file = sys.stdin
193
194 # Scan the input until the marker is found
195 # FIXME: this is a bit silly really, might just as well harcode
196 # the license header in the script and drop the template
197 for line in tmpl_file:
198 if line.strip() == '@@@':
199 break
200 print(line, end='')
201
202 # Dump these tables
203 print('#define FC_NUM_CASE_FOLD\t{}'.format(len(folds)))
204 print('#define FC_NUM_CASE_FOLD_CHARS\t{}'.format(len(foldChars)))
205 print('#define FC_MAX_CASE_FOLD_CHARS\t{}'.format(maxFoldChars))
206 print('#define FC_MAX_CASE_FOLD_EXPAND\t{}'.format(maxExpand))
207 print('#define FC_MIN_FOLD_CHAR\t0x{:08x}'.format(minFoldChar))
208 print('#define FC_MAX_FOLD_CHAR\t0x{:08x}'.format(maxFoldChar))
209 print('')
210
211 # Dump out ranges
212 print('static const FcCaseFold fcCaseFold[FC_NUM_CASE_FOLD] = {')
213 for f in folds:
214 short_offset = f['offset']
215 if short_offset < -32367:
216 short_offset += 65536
217 if short_offset > 32368:
218 short_offset -= 65536
219 print(' {} 0x{:08x}, {:22s} 0x{:04x}, {:6d} {},'.format('{',
220 f['upper'], case_fold_method_name_map[f['method']],
221 f['count'], short_offset, '}'))
222 print('};\n')
223
224 # Dump out "other" values
225 print('static const FcChar8\tfcCaseFoldChars[FC_NUM_CASE_FOLD_CHARS] = {')
226 for n, c in enumerate(foldChars):
227 if n == len(foldChars) - 1:
228 end = ''
229 elif n % 16 == 15:
230 end = ',\n'
231 else:
232 end = ','
233 print('0x{:02x}'.format(c), end=end)
234 print('\n};')
235
236 # And flush out the rest of the input file
237 for line in tmpl_file:
238 print(line, end='')
239
240 sys.stdout.flush()