1  #!/usr/bin/env python3
       2  # flake8: noqa: F821
       3  
       4  """usage: ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt ArabicShaping.txt DerivedCoreProperties.txt UnicodeData.txt Blocks.txt Scripts.txt IndicSyllabicCategory-Additional.txt IndicPositionalCategory-Additional.txt
       5  
       6  Input files:
       7  * https://unicode.org/Public/UCD/latest/ucd/IndicSyllabicCategory.txt
       8  * https://unicode.org/Public/UCD/latest/ucd/IndicPositionalCategory.txt
       9  * https://unicode.org/Public/UCD/latest/ucd/ArabicShaping.txt
      10  * https://unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt
      11  * https://unicode.org/Public/UCD/latest/ucd/UnicodeData.txt
      12  * https://unicode.org/Public/UCD/latest/ucd/Blocks.txt
      13  * https://unicode.org/Public/UCD/latest/ucd/Scripts.txt
      14  * ms-use/IndicSyllabicCategory-Additional.txt
      15  * ms-use/IndicPositionalCategory-Additional.txt
      16  """
      17  
      18  import logging
      19  logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.INFO)
      20  
      21  
      22  import sys
      23  
      24  if len (sys.argv) != 10:
      25  	sys.exit (__doc__)
      26  
      27  DISABLED_SCRIPTS = {
      28  	'Arabic',
      29  	'Lao',
      30  	'Samaritan',
      31  	'Syriac',
      32  	'Thai',
      33  }
      34  
      35  files = [open (x, encoding='utf-8') for x in sys.argv[1:]]
      36  
      37  headers = [[f.readline () for i in range (2)] for j,f in enumerate(files) if j != 4]
      38  for j in range(7, 9):
      39  	for line in files[j]:
      40  		line = line.rstrip()
      41  		if not line:
      42  			break
      43  		headers[j - 1].append(line)
      44  headers.append (["UnicodeData.txt does not have a header."])
      45  
      46  unicode_data = [{} for _ in files]
      47  values = [{} for _ in files]
      48  for i, f in enumerate (files):
      49  	for line in f:
      50  
      51  		j = line.find ('#')
      52  		if j >= 0:
      53  			line = line[:j]
      54  
      55  		fields = [x.strip () for x in line.split (';')]
      56  		if len (fields) == 1:
      57  			continue
      58  
      59  		uu = fields[0].split ('..')
      60  		start = int (uu[0], 16)
      61  		if len (uu) == 1:
      62  			end = start
      63  		else:
      64  			end = int (uu[1], 16)
      65  
      66  		t = fields[1 if i not in [2, 4] else 2]
      67  
      68  		if i == 2:
      69  			t = 'jt_' + t
      70  		elif i == 3 and t != 'Default_Ignorable_Code_Point':
      71  			continue
      72  		elif i == 7 and t == 'Consonant_Final_Modifier':
      73  			# TODO: https://github.com/MicrosoftDocs/typography-issues/issues/336
      74  			t = 'Syllable_Modifier'
      75  		elif i == 8 and t == 'NA':
      76  			t = 'Not_Applicable'
      77  
      78  		i0 = i if i < 7 else i - 7
      79  		for u in range (start, end + 1):
      80  			unicode_data[i0][u] = t
      81  		values[i0][t] = values[i0].get (t, 0) + end - start + 1
      82  
      83  defaults = ('Other', 'Not_Applicable', 'jt_X', '', 'Cn', 'No_Block', 'Unknown')
      84  
      85  # Merge data into one dict:
      86  for i,v in enumerate (defaults):
      87  	values[i][v] = values[i].get (v, 0) + 1
      88  combined = {}
      89  for i,d in enumerate (unicode_data):
      90  	for u,v in d.items ():
      91  		if not u in combined:
      92  			if i >= 4:
      93  				continue
      94  			combined[u] = list (defaults)
      95  		combined[u][i] = v
      96  combined = {k: v for k, v in combined.items() if v[6] not in DISABLED_SCRIPTS}
      97  
      98  
      99  property_names = [
     100  	# General_Category
     101  	'Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu', 'Mc',
     102  	'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po',
     103  	'Ps', 'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs',
     104  	# Indic_Syllabic_Category
     105  	'Other',
     106  	'Bindu',
     107  	'Visarga',
     108  	'Avagraha',
     109  	'Nukta',
     110  	'Virama',
     111  	'Pure_Killer',
     112  	'Invisible_Stacker',
     113  	'Vowel_Independent',
     114  	'Vowel_Dependent',
     115  	'Vowel',
     116  	'Consonant_Placeholder',
     117  	'Consonant',
     118  	'Consonant_Dead',
     119  	'Consonant_With_Stacker',
     120  	'Consonant_Prefixed',
     121  	'Consonant_Preceding_Repha',
     122  	'Consonant_Succeeding_Repha',
     123  	'Consonant_Subjoined',
     124  	'Consonant_Medial',
     125  	'Consonant_Final',
     126  	'Consonant_Head_Letter',
     127  	'Consonant_Initial_Postfixed',
     128  	'Modifying_Letter',
     129  	'Tone_Letter',
     130  	'Tone_Mark',
     131  	'Gemination_Mark',
     132  	'Cantillation_Mark',
     133  	'Register_Shifter',
     134  	'Syllable_Modifier',
     135  	'Consonant_Killer',
     136  	'Non_Joiner',
     137  	'Joiner',
     138  	'Number_Joiner',
     139  	'Number',
     140  	'Brahmi_Joining_Number',
     141  	'Symbol_Modifier',
     142  	'Hieroglyph',
     143  	'Hieroglyph_Joiner',
     144  	'Hieroglyph_Mark_Begin',
     145  	'Hieroglyph_Mark_End',
     146  	'Hieroglyph_Mirror',
     147  	'Hieroglyph_Modifier',
     148  	'Hieroglyph_Segment_Begin',
     149  	'Hieroglyph_Segment_End',
     150  	# Indic_Positional_Category
     151  	'Not_Applicable',
     152  	'Right',
     153  	'Left',
     154  	'Visual_Order_Left',
     155  	'Left_And_Right',
     156  	'Top',
     157  	'Bottom',
     158  	'Top_And_Bottom',
     159  	'Top_And_Bottom_And_Left',
     160  	'Top_And_Right',
     161  	'Top_And_Left',
     162  	'Top_And_Left_And_Right',
     163  	'Bottom_And_Left',
     164  	'Bottom_And_Right',
     165  	'Top_And_Bottom_And_Right',
     166  	'Overstruck',
     167  	# Joining_Type
     168  	'jt_C',
     169  	'jt_D',
     170  	'jt_L',
     171  	'jt_R',
     172  	'jt_T',
     173  	'jt_U',
     174  	'jt_X',
     175  ]
     176  
     177  class ESC[4;38;5;81mPropertyValue(ESC[4;38;5;149mobject):
     178  	def __init__(self, name_):
     179  		self.name = name_
     180  	def __str__(self):
     181  		return self.name
     182  	def __eq__(self, other):
     183  		return self.name == (other if isinstance(other, str) else other.name)
     184  	def __ne__(self, other):
     185  		return not (self == other)
     186  	def __hash__(self):
     187  		return hash(str(self))
     188  
     189  property_values = {}
     190  
     191  for name in property_names:
     192  	value = PropertyValue(name)
     193  	assert value not in property_values
     194  	assert value not in globals()
     195  	property_values[name] = value
     196  globals().update(property_values)
     197  
     198  
     199  def is_BASE(U, UISC, UDI, UGC, AJT):
     200  	return (UISC in [Number, Consonant, Consonant_Head_Letter,
     201  			Tone_Letter,
     202  			Vowel_Independent,
     203  			] or
     204  		# TODO: https://github.com/MicrosoftDocs/typography-issues/issues/484
     205  		AJT in [jt_C, jt_D, jt_L, jt_R] and UISC != Joiner or
     206  		(UGC == Lo and UISC in [Avagraha, Bindu, Consonant_Final, Consonant_Medial,
     207  					Consonant_Subjoined, Vowel, Vowel_Dependent]))
     208  def is_BASE_NUM(U, UISC, UDI, UGC, AJT):
     209  	return UISC == Brahmi_Joining_Number
     210  def is_BASE_OTHER(U, UISC, UDI, UGC, AJT):
     211  	if UISC == Consonant_Placeholder: return True
     212  	return U in [0x2015, 0x2022, 0x25FB, 0x25FC, 0x25FD, 0x25FE]
     213  def is_CGJ(U, UISC, UDI, UGC, AJT):
     214  	# Also includes VARIATION_SELECTOR and ZWJ
     215  	return UISC == Joiner or UDI and UGC in [Mc, Me, Mn]
     216  def is_CONS_FINAL(U, UISC, UDI, UGC, AJT):
     217  	return ((UISC == Consonant_Final and UGC != Lo) or
     218  		UISC == Consonant_Succeeding_Repha)
     219  def is_CONS_FINAL_MOD(U, UISC, UDI, UGC, AJT):
     220  	return UISC == Syllable_Modifier
     221  def is_CONS_MED(U, UISC, UDI, UGC, AJT):
     222  	# Consonant_Initial_Postfixed is new in Unicode 11; not in the spec.
     223  	return (UISC == Consonant_Medial and UGC != Lo or
     224  		UISC == Consonant_Initial_Postfixed)
     225  def is_CONS_MOD(U, UISC, UDI, UGC, AJT):
     226  	return UISC in [Nukta, Gemination_Mark, Consonant_Killer]
     227  def is_CONS_SUB(U, UISC, UDI, UGC, AJT):
     228  	return UISC == Consonant_Subjoined and UGC != Lo
     229  def is_CONS_WITH_STACKER(U, UISC, UDI, UGC, AJT):
     230  	return UISC == Consonant_With_Stacker
     231  def is_HALANT(U, UISC, UDI, UGC, AJT):
     232  	return UISC == Virama and not is_HALANT_OR_VOWEL_MODIFIER(U, UISC, UDI, UGC, AJT)
     233  def is_HALANT_OR_VOWEL_MODIFIER(U, UISC, UDI, UGC, AJT):
     234  	# Split off of HALANT
     235  	return U == 0x0DCA
     236  def is_HALANT_NUM(U, UISC, UDI, UGC, AJT):
     237  	return UISC == Number_Joiner
     238  def is_HIEROGLYPH(U, UISC, UDI, UGC, AJT):
     239  	return UISC == Hieroglyph
     240  def is_HIEROGLYPH_JOINER(U, UISC, UDI, UGC, AJT):
     241  	return UISC == Hieroglyph_Joiner
     242  def is_HIEROGLYPH_MIRROR(U, UISC, UDI, UGC, AJT):
     243  	return UISC == Hieroglyph_Mirror
     244  def is_HIEROGLYPH_MOD(U, UISC, UDI, UGC, AJT):
     245  	return UISC == Hieroglyph_Modifier
     246  def is_HIEROGLYPH_SEGMENT_BEGIN(U, UISC, UDI, UGC, AJT):
     247  	return UISC in [Hieroglyph_Mark_Begin, Hieroglyph_Segment_Begin]
     248  def is_HIEROGLYPH_SEGMENT_END(U, UISC, UDI, UGC, AJT):
     249  	return UISC in [Hieroglyph_Mark_End, Hieroglyph_Segment_End]
     250  def is_INVISIBLE_STACKER(U, UISC, UDI, UGC, AJT):
     251  	# Split off of HALANT
     252  	return (UISC == Invisible_Stacker
     253  		and not is_SAKOT(U, UISC, UDI, UGC, AJT)
     254  	)
     255  def is_ZWNJ(U, UISC, UDI, UGC, AJT):
     256  	return UISC == Non_Joiner
     257  def is_OTHER(U, UISC, UDI, UGC, AJT):
     258  	# Also includes BASE_IND and SYM
     259  	return ((UGC == Po or UISC in [Consonant_Dead, Joiner, Modifying_Letter, Other])
     260  		and not is_BASE(U, UISC, UDI, UGC, AJT)
     261  		and not is_BASE_OTHER(U, UISC, UDI, UGC, AJT)
     262  		and not is_CGJ(U, UISC, UDI, UGC, AJT)
     263  		and not is_SYM_MOD(U, UISC, UDI, UGC, AJT)
     264  		and not is_Word_Joiner(U, UISC, UDI, UGC, AJT)
     265  	)
     266  def is_REPHA(U, UISC, UDI, UGC, AJT):
     267  	return UISC in [Consonant_Preceding_Repha, Consonant_Prefixed]
     268  def is_SAKOT(U, UISC, UDI, UGC, AJT):
     269  	# Split off of HALANT
     270  	return U == 0x1A60
     271  def is_SYM_MOD(U, UISC, UDI, UGC, AJT):
     272  	return UISC == Symbol_Modifier
     273  def is_VOWEL(U, UISC, UDI, UGC, AJT):
     274  	return (UISC == Pure_Killer or
     275  		UGC != Lo and UISC in [Vowel, Vowel_Dependent])
     276  def is_VOWEL_MOD(U, UISC, UDI, UGC, AJT):
     277  	return (UISC in [Tone_Mark, Cantillation_Mark, Register_Shifter, Visarga] or
     278  		UGC != Lo and UISC == Bindu)
     279  def is_Word_Joiner(U, UISC, UDI, UGC, AJT):
     280  	# Also includes Rsv
     281  	return (UDI and U not in [0x115F, 0x1160, 0x3164, 0xFFA0, 0x1BCA0, 0x1BCA1, 0x1BCA2, 0x1BCA3]
     282  		and UISC == Other
     283  		and not is_CGJ(U, UISC, UDI, UGC, AJT)
     284  	) or UGC == Cn
     285  
     286  use_mapping = {
     287  	'B':	is_BASE,
     288  	'N':	is_BASE_NUM,
     289  	'GB':	is_BASE_OTHER,
     290  	'CGJ':	is_CGJ,
     291  	'F':	is_CONS_FINAL,
     292  	'FM':	is_CONS_FINAL_MOD,
     293  	'M':	is_CONS_MED,
     294  	'CM':	is_CONS_MOD,
     295  	'SUB':	is_CONS_SUB,
     296  	'CS':	is_CONS_WITH_STACKER,
     297  	'H':	is_HALANT,
     298  	'HVM':	is_HALANT_OR_VOWEL_MODIFIER,
     299  	'HN':	is_HALANT_NUM,
     300  	'IS':	is_INVISIBLE_STACKER,
     301  	'G':	is_HIEROGLYPH,
     302  	'HM':	is_HIEROGLYPH_MOD,
     303  	'HR':	is_HIEROGLYPH_MIRROR,
     304  	'J':	is_HIEROGLYPH_JOINER,
     305  	'SB':	is_HIEROGLYPH_SEGMENT_BEGIN,
     306  	'SE':	is_HIEROGLYPH_SEGMENT_END,
     307  	'ZWNJ':	is_ZWNJ,
     308  	'O':	is_OTHER,
     309  	'R':	is_REPHA,
     310  	'Sk':	is_SAKOT,
     311  	'SM':	is_SYM_MOD,
     312  	'V':	is_VOWEL,
     313  	'VM':	is_VOWEL_MOD,
     314  	'WJ':	is_Word_Joiner,
     315  }
     316  
     317  use_positions = {
     318  	'F': {
     319  		'Abv': [Top],
     320  		'Blw': [Bottom],
     321  		'Pst': [Right],
     322  	},
     323  	'M': {
     324  		'Abv': [Top],
     325  		'Blw': [Bottom, Bottom_And_Left, Bottom_And_Right],
     326  		'Pst': [Right],
     327  		'Pre': [Left, Top_And_Bottom_And_Left],
     328  	},
     329  	'CM': {
     330  		'Abv': [Top],
     331  		'Blw': [Bottom, Overstruck],
     332  	},
     333  	'V': {
     334  		'Abv': [Top, Top_And_Bottom, Top_And_Bottom_And_Right, Top_And_Right],
     335  		'Blw': [Bottom, Overstruck, Bottom_And_Right],
     336  		'Pst': [Right],
     337  		'Pre': [Left, Top_And_Left, Top_And_Left_And_Right, Left_And_Right],
     338  	},
     339  	'VM': {
     340  		'Abv': [Top],
     341  		'Blw': [Bottom, Overstruck],
     342  		'Pst': [Right],
     343  		'Pre': [Left],
     344  	},
     345  	'SM': {
     346  		'Abv': [Top],
     347  		'Blw': [Bottom],
     348  	},
     349  	'H': None,
     350  	'HM': None,
     351  	'HR': None,
     352  	'HVM': None,
     353  	'IS': None,
     354  	'B': None,
     355  	'FM': {
     356  		'Abv': [Top],
     357  		'Blw': [Bottom],
     358  		'Pst': [Not_Applicable],
     359  	},
     360  	'R': None,
     361  	'SUB': None,
     362  }
     363  
     364  def map_to_use(data):
     365  	out = {}
     366  	items = use_mapping.items()
     367  	for U, (UISC, UIPC, AJT, UDI, UGC, UBlock, _) in data.items():
     368  
     369  		# Resolve Indic_Syllabic_Category
     370  
     371  		# TODO: These don't have UISC assigned in Unicode 13.0.0, but have UIPC
     372  		if 0x1CE2 <= U <= 0x1CE8: UISC = Cantillation_Mark
     373  
     374  		# Tibetan:
     375  		# TODO: These don't have UISC assigned in Unicode 13.0.0, but have UIPC
     376  		if 0x0F18 <= U <= 0x0F19 or 0x0F3E <= U <= 0x0F3F: UISC = Vowel_Dependent
     377  
     378  		# TODO: U+1CED should only be allowed after some of
     379  		# the nasalization marks, maybe only for U+1CE9..U+1CF1.
     380  		if U == 0x1CED: UISC = Tone_Mark
     381  
     382  		values = [k for k,v in items if v(U, UISC, UDI, UGC, AJT)]
     383  		assert len(values) == 1, "%s %s %s %s %s %s" % (hex(U), UISC, UDI, UGC, AJT, values)
     384  		USE = values[0]
     385  
     386  		# Resolve Indic_Positional_Category
     387  
     388  		# TODO: https://github.com/harfbuzz/harfbuzz/pull/1037
     389  		#  and https://github.com/harfbuzz/harfbuzz/issues/1631
     390  		if U in [0x11302, 0x11303, 0x114C1]: UIPC = Top
     391  
     392  		assert (UIPC in [Not_Applicable, Visual_Order_Left] or U == 0x0F7F or
     393  			USE in use_positions), "%s %s %s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UDI, UGC, AJT)
     394  
     395  		pos_mapping = use_positions.get(USE, None)
     396  		if pos_mapping:
     397  			values = [k for k,v in pos_mapping.items() if v and UIPC in v]
     398  			assert len(values) == 1, "%s %s %s %s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UDI, UGC, AJT, values)
     399  			USE = USE + values[0]
     400  
     401  		out[U] = (USE, UBlock)
     402  	return out
     403  
     404  use_data = map_to_use(combined)
     405  
     406  print ("/* == Start of generated table == */")
     407  print ("/*")
     408  print (" * The following table is generated by running:")
     409  print (" *")
     410  print (" *   {} IndicSyllabicCategory.txt IndicPositionalCategory.txt ArabicShaping.txt DerivedCoreProperties.txt UnicodeData.txt Blocks.txt Scripts.txt IndicSyllabicCategory-Additional.txt IndicPositionalCategory-Additional.txt".format (sys.argv[0]))
     411  print (" *")
     412  print (" * on files with these headers:")
     413  print (" *")
     414  for h in headers:
     415  	for l in h:
     416  		print (" * %s" % (l.strip()))
     417  print (" */")
     418  print ()
     419  print ("#ifndef HB_OT_SHAPER_USE_TABLE_HH")
     420  print ("#define HB_OT_SHAPER_USE_TABLE_HH")
     421  print ()
     422  print ('#include "hb.hh"')
     423  print ()
     424  print ('#include "hb-ot-shaper-use-machine.hh"')
     425  print ()
     426  
     427  total = 0
     428  used = 0
     429  last_block = None
     430  def print_block (block, start, end, use_data):
     431  	global total, used, last_block
     432  	if block and block != last_block:
     433  		print ()
     434  		print ()
     435  		print ("  /* %s */" % block)
     436  		if start % 16:
     437  			print (' ' * (20 + (start % 16 * 6)), end='')
     438  	num = 0
     439  	assert start % 8 == 0
     440  	assert (end+1) % 8 == 0
     441  	for u in range (start, end+1):
     442  		if u % 16 == 0:
     443  			print ()
     444  			print ("  /* %04X */" % u, end='')
     445  		if u in use_data:
     446  			num += 1
     447  		d = use_data.get (u)
     448  		if d is not None:
     449  			d = d[0]
     450  		elif u in unicode_data[4]:
     451  			d = 'O'
     452  		else:
     453  			d = 'WJ'
     454  		print ("%6s," % d, end='')
     455  
     456  	total += end - start + 1
     457  	used += num
     458  	if block:
     459  		last_block = block
     460  
     461  uu = sorted (use_data.keys ())
     462  
     463  last = -100000
     464  num = 0
     465  offset = 0
     466  starts = []
     467  ends = []
     468  print ('#pragma GCC diagnostic push')
     469  print ('#pragma GCC diagnostic ignored "-Wunused-macros"')
     470  for k,v in sorted(use_mapping.items()):
     471  	if k in use_positions and use_positions[k]: continue
     472  	print ("#define %s	USE(%s)	/* %s */" % (k, k, v.__name__[3:]))
     473  for k,v in sorted(use_positions.items()):
     474  	if not v: continue
     475  	for suf in v.keys():
     476  		tag = k + suf
     477  		print ("#define %s	USE(%s)" % (tag, tag))
     478  print ('#pragma GCC diagnostic pop')
     479  print ("")
     480  
     481  
     482  import packTab
     483  data = {u:v[0] for u,v in use_data.items()}
     484  
     485  DEFAULT = 5
     486  COMPACT = 9
     487  for compression in (DEFAULT, COMPACT):
     488  
     489      logging.info('  Compression=%d:' % compression)
     490      print()
     491      if compression == DEFAULT:
     492          print('#ifndef HB_OPTIMIZE_SIZE')
     493      elif compression == COMPACT:
     494          print('#else')
     495      else:
     496          assert False
     497      print()
     498  
     499      code = packTab.Code('hb_use')
     500      sol = packTab.pack_table(data, compression=compression, default='O')
     501      logging.info('      FullCost=%d' % (sol.fullCost))
     502      sol.genCode(code, f'get_category')
     503      code.print_c(linkage='static inline')
     504      print ()
     505  
     506  print('#endif')
     507  
     508  print ()
     509  for k in sorted(use_mapping.keys()):
     510  	if k in use_positions and use_positions[k]: continue
     511  	print ("#undef %s" % k)
     512  for k,v in sorted(use_positions.items()):
     513  	if not v: continue
     514  	for suf in v.keys():
     515  		tag = k + suf
     516  		print ("#undef %s" % tag)
     517  print ()
     518  print ()
     519  print ("#endif /* HB_OT_SHAPER_USE_TABLE_HH */")
     520  print ("/* == End of generated table == */")