1  #!/usr/bin/env python3
       2  
       3  """usage: ./gen-indic-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt Blocks.txt
       4  
       5  Input files:
       6  * https://unicode.org/Public/UCD/latest/ucd/IndicSyllabicCategory.txt
       7  * https://unicode.org/Public/UCD/latest/ucd/IndicPositionalCategory.txt
       8  * https://unicode.org/Public/UCD/latest/ucd/Blocks.txt
       9  """
      10  
      11  import sys
      12  
      13  if len (sys.argv) != 4:
      14  	sys.exit (__doc__)
      15  
      16  ALLOWED_SINGLES = [0x00A0, 0x25CC]
      17  ALLOWED_BLOCKS = [
      18  	'Basic Latin',
      19  	'Latin-1 Supplement',
      20  	'Devanagari',
      21  	'Bengali',
      22  	'Gurmukhi',
      23  	'Gujarati',
      24  	'Oriya',
      25  	'Tamil',
      26  	'Telugu',
      27  	'Kannada',
      28  	'Malayalam',
      29  	'Myanmar',
      30  	'Khmer',
      31  	'Vedic Extensions',
      32  	'General Punctuation',
      33  	'Superscripts and Subscripts',
      34  	'Devanagari Extended',
      35  	'Myanmar Extended-B',
      36  	'Myanmar Extended-A',
      37  ]
      38  
      39  files = [open (x, encoding='utf-8') for x in sys.argv[1:]]
      40  
      41  headers = [[f.readline () for i in range (2)] for f in files]
      42  
      43  unicode_data = [{} for _ in files]
      44  for i, f in enumerate (files):
      45  	for line in f:
      46  
      47  		j = line.find ('#')
      48  		if j >= 0:
      49  			line = line[:j]
      50  
      51  		fields = [x.strip () for x in line.split (';')]
      52  		if len (fields) == 1:
      53  			continue
      54  
      55  		uu = fields[0].split ('..')
      56  		start = int (uu[0], 16)
      57  		if len (uu) == 1:
      58  			end = start
      59  		else:
      60  			end = int (uu[1], 16)
      61  
      62  		t = fields[1]
      63  
      64  		for u in range (start, end + 1):
      65  			unicode_data[i][u] = t
      66  
      67  # Merge data into one dict:
      68  defaults = ('Other', 'Not_Applicable', 'No_Block')
      69  combined = {}
      70  for i,d in enumerate (unicode_data):
      71  	for u,v in d.items ():
      72  		if i == 2 and not u in combined:
      73  			continue
      74  		if not u in combined:
      75  			combined[u] = list (defaults)
      76  		combined[u][i] = v
      77  combined = {k:v for k,v in combined.items() if k in ALLOWED_SINGLES or v[2] in ALLOWED_BLOCKS}
      78  
      79  
      80  # Convert categories & positions types
      81  
      82  categories = {
      83    'indic' : [
      84      'X',
      85      'C',
      86      'V',
      87      'N',
      88      'H',
      89      'ZWNJ',
      90      'ZWJ',
      91      'M',
      92      'SM',
      93      'A',
      94      'VD',
      95      'PLACEHOLDER',
      96      'DOTTEDCIRCLE',
      97      'RS',
      98      'MPst',
      99      'Repha',
     100      'Ra',
     101      'CM',
     102      'Symbol',
     103      'CS',
     104    ],
     105    'khmer' : [
     106      'VAbv',
     107      'VBlw',
     108      'VPre',
     109      'VPst',
     110  
     111      'Robatic',
     112      'Xgroup',
     113      'Ygroup',
     114    ],
     115    'myanmar' : [
     116      'VAbv',
     117      'VBlw',
     118      'VPre',
     119      'VPst',
     120  
     121      'IV',
     122      'As',
     123      'DB',
     124      'GB',
     125      'MH',
     126      'MR',
     127      'MW',
     128      'MY',
     129      'PT',
     130      'VS',
     131      'ML',
     132    ],
     133  }
     134  
     135  category_map = {
     136    'Other'			: 'X',
     137    'Avagraha'			: 'Symbol',
     138    'Bindu'			: 'SM',
     139    'Brahmi_Joining_Number'	: 'PLACEHOLDER', # Don't care.
     140    'Cantillation_Mark'		: 'A',
     141    'Consonant'			: 'C',
     142    'Consonant_Dead'		: 'C',
     143    'Consonant_Final'		: 'CM',
     144    'Consonant_Head_Letter'	: 'C',
     145    'Consonant_Initial_Postfixed'	: 'C', # TODO
     146    'Consonant_Killer'		: 'M', # U+17CD only.
     147    'Consonant_Medial'		: 'CM',
     148    'Consonant_Placeholder'	: 'PLACEHOLDER',
     149    'Consonant_Preceding_Repha'	: 'Repha',
     150    'Consonant_Prefixed'		: 'X', # Don't care.
     151    'Consonant_Subjoined'		: 'CM',
     152    'Consonant_Succeeding_Repha'	: 'CM',
     153    'Consonant_With_Stacker'	: 'CS',
     154    'Gemination_Mark'		: 'SM', # https://github.com/harfbuzz/harfbuzz/issues/552
     155    'Invisible_Stacker'		: 'H',
     156    'Joiner'			: 'ZWJ',
     157    'Modifying_Letter'		: 'X',
     158    'Non_Joiner'			: 'ZWNJ',
     159    'Nukta'			: 'N',
     160    'Number'			: 'PLACEHOLDER',
     161    'Number_Joiner'		: 'PLACEHOLDER', # Don't care.
     162    'Pure_Killer'			: 'M', # Is like a vowel matra.
     163    'Register_Shifter'		: 'RS',
     164    'Syllable_Modifier'		: 'SM',
     165    'Tone_Letter'			: 'X',
     166    'Tone_Mark'			: 'N',
     167    'Virama'			: 'H',
     168    'Visarga'			: 'SM',
     169    'Vowel'			: 'V',
     170    'Vowel_Dependent'		: 'M',
     171    'Vowel_Independent'		: 'V',
     172  }
     173  position_map = {
     174    'Not_Applicable'		: 'END',
     175  
     176    'Left'			: 'PRE_C',
     177    'Top'				: 'ABOVE_C',
     178    'Bottom'			: 'BELOW_C',
     179    'Right'			: 'POST_C',
     180  
     181    # These should resolve to the position of the last part of the split sequence.
     182    'Bottom_And_Right'		: 'POST_C',
     183    'Left_And_Right'		: 'POST_C',
     184    'Top_And_Bottom'		: 'BELOW_C',
     185    'Top_And_Bottom_And_Left'	: 'BELOW_C',
     186    'Top_And_Bottom_And_Right'	: 'POST_C',
     187    'Top_And_Left'		: 'ABOVE_C',
     188    'Top_And_Left_And_Right'	: 'POST_C',
     189    'Top_And_Right'		: 'POST_C',
     190  
     191    'Overstruck'			: 'AFTER_MAIN',
     192    'Visual_order_left'		: 'PRE_M',
     193  }
     194  
     195  category_overrides = {
     196  
     197    # These are the variation-selectors. They only appear in the Myanmar grammar
     198    # but are not Myanmar-specific
     199    0xFE00: 'VS',
     200    0xFE01: 'VS',
     201    0xFE02: 'VS',
     202    0xFE03: 'VS',
     203    0xFE04: 'VS',
     204    0xFE05: 'VS',
     205    0xFE06: 'VS',
     206    0xFE07: 'VS',
     207    0xFE08: 'VS',
     208    0xFE09: 'VS',
     209    0xFE0A: 'VS',
     210    0xFE0B: 'VS',
     211    0xFE0C: 'VS',
     212    0xFE0D: 'VS',
     213    0xFE0E: 'VS',
     214    0xFE0F: 'VS',
     215  
     216    # These appear in the OT Myanmar spec, but are not Myanmar-specific
     217    0x2015: 'PLACEHOLDER',
     218    0x2022: 'PLACEHOLDER',
     219    0x25FB: 'PLACEHOLDER',
     220    0x25FC: 'PLACEHOLDER',
     221    0x25FD: 'PLACEHOLDER',
     222    0x25FE: 'PLACEHOLDER',
     223  
     224  
     225    # Indic
     226  
     227    0x0930: 'Ra', # Devanagari
     228    0x09B0: 'Ra', # Bengali
     229    0x09F0: 'Ra', # Bengali
     230    0x0A30: 'Ra', # Gurmukhi 	No Reph
     231    0x0AB0: 'Ra', # Gujarati
     232    0x0B30: 'Ra', # Oriya
     233    0x0BB0: 'Ra', # Tamil 	No Reph
     234    0x0C30: 'Ra', # Telugu 	Reph formed only with ZWJ
     235    0x0CB0: 'Ra', # Kannada
     236    0x0D30: 'Ra', # Malayalam 	No Reph, Logical Repha
     237  
     238    # The following act more like the Bindus.
     239    0x0953: 'SM',
     240    0x0954: 'SM',
     241  
     242    # U+0A40 GURMUKHI VOWEL SIGN II may be preceded by U+0A02 GURMUKHI SIGN BINDI.
     243    0x0A40: 'MPst',
     244  
     245    # The following act like consonants.
     246    0x0A72: 'C',
     247    0x0A73: 'C',
     248    0x1CF5: 'C',
     249    0x1CF6: 'C',
     250  
     251    # TODO: The following should only be allowed after a Visarga.
     252    # For now, just treat them like regular tone marks.
     253    0x1CE2: 'A',
     254    0x1CE3: 'A',
     255    0x1CE4: 'A',
     256    0x1CE5: 'A',
     257    0x1CE6: 'A',
     258    0x1CE7: 'A',
     259    0x1CE8: 'A',
     260  
     261    # TODO: The following should only be allowed after some of
     262    # the nasalization marks, maybe only for U+1CE9..U+1CF1.
     263    # For now, just treat them like tone marks.
     264    0x1CED: 'A',
     265  
     266    # The following take marks in standalone clusters, similar to Avagraha.
     267    0xA8F2: 'Symbol',
     268    0xA8F3: 'Symbol',
     269    0xA8F4: 'Symbol',
     270    0xA8F5: 'Symbol',
     271    0xA8F6: 'Symbol',
     272    0xA8F7: 'Symbol',
     273    0x1CE9: 'Symbol',
     274    0x1CEA: 'Symbol',
     275    0x1CEB: 'Symbol',
     276    0x1CEC: 'Symbol',
     277    0x1CEE: 'Symbol',
     278    0x1CEF: 'Symbol',
     279    0x1CF0: 'Symbol',
     280    0x1CF1: 'Symbol',
     281  
     282    0x0A51: 'M', # https://github.com/harfbuzz/harfbuzz/issues/524
     283  
     284    # According to ScriptExtensions.txt, these Grantha marks may also be used in Tamil,
     285    # so the Indic shaper needs to know their categories.
     286    0x11301: 'SM',
     287    0x11302: 'SM',
     288    0x11303: 'SM',
     289    0x1133B: 'N',
     290    0x1133C: 'N',
     291  
     292    0x0AFB: 'N', # https://github.com/harfbuzz/harfbuzz/issues/552
     293    0x0B55: 'N', # https://github.com/harfbuzz/harfbuzz/issues/2849
     294  
     295    0x09FC: 'PLACEHOLDER', # https://github.com/harfbuzz/harfbuzz/pull/1613
     296    0x0C80: 'PLACEHOLDER', # https://github.com/harfbuzz/harfbuzz/pull/623
     297    0x0D04: 'PLACEHOLDER', # https://github.com/harfbuzz/harfbuzz/pull/3511
     298  
     299    0x25CC: 'DOTTEDCIRCLE',
     300  
     301  
     302    # Khmer
     303  
     304    0x179A: 'Ra',
     305  
     306    0x17CC: 'Robatic',
     307    0x17C9: 'Robatic',
     308    0x17CA: 'Robatic',
     309  
     310    0x17C6: 'Xgroup',
     311    0x17CB: 'Xgroup',
     312    0x17CD: 'Xgroup',
     313    0x17CE: 'Xgroup',
     314    0x17CF: 'Xgroup',
     315    0x17D0: 'Xgroup',
     316    0x17D1: 'Xgroup',
     317  
     318    0x17C7: 'Ygroup',
     319    0x17C8: 'Ygroup',
     320    0x17DD: 'Ygroup',
     321    0x17D3: 'Ygroup', # Just guessing. Uniscribe doesn't categorize it.
     322  
     323    0x17D9: 'PLACEHOLDER', # https://github.com/harfbuzz/harfbuzz/issues/2384
     324  
     325  
     326    # Myanmar
     327  
     328    # https://docs.microsoft.com/en-us/typography/script-development/myanmar#analyze
     329  
     330    0x104E: 'C', # The spec says C, IndicSyllableCategory says Consonant_Placeholder
     331  
     332    0x1004: 'Ra',
     333    0x101B: 'Ra',
     334    0x105A: 'Ra',
     335  
     336    0x1032: 'A',
     337    0x1036: 'A',
     338  
     339    0x103A: 'As',
     340  
     341    #0x1040: 'D0', # XXX The spec says D0, but Uniscribe doesn't seem to do.
     342  
     343    0x103E: 'MH',
     344    0x1060: 'ML',
     345    0x103C: 'MR',
     346    0x103D: 'MW',
     347    0x1082: 'MW',
     348    0x103B: 'MY',
     349    0x105E: 'MY',
     350    0x105F: 'MY',
     351  
     352    0x1063: 'PT',
     353    0x1064: 'PT',
     354    0x1069: 'PT',
     355    0x106A: 'PT',
     356    0x106B: 'PT',
     357    0x106C: 'PT',
     358    0x106D: 'PT',
     359    0xAA7B: 'PT',
     360  
     361    0x1038: 'SM',
     362    0x1087: 'SM',
     363    0x1088: 'SM',
     364    0x1089: 'SM',
     365    0x108A: 'SM',
     366    0x108B: 'SM',
     367    0x108C: 'SM',
     368    0x108D: 'SM',
     369    0x108F: 'SM',
     370    0x109A: 'SM',
     371    0x109B: 'SM',
     372    0x109C: 'SM',
     373  
     374    0x104A: 'PLACEHOLDER',
     375  }
     376  position_overrides = {
     377  
     378    0x0A51: 'BELOW_C', # https://github.com/harfbuzz/harfbuzz/issues/524
     379  
     380    0x0B01: 'BEFORE_SUB', # Oriya Bindu is BeforeSub in the spec.
     381  }
     382  
     383  def matra_pos_left(u, block):
     384    return "PRE_M"
     385  def matra_pos_right(u, block):
     386    if block == 'Devanagari':	return  'AFTER_SUB'
     387    if block == 'Bengali':	return  'AFTER_POST'
     388    if block == 'Gurmukhi':	return  'AFTER_POST'
     389    if block == 'Gujarati':	return  'AFTER_POST'
     390    if block == 'Oriya':		return  'AFTER_POST'
     391    if block == 'Tamil':		return  'AFTER_POST'
     392    if block == 'Telugu':		return  'BEFORE_SUB' if u <= 0x0C42 else 'AFTER_SUB'
     393    if block == 'Kannada':	return  'BEFORE_SUB' if u < 0x0CC3 or u > 0x0CD6 else 'AFTER_SUB'
     394    if block == 'Malayalam':	return  'AFTER_POST'
     395    return 'AFTER_SUB'
     396  def matra_pos_top(u, block):
     397    # BENG and MLYM don't have top matras.
     398    if block == 'Devanagari':	return  'AFTER_SUB'
     399    if block == 'Gurmukhi':	return  'AFTER_POST' # Deviate from spec
     400    if block == 'Gujarati':	return  'AFTER_SUB'
     401    if block == 'Oriya':		return  'AFTER_MAIN'
     402    if block == 'Tamil':		return  'AFTER_SUB'
     403    if block == 'Telugu':		return  'BEFORE_SUB'
     404    if block == 'Kannada':	return  'BEFORE_SUB'
     405    return 'AFTER_SUB'
     406  def matra_pos_bottom(u, block):
     407    if block == 'Devanagari':	return  'AFTER_SUB'
     408    if block == 'Bengali':	return  'AFTER_SUB'
     409    if block == 'Gurmukhi':	return  'AFTER_POST'
     410    if block == 'Gujarati':	return  'AFTER_POST'
     411    if block == 'Oriya':		return  'AFTER_SUB'
     412    if block == 'Tamil':		return  'AFTER_POST'
     413    if block == 'Telugu':		return  'BEFORE_SUB'
     414    if block == 'Kannada':	return  'BEFORE_SUB'
     415    if block == 'Malayalam':	return  'AFTER_POST'
     416    return "AFTER_SUB"
     417  def indic_matra_position(u, pos, block): # Reposition matra
     418    if pos == 'PRE_C':	return matra_pos_left(u, block)
     419    if pos == 'POST_C':	return matra_pos_right(u, block)
     420    if pos == 'ABOVE_C':	return matra_pos_top(u, block)
     421    if pos == 'BELOW_C':	return matra_pos_bottom(u, block)
     422    assert (False)
     423  
     424  def position_to_category(pos):
     425    if pos == 'PRE_C':	return 'VPre'
     426    if pos == 'ABOVE_C':	return 'VAbv'
     427    if pos == 'BELOW_C':	return 'VBlw'
     428    if pos == 'POST_C':	return 'VPst'
     429    assert(False)
     430  
     431  
     432  defaults = (category_map[defaults[0]], position_map[defaults[1]], defaults[2])
     433  
     434  indic_data = {}
     435  for k, (cat, pos, block) in combined.items():
     436    cat = category_map[cat]
     437    pos = position_map[pos]
     438    indic_data[k] = (cat, pos, block)
     439  
     440  for k,new_cat in category_overrides.items():
     441    (cat, pos, _) = indic_data.get(k, defaults)
     442    indic_data[k] = (new_cat, pos, unicode_data[2][k])
     443  
     444  # We only expect position for certain types
     445  positioned_categories = ('CM', 'SM', 'RS', 'H', 'M', 'MPst')
     446  for k, (cat, pos, block) in indic_data.items():
     447    if cat not in positioned_categories:
     448      pos = 'END'
     449      indic_data[k] = (cat, pos, block)
     450  
     451  # Position overrides are more complicated
     452  
     453  # Keep in sync with CONSONANT_FLAGS in the shaper
     454  consonant_categories = ('C', 'CS', 'Ra','CM', 'V', 'PLACEHOLDER', 'DOTTEDCIRCLE')
     455  matra_categories = ('M', 'MPst')
     456  smvd_categories = ('SM', 'VD', 'A', 'Symbol')
     457  for k, (cat, pos, block) in indic_data.items():
     458    if cat in consonant_categories:
     459      pos = 'BASE_C'
     460    elif cat in matra_categories:
     461      if block.startswith('Khmer') or block.startswith('Myanmar'):
     462        cat = position_to_category(pos)
     463      else:
     464        pos = indic_matra_position(k, pos, block)
     465    elif cat in smvd_categories:
     466      pos = 'SMVD';
     467    indic_data[k] = (cat, pos, block)
     468  
     469  for k,new_pos in position_overrides.items():
     470    (cat, pos, _) = indic_data.get(k, defaults)
     471    indic_data[k] = (cat, new_pos, unicode_data[2][k])
     472  
     473  
     474  values = [{_: 1} for _ in defaults]
     475  for vv in indic_data.values():
     476    for i,v in enumerate(vv):
     477      values[i][v] = values[i].get (v, 0) + 1
     478  
     479  
     480  
     481  
     482  # Move the outliers NO-BREAK SPACE and DOTTED CIRCLE out
     483  singles = {}
     484  for u in ALLOWED_SINGLES:
     485  	singles[u] = indic_data[u]
     486  	del indic_data[u]
     487  
     488  print ("/* == Start of generated table == */")
     489  print ("/*")
     490  print (" * The following table is generated by running:")
     491  print (" *")
     492  print (" *   ./gen-indic-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt Blocks.txt")
     493  print (" *")
     494  print (" * on files with these headers:")
     495  print (" *")
     496  for h in headers:
     497  	for l in h:
     498  		print (" * %s" % (l.strip()))
     499  print (" */")
     500  print ()
     501  print ('#include "hb.hh"')
     502  print ()
     503  print ('#ifndef HB_NO_OT_SHAPE')
     504  print ()
     505  print ('#include "hb-ot-shaper-indic.hh"')
     506  print ()
     507  print ('#pragma GCC diagnostic push')
     508  print ('#pragma GCC diagnostic ignored "-Wunused-macros"')
     509  print ()
     510  
     511  # Print categories
     512  for shaper in categories:
     513    print ('#include "hb-ot-shaper-%s-machine.hh"' % shaper)
     514  print ()
     515  done = {}
     516  for shaper, shaper_cats in categories.items():
     517    print ('/* %s */' % shaper)
     518    for cat in shaper_cats:
     519      v = shaper[0].upper()
     520      if cat not in done:
     521        print ("#define OT_%s %s_Cat(%s)" % (cat, v, cat))
     522        done[cat] = v
     523      else:
     524        print ('static_assert (OT_%s == %s_Cat(%s), "");' % (cat, v, cat))
     525  print ()
     526  
     527  # Shorten values
     528  short = [{
     529  	"Repha":		'Rf',
     530  	"PLACEHOLDER":		'GB',
     531  	"DOTTEDCIRCLE":		'DC',
     532  	"VPst":			'VR',
     533  	"VPre":			'VL',
     534  	"Robatic":		'Rt',
     535  	"Xgroup":		'Xg',
     536  	"Ygroup":		'Yg',
     537  	"As":			'As',
     538  },{
     539  	"END":			'X',
     540  	"BASE_C":		'C',
     541  	"ABOVE_C":		'T',
     542  	"BELOW_C":		'B',
     543  	"POST_C":		'R',
     544  	"PRE_C":		'L',
     545  	"PRE_M":		'LM',
     546  	"AFTER_MAIN":		'A',
     547  	"AFTER_SUB":		'AS',
     548  	"BEFORE_SUB":		'BS',
     549  	"AFTER_POST":		'AP',
     550  	"SMVD":			'SM',
     551  }]
     552  all_shorts = [{},{}]
     553  
     554  # Add some of the values, to make them more readable, and to avoid duplicates
     555  
     556  for i in range (2):
     557  	for v,s in short[i].items ():
     558  		all_shorts[i][s] = v
     559  
     560  what = ["OT", "POS"]
     561  what_short = ["_OT", "_POS"]
     562  cat_defs = []
     563  for i in range (2):
     564  	vv = sorted (values[i].keys ())
     565  	for v in vv:
     566  		v_no_and = v.replace ('_And_', '_')
     567  		if v in short[i]:
     568  			s = short[i][v]
     569  		else:
     570  			s = ''.join ([c for c in v_no_and if ord ('A') <= ord (c) <= ord ('Z')])
     571  			if s in all_shorts[i]:
     572  				raise Exception ("Duplicate short value alias", v, all_shorts[i][s])
     573  			all_shorts[i][s] = v
     574  			short[i][v] = s
     575  		cat_defs.append ((what_short[i] + '_' + s, what[i] + '_' + (v.upper () if i else v), str (values[i][v]), v))
     576  
     577  maxlen_s = max ([len (c[0]) for c in cat_defs])
     578  maxlen_l = max ([len (c[1]) for c in cat_defs])
     579  maxlen_n = max ([len (c[2]) for c in cat_defs])
     580  for s in what_short:
     581  	print ()
     582  	for c in [c for c in cat_defs if s in c[0]]:
     583  		print ("#define %s %s /* %s chars; %s */" %
     584  			(c[0].ljust (maxlen_s), c[1].ljust (maxlen_l), c[2].rjust (maxlen_n), c[3]))
     585  print ()
     586  print ('#pragma GCC diagnostic pop')
     587  print ()
     588  print ("#define INDIC_COMBINE_CATEGORIES(S,M) ((S) | ((M) << 8))")
     589  print ()
     590  print ("#define _(S,M) INDIC_COMBINE_CATEGORIES (%s_##S, %s_##M)" % tuple(what_short))
     591  print ()
     592  print ()
     593  
     594  total = 0
     595  used = 0
     596  last_block = None
     597  def print_block (block, start, end, data):
     598  	global total, used, last_block
     599  	if block and block != last_block:
     600  		print ()
     601  		print ()
     602  		print ("  /* %s */" % block)
     603  	num = 0
     604  	assert start % 8 == 0
     605  	assert (end+1) % 8 == 0
     606  	for u in range (start, end+1):
     607  		if u % 8 == 0:
     608  			print ()
     609  			print ("  /* %04X */" % u, end="")
     610  		if u in data:
     611  			num += 1
     612  		d = data.get (u, defaults)
     613  		print ("%9s" % ("_(%s,%s)," % (short[0][d[0]], short[1][d[1]])), end="")
     614  
     615  	total += end - start + 1
     616  	used += num
     617  	if block:
     618  		last_block = block
     619  
     620  uu = sorted (indic_data)
     621  
     622  last = -100000
     623  num = 0
     624  offset = 0
     625  starts = []
     626  ends = []
     627  print ("static const uint16_t indic_table[] = {")
     628  for u in uu:
     629  	if u <= last:
     630  		continue
     631  	block = indic_data[u][2]
     632  
     633  	start = u//8*8
     634  	end = start+1
     635  	while end in uu and block == indic_data[end][2]:
     636  		end += 1
     637  	end = (end-1)//8*8 + 7
     638  
     639  	if start != last + 1:
     640  		if start - last <= 1+16*2:
     641  			print_block (None, last+1, start-1, indic_data)
     642  		else:
     643  			if last >= 0:
     644  				ends.append (last + 1)
     645  				offset += ends[-1] - starts[-1]
     646  			print ()
     647  			print ()
     648  			print ("#define indic_offset_0x%04xu %d" % (start, offset))
     649  			starts.append (start)
     650  
     651  	print_block (block, start, end, indic_data)
     652  	last = end
     653  ends.append (last + 1)
     654  offset += ends[-1] - starts[-1]
     655  print ()
     656  print ()
     657  occupancy = used * 100. / total
     658  page_bits = 12
     659  print ("}; /* Table items: %d; occupancy: %d%% */" % (offset, occupancy))
     660  print ()
     661  print ("uint16_t")
     662  print ("hb_indic_get_categories (hb_codepoint_t u)")
     663  print ("{")
     664  print ("  switch (u >> %d)" % page_bits)
     665  print ("  {")
     666  pages = set ([u>>page_bits for u in starts+ends+list (singles.keys ())])
     667  for p in sorted(pages):
     668  	print ("    case 0x%0Xu:" % p)
     669  	for u,d in singles.items ():
     670  		if p != u>>page_bits: continue
     671  		print ("      if (unlikely (u == 0x%04Xu)) return _(%s,%s);" % (u, short[0][d[0]], short[1][d[1]]))
     672  	for (start,end) in zip (starts, ends):
     673  		if p not in [start>>page_bits, end>>page_bits]: continue
     674  		offset = "indic_offset_0x%04xu" % start
     675  		print ("      if (hb_in_range<hb_codepoint_t> (u, 0x%04Xu, 0x%04Xu)) return indic_table[u - 0x%04Xu + %s];" % (start, end-1, start, offset))
     676  	print ("      break;")
     677  	print ("")
     678  print ("    default:")
     679  print ("      break;")
     680  print ("  }")
     681  print ("  return _(X,X);")
     682  print ("}")
     683  print ()
     684  print ("#undef _")
     685  print ("#undef INDIC_COMBINE_CATEGORIES")
     686  for i in range (2):
     687  	print ()
     688  	vv = sorted (values[i].keys ())
     689  	for v in vv:
     690  		print ("#undef %s_%s" %
     691  			(what_short[i], short[i][v]))
     692  print ()
     693  print ('#endif')
     694  print ()
     695  print ("/* == End of generated table == */")
     696  
     697  # Maintain at least 50% occupancy in the table */
     698  if occupancy < 50:
     699  	raise Exception ("Table too sparse, please investigate: ", occupancy)