(root)/
harfbuzz-8.3.0/
src/
gen-vowel-constraints.py
       1  #!/usr/bin/env python3
       2  
       3  """Generator of the function to prohibit certain vowel sequences.
       4  
       5  It creates ``_hb_preprocess_text_vowel_constraints``, which inserts dotted
       6  circles into sequences prohibited by the USE script development spec.
       7  This function should be used as the ``preprocess_text`` of an
       8  ``hb_ot_shaper_t``.
       9  
      10  usage: ./gen-vowel-constraints.py ms-use/IndicShapingInvalidCluster.txt Scripts.txt
      11  
      12  Input file:
      13  * https://unicode.org/Public/UCD/latest/ucd/Scripts.txt
      14  """
      15  
      16  import collections
      17  def write (s):
      18  	sys.stdout.flush ()
      19  	sys.stdout.buffer.write (s.encode ('utf-8'))
      20  import sys
      21  
      22  if len (sys.argv) != 3:
      23  	sys.exit (__doc__)
      24  
      25  with open (sys.argv[2], encoding='utf-8') as f:
      26  	scripts_header = [f.readline () for i in range (2)]
      27  	scripts = {}
      28  	script_order = {}
      29  	for line in f:
      30  		j = line.find ('#')
      31  		if j >= 0:
      32  			line = line[:j]
      33  		fields = [x.strip () for x in line.split (';')]
      34  		if len (fields) == 1:
      35  			continue
      36  		uu = fields[0].split ('..')
      37  		start = int (uu[0], 16)
      38  		if len (uu) == 1:
      39  			end = start
      40  		else:
      41  			end = int (uu[1], 16)
      42  		script = fields[1]
      43  		for u in range (start, end + 1):
      44  			scripts[u] = script
      45  		if script not in script_order:
      46  			script_order[script] = start
      47  
      48  class ESC[4;38;5;81mConstraintSet (ESC[4;38;5;149mobject):
      49  	"""A set of prohibited code point sequences.
      50  
      51  	Args:
      52  		constraint (List[int]): A prohibited code point sequence.
      53  
      54  	"""
      55  	def __init__ (self, constraint):
      56  		# Either a list or a dictionary. As a list of code points, it
      57  		# represents a prohibited code point sequence. As a dictionary,
      58  		# it represents a set of prohibited sequences, where each item
      59  		# represents the set of prohibited sequences starting with the
      60  		# key (a code point) concatenated with any of the values
      61  		# (ConstraintSets).
      62  		self._c = constraint
      63  
      64  	def add (self, constraint):
      65  		"""Add a constraint to this set."""
      66  		if not constraint:
      67  			return
      68  		first = constraint[0]
      69  		rest = constraint[1:]
      70  		if isinstance (self._c, list):
      71  			if constraint == self._c[:len (constraint)]:
      72  				self._c = constraint
      73  			elif self._c != constraint[:len (self._c)]:
      74  				self._c = {self._c[0]: ConstraintSet (self._c[1:])}
      75  		if isinstance (self._c, dict):
      76  			if first in self._c:
      77  				self._c[first].add (rest)
      78  			else:
      79  				self._c[first] = ConstraintSet (rest)
      80  
      81  	@staticmethod
      82  	def _indent (depth):
      83  		return ('  ' * depth).replace ('        ', '\t')
      84  
      85  	def __str__ (self, index=0, depth=4):
      86  		s = []
      87  		indent = self._indent (depth)
      88  		if isinstance (self._c, list):
      89  			if len (self._c) == 0:
      90  				assert index == 2, 'Cannot use `matched` for this constraint; the general case has not been implemented'
      91  				s.append ('{}matched = true;\n'.format (indent))
      92  			elif len (self._c) == 1:
      93  				assert index == 1, 'Cannot use `matched` for this constraint; the general case has not been implemented'
      94  				s.append ('{}matched = 0x{:04X}u == buffer->cur ({}).codepoint;\n'.format (indent, next (iter (self._c)), index or ''))
      95  			else:
      96  				s.append ('{}if (0x{:04X}u == buffer->cur ({}).codepoint &&\n'.format (indent, self._c[0], index or ''))
      97  				if index:
      98  					s.append ('{}buffer->idx + {} < count &&\n'.format (self._indent (depth + 2), index + 1))
      99  				for i, cp in enumerate (self._c[1:], start=1):
     100  					s.append ('{}0x{:04X}u == buffer->cur ({}).codepoint{}\n'.format (
     101  						self._indent (depth + 2), cp, index + i, ')' if i == len (self._c) - 1 else ' &&'))
     102  				s.append ('{}{{\n'.format (indent))
     103  				for i in range (index):
     104  					s.append ('{}(void) buffer->next_glyph ();\n'.format (self._indent (depth + 1)))
     105  				s.append ('{}matched = true;\n'.format (self._indent (depth + 1)))
     106  				s.append ('{}}}\n'.format (indent))
     107  		else:
     108  			s.append ('{}switch (buffer->cur ({}).codepoint)\n'.format(indent, index or ''))
     109  			s.append ('{}{{\n'.format (indent))
     110  			cases = collections.defaultdict (set)
     111  			for first, rest in sorted (self._c.items ()):
     112  				cases[rest.__str__ (index + 1, depth + 2)].add (first)
     113  			for body, labels in sorted (cases.items (), key=lambda b_ls: sorted (b_ls[1])[0]):
     114  				for i, cp in enumerate (sorted (labels)):
     115  					if i % 4 == 0:
     116  						s.append (self._indent (depth + 1))
     117  					else:
     118  						s.append (' ')
     119  					s.append ('case 0x{:04X}u:{}'.format (cp, '\n' if i % 4 == 3 else ''))
     120  				if len (labels) % 4 != 0:
     121  					s.append ('\n')
     122  				s.append (body)
     123  				s.append ('{}break;\n'.format (self._indent (depth + 2)))
     124  			s.append ('{}}}\n'.format (indent))
     125  		return ''.join (s)
     126  
     127  constraints = {}
     128  with open (sys.argv[1], encoding='utf-8') as f:
     129  	constraints_header = []
     130  	while True:
     131  		line = f.readline ().strip ()
     132  		if line == '#':
     133  			break
     134  		constraints_header.append(line)
     135  	for line in f:
     136  		j = line.find ('#')
     137  		if j >= 0:
     138  			line = line[:j]
     139  		constraint = [int (cp, 16) for cp in line.split (';')[0].split ()]
     140  		if not constraint: continue
     141  		assert 2 <= len (constraint), 'Prohibited sequence is too short: {}'.format (constraint)
     142  		script = scripts[constraint[0]]
     143  		if script in constraints:
     144  			constraints[script].add (constraint)
     145  		else:
     146  			constraints[script] = ConstraintSet (constraint)
     147  		assert constraints, 'No constraints found'
     148  
     149  print ('/* == Start of generated functions == */')
     150  print ('/*')
     151  print (' * The following functions are generated by running:')
     152  print (' *')
     153  print (' *   %s ms-use/IndicShapingInvalidCluster.txt Scripts.txt' % sys.argv[0])
     154  print (' *')
     155  print (' * on files with these headers:')
     156  print (' *')
     157  for line in constraints_header:
     158  	print (' * %s' % line.strip ())
     159  print (' *')
     160  for line in scripts_header:
     161  	print (' * %s' % line.strip ())
     162  print (' */')
     163  
     164  print ()
     165  print ('#include "hb.hh"')
     166  print ()
     167  print ('#ifndef HB_NO_OT_SHAPE')
     168  print ()
     169  print ('#include "hb-ot-shaper-vowel-constraints.hh"')
     170  print ()
     171  print ('static void')
     172  print ('_output_dotted_circle (hb_buffer_t *buffer)')
     173  print ('{')
     174  print ('  (void) buffer->output_glyph (0x25CCu);')
     175  print ('  _hb_glyph_info_reset_continuation (&buffer->prev());')
     176  print ('}')
     177  print ()
     178  print ('static void')
     179  print ('_output_with_dotted_circle (hb_buffer_t *buffer)')
     180  print ('{')
     181  print ('  _output_dotted_circle (buffer);')
     182  print ('  (void) buffer->next_glyph ();')
     183  print ('}')
     184  print ()
     185  
     186  print ('void')
     187  print ('_hb_preprocess_text_vowel_constraints (const hb_ot_shape_plan_t *plan HB_UNUSED,')
     188  print ('\t\t\t\t       hb_buffer_t              *buffer,')
     189  print ('\t\t\t\t       hb_font_t                *font HB_UNUSED)')
     190  print ('{')
     191  print ('#ifdef HB_NO_OT_SHAPER_VOWEL_CONSTRAINTS')
     192  print ('  return;')
     193  print ('#endif')
     194  print ('  if (buffer->flags & HB_BUFFER_FLAG_DO_NOT_INSERT_DOTTED_CIRCLE)')
     195  print ('    return;')
     196  print ()
     197  print ('  /* UGLY UGLY UGLY business of adding dotted-circle in the middle of')
     198  print ('   * vowel-sequences that look like another vowel.  Data for each script')
     199  print ('   * collected from the USE script development spec.')
     200  print ('   *')
     201  print ('   * https://github.com/harfbuzz/harfbuzz/issues/1019')
     202  print ('   */')
     203  print ('  buffer->clear_output ();')
     204  print ('  unsigned int count = buffer->len;')
     205  print ('  switch ((unsigned) buffer->props.script)')
     206  print ('  {')
     207  
     208  for script, constraints in sorted (constraints.items (), key=lambda s_c: script_order[s_c[0]]):
     209  	print ('    case HB_SCRIPT_{}:'.format (script.upper ()))
     210  	print ('      for (buffer->idx = 0; buffer->idx + 1 < count && buffer->successful;)')
     211  	print ('      {')
     212  	print ('\tbool matched = false;')
     213  	write (str (constraints))
     214  	print ('\t(void) buffer->next_glyph ();')
     215  	print ('\tif (matched) _output_with_dotted_circle (buffer);')
     216  	print ('      }')
     217  	print ('      break;')
     218  	print ()
     219  
     220  print ('    default:')
     221  print ('      break;')
     222  print ('  }')
     223  print ('  buffer->sync ();')
     224  print ('}')
     225  
     226  print ()
     227  print ()
     228  print ('#endif')
     229  print ('/* == End of generated functions == */')