3 """Generator of the function to prohibit certain vowel sequences.
5 It creates ``_hb_preprocess_text_vowel_constraints``, which inserts dotted
6 circles into sequences prohibited by the USE script development spec.
7 This function should be used as the ``preprocess_text`` of an
8 ``hb_ot_complex_shaper_t``.
10 usage: ./gen-vowel-constraints.py ms-use/IndicShapingInvalidCluster.txt Scripts.txt
13 * https://unicode.org/Public/UCD/latest/ucd/Scripts.txt
17 from html.parser import HTMLParser
20 sys.stdout.buffer.write (s.encode ('utf-8'))
24 if len (sys.argv) != 3:
27 with open (sys.argv[2], encoding='utf-8') as f:
28 scripts_header = [f.readline () for i in range (2)]
35 fields = [x.strip () for x in line.split (';')]
38 uu = fields[0].split ('..')
39 start = int (uu[0], 16)
45 for u in range (start, end + 1):
47 if script not in script_order:
48 script_order[script] = start
50 class ConstraintSet (object):
51 """A set of prohibited code point sequences.
54 constraint (List[int]): A prohibited code point sequence.
57 def __init__ (self, constraint):
58 # Either a list or a dictionary. As a list of code points, it
59 # represents a prohibited code point sequence. As a dictionary,
60 # it represents a set of prohibited sequences, where each item
61 # represents the set of prohibited sequences starting with the
62 # key (a code point) concatenated with any of the values
66 def add (self, constraint):
67 """Add a constraint to this set."""
72 if isinstance (self._c, list):
73 if constraint == self._c[:len (constraint)]:
75 elif self._c != constraint[:len (self._c)]:
76 self._c = {self._c[0]: ConstraintSet (self._c[1:])}
77 if isinstance (self._c, dict):
79 self._c[first].add (rest)
81 self._c[first] = ConstraintSet (rest)
85 return (' ' * depth).replace (' ', '\t')
87 def __str__ (self, index=0, depth=4):
89 indent = self._indent (depth)
90 if isinstance (self._c, list):
91 if len (self._c) == 0:
92 assert index == 2, 'Cannot use `matched` for this constraint; the general case has not been implemented'
93 s.append ('{}matched = true;\n'.format (indent))
94 elif len (self._c) == 1:
95 assert index == 1, 'Cannot use `matched` for this constraint; the general case has not been implemented'
96 s.append ('{}matched = 0x{:04X}u == buffer->cur ({}).codepoint;\n'.format (indent, next (iter (self._c)), index or ''))
98 s.append ('{}if (0x{:04X}u == buffer->cur ({}).codepoint &&\n'.format (indent, self._c[0], index or ''))
100 s.append ('{}buffer->idx + {} < count &&\n'.format (self._indent (depth + 2), index + 1))
101 for i, cp in enumerate (self._c[1:], start=1):
102 s.append ('{}0x{:04X}u == buffer->cur ({}).codepoint{}\n'.format (
103 self._indent (depth + 2), cp, index + i, ')' if i == len (self._c) - 1 else ' &&'))
104 s.append ('{}{{\n'.format (indent))
105 for i in range (index):
106 s.append ('{}buffer->next_glyph ();\n'.format (self._indent (depth + 1)))
107 s.append ('{}matched = true;\n'.format (self._indent (depth + 1)))
108 s.append ('{}}}\n'.format (indent))
110 s.append ('{}switch (buffer->cur ({}).codepoint)\n'.format(indent, index or ''))
111 s.append ('{}{{\n'.format (indent))
112 cases = collections.defaultdict (set)
113 for first, rest in sorted (self._c.items ()):
114 cases[rest.__str__ (index + 1, depth + 2)].add (first)
115 for body, labels in sorted (cases.items (), key=lambda b_ls: sorted (b_ls[1])[0]):
116 for i, cp in enumerate (sorted (labels)):
118 s.append (self._indent (depth + 1))
121 s.append ('case 0x{:04X}u:{}'.format (cp, '\n' if i % 4 == 3 else ''))
122 if len (labels) % 4 != 0:
125 s.append ('{}break;\n'.format (self._indent (depth + 2)))
126 s.append ('{}}}\n'.format (indent))
130 with open (sys.argv[1], encoding='utf-8') as f:
131 constraints_header = []
133 line = f.readline ().strip ()
136 constraints_header.append(line)
141 constraint = [int (cp, 16) for cp in line.split (';')[0].split ()]
142 if not constraint: continue
143 assert 2 <= len (constraint), 'Prohibited sequence is too short: {}'.format (constraint)
144 script = scripts[constraint[0]]
145 if script in constraints:
146 constraints[script].add (constraint)
148 constraints[script] = ConstraintSet (constraint)
149 assert constraints, 'No constraints found'
151 print ('/* == Start of generated functions == */')
153 print (' * The following functions are generated by running:')
155 print (' * %s ms-use/IndicShapingInvalidCluster.txt Scripts.txt' % sys.argv[0])
157 print (' * on files with these headers:')
159 for line in constraints_header:
160 print (' * %s' % line.strip ())
162 for line in scripts_header:
163 print (' * %s' % line.strip ())
167 print ('#include "hb.hh"')
169 print ('#ifndef HB_NO_OT_SHAPE')
171 print ('#include "hb-ot-shape-complex-vowel-constraints.hh"')
173 print ('static void')
174 print ('_output_dotted_circle (hb_buffer_t *buffer)')
176 print (' hb_glyph_info_t &dottedcircle = buffer->output_glyph (0x25CCu);')
177 print (' _hb_glyph_info_reset_continuation (&dottedcircle);')
180 print ('static void')
181 print ('_output_with_dotted_circle (hb_buffer_t *buffer)')
183 print (' _output_dotted_circle (buffer);')
184 print (' buffer->next_glyph ();')
189 print ('_hb_preprocess_text_vowel_constraints (const hb_ot_shape_plan_t *plan HB_UNUSED,')
190 print ('\t\t\t\t hb_buffer_t *buffer,')
191 print ('\t\t\t\t hb_font_t *font HB_UNUSED)')
193 print ('#ifdef HB_NO_OT_SHAPE_COMPLEX_VOWEL_CONSTRAINTS')
196 print (' if (buffer->flags & HB_BUFFER_FLAG_DO_NOT_INSERT_DOTTED_CIRCLE)')
199 print (' /* UGLY UGLY UGLY business of adding dotted-circle in the middle of')
200 print (' * vowel-sequences that look like another vowel. Data for each script')
201 print (' * collected from the USE script development spec.')
203 print (' * https://github.com/harfbuzz/harfbuzz/issues/1019')
205 print (' bool processed = false;')
206 print (' buffer->clear_output ();')
207 print (' unsigned int count = buffer->len;')
208 print (' switch ((unsigned) buffer->props.script)')
211 for script, constraints in sorted (constraints.items (), key=lambda s_c: script_order[s_c[0]]):
212 print (' case HB_SCRIPT_{}:'.format (script.upper ()))
213 print (' for (buffer->idx = 0; buffer->idx + 1 < count && buffer->successful;)')
215 print ('\tbool matched = false;')
216 write (str (constraints))
217 print ('\tbuffer->next_glyph ();')
218 print ('\tif (matched) _output_with_dotted_circle (buffer);')
220 print (' processed = true;')
227 print (' if (processed)')
229 print (' if (buffer->idx < count)')
230 print (' buffer->next_glyph ();')
231 print (' buffer->swap_buffers ();')
238 print ('/* == End of generated functions == */')