1 import os.path
2 import token
3 from typing import IO, Any, Dict, Optional, Sequence, Set, Text, Tuple
4
5 from pegen import grammar
6 from pegen.grammar import (
7 Alt,
8 Cut,
9 Forced,
10 Gather,
11 GrammarVisitor,
12 Group,
13 Lookahead,
14 NamedItem,
15 NameLeaf,
16 NegativeLookahead,
17 Opt,
18 PositiveLookahead,
19 Repeat0,
20 Repeat1,
21 Rhs,
22 Rule,
23 StringLeaf,
24 )
25 from pegen.parser_generator import ParserGenerator
26
27 MODULE_PREFIX = """\
28 #!/usr/bin/env python3.8
29 # @generated by pegen from {filename}
30
31 import ast
32 import sys
33 import tokenize
34
35 from typing import Any, Optional
36
37 from pegen.parser import memoize, memoize_left_rec, logger, Parser
38
39 """
40 MODULE_SUFFIX = """
41
42 if __name__ == '__main__':
43 from pegen.parser import simple_parser_main
44 simple_parser_main({class_name})
45 """
46
47
48 class ESC[4;38;5;81mInvalidNodeVisitor(ESC[4;38;5;149mGrammarVisitor):
49 def visit_NameLeaf(self, node: NameLeaf) -> bool:
50 name = node.value
51 return name.startswith("invalid")
52
53 def visit_StringLeaf(self, node: StringLeaf) -> bool:
54 return False
55
56 def visit_NamedItem(self, node: NamedItem) -> bool:
57 return self.visit(node.item)
58
59 def visit_Rhs(self, node: Rhs) -> bool:
60 return any(self.visit(alt) for alt in node.alts)
61
62 def visit_Alt(self, node: Alt) -> bool:
63 return any(self.visit(item) for item in node.items)
64
65 def lookahead_call_helper(self, node: Lookahead) -> bool:
66 return self.visit(node.node)
67
68 def visit_PositiveLookahead(self, node: PositiveLookahead) -> bool:
69 return self.lookahead_call_helper(node)
70
71 def visit_NegativeLookahead(self, node: NegativeLookahead) -> bool:
72 return self.lookahead_call_helper(node)
73
74 def visit_Opt(self, node: Opt) -> bool:
75 return self.visit(node.node)
76
77 def visit_Repeat(self, node: Repeat0) -> Tuple[str, str]:
78 return self.visit(node.node)
79
80 def visit_Gather(self, node: Gather) -> Tuple[str, str]:
81 return self.visit(node.node)
82
83 def visit_Group(self, node: Group) -> bool:
84 return self.visit(node.rhs)
85
86 def visit_Cut(self, node: Cut) -> bool:
87 return False
88
89 def visit_Forced(self, node: Forced) -> bool:
90 return self.visit(node.node)
91
92
93 class ESC[4;38;5;81mPythonCallMakerVisitor(ESC[4;38;5;149mGrammarVisitor):
94 def __init__(self, parser_generator: ParserGenerator):
95 self.gen = parser_generator
96 self.cache: Dict[Any, Any] = {}
97
98 def visit_NameLeaf(self, node: NameLeaf) -> Tuple[Optional[str], str]:
99 name = node.value
100 if name == "SOFT_KEYWORD":
101 return "soft_keyword", "self.soft_keyword()"
102 if name in ("NAME", "NUMBER", "STRING", "OP", "TYPE_COMMENT"):
103 name = name.lower()
104 return name, f"self.{name}()"
105 if name in ("NEWLINE", "DEDENT", "INDENT", "ENDMARKER", "ASYNC", "AWAIT"):
106 # Avoid using names that can be Python keywords
107 return "_" + name.lower(), f"self.expect({name!r})"
108 return name, f"self.{name}()"
109
110 def visit_StringLeaf(self, node: StringLeaf) -> Tuple[str, str]:
111 return "literal", f"self.expect({node.value})"
112
113 def visit_Rhs(self, node: Rhs) -> Tuple[Optional[str], str]:
114 if node in self.cache:
115 return self.cache[node]
116 if len(node.alts) == 1 and len(node.alts[0].items) == 1:
117 self.cache[node] = self.visit(node.alts[0].items[0])
118 else:
119 name = self.gen.artifical_rule_from_rhs(node)
120 self.cache[node] = name, f"self.{name}()"
121 return self.cache[node]
122
123 def visit_NamedItem(self, node: NamedItem) -> Tuple[Optional[str], str]:
124 name, call = self.visit(node.item)
125 if node.name:
126 name = node.name
127 return name, call
128
129 def lookahead_call_helper(self, node: Lookahead) -> Tuple[str, str]:
130 name, call = self.visit(node.node)
131 head, tail = call.split("(", 1)
132 assert tail[-1] == ")"
133 tail = tail[:-1]
134 return head, tail
135
136 def visit_PositiveLookahead(self, node: PositiveLookahead) -> Tuple[None, str]:
137 head, tail = self.lookahead_call_helper(node)
138 return None, f"self.positive_lookahead({head}, {tail})"
139
140 def visit_NegativeLookahead(self, node: NegativeLookahead) -> Tuple[None, str]:
141 head, tail = self.lookahead_call_helper(node)
142 return None, f"self.negative_lookahead({head}, {tail})"
143
144 def visit_Opt(self, node: Opt) -> Tuple[str, str]:
145 name, call = self.visit(node.node)
146 # Note trailing comma (the call may already have one comma
147 # at the end, for example when rules have both repeat0 and optional
148 # markers, e.g: [rule*])
149 if call.endswith(","):
150 return "opt", call
151 else:
152 return "opt", f"{call},"
153
154 def visit_Repeat0(self, node: Repeat0) -> Tuple[str, str]:
155 if node in self.cache:
156 return self.cache[node]
157 name = self.gen.artificial_rule_from_repeat(node.node, False)
158 self.cache[node] = name, f"self.{name}()," # Also a trailing comma!
159 return self.cache[node]
160
161 def visit_Repeat1(self, node: Repeat1) -> Tuple[str, str]:
162 if node in self.cache:
163 return self.cache[node]
164 name = self.gen.artificial_rule_from_repeat(node.node, True)
165 self.cache[node] = name, f"self.{name}()" # But no trailing comma here!
166 return self.cache[node]
167
168 def visit_Gather(self, node: Gather) -> Tuple[str, str]:
169 if node in self.cache:
170 return self.cache[node]
171 name = self.gen.artifical_rule_from_gather(node)
172 self.cache[node] = name, f"self.{name}()" # No trailing comma here either!
173 return self.cache[node]
174
175 def visit_Group(self, node: Group) -> Tuple[Optional[str], str]:
176 return self.visit(node.rhs)
177
178 def visit_Cut(self, node: Cut) -> Tuple[str, str]:
179 return "cut", "True"
180
181 def visit_Forced(self, node: Forced) -> Tuple[str, str]:
182 if isinstance(node.node, Group):
183 _, val = self.visit(node.node.rhs)
184 return "forced", f"self.expect_forced({val}, '''({node.node.rhs!s})''')"
185 else:
186 return (
187 "forced",
188 f"self.expect_forced(self.expect({node.node.value}), {node.node.value!r})",
189 )
190
191
192 class ESC[4;38;5;81mPythonParserGenerator(ESC[4;38;5;149mParserGenerator, ESC[4;38;5;149mGrammarVisitor):
193 def __init__(
194 self,
195 grammar: grammar.Grammar,
196 file: Optional[IO[Text]],
197 tokens: Set[str] = set(token.tok_name.values()),
198 location_formatting: Optional[str] = None,
199 unreachable_formatting: Optional[str] = None,
200 ):
201 tokens.add("SOFT_KEYWORD")
202 super().__init__(grammar, tokens, file)
203 self.callmakervisitor: PythonCallMakerVisitor = PythonCallMakerVisitor(self)
204 self.invalidvisitor: InvalidNodeVisitor = InvalidNodeVisitor()
205 self.unreachable_formatting = unreachable_formatting or "None # pragma: no cover"
206 self.location_formatting = (
207 location_formatting
208 or "lineno=start_lineno, col_offset=start_col_offset, "
209 "end_lineno=end_lineno, end_col_offset=end_col_offset"
210 )
211
212 def generate(self, filename: str) -> None:
213 self.collect_rules()
214 header = self.grammar.metas.get("header", MODULE_PREFIX)
215 if header is not None:
216 basename = os.path.basename(filename)
217 self.print(header.rstrip("\n").format(filename=basename))
218 subheader = self.grammar.metas.get("subheader", "")
219 if subheader:
220 self.print(subheader)
221 cls_name = self.grammar.metas.get("class", "GeneratedParser")
222 self.print("# Keywords and soft keywords are listed at the end of the parser definition.")
223 self.print(f"class {cls_name}(Parser):")
224 for rule in self.all_rules.values():
225 self.print()
226 with self.indent():
227 self.visit(rule)
228
229 self.print()
230 with self.indent():
231 self.print(f"KEYWORDS = {tuple(self.keywords)}")
232 self.print(f"SOFT_KEYWORDS = {tuple(self.soft_keywords)}")
233
234 trailer = self.grammar.metas.get("trailer", MODULE_SUFFIX.format(class_name=cls_name))
235 if trailer is not None:
236 self.print(trailer.rstrip("\n"))
237
238 def alts_uses_locations(self, alts: Sequence[Alt]) -> bool:
239 for alt in alts:
240 if alt.action and "LOCATIONS" in alt.action:
241 return True
242 for n in alt.items:
243 if isinstance(n.item, Group) and self.alts_uses_locations(n.item.rhs.alts):
244 return True
245 return False
246
247 def visit_Rule(self, node: Rule) -> None:
248 is_loop = node.is_loop()
249 is_gather = node.is_gather()
250 rhs = node.flatten()
251 if node.left_recursive:
252 if node.leader:
253 self.print("@memoize_left_rec")
254 else:
255 # Non-leader rules in a cycle are not memoized,
256 # but they must still be logged.
257 self.print("@logger")
258 else:
259 self.print("@memoize")
260 node_type = node.type or "Any"
261 self.print(f"def {node.name}(self) -> Optional[{node_type}]:")
262 with self.indent():
263 self.print(f"# {node.name}: {rhs}")
264 self.print("mark = self._mark()")
265 if self.alts_uses_locations(node.rhs.alts):
266 self.print("tok = self._tokenizer.peek()")
267 self.print("start_lineno, start_col_offset = tok.start")
268 if is_loop:
269 self.print("children = []")
270 self.visit(rhs, is_loop=is_loop, is_gather=is_gather)
271 if is_loop:
272 self.print("return children")
273 else:
274 self.print("return None")
275
276 def visit_NamedItem(self, node: NamedItem) -> None:
277 name, call = self.callmakervisitor.visit(node.item)
278 if node.name:
279 name = node.name
280 if not name:
281 self.print(call)
282 else:
283 if name != "cut":
284 name = self.dedupe(name)
285 self.print(f"({name} := {call})")
286
287 def visit_Rhs(self, node: Rhs, is_loop: bool = False, is_gather: bool = False) -> None:
288 if is_loop:
289 assert len(node.alts) == 1
290 for alt in node.alts:
291 self.visit(alt, is_loop=is_loop, is_gather=is_gather)
292
293 def visit_Alt(self, node: Alt, is_loop: bool, is_gather: bool) -> None:
294 has_cut = any(isinstance(item.item, Cut) for item in node.items)
295 with self.local_variable_context():
296 if has_cut:
297 self.print("cut = False")
298 if is_loop:
299 self.print("while (")
300 else:
301 self.print("if (")
302 with self.indent():
303 first = True
304 for item in node.items:
305 if first:
306 first = False
307 else:
308 self.print("and")
309 self.visit(item)
310 if is_gather:
311 self.print("is not None")
312
313 self.print("):")
314 with self.indent():
315 action = node.action
316 if not action:
317 if is_gather:
318 assert len(self.local_variable_names) == 2
319 action = (
320 f"[{self.local_variable_names[0]}] + {self.local_variable_names[1]}"
321 )
322 else:
323 if self.invalidvisitor.visit(node):
324 action = "UNREACHABLE"
325 elif len(self.local_variable_names) == 1:
326 action = f"{self.local_variable_names[0]}"
327 else:
328 action = f"[{', '.join(self.local_variable_names)}]"
329 elif "LOCATIONS" in action:
330 self.print("tok = self._tokenizer.get_last_non_whitespace_token()")
331 self.print("end_lineno, end_col_offset = tok.end")
332 action = action.replace("LOCATIONS", self.location_formatting)
333
334 if is_loop:
335 self.print(f"children.append({action})")
336 self.print(f"mark = self._mark()")
337 else:
338 if "UNREACHABLE" in action:
339 action = action.replace("UNREACHABLE", self.unreachable_formatting)
340 self.print(f"return {action}")
341
342 self.print("self._reset(mark)")
343 # Skip remaining alternatives if a cut was reached.
344 if has_cut:
345 self.print("if cut: return None")