1 import argparse
2 import sys
3 import time
4 import token
5 import tokenize
6 import traceback
7 from abc import abstractmethod
8 from typing import Any, Callable, ClassVar, Dict, Optional, Tuple, Type, TypeVar, cast
9
10 from pegen.tokenizer import Mark, Tokenizer, exact_token_types
11
12 T = TypeVar("T")
13 P = TypeVar("P", bound="Parser")
14 F = TypeVar("F", bound=Callable[..., Any])
15
16
17 def logger(method: F) -> F:
18 """For non-memoized functions that we want to be logged.
19
20 (In practice this is only non-leader left-recursive functions.)
21 """
22 method_name = method.__name__
23
24 def logger_wrapper(self: P, *args: object) -> T:
25 if not self._verbose:
26 return method(self, *args)
27 argsr = ",".join(repr(arg) for arg in args)
28 fill = " " * self._level
29 print(f"{fill}{method_name}({argsr}) .... (looking at {self.showpeek()})")
30 self._level += 1
31 tree = method(self, *args)
32 self._level -= 1
33 print(f"{fill}... {method_name}({argsr}) --> {tree!s:.200}")
34 return tree
35
36 logger_wrapper.__wrapped__ = method # type: ignore
37 return cast(F, logger_wrapper)
38
39
40 def memoize(method: F) -> F:
41 """Memoize a symbol method."""
42 method_name = method.__name__
43
44 def memoize_wrapper(self: P, *args: object) -> T:
45 mark = self._mark()
46 key = mark, method_name, args
47 # Fast path: cache hit, and not verbose.
48 if key in self._cache and not self._verbose:
49 tree, endmark = self._cache[key]
50 self._reset(endmark)
51 return tree
52 # Slow path: no cache hit, or verbose.
53 verbose = self._verbose
54 argsr = ",".join(repr(arg) for arg in args)
55 fill = " " * self._level
56 if key not in self._cache:
57 if verbose:
58 print(f"{fill}{method_name}({argsr}) ... (looking at {self.showpeek()})")
59 self._level += 1
60 tree = method(self, *args)
61 self._level -= 1
62 if verbose:
63 print(f"{fill}... {method_name}({argsr}) -> {tree!s:.200}")
64 endmark = self._mark()
65 self._cache[key] = tree, endmark
66 else:
67 tree, endmark = self._cache[key]
68 if verbose:
69 print(f"{fill}{method_name}({argsr}) -> {tree!s:.200}")
70 self._reset(endmark)
71 return tree
72
73 memoize_wrapper.__wrapped__ = method # type: ignore
74 return cast(F, memoize_wrapper)
75
76
77 def memoize_left_rec(method: Callable[[P], Optional[T]]) -> Callable[[P], Optional[T]]:
78 """Memoize a left-recursive symbol method."""
79 method_name = method.__name__
80
81 def memoize_left_rec_wrapper(self: P) -> Optional[T]:
82 mark = self._mark()
83 key = mark, method_name, ()
84 # Fast path: cache hit, and not verbose.
85 if key in self._cache and not self._verbose:
86 tree, endmark = self._cache[key]
87 self._reset(endmark)
88 return tree
89 # Slow path: no cache hit, or verbose.
90 verbose = self._verbose
91 fill = " " * self._level
92 if key not in self._cache:
93 if verbose:
94 print(f"{fill}{method_name} ... (looking at {self.showpeek()})")
95 self._level += 1
96
97 # For left-recursive rules we manipulate the cache and
98 # loop until the rule shows no progress, then pick the
99 # previous result. For an explanation why this works, see
100 # https://github.com/PhilippeSigaud/Pegged/wiki/Left-Recursion
101 # (But we use the memoization cache instead of a static
102 # variable; perhaps this is similar to a paper by Warth et al.
103 # (http://web.cs.ucla.edu/~todd/research/pub.php?id=pepm08).
104
105 # Prime the cache with a failure.
106 self._cache[key] = None, mark
107 lastresult, lastmark = None, mark
108 depth = 0
109 if verbose:
110 print(f"{fill}Recursive {method_name} at {mark} depth {depth}")
111
112 while True:
113 self._reset(mark)
114 self.in_recursive_rule += 1
115 try:
116 result = method(self)
117 finally:
118 self.in_recursive_rule -= 1
119 endmark = self._mark()
120 depth += 1
121 if verbose:
122 print(
123 f"{fill}Recursive {method_name} at {mark} depth {depth}: {result!s:.200} to {endmark}"
124 )
125 if not result:
126 if verbose:
127 print(f"{fill}Fail with {lastresult!s:.200} to {lastmark}")
128 break
129 if endmark <= lastmark:
130 if verbose:
131 print(f"{fill}Bailing with {lastresult!s:.200} to {lastmark}")
132 break
133 self._cache[key] = lastresult, lastmark = result, endmark
134
135 self._reset(lastmark)
136 tree = lastresult
137
138 self._level -= 1
139 if verbose:
140 print(f"{fill}{method_name}() -> {tree!s:.200} [cached]")
141 if tree:
142 endmark = self._mark()
143 else:
144 endmark = mark
145 self._reset(endmark)
146 self._cache[key] = tree, endmark
147 else:
148 tree, endmark = self._cache[key]
149 if verbose:
150 print(f"{fill}{method_name}() -> {tree!s:.200} [fresh]")
151 if tree:
152 self._reset(endmark)
153 return tree
154
155 memoize_left_rec_wrapper.__wrapped__ = method # type: ignore
156 return memoize_left_rec_wrapper
157
158
159 class ESC[4;38;5;81mParser:
160 """Parsing base class."""
161
162 KEYWORDS: ClassVar[Tuple[str, ...]]
163
164 SOFT_KEYWORDS: ClassVar[Tuple[str, ...]]
165
166 def __init__(self, tokenizer: Tokenizer, *, verbose: bool = False):
167 self._tokenizer = tokenizer
168 self._verbose = verbose
169 self._level = 0
170 self._cache: Dict[Tuple[Mark, str, Tuple[Any, ...]], Tuple[Any, Mark]] = {}
171 # Integer tracking whether we are in a left recursive rule or not. Can be useful
172 # for error reporting.
173 self.in_recursive_rule = 0
174 # Pass through common tokenizer methods.
175 self._mark = self._tokenizer.mark
176 self._reset = self._tokenizer.reset
177
178 @abstractmethod
179 def start(self) -> Any:
180 pass
181
182 def showpeek(self) -> str:
183 tok = self._tokenizer.peek()
184 return f"{tok.start[0]}.{tok.start[1]}: {token.tok_name[tok.type]}:{tok.string!r}"
185
186 @memoize
187 def name(self) -> Optional[tokenize.TokenInfo]:
188 tok = self._tokenizer.peek()
189 if tok.type == token.NAME and tok.string not in self.KEYWORDS:
190 return self._tokenizer.getnext()
191 return None
192
193 @memoize
194 def number(self) -> Optional[tokenize.TokenInfo]:
195 tok = self._tokenizer.peek()
196 if tok.type == token.NUMBER:
197 return self._tokenizer.getnext()
198 return None
199
200 @memoize
201 def string(self) -> Optional[tokenize.TokenInfo]:
202 tok = self._tokenizer.peek()
203 if tok.type == token.STRING:
204 return self._tokenizer.getnext()
205 return None
206
207 @memoize
208 def op(self) -> Optional[tokenize.TokenInfo]:
209 tok = self._tokenizer.peek()
210 if tok.type == token.OP:
211 return self._tokenizer.getnext()
212 return None
213
214 @memoize
215 def type_comment(self) -> Optional[tokenize.TokenInfo]:
216 tok = self._tokenizer.peek()
217 if tok.type == token.TYPE_COMMENT:
218 return self._tokenizer.getnext()
219 return None
220
221 @memoize
222 def soft_keyword(self) -> Optional[tokenize.TokenInfo]:
223 tok = self._tokenizer.peek()
224 if tok.type == token.NAME and tok.string in self.SOFT_KEYWORDS:
225 return self._tokenizer.getnext()
226 return None
227
228 @memoize
229 def expect(self, type: str) -> Optional[tokenize.TokenInfo]:
230 tok = self._tokenizer.peek()
231 if tok.string == type:
232 return self._tokenizer.getnext()
233 if type in exact_token_types:
234 if tok.type == exact_token_types[type]:
235 return self._tokenizer.getnext()
236 if type in token.__dict__:
237 if tok.type == token.__dict__[type]:
238 return self._tokenizer.getnext()
239 if tok.type == token.OP and tok.string == type:
240 return self._tokenizer.getnext()
241 return None
242
243 def expect_forced(self, res: Any, expectation: str) -> Optional[tokenize.TokenInfo]:
244 if res is None:
245 raise self.make_syntax_error(f"expected {expectation}")
246 return res
247
248 def positive_lookahead(self, func: Callable[..., T], *args: object) -> T:
249 mark = self._mark()
250 ok = func(*args)
251 self._reset(mark)
252 return ok
253
254 def negative_lookahead(self, func: Callable[..., object], *args: object) -> bool:
255 mark = self._mark()
256 ok = func(*args)
257 self._reset(mark)
258 return not ok
259
260 def make_syntax_error(self, message: str, filename: str = "<unknown>") -> SyntaxError:
261 tok = self._tokenizer.diagnose()
262 return SyntaxError(message, (filename, tok.start[0], 1 + tok.start[1], tok.line))
263
264
265 def simple_parser_main(parser_class: Type[Parser]) -> None:
266 argparser = argparse.ArgumentParser()
267 argparser.add_argument(
268 "-v",
269 "--verbose",
270 action="count",
271 default=0,
272 help="Print timing stats; repeat for more debug output",
273 )
274 argparser.add_argument(
275 "-q", "--quiet", action="store_true", help="Don't print the parsed program"
276 )
277 argparser.add_argument("filename", help="Input file ('-' to use stdin)")
278
279 args = argparser.parse_args()
280 verbose = args.verbose
281 verbose_tokenizer = verbose >= 3
282 verbose_parser = verbose == 2 or verbose >= 4
283
284 t0 = time.time()
285
286 filename = args.filename
287 if filename == "" or filename == "-":
288 filename = "<stdin>"
289 file = sys.stdin
290 else:
291 file = open(args.filename)
292 try:
293 tokengen = tokenize.generate_tokens(file.readline)
294 tokenizer = Tokenizer(tokengen, verbose=verbose_tokenizer)
295 parser = parser_class(tokenizer, verbose=verbose_parser)
296 tree = parser.start()
297 try:
298 if file.isatty():
299 endpos = 0
300 else:
301 endpos = file.tell()
302 except IOError:
303 endpos = 0
304 finally:
305 if file is not sys.stdin:
306 file.close()
307
308 t1 = time.time()
309
310 if not tree:
311 err = parser.make_syntax_error(filename)
312 traceback.print_exception(err.__class__, err, None)
313 sys.exit(1)
314
315 if not args.quiet:
316 print(tree)
317
318 if verbose:
319 dt = t1 - t0
320 diag = tokenizer.diagnose()
321 nlines = diag.end[0]
322 if diag.type == token.ENDMARKER:
323 nlines -= 1
324 print(f"Total time: {dt:.3f} sec; {nlines} lines", end="")
325 if endpos:
326 print(f" ({endpos} bytes)", end="")
327 if dt:
328 print(f"; {nlines / dt:.0f} lines/sec")
329 else:
330 print()
331 print("Caches sizes:")
332 print(f" token array : {len(tokenizer._tokens):10}")
333 print(f" cache : {len(parser._cache):10}")
334 ## print_memstats()