1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 import copy
24 import sys
25
26
27 LITERAL_STRING = 0
28 CHARACTER_CLASS = 1
29 ANY_CHARACTER = 2
30 OPTIONAL = 3
31 ZERO_OR_MORE = 4
32 ONE_OR_MORE = 5
33 AND_PREDICATE = 6
34 NOT_PREDICATE = 7
35 PRIORITIZED_CHOICE = 8
36 BACKREF = 9
37 CALLABLE = 10
38
39 SIMPLE_CAPTURE = 11
40 POSITION_CAPTURE = 12
41 CONSTANT_CAPTURE = 13
42 LIST_CAPTURE = 14
43
45 """
46 A PEG pattern or set of patterns.
47
48 PEG Syntax::
49
50 Operator Type Precedence Description
51 -------- ------------ ---------- -----------
52 ' ' primary 5 Literal string
53 " " primary 5 Literal string
54 [' '] primary 5 Character class
55 1 primary 5 Any character
56 (e) primary 5 Grouping
57 e[:1] unary suffix 4 Optional
58 e[0:] unary suffix 4 Zero-or-more
59 e[1:] unary suffix 4 One-or-more
60 +e unary prefix 3 And-predicate
61 -e unary prefix 3 Not-predicate
62 e1 + e2 binary 2 Sequence
63 e1 | e2 binary 1 Prioritized Choice
64
65 Additionally::
66
67 Operator Type Precedence Description
68 -------- ------------ ---------- -----------
69 e1 - e2 binary 2 Syntactic sugar for -e2 + e1
70 """
71 REPETITION_LIMIT = 100
72
74 """
75 Creates and returns a PEG pattern from a given parameter.
76
77 For a given parameter:
78 * literal string - Matches its specific text (case insensitively) in a
79 subject string.
80 * list - Must contain a single string which is treated as a character class,
81 otherwise a ``ValueError`` will be raised.
82 Matches any character in a subject string contained in this class.
83 * integer - Must be positive or a ``ValueError`` will be raised.
84 Matches exactly that many characters in a subject string.
85 * function - The function will be passed the subject string and current
86 index the PEG is at. If a non-negative integer is returned,
87 that integer becomes the new index, indicating a match.
88 * dictionary - A dictionary is interpreted as a grammar. These are useful
89 for defining recursive patterns. Each dictionary has key
90 names with pattern values, otherwise a ``TypeError`` will be
91 raised. Also, there must be a 'default' key whose value is
92 the name key for the default pattern to initially match. A
93 ``ValueError`` will be raised if it does not exist. If it
94 points to an invalid name key, ``KeyError`` will be raised.
95 * Pattern - The pattern is simply copied.
96 * anything else - ``TypeError`` will be raised.
97
98 Warning: left-recursive patterns are not checked for and will cause a
99 ``RuntimeError`` when ``match`` is called due to a recursion overflow.
100
101 Examples::
102
103 # creates a pattern that matches 'foo' literally
104 patt = Pattern('foo')
105
106 # creates a pattern that matches 'a', or 'b', or 'c' character once
107 patt = Pattern(['abc'])
108
109 # creates a pattern that matches any single character
110 patt = Pattern(1)
111
112 # creates a pattern that matches any even, single digit number
113 def even_single_digit(subject, index):
114 digit = subject[index]
115 if digit >= '0' and digit <= '9' and int(digit) % 2 == 0:
116 return index + 1
117 patt = Pattern(even_single_digit)
118
119 # creates a pattern that searches for 'e'
120 patt = Pattern({
121 1: Pattern('e') | Pattern(1) + PatternBackRef(1),
122 'default': 1
123 })
124 """
125
126 self.pattern_list = []
127
128 if isinstance(pattern, str):
129 self.pattern_list = [[LITERAL_STRING, pattern]]
130
131
132 elif isinstance(pattern, list):
133 if len(pattern) == 1 and isinstance(pattern[0], str):
134 self.pattern_list = [[CHARACTER_CLASS, pattern[0]]]
135 else:
136 raise ValueError("Single string expected for Character class.")
137
138
139 elif isinstance(pattern, int):
140 if pattern > 0:
141 self.pattern_list = [[ANY_CHARACTER, pattern]]
142 else:
143 raise ValueError("Positive integer expected for Any character.")
144
145
146 elif callable(pattern):
147 self.pattern_list = [[CALLABLE, pattern]]
148
149
150 elif isinstance(pattern, dict):
151 if 'default' not in pattern:
152 raise ValueError("'default' key expected in grammar.")
153 for name, patt in pattern.iteritems():
154 if name != 'default':
155 pattern[name] = Pattern(patt)
156 default_name = pattern['default']
157 default_pattern = pattern.get(default_name, None)
158 if not isinstance(default_pattern, Pattern):
159 raise KeyError("'%s' not defined in grammar." % default_name)
160 self.pattern_list = default_pattern.pattern_list
161 grammar = pattern
162 self.pattern_list[0].append(grammar)
163
164
165 elif isinstance(pattern, Pattern):
166 self.pattern_list = copy.deepcopy(pattern.pattern_list)
167
168
169 else:
170 raise TypeError("Cannot convert given type to Pattern.")
171
173 """
174 Returns a PEG repetition of this pattern given a slice.
175
176 Valid slices are:
177 * ``[:1]`` - matches zero or one occurance of the pattern.
178 * ``[0:]`` - matches zero or more occurances of the pattern.
179 * ``[1:]`` - matches one or more occurances of the pattern.
180 * anything else - ``ValueError`` will be raised.
181
182 Warning: repeating patterns that do not consume input zero or more times
183 will cause a ``RuntimeError`` after ``self.REPETITION_LIMIT`` loops. This
184 cannot be checked for.
185
186 Examples::
187
188 # creates a pattern that matches 'foo' either not at all or only once
189 patt = Pattern('foo')[:0]
190
191 # creates a pattern that matches 'bar' either not at all or many times
192 patt = Pattern('bar')[0:]
193
194 # creates a pattern that matches 'baz' at least once
195 patt = Pattern('baz')[1:]
196 """
197 new_pattern = Pattern(self)
198 if isinstance(key, slice):
199 start, stop = key.start, key.stop
200 if start == 0 and stop == 1:
201 new_pattern.pattern_list = [[OPTIONAL, new_pattern.pattern_list]]
202 elif start == 0 and stop == sys.maxint:
203 new_pattern.pattern_list = [[ZERO_OR_MORE, new_pattern.pattern_list]]
204 elif start == 1 and stop == sys.maxint:
205 new_pattern.pattern_list = [[ONE_OR_MORE, new_pattern.pattern_list]]
206 else:
207 raise ValueError("[:1], [0:], or [1:] slice expected.")
208 else:
209 raise ValueError("[:1], [0:], or [1:] slice expected.")
210
211 return new_pattern
212
214 """
215 Returns a PEG And-predicate for the current pattern.
216
217 An And-predicate matches the pattern, but does not consume input.
218
219 Examples::
220
221 # creates a pattern that matches 'foo' literally, but does not consume it
222 patt = +Pattern('foo')
223
224 # creates a mattern that matches 'foo', but only consumes 'f'
225 patt = +Pattern('foo') + 'f'
226 """
227 new_pattern = Pattern(self)
228 new_pattern.pattern_list = [[AND_PREDICATE, new_pattern.pattern_list]]
229 return new_pattern
230
232 """
233 Returns a PEG Not-predicate for the current pattern.
234
235 A Not-predicate matches the inverse of the pattern, consuming no input.
236
237 Examples::
238
239 # creates a pattern that does not match the characters 'a', 'b', or 'c'
240 patt = -Pattern(['abc'])
241
242 # creates a pattern that matches 'foo', but not 'bar' afterwards
243 patt = Pattern('foo') + -Pattern('bar')
244 """
245 new_pattern = Pattern(self)
246 new_pattern.pattern_list = [[NOT_PREDICATE, new_pattern.pattern_list]]
247 return new_pattern
248
250 """
251 Joins this pattern and another together in a PEG Sequence and returns it.
252
253 A Sequence matches the first pattern and then the second.
254
255 If the other pattern is a literal string, list, integer, function, or
256 dictionary, it is converted into a PEG pattern as in ``__init__`` first.
257
258 Examples::
259
260 # creates a pattern that matches 'foo' and 'bar' literally
261 patt = Pattern('foo') + 'bar'
262 """
263 new_pattern = Pattern(self)
264 if isinstance(other, (str, list, int)):
265 pattern = Pattern(other)
266 new_pattern.pattern_list.append(pattern.pattern_list[0])
267 elif isinstance(other, Pattern):
268 for pattern in other.pattern_list:
269 new_pattern.pattern_list.append(pattern)
270 else:
271 raise TypeError("Cannot convert right operand to Pattern.")
272
273 return new_pattern
274
276 """
277 Syntactic sugar for ``-other + self``.
278
279 Examples::
280
281 # creates a pattern that matches any character but 'a'
282 patt = Pattern(1) - 'a'
283 """
284 return -Pattern(other) + self
285
287 """
288 Join this pattern and another together in a PEG Prioritized Choice and
289 returns it.
290
291 A Prioritized Choice matches either pattern, the first taking precedence.
292
293 If the other pattern is a literal string, list, integer, function, or
294 dictionary, it is converted into a PEG pattern as in ``__init__`` first.
295
296 Examples::
297
298 # creates a pattern that matches 'foo' first, or else 'bar'
299 patt = Pattern('foo') | 'bar'
300 """
301 new_pattern = Pattern(self)
302 if isinstance(other, (str, list, int)):
303 pattern = Pattern(other)
304 new_pattern.pattern_list = [[PRIORITIZED_CHOICE,
305 [new_pattern.pattern_list,
306 pattern.pattern_list]]]
307 elif isinstance(other, Pattern):
308 new_pattern.pattern_list = [[PRIORITIZED_CHOICE,
309 [new_pattern.pattern_list,
310 other.pattern_list]]]
311 else:
312 raise TypeError("Cannot convert right operand to Pattern.")
313
314 return new_pattern
315
316 - def match(self, subject, init=0):
317 """
318 Matches this PEG pattern against a subject string starting at init.
319 If any captures are contained in the pattern, they are returned as a tuple.
320 Otherwise, the index of the first character in subject after the match is
321 returned, or ``-1`` if the match failed.
322
323 ``IndexError`` will be raised if ``init >= len(subject)``.
324
325 Examples::
326
327 patt = Pattern('foo')
328 index = patt.match('foobar')
329 #result: index = 3
330
331 patt = Pattern('bar')
332 index = patt.match('foobar', 3)
333 #result: index = 6
334
335 patt = SimpleCapture('foo')
336 text, = patt.match('foobar')
337 #result: text = 'foo'
338 """
339 if init >= len(subject):
340 raise IndexError("init >= len(subject)")
341
342 captures = []
343 userdata = {
344 'pattern': self,
345 'capture_stack': [captures],
346 'current_grammar': None
347 }
348
349 index = self.__match(self, subject, init, userdata)
350
351 if len(captures) > 0:
352 return captures
353 else:
354 return index
355
356 - def __match(self, pattern, subject, init, userdata):
357 """
358 Private helper function used internally.
359 Behaves like ``match``, but always returns an index.
360 """
361 pattern_list = None
362 if isinstance(pattern, Pattern):
363 pattern_list = pattern.pattern_list
364 elif isinstance(pattern, list):
365 pattern_list = pattern
366
367 if init >= len(subject):
368 return -1
369
370 index = init
371 capture_stack = userdata['capture_stack']
372 pattern_has_grammar = False
373
374 for pattern in pattern_list:
375 ptype = pattern[0]
376 patt = pattern[1]
377 if len(pattern) > 2:
378 grammar = pattern[2]
379 current_grammar = userdata['current_grammar']
380 if current_grammar and current_grammar != grammar:
381 raise RuntimeError("Cannot nest grammars.")
382 elif not current_grammar:
383 userdata['current_grammar'] = grammar
384 pattern_has_grammar = True
385
386
387 if ptype == LITERAL_STRING:
388 if subject.startswith(patt, index):
389 index += len(patt)
390 else:
391 return -1
392 elif ptype == CHARACTER_CLASS:
393 if patt.find(subject[index]) != -1:
394 index += 1
395 else:
396 return -1
397 elif ptype == ANY_CHARACTER:
398 num_chars = patt
399 index += num_chars
400 elif ptype == OPTIONAL:
401 new_index = self.__match(patt, subject, index, userdata)
402 if new_index != -1:
403 index = new_index
404 elif ptype == ZERO_OR_MORE:
405 count = 0
406 new_index = self.__match(patt, subject, index, userdata)
407 while new_index != -1:
408 if new_index == index:
409 count += 1
410 if count > self.REPETITION_LIMIT:
411 raise RuntimeError("Max repetition reached.")
412 else:
413 index = new_index
414 new_index = self.__match(patt, subject, index, userdata)
415 elif ptype == ONE_OR_MORE:
416 index = self.__match(patt, subject, index, userdata)
417 if index != -1:
418 count = 0
419 new_index = self.__match(patt, subject, index, userdata)
420 while new_index != -1:
421 if new_index == index:
422 count += 1
423 if count > self.REPETITION_LIMIT:
424 raise RuntimeError("Max repetition reached.")
425 else:
426 index = new_index
427 new_index = self.__match(patt, subject, index, userdata)
428 else:
429 return -1
430 elif ptype == AND_PREDICATE:
431 if self.__match(patt, subject, index, userdata) == -1:
432 return -1
433 elif ptype == NOT_PREDICATE:
434 if self.__match(patt, subject, index, userdata) != -1:
435 return -1
436 elif ptype == PRIORITIZED_CHOICE:
437 choice1, choice2 = patt[0], patt[1]
438 capture_stack.append([])
439 new_index = self.__match(choice1, subject, index, userdata)
440 pending_captures = capture_stack.pop()
441 if new_index != -1:
442
443 for capture in pending_captures:
444 capture_stack[-1].append(capture)
445 index = new_index
446 else:
447 capture_stack.append([])
448 index = self.__match(choice2, subject, index, userdata)
449 pending_captures = capture_stack.pop()
450 if index != -1:
451
452 for capture in pending_captures:
453 capture_stack[-1].append(capture)
454 else:
455 return -1
456 elif ptype == BACKREF:
457 current_grammar = userdata['current_grammar']
458 if current_grammar:
459 name = patt
460 ref = current_grammar.get(name, None)
461 if ref:
462 index = self.__match(ref, subject, index, userdata)
463 if index == -1:
464 return -1
465 else:
466 raise LookupError("Backref '%s' is undefined." % name)
467 else:
468 raise LookupError("Backref '%s' used outside of grammar." % patt)
469 elif ptype == CALLABLE:
470 function = patt
471 index = function.__call__(subject, index)
472 if not isinstance(index, int):
473 return -1
474
475
476
477
478
479
480
481
482
483
484
485
486
487 elif ptype == SIMPLE_CAPTURE:
488 start = index
489 end = self.__match(patt, subject, index, userdata)
490 if end != -1:
491 capture = subject[start:end]
492 capture_stack[-1].append(capture)
493 index = end
494 else:
495 return -1
496 elif ptype == POSITION_CAPTURE:
497 capture_stack[-1].append(index)
498 elif ptype == CONSTANT_CAPTURE:
499 constant = patt
500 capture_stack[-1].append(constant)
501 elif ptype == LIST_CAPTURE:
502 capture_stack.append([])
503 index = self.__match(patt, subject, index, userdata)
504 capture_list = capture_stack.pop()
505 if index != -1:
506 capture_stack[-1].append(capture_list)
507 else:
508 return -1
509
510
511 else:
512 raise RuntimeError("'%s' not implemented." % ptype)
513
514 if pattern_has_grammar:
515 userdata['current_grammar'] = None
516
517 return index
518
520 """
521 Creates and returns a reference in the current PEG grammar for the given name
522 (valid only inside a PEG grammar).
523
524 name is the key whose pattern is being referred to in the grammar dictionary.
525
526 Examples::
527
528 # creates a pattern to match balanced parentheses
529 patt = Pattern({
530 1: Pattern('(') + ((Pattern(1) - ['()']) | PatternBackRef(1))[0:] + ')',
531 'default': 1
532 })
533 """
535 self.pattern_list = [[BACKREF, ref]]
536
537
538
540 """
541 Creates and returns a capture for the text of the given pattern.
542
543 If the pattern is a literal string, list, integer, function, or dictionary,
544 it is converted into a PEG pattern as in ``Pattern.__init__``.
545
546 Examples::
547
548 # creates a pattern to capture text inside quotations marks
549 patt = Pattern('"') + SimpleCapture((Pattern(1) - '"')[0:]) + '"'
550 """
554
556 """
557 Creates and returns a position capture at the point of insertion in a PEG
558 pattern (matches the empty string).
559
560 Examples::
561
562 # creates a pattern to capture the start and end indices of quoted text
563 patt = Pattern('"') + PositionCapture() + (Pattern(1) - '"')[0:] + \
564 PositionCapture() + '"'
565 """
568
570 """
571 Creates and returns a constant capture at the point of insertion in a PEG
572 pattern (matches the empty string).
573
574 If more than one constant value is passed, the constant capture is returned as
575 as list. Otherwise the single value is returned.
576
577 Examples::
578
579 # creates a pattern to capture 'quoted_text' and then the quoted text itself
580 patt = Pattern('"') + ConstantCapture('quoted_text') + \
581 SimpleCapture((Pattern(1) - '"')[0:]) + '"'
582 """
584 if len(values) == 1:
585 values = values[0]
586 self.pattern_list = [[CONSTANT_CAPTURE, values]]
587
589 """
590 Creates and returns a list capture for the Captures in a given pattern.
591
592 If the pattern is a literal string, list, integer, function, or dictionary,
593 it is converted into a PEG pattern as in ``Pattern.__init__``.
594
595 Examples::
596
597 # creates a pattern to capture all words as a list
598 word = SimpleCapture((Pattern(1) - ' \\t\\r\\n')[1:])
599 spaces = Pattern(' \\t\\r\\n')[1:]
600 patt = ListCapture({ 1: word + (spaces + word)[0:], 'default': 1 })
601 """
605