Module peg
[hide private]
[frames] | no frames]

Source Code for Module peg

  1  # The MIT License 
  2  # 
  3  # Copyright (c) 2009 Mitchell 
  4  # 
  5  # Permission is hereby granted, free of charge, to any person obtaining a copy 
  6  # of this software and associated documentation files (the "Software"), to deal 
  7  # in the Software without restriction, including without limitation the rights 
  8  # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 
  9  # copies of the Software, and to permit persons to whom the Software is 
 10  # furnished to do so, subject to the following conditions: 
 11  # 
 12  # The above copyright notice and this permission notice shall be included in 
 13  # all copies or substantial portions of the Software. 
 14  # 
 15  # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
 16  # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
 17  # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
 18  # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
 19  # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 
 20  # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 
 21  # THE SOFTWARE. 
 22   
 23  import copy 
 24  import sys # for sys.maxint 
 25   
 26  # operator constants 
 27  LITERAL_STRING = 0 
 28  CHARACTER_CLASS = 1 
 29  ANY_CHARACTER = 2 
 30  OPTIONAL = 3 
 31  ZERO_OR_MORE = 4 
 32  ONE_OR_MORE = 5 
 33  AND_PREDICATE = 6 
 34  NOT_PREDICATE = 7 
 35  PRIORITIZED_CHOICE = 8 
 36  BACKREF = 9 
 37  CALLABLE = 10 
 38  # capture constants 
 39  SIMPLE_CAPTURE = 11 
 40  POSITION_CAPTURE = 12 
 41  CONSTANT_CAPTURE = 13 
 42  LIST_CAPTURE = 14 
 43   
44 -class Pattern:
45 """ 46 A PEG pattern or set of patterns. 47 48 PEG Syntax:: 49 50 Operator Type Precedence Description 51 -------- ------------ ---------- ----------- 52 ' ' primary 5 Literal string 53 " " primary 5 Literal string 54 [' '] primary 5 Character class 55 1 primary 5 Any character 56 (e) primary 5 Grouping 57 e[:1] unary suffix 4 Optional 58 e[0:] unary suffix 4 Zero-or-more 59 e[1:] unary suffix 4 One-or-more 60 +e unary prefix 3 And-predicate 61 -e unary prefix 3 Not-predicate 62 e1 + e2 binary 2 Sequence 63 e1 | e2 binary 1 Prioritized Choice 64 65 Additionally:: 66 67 Operator Type Precedence Description 68 -------- ------------ ---------- ----------- 69 e1 - e2 binary 2 Syntactic sugar for -e2 + e1 70 """ 71 REPETITION_LIMIT = 100 # do not infinitely recurse for a bad pattern 72
73 - def __init__(self, pattern):
74 """ 75 Creates and returns a PEG pattern from a given parameter. 76 77 For a given parameter: 78 * literal string - Matches its specific text (case insensitively) in a 79 subject string. 80 * list - Must contain a single string which is treated as a character class, 81 otherwise a ``ValueError`` will be raised. 82 Matches any character in a subject string contained in this class. 83 * integer - Must be positive or a ``ValueError`` will be raised. 84 Matches exactly that many characters in a subject string. 85 * function - The function will be passed the subject string and current 86 index the PEG is at. If a non-negative integer is returned, 87 that integer becomes the new index, indicating a match. 88 * dictionary - A dictionary is interpreted as a grammar. These are useful 89 for defining recursive patterns. Each dictionary has key 90 names with pattern values, otherwise a ``TypeError`` will be 91 raised. Also, there must be a 'default' key whose value is 92 the name key for the default pattern to initially match. A 93 ``ValueError`` will be raised if it does not exist. If it 94 points to an invalid name key, ``KeyError`` will be raised. 95 * Pattern - The pattern is simply copied. 96 * anything else - ``TypeError`` will be raised. 97 98 Warning: left-recursive patterns are not checked for and will cause a 99 ``RuntimeError`` when ``match`` is called due to a recursion overflow. 100 101 Examples:: 102 103 # creates a pattern that matches 'foo' literally 104 patt = Pattern('foo') 105 106 # creates a pattern that matches 'a', or 'b', or 'c' character once 107 patt = Pattern(['abc']) 108 109 # creates a pattern that matches any single character 110 patt = Pattern(1) 111 112 # creates a pattern that matches any even, single digit number 113 def even_single_digit(subject, index): 114 digit = subject[index] 115 if digit >= '0' and digit <= '9' and int(digit) % 2 == 0: 116 return index + 1 117 patt = Pattern(even_single_digit) 118 119 # creates a pattern that searches for 'e' 120 patt = Pattern({ 121 1: Pattern('e') | Pattern(1) + PatternBackRef(1), 122 'default': 1 123 }) 124 """ 125 126 self.pattern_list = [] 127 # literal-string 128 if isinstance(pattern, str): 129 self.pattern_list = [[LITERAL_STRING, pattern]] 130 131 # character-class 132 elif isinstance(pattern, list): 133 if len(pattern) == 1 and isinstance(pattern[0], str): 134 self.pattern_list = [[CHARACTER_CLASS, pattern[0]]] 135 else: 136 raise ValueError("Single string expected for Character class.") 137 138 # any-character 139 elif isinstance(pattern, int): 140 if pattern > 0: 141 self.pattern_list = [[ANY_CHARACTER, pattern]] 142 else: 143 raise ValueError("Positive integer expected for Any character.") 144 145 # function 146 elif callable(pattern): 147 self.pattern_list = [[CALLABLE, pattern]] 148 149 # grammar 150 elif isinstance(pattern, dict): 151 if 'default' not in pattern: 152 raise ValueError("'default' key expected in grammar.") 153 for name, patt in pattern.iteritems(): # invalid in Python 3.0 154 if name != 'default': 155 pattern[name] = Pattern(patt) # copy 156 default_name = pattern['default'] 157 default_pattern = pattern.get(default_name, None) 158 if not isinstance(default_pattern, Pattern): 159 raise KeyError("'%s' not defined in grammar." % default_name) 160 self.pattern_list = default_pattern.pattern_list 161 grammar = pattern 162 self.pattern_list[0].append(grammar) 163 164 # pattern 165 elif isinstance(pattern, Pattern): 166 self.pattern_list = copy.deepcopy(pattern.pattern_list) 167 168 # other 169 else: 170 raise TypeError("Cannot convert given type to Pattern.")
171
172 - def __getitem__(self, key):
173 """ 174 Returns a PEG repetition of this pattern given a slice. 175 176 Valid slices are: 177 * ``[:1]`` - matches zero or one occurance of the pattern. 178 * ``[0:]`` - matches zero or more occurances of the pattern. 179 * ``[1:]`` - matches one or more occurances of the pattern. 180 * anything else - ``ValueError`` will be raised. 181 182 Warning: repeating patterns that do not consume input zero or more times 183 will cause a ``RuntimeError`` after ``self.REPETITION_LIMIT`` loops. This 184 cannot be checked for. 185 186 Examples:: 187 188 # creates a pattern that matches 'foo' either not at all or only once 189 patt = Pattern('foo')[:0] 190 191 # creates a pattern that matches 'bar' either not at all or many times 192 patt = Pattern('bar')[0:] 193 194 # creates a pattern that matches 'baz' at least once 195 patt = Pattern('baz')[1:] 196 """ 197 new_pattern = Pattern(self) 198 if isinstance(key, slice): 199 start, stop = key.start, key.stop 200 if start == 0 and stop == 1: # optional 201 new_pattern.pattern_list = [[OPTIONAL, new_pattern.pattern_list]] 202 elif start == 0 and stop == sys.maxint: # zero-or-more 203 new_pattern.pattern_list = [[ZERO_OR_MORE, new_pattern.pattern_list]] 204 elif start == 1 and stop == sys.maxint: # one-or-more 205 new_pattern.pattern_list = [[ONE_OR_MORE, new_pattern.pattern_list]] 206 else: 207 raise ValueError("[:1], [0:], or [1:] slice expected.") 208 else: 209 raise ValueError("[:1], [0:], or [1:] slice expected.") 210 211 return new_pattern
212
213 - def __pos__(self):
214 """ 215 Returns a PEG And-predicate for the current pattern. 216 217 An And-predicate matches the pattern, but does not consume input. 218 219 Examples:: 220 221 # creates a pattern that matches 'foo' literally, but does not consume it 222 patt = +Pattern('foo') 223 224 # creates a mattern that matches 'foo', but only consumes 'f' 225 patt = +Pattern('foo') + 'f' 226 """ 227 new_pattern = Pattern(self) 228 new_pattern.pattern_list = [[AND_PREDICATE, new_pattern.pattern_list]] 229 return new_pattern
230
231 - def __neg__(self):
232 """ 233 Returns a PEG Not-predicate for the current pattern. 234 235 A Not-predicate matches the inverse of the pattern, consuming no input. 236 237 Examples:: 238 239 # creates a pattern that does not match the characters 'a', 'b', or 'c' 240 patt = -Pattern(['abc']) 241 242 # creates a pattern that matches 'foo', but not 'bar' afterwards 243 patt = Pattern('foo') + -Pattern('bar') 244 """ 245 new_pattern = Pattern(self) 246 new_pattern.pattern_list = [[NOT_PREDICATE, new_pattern.pattern_list]] 247 return new_pattern
248
249 - def __add__(self, other):
250 """ 251 Joins this pattern and another together in a PEG Sequence and returns it. 252 253 A Sequence matches the first pattern and then the second. 254 255 If the other pattern is a literal string, list, integer, function, or 256 dictionary, it is converted into a PEG pattern as in ``__init__`` first. 257 258 Examples:: 259 260 # creates a pattern that matches 'foo' and 'bar' literally 261 patt = Pattern('foo') + 'bar' 262 """ 263 new_pattern = Pattern(self) 264 if isinstance(other, (str, list, int)): 265 pattern = Pattern(other) 266 new_pattern.pattern_list.append(pattern.pattern_list[0]) 267 elif isinstance(other, Pattern): 268 for pattern in other.pattern_list: 269 new_pattern.pattern_list.append(pattern) 270 else: 271 raise TypeError("Cannot convert right operand to Pattern.") 272 273 return new_pattern
274
275 - def __sub__(self, other):
276 """ 277 Syntactic sugar for ``-other + self``. 278 279 Examples:: 280 281 # creates a pattern that matches any character but 'a' 282 patt = Pattern(1) - 'a' 283 """ 284 return -Pattern(other) + self
285
286 - def __or__(self, other):
287 """ 288 Join this pattern and another together in a PEG Prioritized Choice and 289 returns it. 290 291 A Prioritized Choice matches either pattern, the first taking precedence. 292 293 If the other pattern is a literal string, list, integer, function, or 294 dictionary, it is converted into a PEG pattern as in ``__init__`` first. 295 296 Examples:: 297 298 # creates a pattern that matches 'foo' first, or else 'bar' 299 patt = Pattern('foo') | 'bar' 300 """ 301 new_pattern = Pattern(self) 302 if isinstance(other, (str, list, int)): 303 pattern = Pattern(other) 304 new_pattern.pattern_list = [[PRIORITIZED_CHOICE, 305 [new_pattern.pattern_list, 306 pattern.pattern_list]]] 307 elif isinstance(other, Pattern): 308 new_pattern.pattern_list = [[PRIORITIZED_CHOICE, 309 [new_pattern.pattern_list, 310 other.pattern_list]]] 311 else: 312 raise TypeError("Cannot convert right operand to Pattern.") 313 314 return new_pattern
315
316 - def match(self, subject, init=0):
317 """ 318 Matches this PEG pattern against a subject string starting at init. 319 If any captures are contained in the pattern, they are returned as a tuple. 320 Otherwise, the index of the first character in subject after the match is 321 returned, or ``-1`` if the match failed. 322 323 ``IndexError`` will be raised if ``init >= len(subject)``. 324 325 Examples:: 326 327 patt = Pattern('foo') 328 index = patt.match('foobar') 329 #result: index = 3 330 331 patt = Pattern('bar') 332 index = patt.match('foobar', 3) 333 #result: index = 6 334 335 patt = SimpleCapture('foo') 336 text, = patt.match('foobar') 337 #result: text = 'foo' 338 """ 339 if init >= len(subject): 340 raise IndexError("init >= len(subject)") 341 342 captures = [] 343 userdata = { 344 'pattern': self, 345 'capture_stack': [captures], 346 'current_grammar': None 347 } 348 349 index = self.__match(self, subject, init, userdata) 350 351 if len(captures) > 0: 352 return captures 353 else: 354 return index
355
356 - def __match(self, pattern, subject, init, userdata):
357 """ 358 Private helper function used internally. 359 Behaves like ``match``, but always returns an index. 360 """ 361 pattern_list = None 362 if isinstance(pattern, Pattern): 363 pattern_list = pattern.pattern_list 364 elif isinstance(pattern, list): 365 pattern_list = pattern 366 367 if init >= len(subject): 368 return -1 369 370 index = init 371 capture_stack = userdata['capture_stack'] 372 pattern_has_grammar = False 373 374 for pattern in pattern_list: 375 ptype = pattern[0] 376 patt = pattern[1] 377 if len(pattern) > 2: 378 grammar = pattern[2] 379 current_grammar = userdata['current_grammar'] 380 if current_grammar and current_grammar != grammar: 381 raise RuntimeError("Cannot nest grammars.") 382 elif not current_grammar: 383 userdata['current_grammar'] = grammar 384 pattern_has_grammar = True # applies to all patterns in pattern_list 385 386 # Patterns 387 if ptype == LITERAL_STRING: 388 if subject.startswith(patt, index): 389 index += len(patt) 390 else: 391 return -1 392 elif ptype == CHARACTER_CLASS: 393 if patt.find(subject[index]) != -1: 394 index += 1 395 else: 396 return -1 397 elif ptype == ANY_CHARACTER: 398 num_chars = patt 399 index += num_chars 400 elif ptype == OPTIONAL: 401 new_index = self.__match(patt, subject, index, userdata) 402 if new_index != -1: 403 index = new_index 404 elif ptype == ZERO_OR_MORE: 405 count = 0 406 new_index = self.__match(patt, subject, index, userdata) 407 while new_index != -1: 408 if new_index == index: 409 count += 1 410 if count > self.REPETITION_LIMIT: 411 raise RuntimeError("Max repetition reached.") 412 else: 413 index = new_index 414 new_index = self.__match(patt, subject, index, userdata) 415 elif ptype == ONE_OR_MORE: 416 index = self.__match(patt, subject, index, userdata) 417 if index != -1: 418 count = 0 419 new_index = self.__match(patt, subject, index, userdata) 420 while new_index != -1: 421 if new_index == index: 422 count += 1 423 if count > self.REPETITION_LIMIT: 424 raise RuntimeError("Max repetition reached.") 425 else: 426 index = new_index 427 new_index = self.__match(patt, subject, index, userdata) 428 else: 429 return -1 430 elif ptype == AND_PREDICATE: 431 if self.__match(patt, subject, index, userdata) == -1: 432 return -1 433 elif ptype == NOT_PREDICATE: 434 if self.__match(patt, subject, index, userdata) != -1: 435 return -1 436 elif ptype == PRIORITIZED_CHOICE: 437 choice1, choice2 = patt[0], patt[1] 438 capture_stack.append([]) # for pending captures 439 new_index = self.__match(choice1, subject, index, userdata) 440 pending_captures = capture_stack.pop() 441 if new_index != -1: 442 # choice matched; add pending captures to capture_stack 443 for capture in pending_captures: 444 capture_stack[-1].append(capture) 445 index = new_index 446 else: 447 capture_stack.append([]) # for pending captures 448 index = self.__match(choice2, subject, index, userdata) 449 pending_captures = capture_stack.pop() 450 if index != -1: 451 # choice matched; add pending captures to capture_stack 452 for capture in pending_captures: 453 capture_stack[-1].append(capture) 454 else: 455 return -1 456 elif ptype == BACKREF: 457 current_grammar = userdata['current_grammar'] 458 if current_grammar: 459 name = patt 460 ref = current_grammar.get(name, None) 461 if ref: 462 index = self.__match(ref, subject, index, userdata) 463 if index == -1: 464 return -1 465 else: 466 raise LookupError("Backref '%s' is undefined." % name) 467 else: 468 raise LookupError("Backref '%s' used outside of grammar." % patt) 469 elif ptype == CALLABLE: 470 function = patt 471 index = function.__call__(subject, index) 472 if not isinstance(index, int): 473 return -1 474 475 # Captures 476 # 477 # For a prioritized choice, all captures should be pending so if the 478 # choice fails, those captures can be discarded. Therefore, when a 479 # prioritized choice is entered, a new capture list is pushed to the 480 # 'capture_stack' so all captures can be added to it. If the choice 481 # matches, all pending captures are added to the old capture list 482 # automatically. 483 # 484 # List captures use the same 'capture_stack' mechanism described above, 485 # but add the pending capture list as a list rather than adding each 486 # individual capture to the old capture list. 487 elif ptype == SIMPLE_CAPTURE: 488 start = index 489 end = self.__match(patt, subject, index, userdata) 490 if end != -1: 491 capture = subject[start:end] 492 capture_stack[-1].append(capture) 493 index = end 494 else: 495 return -1 496 elif ptype == POSITION_CAPTURE: 497 capture_stack[-1].append(index) 498 elif ptype == CONSTANT_CAPTURE: 499 constant = patt 500 capture_stack[-1].append(constant) 501 elif ptype == LIST_CAPTURE: 502 capture_stack.append([]) # for capture list 503 index = self.__match(patt, subject, index, userdata) 504 capture_list = capture_stack.pop() 505 if index != -1: 506 capture_stack[-1].append(capture_list) 507 else: 508 return -1 509 510 # Not implemented 511 else: 512 raise RuntimeError("'%s' not implemented." % ptype) 513 514 if pattern_has_grammar: 515 userdata['current_grammar'] = None # done with the current grammar 516 517 return index
518
519 -class PatternBackRef(Pattern):
520 """ 521 Creates and returns a reference in the current PEG grammar for the given name 522 (valid only inside a PEG grammar). 523 524 name is the key whose pattern is being referred to in the grammar dictionary. 525 526 Examples:: 527 528 # creates a pattern to match balanced parentheses 529 patt = Pattern({ 530 1: Pattern('(') + ((Pattern(1) - ['()']) | PatternBackRef(1))[0:] + ')', 531 'default': 1 532 }) 533 """
534 - def __init__(self, ref):
535 self.pattern_list = [[BACKREF, ref]]
536 537 # Captures 538
539 -class SimpleCapture(Pattern):
540 """ 541 Creates and returns a capture for the text of the given pattern. 542 543 If the pattern is a literal string, list, integer, function, or dictionary, 544 it is converted into a PEG pattern as in ``Pattern.__init__``. 545 546 Examples:: 547 548 # creates a pattern to capture text inside quotations marks 549 patt = Pattern('"') + SimpleCapture((Pattern(1) - '"')[0:]) + '"' 550 """
551 - def __init__(self, pattern):
552 Pattern.__init__(self, pattern) 553 self.pattern_list = [[SIMPLE_CAPTURE, self.pattern_list]]
554
555 -class PositionCapture(Pattern):
556 """ 557 Creates and returns a position capture at the point of insertion in a PEG 558 pattern (matches the empty string). 559 560 Examples:: 561 562 # creates a pattern to capture the start and end indices of quoted text 563 patt = Pattern('"') + PositionCapture() + (Pattern(1) - '"')[0:] + \ 564 PositionCapture() + '"' 565 """
566 - def __init__(self):
567 self.pattern_list = [[POSITION_CAPTURE, None]]
568
569 -class ConstantCapture(Pattern):
570 """ 571 Creates and returns a constant capture at the point of insertion in a PEG 572 pattern (matches the empty string). 573 574 If more than one constant value is passed, the constant capture is returned as 575 as list. Otherwise the single value is returned. 576 577 Examples:: 578 579 # creates a pattern to capture 'quoted_text' and then the quoted text itself 580 patt = Pattern('"') + ConstantCapture('quoted_text') + \ 581 SimpleCapture((Pattern(1) - '"')[0:]) + '"' 582 """
583 - def __init__(self, *values):
584 if len(values) == 1: 585 values = values[0] 586 self.pattern_list = [[CONSTANT_CAPTURE, values]]
587
588 -class ListCapture(Pattern):
589 """ 590 Creates and returns a list capture for the Captures in a given pattern. 591 592 If the pattern is a literal string, list, integer, function, or dictionary, 593 it is converted into a PEG pattern as in ``Pattern.__init__``. 594 595 Examples:: 596 597 # creates a pattern to capture all words as a list 598 word = SimpleCapture((Pattern(1) - ' \\t\\r\\n')[1:]) 599 spaces = Pattern(' \\t\\r\\n')[1:] 600 patt = ListCapture({ 1: word + (spaces + word)[0:], 'default': 1 }) 601 """
602 - def __init__(self, pattern):
603 Pattern.__init__(self, pattern) 604 self.pattern_list = [[LIST_CAPTURE, self.pattern_list]]
605