import re, types from Constants import * from sre_compile import MAXCODE PatternType = types.ListType class Parser: digit_expr = re.compile('[0-9]+') name_expr = re.compile('[a-zA-Z_][a-zA-Z0-9_-]*') def __init__(self, string, defines): self.yyin = string self.defines = defines self.yypos = 0 self.sre_able = True return def switch_context(self, string): context = (self.yyin, self.yypos) self.yyin = string self.yypos = 0 return context def restore_context(self, context): self.yyin, self.yypos = context return def match(self, char): try: next = self.yyin[self.yypos] except IndexError: return 0 if char == next: self.yypos += 1 return 1 return 0 def peek(self): try: return self.yyin[self.yypos] except IndexError: return None def match_regexp(self, pattern): match = pattern.match(self.yyin, self.yypos) if match: self.yypos = match.end() match = match.group() return match def peek_regexp(self, pattern): return pattern.match(self.yyin, self.yypos) is not None def get(self): try: char = self.yyin[self.yypos] except IndexError: return None if char == '\\': # escape sequence (returns integer) try: escape = self.yyin[self.yypos + 1] if escape == 'a': # \a - alert char = 0x07 elif escape == 'b': # \b - backspace char= 0x08 elif escape == 'f': # \f - formfeed char = 0x0C elif escape == 'n': # \n - newline char = 0x0A elif escape == 'r': # \r - carriage return char = 0x0D elif escape == 't': # \t - horizontal tab char = 0x09 elif escape == 'v': # \v - vertical tab char = 0x0B elif escape == 'x': # \xHH - 8-bit hexadecimal value digits = self.yyin[self.yypos + 2 : self.yypos + 4] try: char = int(digits, 16) except ValueError: escape = self.yyin[self.yypos : self.yypos + 4] raise SyntaxError('bad hexadecimal escape %r' % escape) self.yypos += 2 elif escape == 'u': # \uHHHH - 16-bit hexadecimal value digits = self.yyin[self.yypos + 2 : self.yypos + 6] try: char = int(digits, 16) except ValueError: escape = self.yyin[self.yypos : self.yypos + 6] raise SyntaxError('bad hexadecimal escape %r' % escape) self.yypos += 4 elif escape == 'U': # \UHHHHHHHH - 32-bit hexadecimal value digits = self.yyin[self.yypos + 2 : self.yypos + 10] try: char = int(digits, 16) except ValueError: escape = self.yyin[self.yypos : self.yypos + 10] raise SyntaxError('bad hexadecimal escape %r' % escape) if char > MAXCODE: self.sre_able = 0 self.yypos += 8 elif escape == '0': # \OOO - octal value digits = self.yyin[self.yypos + 2 : self.yypos + 4] try: char = int(digits, 8) except ValueError: escape = self.yyin[self.yypos : self.yypos + 4] raise SyntaxError('bad octal escape %r' % escape) self.yypos += 2 else: # \ - escaped special character char = ord(escape) self.yypos += 1 except IndexError: raise SyntaxError('bad escape sequence (end-of-file)') elif ord(char) > MAXCODE: self.sre_able = 0 self.yypos += 1 return char def parse(self): # Check for special EOF rule if self.yyin.startswith('<>'): if len(self.yyin) > 7: raise SyntaxError('EOF rule can only be used by itself') return [(AT, AT_END_STRING)] self.trailing_context = 0 try: return self.parse_regexp() except SyntaxError, e: raise SyntaxError("At offset %d: %s" % (self.yypos, e)) def parse_regexp(self, subpattern=False): """parse an alternation: a|b|c""" items = [] repeat = 1 while repeat: items.append(self.parse_branch()) repeat = self.match('|') paren = self.match(')') if subpattern and not paren: # This is in a group, must have closing paren raise SyntaxError('missing closing parenthesis') elif paren and not subpattern: raise SyntaxError('too many parentheses') # Not really an alternation, just a sequence if len(items) == 1: return items[0] # merge lone literals and character sets into a character set singles = [ item for item in items if len(item) == 1 ] charset = [] for item in [ item for item in singles if item[0][0] == LITERAL ]: items.remove(item) charset.append((CHARSET_LITERAL, item[0][1])) for item in [ item for item in singles if item[0][0] == CHARSET ]: items.remove(item) charset.extend(item[0][1]) if len(charset) == 1: items.insert(0, [(LITERAL, charset[0][1])]) elif charset: items.insert(0, [(CHARSET, charset)]) if len(items) == 1: return items[0] return [(BRANCH, items)] def parse_branch(self): # parse a simple pattern pattern = [] while self.peek() not in (None, "|", ")"): this = self.get() if isinstance(this, types.IntType): # escape sequence pattern.append((LITERAL, this)) elif this not in '"[?*+{.(/^$': # regular character pattern.append((LITERAL, ord(this))) elif this == '"': char = self.get() while char and char != '"': if isinstance(char, types.IntType): pattern.append((LITERAL, char)) else: pattern.append((LITERAL, ord(char))) char = self.get() if not char: raise SyntaxError('bad string literal (end-of-line)') elif this == "[": # character set if self.match('^'): type = NOT_CHARSET else: type = CHARSET pattern.append((type, self.parse_bracket_expression())) elif this == '{' and self.peek_regexp(self.name_expr): # named substitution name = self.match_regexp(self.name_expr) if not self.match("}"): raise SyntaxError("invalid substitution expression") try: subst = self.defines[name] except KeyError: raise SyntaxError('undefined definition {%s}' % name) # Cache the parsed forms for subsequent accesses if not isinstance(subst, PatternType): # Parse the definition into an AST old_context = self.switch_context(subst) subst = self.parse_regexp() self.restore_context(old_context) self.defines[name] = subst # Replace inline as an anonymous group pattern.append((SUBPATTERN, subst)) elif this in '?*+{': # repeat previous item if this == "?": min, max = 0, 1 elif this == "*": min, max = 0, -1 elif this == "+": min, max = 1, -1 elif this == "{": digits = self.match_regexp(self.digit_expr) try: min = int(digits, 10) except ValueError: raise SyntaxError('invalid lower bound') max = -1 if self.match(','): # bound repetition digits = self.match_regexp(self.digit_expr) try: max = int(digits, 10) except ValueError: raise SyntaxError('invalid upper bound') if max < min: raise SyntaxError('lower bound cannot be greater ' 'than upper bound') if not self.match("}"): raise SyntaxError("invalid repetition expression") # figure out which item to repeat if pattern: item = pattern[-1:] else: item = None if not item or (len(item) == 1 and item[0][0] in (BOL, EOL)): raise SyntaxError("nothing to repeat") if max == -1: pattern[-1] = (REPEAT, (min, item)) else: pattern[-1] = (REPEAT_RANGE, (min, max, item)) elif this == ".": pattern.append((ANY, None)) elif this == "(": pattern.append((SUBPATTERN, self.parse_regexp(True))) elif this == '/': if self.trailing_context: raise SyntaxError("trailing context used twice") self.trailing_context = 1 pattern.append((ASSERT, self.parse_regexp())) self.trailing_context = 0 elif this == "^": pattern.append((BOL, None)) elif this == "$": pattern.append((EOL, None)) else: # Placeholder just in case raise AssertionError("parser error, unhandled special %r" % this) return pattern def parse_bracket_expression(self): set = [] while 1: this = self.get() if this == "]" and set: # end of charset break elif this == '[' and self.peek() in (':', '=', '.'): next = self.get() if next == ':': raise NotImplementedError('character class') elif next == '=': raise NotImplementedError('equivalence class') elif next == '.': raise NotImplementedError('collating symbol') elif isinstance(this, types.IntType): # translated escape sequence code1 = (CHARSET_LITERAL, this) elif this is not None: # regular character code1 = (CHARSET_LITERAL, ord(this)) else: # end-of-file raise SyntaxError("bad bracket expression (end-of-file)") if self.match("-"): # potential range this = self.get() if this == "]": set.append(code1) set.append((CHARSET_LITERAL, ord("-"))) break if isinstance(this, types.IntType): hi = this else: hi = ord(this) lo = code1[1] if hi < lo: raise SyntaxError("negative character range") set.append((CHARSET_RANGE, (lo, hi))) else: set.append(code1) return set