import types from Constants import * class Compiler: def __init__(self): self.bitmaps = {} self.headers = {} return def compile(self, ast): return self.compile_node(ast, []) def compile_node(self, node, bytecode, final=1): append = bytecode.append extend = bytecode.extend for op, data in node: if op in (ANY, BOL, EOL, EOF, SUCCESS, FAILURE): # append(OP_CODES[op]) elif op is LITERAL: # append(OP_CODES[op]) append(data) elif op in (CHARSET, NOT_CHARSET): if len(data) == 1 and data[0][0] is CHARSET_LITERAL: if op is CHARSET: # append(OP_CODES[LITERAL]) else: # append(OP_CODES[NOT_LITERAL]) append(data[0][1]) else: # code = self.compile_charset(data) append(OP_CODES[op]) append(len(code)+1) extend(code) elif op is REPEAT: # code = self.compile_node(data[1], []) append(OP_CODES[op]) append(len(code)+2) append(data[0]) extend(code) elif op is REPEAT_RANGE: # code = self.compile_node(data[2], []) append(OP_CODES[op]) append(len(code)+3) append(data[0]) append(data[1]) extend(code) elif op is SUBPATTERN: self.compile_node(data, bytecode, 0) elif op is ASSERT: # code = self.compile_node(data, []) append(OP_CODES[op]) append(len(code)+1) extend(code) elif op is BRANCH: # ... append(OP_CODES[op]) for item in data: code = self.compile_node(item, []) append(len(code)+1) extend(code) append(0) # null else: raise AssertionError("compiler error, unsupported operand %r" % op) if final: append(OP_CODES[SUCCESS]) return bytecode def compile_charset(self, node): bytecode = [] append = bytecode.append extend = bytecode.extend for op, data in self.compress_charset(node): if op is CHARSET_LITERAL: append(CHARSET_CODES[op]) append(data) elif op is CHARSET_RANGE: append(CHARSET_CODES[op]) extend(data) elif op in (CHARSET_SMALL, CHARSET_BIG): append(CHARSET_CODES[op]) append(data) else: raise AssertionError('compiler error, unsupported operand %r' % op) append(CHARSET_CODES[FAILURE]) return bytecode def compress_charset(self, node): smallchars = {} chars = {} widechars = {} for op, data in node: if op is CHARSET_LITERAL: if data < 256: smallchars[data] = 1 elif data < 65536: chars[data] = 1 else: widechars[data] = 1 elif op is CHARSET_RANGE: for ordinal in xrange(data[0], data[1]+1): if ordinal < 256: smallchars[ordinal] = 1 elif ordinal < 65536: chars[ordinal] = 1 else: widechars[ordinal] = 1 charset = [] if not chars: # use a "small" charset (0-255) charmap = [0]*256 for ordinal in smallchars: charmap[ordinal] = 1 chunk = tuple(charmap) block = len(self.bitmaps) id = self.bitmaps.setdefault(chunk, block) charset.append((CHARSET_SMALL, id)) else: # use a "big" charset (0-65535) op = CHARSET_BIG chars.update(smallchars) charmap = [0]*65536 for ordinal in chars: charmap[ordinal] = 1 header = [0]*256 for i in xrange(256): chunk = tuple(charmap[i*256:(i+1)*256]) id = self.bitmaps.setdefault(chunk, len(self.bitmaps)) header[i] = id id = self.headers.setdefault(tuple(header), len(self.headers)) charset.append((CHARSET_BIG, id)) if widechars: # Find the largest ordinal in the charset max_ordinal = max(widechars.keys()) start = length = 0 # By checking one higher than the largest, the last item is flushed for ordinal in xrange(65536, max_ordinal+2): if widechars.has_key(ordinal): if not length: start = ordinal length += 1 elif length: if length == 1: charset.append((CHARSET_LITERAL, start)) else: charset.append((CHARSET_RANGE, (start, start+length-1))) length = 0 return charset