0001"""Pull-style interface for ElementTree."""
0002
0003from __future__ import generators
0004
0005__revision__ = "$Rev$"
0006__date__ = "$Date: 2005-02-16 15:43:38 -0500 (Wed, 16 Feb 2005) $"
0007__author__ = "Ryan Tomayko (rtomayko@gmail.com)"
0008__copyright__ = "Copyright 2004-2005, Ryan Tomayko"
0009__license__ = "MIT <http://www.opensource.org/licenses/mit-license.php>"
0010
0011from kid.et import *
0012from kid.util import open_resource, QuickTextReader
0013
0014from xml.parsers import expat
0015
0016
0017import htmlentitydefs
0018default_entity_map = {}
0019default_external_dtd = []
0020for k, v in htmlentitydefs.name2codepoint.items():
0021 default_entity_map[k] = unichr(v)
0022 default_external_dtd.append('<!ENTITY %s "&#%d;">' % (k, v))
0023default_external_dtd = '\n'.join(default_external_dtd)
0024
0025
0026class InvalidStreamState(Exception):
0027 def __init__(self, msg="Invalid stream state."):
0028 Exception.__init__(self, msg)
0029
0030def XML(text, fragment=1, encoding=None, xmlns=None):
0031 """Element generator that reads from a string"""
0032 if text.startswith('<?xml ') or text.startswith('<!DOCTYPE '):
0033 fragment = 0
0034 if fragment:
0035
0036 if xmlns:
0037 text = '<xml xmlns="%s">%s</xml>' % (xmlns,text)
0038 else:
0039 text = '<xml>%s</xml>' % text
0040 if isinstance(text, unicode):
0041 encoding = 'utf-16'
0042 text = text.encode(encoding)
0043 p = Parser(QuickTextReader(text), encoding)
0044 p._sourcetext = text
0045 return ElementStream(_coalesce(p, encoding=encoding)).strip()
0046 else:
0047 if isinstance(text, unicode):
0048 encoding = 'utf-16'
0049 text = text.encode(encoding)
0050 p = Parser(QuickTextReader(text), encoding)
0051 p._sourcetext = text
0052 return ElementStream(_coalesce(p, encoding=encoding))
0053
0054def document(file, encoding=None, filename=None):
0055 if not hasattr(file, 'read'):
0056 if filename is None:
0057 filename = file
0058 file = open_resource(file, 'rb')
0059 else:
0060 if filename is None:
0061 filename = '<string>'
0062 p = Parser(file, encoding)
0063 p._filename = filename
0064 return ElementStream(_coalesce(p, encoding=encoding))
0065
0066class ElementStream(object):
0067
0068 """Provides a pull/streaming interface to ElementTree.
0069
0070 Instances of this class are iterable. Most methods of the class act on
0071 the Element that is currently being visited.
0072
0073 """
0074
0075 def __init__(self, stream, current=None):
0076 """Create an ElementStream.
0077
0078 stream - an iterator that returns ElementStream events.
0079 current - If an Element is provided in this parameter than
0080 it is yielded as the first element in the stream.
0081
0082 """
0083 if hasattr(stream, 'tag') and hasattr(stream, 'attrib'):
0084 stream = self._pull(stream, tail=1)
0085 self.current = None
0086 self._iter = self._track(iter(stream), current)
0087
0088 def __iter__(self):
0089 return self._iter
0090
0091 def expand(self):
0092 """Expand the current item in the stream as an Element."""
0093
0094 current = self.current
0095 if current is None:
0096 current = []
0097 stack = [current]
0098 last = None
0099 for ev, item in self._iter:
0100 if ev == START:
0101 current = item
0102 if len(stack) > 0:
0103 stack[-1].append(current)
0104 last = None
0105 stack.append(current)
0106 elif ev == END:
0107 last = stack.pop()
0108 assert last is item
0109 if len(stack) == 0:
0110 break
0111 elif ev == TEXT:
0112 if last is not None:
0113 last.tail = item
0114 else:
0115 current.text = item
0116 if isinstance(last, list):
0117 return last[0]
0118 else:
0119 return last
0120
0121 def strip(self, levels=1):
0122 depth = self.current is not None and 1 or 0
0123 for (ev, item) in self._iter:
0124 if ev == START:
0125 depth += 1
0126 if depth > levels or (depth == levels and ev not in (START, END)):
0127 yield (ev, item)
0128 if ev == END:
0129 depth -= 1
0130 if depth == 0:
0131 break
0132 elif depth < 0:
0133 raise InvalidStreamState()
0134
0135 def eat(self):
0136 """Eat the current element and all decendant items."""
0137 depth = self.current is not None and 1 or 0
0138 for (ev, item) in self._iter:
0139 if ev == START:
0140 depth += 1
0141 elif ev == END:
0142 depth -= 1
0143 if depth == 0:
0144 break
0145 return self
0146
0147 def _pull(self, elem, tail=0):
0148 orig = elem
0149 elem = Element(orig.tag, dict(orig.attrib))
0150
0151 if elem.tag in (Comment, ProcessingInstruction):
0152 elem.text = orig.text
0153 orig.text = None
0154 yield (START, elem)
0155 if orig.text:
0156 yield (TEXT, orig.text)
0157 for child in orig.getchildren():
0158 for event in self._pull(child, tail=1):
0159 yield event
0160 yield (END, elem)
0161 if tail and orig.tail:
0162 yield (TEXT, orig.tail)
0163
0164 def _track(self, stream, current=None):
0165 if current is not None:
0166 self.current = current
0167 yield (START, current)
0168 for p in stream:
0169 ev, item = p
0170 if ev == START:
0171 self.current = item
0172 elif ev == END:
0173 self.current = None
0174 yield (ev, item)
0175
0176 def ensure(cls, stream, current=None):
0177 if isinstance(stream, cls):
0178 return stream
0179 else:
0180 return cls(stream, current)
0181 ensure = classmethod(ensure)
0182
0183
0184def to_unicode(value, encoding):
0185 if isinstance(value, unicode):
0186 return value
0187
0188 if hasattr(value, '__unicode__'):
0189 return unicode(value)
0190
0191 if not isinstance(value, str):
0192 value = str(value)
0193
0194 return unicode(value, encoding)
0195
0196
0197def _coalesce(stream, encoding, extended=1):
0198 """Coalesces TEXT events and namespace events.
0199
0200 Fold multiple sequential TEXT events into a single event.
0201
0202 The 'encoding' attribute is for the source strings.
0203 """
0204 textbuf = []
0205 namespaces = []
0206 last_ev = None
0207 last = None
0208 current = None
0209 stack = [None]
0210 for ev, item in stream:
0211 if ev == TEXT:
0212 textbuf.append(item)
0213 last_ev = TEXT
0214 continue
0215 if last_ev == TEXT:
0216 text = u""
0217 for value in textbuf:
0218 text += to_unicode(value, encoding)
0219
0220 textbuf = []
0221 if text:
0222 yield (TEXT, text)
0223 if ev == START:
0224 attrib = item.attrib
0225 for prefix, uri in namespaces:
0226 if prefix:
0227 attrib['xmlns:%s' % prefix] = uri
0228 else:
0229 attrib['xmlns'] = uri
0230 namespaces = []
0231 current = item
0232 stack.append(item)
0233 elif ev == END:
0234 current = stack.pop()
0235 elif ev == START_NS:
0236 prefix, uri = item
0237 namespaces.append( (prefix, uri) )
0238 continue
0239 elif ev == END_NS:
0240 continue
0241 yield ev, item
0242 if last_ev == TEXT:
0243 text = u""
0244 for value in textbuf:
0245 text += to_unicode(value, encoding)
0246
0247 if text:
0248 yield (TEXT, text)
0249
0250
0251START = 1
0252END = 2
0253TEXT = 3
0254DOCTYPE = 4
0255XML_DECL = 5
0256
0257
0258START_NS = 10
0259END_NS = 11
0260PI = 12
0261COMMENT = 13
0262
0263def Parser(source, encoding=None):
0264 return ExpatParser(source)
0265
0266
0267
0268
0269
0270
0271
0272
0273
0274
0275class ExpatParser(object):
0276
0277 def __init__(self, source, encoding=None):
0278 if not hasattr(source, 'read'):
0279 filename = source
0280 source = open(source, 'rb')
0281 else:
0282 filename = '<string>'
0283 self._filename = filename
0284 self._source = source
0285 self._parser = parser = expat.ParserCreate(encoding, "}")
0286 self._queue = []
0287
0288
0289 parser.DefaultHandler = self._default
0290 parser.StartElementHandler = self._start
0291 parser.EndElementHandler = self._end
0292 parser.CharacterDataHandler = self._data
0293 parser.ProcessingInstructionHandler = self._pi
0294 parser.CommentHandler = self._comment
0295 parser.StartNamespaceDeclHandler = self._start_ns
0296 parser.EndNamespaceDeclHandler = self._end_ns
0297 parser.XmlDeclHandler = self._xmldecl_handler
0298 parser.StartDoctypeDeclHandler = self._doctype_handler
0299
0300
0301 try:
0302 self._parser.buffer_text = 1
0303 except AttributeError:
0304 pass
0305
0306 try:
0307 self._parser.ordered_attributes = 1
0308 self._parser.specified_attributes = 1
0309 parser.StartElementHandler = self._start_list
0310 except AttributeError:
0311 pass
0312 self._doctype = None
0313
0314 self.entity = default_entity_map
0315 self.external_dtd = default_external_dtd
0316
0317 self._parser.SetParamEntityParsing(
0318 expat.XML_PARAM_ENTITY_PARSING_ALWAYS)
0319 self._parser.ExternalEntityRefHandler = self._buildForeign
0320 self._parser.UseForeignDTD()
0321
0322 def _buildForeign(self, context, base, systemId, publicId):
0323 import StringIO
0324 parseableFile = StringIO.StringIO(default_external_dtd)
0325 original_parser = self._parser
0326 self._parser = self._parser.ExternalEntityParserCreate(context)
0327 self._parser.ParseFile(parseableFile)
0328 self._parser = original_parser
0329 return 1
0330
0331 def push(self, ev, stuff):
0332 self._queue.append( (ev, stuff) )
0333
0334 def _expat_stream(self):
0335 bufsize = 4 * 1024
0336 feed = self.feed
0337 read = self._source.read
0338 done = 0
0339 while 1:
0340 while not done and len(self._queue) == 0:
0341 data = read(bufsize)
0342 if data == '':
0343 self.close()
0344 done = 1
0345 else:
0346 feed(data)
0347 for i in self._queue:
0348 yield i
0349 self._queue = []
0350 if done:
0351 break
0352
0353 def __iter__(self):
0354 names = {}
0355
0356
0357 old_ns = 'http://naeblis.cx/ns/kid#'
0358 new_ns = 'http://purl.org/kid/ns#'
0359 def fixname(key):
0360 if key.startswith(old_ns):
0361 key = ''.join([new_ns, key[len(old_ns):]])
0362 try:
0363 name = names[key]
0364 except KeyError:
0365 name = key
0366 if "}" in name:
0367 name = "{" + name
0368 names[key] = name
0369 return name
0370
0371 stack = []
0372 parent = None
0373 current = None
0374 for (ev, stuff) in self._expat_stream():
0375 if ev == TEXT:
0376 yield (TEXT, stuff)
0377 elif ev == START:
0378 tag, attrib_in = stuff
0379 tag = fixname(tag)
0380 attrib = {}
0381 if attrib_in:
0382 for key, value in attrib_in.items():
0383 attrib[fixname(key)] = value
0384 parent = current
0385 current = Element(tag, attrib)
0386 stack.append(current)
0387 yield (START, current)
0388 elif ev == END:
0389 current = stack.pop()
0390 assert fixname(stuff) == current.tag
0391 parent = len(stack) and stack[-1] or None
0392 yield (END, current)
0393 elif ev == COMMENT:
0394 current = Comment(stuff)
0395 yield (START, current)
0396 yield (END, current)
0397 elif ev == PI:
0398 current = ProcessingInstruction(stuff[0], stuff[1])
0399 yield (START, current)
0400 yield (END, current)
0401 else:
0402 yield (ev, stuff)
0403
0404 def feed(self, data):
0405 try:
0406 self._parser.Parse(data, 0)
0407 except expat.ExpatError, e:
0408 e.filename = self._filename
0409 if hasattr(self, '_sourcetext'):
0410 line = e.lineno
0411 e.source = self._sourcetext.split('\n', line)[-1]
0412 else:
0413 e.source = '???'
0414 raise e
0415
0416 def close(self):
0417 if hasattr(self, '_parser'):
0418 self._parser.Parse('', 1)
0419 del self._parser
0420
0421 def _start(self, tag, attrib_in):
0422 self._queue.append((START, (tag, attrib_in)))
0423
0424 def _start_list(self, tag, attrib_in):
0425 attrib = None
0426 if attrib_in:
0427 attrib = {}
0428 for i in range(0, len(attrib_in), 2):
0429 attrib[attrib_in[i]] = attrib_in[i+1]
0430 self._queue.append((START, (tag, attrib)))
0431
0432 def _data(self, text):
0433 self._queue.append((TEXT, text))
0434
0435 def _end(self, tag):
0436 self._queue.append((END, tag))
0437
0438 def _default(self, text):
0439 prefix = text[:1]
0440 if prefix == "&":
0441
0442 try:
0443 self._queue.append((TEXT, self.entity[text[1:-1]]))
0444 except KeyError:
0445 from xml.parsers import expat
0446 raise expat.error(
0447 "undefined entity %s: line %d, column %d" %
0448 (text, self._parser.ErrorLineNumber,
0449 self._parser.ErrorColumnNumber)
0450 )
0451 else:
0452
0453
0454 pass
0455
0456 def _pi(self, target, data):
0457 self._queue.append((PI, (target, data)))
0458
0459 def _comment(self, text):
0460 self._queue.append((COMMENT, text))
0461
0462 def _start_ns(self, prefix, uri):
0463
0464
0465 if uri == 'http://naeblis.cx/ns/kid#':
0466 newuri = 'http://purl.org/kid/ns#'
0467 from warnings import warn
0468 warn('Document uses old kid namespace [%s] this should be changed'
0469 ' to [%s].' % (uri, newuri))
0470 uri = newuri
0471 self._queue.append((START_NS, (prefix or '', uri)))
0472
0473 def _end_ns(self, prefix):
0474 self._queue.append((END_NS, prefix or ''))
0475
0476 def _xmldecl_handler(self, version, encoding, standalone):
0477 self._queue.append((XML_DECL, (version, encoding, standalone)))
0478
0479 def _doctype_handler(self, name, sysid, pubid, has_internal_subset):
0480 self._queue.append((DOCTYPE, (name, pubid, sysid)))
0481
0482
0483
0484
0485
0486__all__ = ['Element', 'SubElement', 'Comment','ProcessingInstruction',
0487 'ElementStream', 'XML', 'document', 'Parser', 'ExpatParser',
0488 'START', 'END', 'TEXT', 'COMMENT', 'PI', 'XML_DECL', 'DOCTYPE']