0001"""Infoset serialization formats (XML, XHTML, HTML, etc)"""
0002
0003from __future__ import generators
0004
0005__revision__ = "$Rev$"
0006__date__ = "$Date: 2005-02-16 15:43:38 -0500 (Wed, 16 Feb 2005) $"
0007__author__ = "Ryan Tomayko (rtomayko@gmail.com)"
0008__copyright__ = "Copyright 2004-2005, Ryan Tomayko"
0009__license__ = "MIT <http://www.opensource.org/licenses/mit-license.php>"
0010
0011import re
0012
0013from kid.et import *
0014from kid.pull import *
0015from kid.pull import _coalesce
0016
0017
0018import kid.namespace as namespace
0019
0020__all__ = ['doctypes', 'Serializer', 'XMLSerializer', 'HTMLSerializer']
0021
0022
0023
0024doctypes = {
0025 'html-strict' : ('HTML', '-//W3C//DTD HTML 4.01//EN',
0026 'http://www.w3.org/TR/html4/strict.dtd'),
0027 'html' : ('HTML', '-//W3C//DTD HTML 4.01 Transitional//EN',
0028 'http://www.w3.org/TR/html4/loose.dtd'),
0029 'xhtml-strict' : ('html', '-//W3C//DTD XHTML 1.0 Strict//EN',
0030 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd'),
0031 'xhtml' : ('html', '-//W3C//DTD XHTML 1.0 Transitional//EN',
0032 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd') }
0033
0034
0035class Serializer(object):
0036
0037 namespaces = namespace.namespaces
0038 encoding = 'utf-8'
0039 balanced_blocks = False
0040 strip_whitespace = 0
0041
0042 def __init__(self, encoding=None, src_encoding="utf-8"):
0043 if encoding is not None:
0044 self.encoding = encoding
0045 self.src_encoding = src_encoding
0046
0047 def has_only_pcdata(self, tagname):
0048 return False
0049
0050 def serialize(self, stream, encoding=None, fragment=0):
0051 text = list(self.generate(stream, encoding, fragment))
0052 return ''.join(text)
0053
0054 def write(self, stream, file, encoding=None, fragment=0):
0055 needs_closed = False
0056 if not hasattr(file, 'write'):
0057 needs_closed = True
0058 file = open(file, 'wb')
0059 try:
0060 write = file.write
0061 for text in self.generate(stream, encoding, fragment):
0062 write(text)
0063 finally:
0064
0065 if needs_closed:
0066 file.close()
0067
0068 def generate(self, stream, encoding=None, fragment=0):
0069 pass
0070
0071 def apply_filters(self, stream):
0072 stream = _coalesce(stream, self.src_encoding)
0073 if self.strip_whitespace:
0074 stream = self.whitespace_filter(stream)
0075 else:
0076 if self.balanced_blocks:
0077 stream = self.balancing_filter(stream)
0078 return stream
0079
0080 def balancing_filter(self, stream):
0081 line_collapse = re.compile('\n{2,}')
0082 text = ''
0083 hops = 0
0084 for ev, item in stream:
0085 if ev == TEXT:
0086 text = item
0087 hops = 0
0088 elif ev in (START, END) and item.tag != Fragment:
0089 if hops > 0:
0090 if text and text.strip() == '':
0091 yield (TEXT, line_collapse.sub('\n', text))
0092 elif text:
0093 if text.strip() == '':
0094 yield (TEXT, line_collapse.sub('\n', text))
0095 else:
0096 yield (TEXT, text)
0097 yield (ev, item)
0098 hops+=1
0099
0100
0101 if ev == START and (self.has_only_pcdata(item.tag)
0102 or item.tag == Comment):
0103 text = ''
0104 else:
0105 yield (ev, item)
0106
0107 def whitespace_filter(self, stream):
0108 for ev, item in stream:
0109 if ev == TEXT:
0110 yield (TEXT, item.strip())
0111 else:
0112 yield (ev, item)
0113
0114class XMLSerializer(Serializer):
0115
0116 decl = 1
0117 doctype = None
0118 cdata_elements = []
0119
0120 def __init__(self, encoding=None, decl=None, doctype=None,
0121 namespaces=None):
0122 Serializer.__init__(self, encoding)
0123 if decl is not None:
0124 self.decl = decl
0125 if doctype is not None:
0126 self.doctype = doctype
0127 if isinstance(self.doctype, basestring):
0128
0129 self.doctype = doctypes[self.doctype]
0130 if namespaces:
0131 self.namespaces = namespaces
0132
0133 def can_be_empty_element(self, ns_stack, item_name):
0134 return True
0135
0136 def generate(self, stream, encoding=None, fragment=0):
0137 """Serializes an event stream to bytes of the specified encoding.
0138
0139 This function yields an encoded string over and over until the
0140 stream is exhausted.
0141
0142 """
0143
0144 encoding = encoding or self.encoding or 'utf-8'
0145 escape_cdata = XMLSerializer.escape_cdata
0146 escape_attrib = XMLSerializer.escape_attrib
0147
0148 lastev = None
0149 stream = iter(stream)
0150 names = NamespaceStack(self.namespaces)
0151 if not fragment:
0152 if self.decl:
0153 yield '<?xml version="1.0" encoding="%s"?>\n' % encoding
0154 if self.doctype is not None:
0155 yield serialize_doctype(self.doctype) + '\n'
0156 text = None
0157 for ev, item in self.apply_filters(stream):
0158 if ev in (START, END) and item.tag == Fragment:
0159 continue
0160 elif ev == TEXT:
0161 if text is not None:
0162 text = u''.join([text, item])
0163 else:
0164 text = item
0165 continue
0166 if lastev == START:
0167 if ev == END and (not text or not text.strip()) and self.can_be_empty_element(names, item.tag):
0168 yield ' />'
0169 lastev = END
0170 text = None
0171 names.pop()
0172 continue
0173 yield ">"
0174 if text:
0175 yield escape_cdata(text, encoding)
0176 text = None
0177 if ev == START:
0178 if item.tag == Comment:
0179 yield "<!--%s-->" % item.text.encode(encoding)
0180 lastev = COMMENT
0181 continue
0182 elif item.tag == ProcessingInstruction:
0183 yield "<?%s?>" % item.text.encode(encoding)
0184 lastev = PI
0185 continue
0186 else:
0187 tag = item.tag
0188 names.push(namespaces(item, remove=1))
0189 qname = names.qname(tag, default=1)
0190 yield "<" + qname.encode(encoding)
0191 attrs = item.attrib.items()
0192 if attrs:
0193 for k, v in attrs:
0194 qname = names.qname(k, default=0)
0195 yield ' %s="%s"' % (qname.encode(encoding),
0196 escape_attrib(v, encoding))
0197 for prefix, uri in names.current.items():
0198 if prefix == '':
0199 yield ' xmlns="%s"' % escape_attrib(uri, encoding)
0200 else:
0201 yield ' xmlns:%s="%s"' % (prefix.encode(encoding),
0202 escape_attrib(uri, encoding))
0203 elif ev == END and item.tag not in (Comment, ProcessingInstruction):
0204 qname = names.qname(item.tag, default=1)
0205 yield "</%s>" % qname.encode(encoding)
0206 names.pop()
0207 lastev = ev
0208 return
0209
0210 def escape_cdata(text, encoding=None):
0211 """Escape character data."""
0212 try:
0213 if encoding:
0214 try:
0215 text = text.encode(encoding)
0216 except UnicodeError:
0217 return encode_entity(text)
0218 text = text.replace("&", "&")
0219 text = text.replace("<", "<")
0220 return text
0221 except (TypeError, AttributeError):
0222 raise_serialization_error(text)
0223 escape_cdata = staticmethod(escape_cdata)
0224
0225 def escape_attrib(text, encoding=None):
0226 """Escape attribute value."""
0227 try:
0228 if encoding:
0229 try:
0230 text = text.encode(encoding)
0231 except UnicodeError:
0232 return encode_entity(text)
0233 text = text.replace("&", "&")
0234 text = text.replace("<", "<")
0235 text = text.replace("\"", """)
0236 return text
0237 except (TypeError, AttributeError):
0238 raise_serialization_error(text)
0239 escape_attrib = staticmethod(escape_attrib)
0240
0241
0242
0243try:
0244 set
0245except NameError:
0246 try:
0247 from sets import Set as set
0248 except ImportError:
0249 def set(seq):
0250 return seq
0251
0252import kid.namespace as namespace
0253xhtml = namespace.xhtml.uri
0254import string
0255
0256class HTMLSerializer(Serializer):
0257
0258 doctype = doctypes['html']
0259 transpose = string.upper
0260 transpose = staticmethod(transpose)
0261 inject_type = 1
0262 empty_elements = set(['area', 'base', 'basefont', 'br', 'col', 'frame',
0263 'hr', 'img', 'input', 'isindex', 'link', 'meta',
0264 'param'])
0265
0266 elements_with_pcdata = set(['option', 'textarea', 'fieldset', 'title'])
0267 noescape_elements = set(['script', 'style'])
0268 boolean_attributes = set(['selected', 'checked', 'compact', 'declare',
0269 'defer', 'disabled', 'ismap', 'multiple', 'nohref',
0270 'noresize', 'noshade', 'nowrap'])
0271
0272 def __init__(self, encoding='utf-8', doctype=None, transpose=None):
0273 Serializer.__init__(self, encoding)
0274 if doctype:
0275 self.doctype = doctype
0276 if isinstance(self.doctype, basestring):
0277
0278 self.doctype = doctypes[self.doctype]
0279 if transpose:
0280 self.transpose = transpose
0281
0282 def has_only_pcdata(self, tagname):
0283 if isinstance(tagname, basestring) and tagname[0] == '{':
0284 tagname = tagname.split('}')[1]
0285 return tagname in self.elements_with_pcdata
0286
0287 def generate(self, stream, encoding=None, fragment=0):
0288 """Serializes an event stream to bytes of the specified encoding.
0289
0290 This function yields an encoded string over and over until the
0291 stream is exhausted.
0292
0293 """
0294
0295 encoding = encoding or self.encoding or 'utf-8'
0296
0297 escape_cdata = HTMLSerializer.escape_cdata
0298 escape_attrib = HTMLSerializer.escape_attrib
0299 noescape_elements = self.noescape_elements
0300 boolean_attributes = self.boolean_attributes
0301 empty_elements = self.empty_elements
0302
0303 names = NamespaceStack(self.namespaces)
0304
0305 def grok_name(tag):
0306 if tag[0] == '{':
0307 uri, localname = tag[1:].split('}', 1)
0308 else:
0309 uri, localname = None, tag
0310 if uri and uri != xhtml:
0311 qname = names.qname(tag, default=0)
0312 else:
0313 qname = localname
0314 if self.transpose is not None:
0315 qname = self.transpose(qname)
0316 return (uri, localname, qname)
0317
0318 attr_http_equiv = 'http-equiv'
0319 attr_content = 'content'
0320 if self.transpose:
0321 attr_http_equiv = self.transpose('http-equiv')
0322 attr_content = self.transpose('content')
0323
0324 current = None
0325 stack = [current]
0326 stream = iter(stream)
0327 if not fragment and self.doctype is not None:
0328 yield serialize_doctype(self.doctype) + '\n'
0329 for ev, item in self.apply_filters(stream):
0330 if ev == TEXT and item:
0331 escape = current not in noescape_elements
0332 yield escape_cdata(item, encoding, escape)
0333 elif ev == START:
0334 if item.tag == Comment:
0335 yield "<!--%s-->" % item.text.encode(encoding)
0336 lastev = COMMENT
0337 continue
0338 elif item.tag == ProcessingInstruction:
0339 yield "<?%s>" % item.text.encode(encoding)
0340 lastev = PI
0341 continue
0342 elif item.tag == Fragment:
0343 continue
0344 else:
0345 names.push(namespaces(item, remove=1))
0346 tag = item.tag
0347 (uri, localname, qname) = grok_name(tag)
0348
0349
0350 current = qname.lower()
0351 stack.append(current)
0352
0353 yield "<" + qname.encode(encoding)
0354 attrs = item.attrib.items()
0355 if attrs:
0356 for k, v in attrs:
0357 (u, l, q) = grok_name(k)
0358 lq = q.lower()
0359 if lq == 'xml:lang': continue
0360 if lq in boolean_attributes:
0361
0362
0363 yield ' %s' % q.encode(encoding)
0364 else:
0365 yield ' %s="%s"' % (q.encode(encoding),
0366 escape_attrib(v, encoding))
0367 yield ">"
0368 if self.inject_type:
0369 if current == 'head':
0370 (uri, localname, qname) = grok_name("meta")
0371 yield '<%s %s="text/html; charset=%s"' ' %s="Content-Type">' % (qname.encode(encoding),
0374 attr_content,
0375 encoding,
0376 attr_http_equiv)
0377
0378 elif ev == END and item.tag not in (Comment,
0379 ProcessingInstruction,
0380 Fragment):
0381 current = stack.pop()
0382 if current not in empty_elements:
0383 tag = item.tag
0384 (uri, localname, qname) = grok_name(tag)
0385 yield "</%s>" % qname.encode(encoding)
0386 current = stack[-1]
0387 names.pop()
0388 return
0389
0390 def escape_cdata(text, encoding=None, escape=1):
0391 """Escape character data."""
0392 try:
0393 if encoding:
0394 try:
0395 text = text.encode(encoding)
0396 except UnicodeError:
0397 return encode_entity(text)
0398 if escape:
0399 text = text.replace("&", "&")
0400 text = text.replace("<", "<")
0401 return text
0402 except (TypeError, AttributeError):
0403 raise_serialization_error(text)
0404 escape_cdata = staticmethod(escape_cdata)
0405
0406 def escape_attrib(text, encoding=None):
0407 """Escape attribute value."""
0408 try:
0409 if encoding:
0410 try:
0411 text = text.encode(encoding)
0412 except UnicodeError:
0413 return encode_entity(text)
0414 text = text.replace("&", "&")
0415 text = text.replace("\"", """)
0416 return text
0417 except (TypeError, AttributeError):
0418 raise_serialization_error(text)
0419 escape_attrib = staticmethod(escape_attrib)
0420
0421class XHTMLSerializer(XMLSerializer):
0422 empty_elements = [namespace.xhtml.clarkname(name) for name in HTMLSerializer.empty_elements]
0423 elements_with_pcdata = [namespace.xhtml.clarkname(name) for name in HTMLSerializer.elements_with_pcdata]
0424
0425 def can_be_empty_element(self, ns_stack, tagname):
0426 return tagname in self.empty_elements
0427
0428 def has_only_pcdata(self, tagname):
0429 return tagname in self.elements_with_pcdata
0430
0431class PlainSerializer(Serializer):
0432
0433 def generate(self, stream, encoding=None, fragment=0):
0434
0435 encoding = encoding or self.encoding or 'utf-8'
0436 for ev, item in self.apply_filters(stream):
0437 if ev == TEXT:
0438 yield item
0439
0440
0441class NamespaceStack:
0442
0443 """Maintains a stack of namespace prefix to URI mappings."""
0444
0445 def __init__(self, default_map=namespace.namespaces):
0446 self.stack = []
0447 self.default_map = default_map
0448 self.push()
0449 self.ns_count = 0
0450
0451 def push(self, names=None):
0452 if names is None:
0453 names = {}
0454 self.current = names
0455 self.stack.insert(0, self.current)
0456
0457 def pop(self):
0458 ns = self.stack[0]
0459 del self.stack[0]
0460 if self.stack:
0461 self.current = self.stack[0]
0462 return ns
0463
0464 def resolve_prefix(self, uri, default=1):
0465 """Figure out prefix given a URI."""
0466
0467 if uri == 'http://www.w3.org/XML/1998/namespace':
0468 return 'xml'
0469
0470 is_default = -1
0471 prefix = None
0472 for names in self.stack:
0473 for k, v in names.items():
0474 if default and is_default == -1 and k == '':
0475
0476 is_default = (v == uri)
0477 if (default and is_default) or prefix:
0478 break
0479 if v == uri and k != '':
0480 prefix = k
0481 if is_default > -1:
0482 break
0483 if default and is_default == 1:
0484 return ''
0485 elif prefix:
0486 return prefix
0487 else:
0488 return None
0489
0490 def resolve_uri(self, prefix):
0491 """Figure out URI given a prefix."""
0492
0493 if prefix == 'xml':
0494 return 'http://www.w3.org/XML/1998/namespace'
0495 for names in self.stack:
0496 uri = names.get(prefix)
0497 if uri:
0498 return uri
0499 return None
0500
0501 def qname(self, cname, default=0):
0502 if isinstance(cname, QName):
0503 cname = cname.text
0504 if cname[0] != '{':
0505
0506 return cname
0507 uri, localname = cname[1:].split('}', 1)
0508 prefix = self.resolve_prefix(uri, default)
0509 if prefix is None:
0510
0511 prefix = self.default_map.get(uri)
0512 if prefix is not None:
0513 self.current[prefix] = uri
0514 else:
0515 if default and not self.current.has_key(''):
0516 prefix = ''
0517 self.current[prefix] = uri
0518 else:
0519 self.ns_count += 1
0520
0521 prefix = 'ns%d' % self.ns_count
0522 self.current[prefix] = uri
0523 if prefix != '':
0524 return '%s:%s' % (prefix, localname)
0525 else:
0526 return localname
0527
0528def serialize_doctype(doctype):
0529 return '<!DOCTYPE %s PUBLIC "%s" "%s">' % doctype