# $Id: selftest.py 2193 2004-12-05 18:03:00Z fredrik $
# -*- coding: iso-8859-1 -*-
# elementtree selftest program
# this test script uses Python's "doctest" module to check that the
# *test script* works as expected.
# TODO: add more elementtree method tests
# TODO: add xml/html parsing tests
# TODO: etc
import re, sys, string, StringIO
from lxml import etree as ElementTree
from lxml import _elementpath as ElementPath
from lxml import ElementInclude
#from elementtree import ElementTree
#from elementtree import ElementPath
#from elementtree import ElementInclude
#from elementtree import HTMLTreeBuilder
#from elementtree import SimpleXMLWriter
def fix_compatibility(xml_data):
xml_data = re.sub('\s*xmlns:[a-z0-9]+="http://www.w3.org/2001/XInclude"', '', xml_data)
return xml_data
def serialize(elem, encoding=None):
import StringIO
file = StringIO.StringIO()
tree = ElementTree.ElementTree(elem)
if encoding:
tree.write(file, encoding)
else:
tree.write(file)
return fix_compatibility( file.getvalue() )
def summarize(elem):
return elem.tag
def summarize_list(seq):
return map(summarize, seq)
def normalize_crlf(tree):
for elem in tree.getiterator():
if elem.text: elem.text = string.replace(elem.text, "\r\n", "\n")
if elem.tail: elem.tail = string.replace(elem.tail, "\r\n", "\n")
SAMPLE_XML = ElementTree.XML("""
text
""")
#
# interface tests
def check_string(string):
len(string)
for char in string:
if len(char) != 1:
print "expected one-character string, got %r" % char
new_string = string + ""
new_string = string + " "
string[:0]
def check_string_or_none(value):
if value is None:
return
return check_string(value)
def check_mapping(mapping):
len(mapping)
keys = mapping.keys()
items = mapping.items()
for key in keys:
item = mapping[key]
mapping["key"] = "value"
if mapping["key"] != "value":
print "expected value string, got %r" % mapping["key"]
def check_element(element):
if not hasattr(element, "tag"):
print "no tag member"
if not hasattr(element, "attrib"):
print "no attrib member"
if not hasattr(element, "text"):
print "no text member"
if not hasattr(element, "tail"):
print "no tail member"
check_string(element.tag)
check_mapping(element.attrib)
check_string_or_none(element.text)
check_string_or_none(element.tail)
for elem in element:
check_element(elem)
def check_element_tree(tree):
check_element(tree.getroot())
# --------------------------------------------------------------------
# element tree tests
## def sanity():
## """
## >>> from elementtree.ElementTree import *
## >>> from elementtree.ElementInclude import *
## >>> from elementtree.ElementPath import *
## >>> from elementtree.HTMLTreeBuilder import *
## >>> from elementtree.SimpleXMLTreeBuilder import *
## >>> from elementtree.SimpleXMLWriter import *
## >>> from elementtree.TidyHTMLTreeBuilder import *
## >>> from elementtree.TidyTools import *
## >>> from elementtree.XMLTreeBuilder import *
## """
def interface():
"""
Test element tree interface.
>>> element = ElementTree.Element("tag")
>>> check_element(element)
>>> tree = ElementTree.ElementTree(element)
>>> check_element_tree(tree)
"""
## def simplefind():
## """
## Test find methods using the elementpath fallback.
## >>> CurrentElementPath = ElementTree.ElementPath
## >>> ElementTree.ElementPath = ElementTree._SimpleElementPath()
## >>> elem = SAMPLE_XML
## >>> elem.find("tag").tag
## 'tag'
## >>> ElementTree.ElementTree(elem).find("tag").tag
## 'tag'
## >>> elem.findtext("tag")
## 'text'
## >>> elem.findtext("tog")
## >>> elem.findtext("tog", "default")
## 'default'
## >>> ElementTree.ElementTree(elem).findtext("tag")
## 'text'
## >>> summarize_list(elem.findall("tag"))
## ['tag', 'tag']
## >>> summarize_list(elem.findall(".//tag"))
## ['tag', 'tag', 'tag']
## Path syntax doesn't work in this case.
## >>> elem.find("section/tag")
## >>> elem.findtext("section/tag")
## >>> elem.findall("section/tag")
## []
## >>> ElementTree.ElementPath = CurrentElementPath
## """
def find():
"""
Test find methods (including xpath syntax).
>>> elem = SAMPLE_XML
>>> elem.find("tag").tag
'tag'
>>> ElementTree.ElementTree(elem).find("tag").tag
'tag'
>>> elem.find("section/tag").tag
'tag'
>>> ElementTree.ElementTree(elem).find("section/tag").tag
'tag'
>>> elem.findtext("tag")
'text'
>>> elem.findtext("tog")
>>> elem.findtext("tog", "default")
'default'
>>> ElementTree.ElementTree(elem).findtext("tag")
'text'
>>> elem.findtext("section/tag")
'subtext'
>>> ElementTree.ElementTree(elem).findtext("section/tag")
'subtext'
>>> summarize_list(elem.findall("tag"))
['tag', 'tag']
>>> summarize_list(elem.findall("*"))
['tag', 'tag', 'section']
>>> summarize_list(elem.findall(".//tag"))
['tag', 'tag', 'tag']
>>> summarize_list(elem.findall("section/tag"))
['tag']
>>> summarize_list(elem.findall("section//tag"))
['tag']
>>> summarize_list(elem.findall("section/*"))
['tag']
>>> summarize_list(elem.findall("section//*"))
['tag']
>>> summarize_list(elem.findall("section/.//*"))
['tag']
>>> summarize_list(elem.findall("*/*"))
['tag']
>>> summarize_list(elem.findall("*//*"))
['tag']
>>> summarize_list(elem.findall("*/tag"))
['tag']
>>> summarize_list(elem.findall("*/./tag"))
['tag']
>>> summarize_list(elem.findall("./tag"))
['tag', 'tag']
>>> summarize_list(elem.findall(".//tag"))
['tag', 'tag', 'tag']
>>> summarize_list(elem.findall("././tag"))
['tag', 'tag']
>>> summarize_list(ElementTree.ElementTree(elem).findall("/tag"))
['tag', 'tag']
>>> summarize_list(ElementTree.ElementTree(elem).findall("./tag"))
['tag', 'tag']
"""
def bad_find():
"""
Check bad or unsupported path expressions.
>>> elem = SAMPLE_XML
>>> elem.findall("/tag")
Traceback (most recent call last):
SyntaxError: cannot use absolute path on element
>>> elem.findall("../tag")
Traceback (most recent call last):
SyntaxError: unsupported path syntax (..)
>>> elem.findall("section//")
Traceback (most recent call last):
SyntaxError: path cannot end with //
>>> elem.findall("tag[tag]")
Traceback (most recent call last):
SyntaxError: expected path separator ([)
"""
def parsefile():
"""
Test parsing from file.
>>> tree = ElementTree.parse("samples/simple.xml")
>>> normalize_crlf(tree)
>>> tree.write(sys.stdout)
text
texttail
>>> tree = ElementTree.parse("samples/simple-ns.xml")
>>> normalize_crlf(tree)
>>> tree.write(sys.stdout)
text
texttail
"""
## def parsehtml():
## """
## Test HTML parsing.
## >>> p = HTMLTreeBuilder.TreeBuilder()
## >>> p.feed("spamegg
")
## >>> serialize(p.close())
## 'spamegg
'
## """
## def parseliteral():
## r"""
## >>> element = ElementTree.XML("text")
## >>> ElementTree.ElementTree(element).write(sys.stdout)
## text
## >>> element = ElementTree.fromstring("text")
## >>> ElementTree.ElementTree(element).write(sys.stdout)
## text
## >>> print ElementTree.tostring(element)
## text
## >>> print ElementTree.tostring(element, "ascii")
##
## text
## >>> _, ids = ElementTree.XMLID("text")
## >>> len(ids)
## 0
## >>> _, ids = ElementTree.XMLID("text")
## >>> len(ids)
## 1
## >>> ids["body"].tag
## 'body'
## """
## def simpleparsefile():
## """
## Test the xmllib-based parser.
## >>> from elementtree import SimpleXMLTreeBuilder
## >>> parser = SimpleXMLTreeBuilder.TreeBuilder()
## >>> tree = ElementTree.parse("samples/simple.xml", parser)
## >>> normalize_crlf(tree)
## >>> tree.write(sys.stdout)
##
## text
## texttail
##
##
## """
def iterparse():
"""
Test iterparse interface.
>>> iterparse = ElementTree.iterparse
>>> context = iterparse("samples/simple.xml")
>>> for action, elem in context:
... print action, elem.tag
end element
end element
end empty-element
end root
>>> context.root.tag
'root'
>>> context = iterparse("samples/simple-ns.xml")
>>> for action, elem in context:
... print action, elem.tag
end {namespace}element
end {namespace}element
end {namespace}empty-element
end {namespace}root
>>> events = ()
>>> context = iterparse("samples/simple.xml", events)
>>> for action, elem in context:
... print action, elem.tag
>>> events = ()
>>> context = iterparse("samples/simple.xml", events=events)
>>> for action, elem in context:
... print action, elem.tag
>>> events = ("start", "end")
>>> context = iterparse("samples/simple.xml", events)
>>> for action, elem in context:
... print action, elem.tag
start root
start element
end element
start element
end element
start empty-element
end empty-element
end root
>>> events = ("start", "end", "start-ns", "end-ns")
>>> context = iterparse("samples/simple-ns.xml", events)
>>> for action, elem in context:
... if action in ("start", "end"):
... print action, elem.tag
... else:
... print action, elem
start-ns ('', 'namespace')
start {namespace}root
start {namespace}element
end {namespace}element
start {namespace}element
end {namespace}element
start {namespace}empty-element
end {namespace}empty-element
end {namespace}root
end-ns None
"""
## def fancyparsefile():
## """
## Test the "fancy" parser.
## Sanity check.
## >>> from elementtree import XMLTreeBuilder
## >>> parser = XMLTreeBuilder.FancyTreeBuilder()
## >>> tree = ElementTree.parse("samples/simple.xml", parser)
## >>> normalize_crlf(tree)
## >>> tree.write(sys.stdout)
##
## text
## texttail
##
##
## Callback check.
## >>> class MyFancyParser(XMLTreeBuilder.FancyTreeBuilder):
## ... def start(self, elem):
## ... print "START", elem.tag
## ... def end(self, elem):
## ... print "END", elem.tag
## >>> parser = MyFancyParser()
## >>> tree = ElementTree.parse("samples/simple.xml", parser)
## START root
## START element
## END element
## START element
## END element
## START empty-element
## END empty-element
## END root
## """
def writefile():
"""
>>> elem = ElementTree.Element("tag")
>>> elem.text = "text"
>>> serialize(elem)
'text'
>>> ElementTree.SubElement(elem, "subtag").text = "subtext"
>>> serialize(elem)
'textsubtext'
"""
def writestring():
"""
>>> elem = ElementTree.XML("text")
>>> ElementTree.tostring(elem)
'text'
>>> elem = ElementTree.fromstring("text")
>>> ElementTree.tostring(elem)
'text'
"""
## def encoding():
## r"""
## Test encoding issues.
## >>> elem = ElementTree.Element("tag")
## >>> elem.text = u"abc"
## >>> serialize(elem)
## 'abc'
## >>> serialize(elem, "utf-8")
## 'abc'
## >>> serialize(elem, "us-ascii")
## 'abc'
## >>> serialize(elem, "iso-8859-1")
## "\nabc"
## >>> elem.text = "<&\"\'>"
## >>> serialize(elem)
## '<&"\'>'
## >>> serialize(elem, "utf-8")
## '<&"\'>'
## >>> serialize(elem, "us-ascii") # cdata characters
## '<&"\'>'
## >>> serialize(elem, "iso-8859-1")
## '\n<&"\'>'
## >>> elem.attrib["key"] = "<&\"\'>"
## >>> elem.text = None
## >>> serialize(elem)
## ''
## >>> serialize(elem, "utf-8")
## ''
## >>> serialize(elem, "us-ascii")
## ''
## >>> serialize(elem, "iso-8859-1")
## '\n'
## >>> elem.text = u'\xe5\xf6\xf6<>'
## >>> elem.attrib.clear()
## >>> serialize(elem)
## 'åöö<>'
## >>> serialize(elem, "utf-8")
## '\xc3\xa5\xc3\xb6\xc3\xb6<>'
## >>> serialize(elem, "us-ascii")
## 'åöö<>'
## >>> serialize(elem, "iso-8859-1")
## "\n\xe5\xf6\xf6<>"
## >>> elem.attrib["key"] = u'\xe5\xf6\xf6<>'
## >>> elem.text = None
## >>> serialize(elem)
## ''
## >>> serialize(elem, "utf-8")
## ''
## >>> serialize(elem, "us-ascii")
## ''
## >>> serialize(elem, "iso-8859-1")
## '\n'
## """
ENTITY_XML = """\
%user-entities;
]>
&entity;
"""
## def entity():
## """
## Test entity handling.
## 1) bad entities
## >>> ElementTree.XML("&entity;")
## Traceback (most recent call last):
## ExpatError: undefined entity: line 1, column 10
## >>> ElementTree.XML(ENTITY_XML)
## Traceback (most recent call last):
## ExpatError: undefined entity &entity;: line 5, column 10
## (add more tests here)
## """
## def xmllang():
## """
## This appears to be a problem; in underlying libxml2?
## 1) xml namespace
## >>> elem = ElementTree.XML("")
## >>> serialize(elem) # 1.1
## ''
## """
def namespace():
"""
Test namespace issues.
2) other "well-known" namespaces
>>> elem = ElementTree.XML("")
>>> serialize(elem) # 2.1
''
>>> elem = ElementTree.XML("")
>>> serialize(elem) # 2.2
''
>>> elem = ElementTree.XML("")
>>> serialize(elem) # 2.3
''
3) unknown namespaces
"""
## def qname():
## """
## Test QName handling.
## 1) decorated tags
## >>> elem = ElementTree.Element("{uri}tag")
## >>> serialize(elem) # 1.1
## ''
## >>> elem = ElementTree.Element(ElementTree.QName("{uri}tag"))
## >>> serialize(elem) # 1.2
## ''
## >>> elem = ElementTree.Element(ElementTree.QName("uri", "tag"))
## >>> serialize(elem) # 1.3
## ''
## 2) decorated attributes
## >>> elem.clear()
## >>> elem.attrib["{uri}key"] = "value"
## >>> serialize(elem) # 2.1
## ''
## >>> elem.clear()
## >>> elem.attrib[ElementTree.QName("{uri}key")] = "value"
## >>> serialize(elem) # 2.2
## ''
## 3) decorated values are not converted by default, but the
## QName wrapper can be used for values
## >>> elem.clear()
## >>> elem.attrib["{uri}key"] = "{uri}value"
## >>> serialize(elem) # 3.1
## ''
## >>> elem.clear()
## >>> elem.attrib["{uri}key"] = ElementTree.QName("{uri}value")
## >>> serialize(elem) # 3.2
## ''
## >>> elem.clear()
## >>> subelem = ElementTree.Element("tag")
## >>> subelem.attrib["{uri1}key"] = ElementTree.QName("{uri2}value")
## >>> elem.append(subelem)
## >>> elem.append(subelem)
## >>> serialize(elem) # 3.3
## ''
## """
def xpath_tokenizer(p):
"""
Test the XPath tokenizer.
>>> # tests from the xml specification
>>> xpath_tokenizer("*")
['*']
>>> xpath_tokenizer("text()")
['text', '()']
>>> xpath_tokenizer("@name")
['@', 'name']
>>> xpath_tokenizer("@*")
['@', '*']
>>> xpath_tokenizer("para[1]")
['para', '[', '1', ']']
>>> xpath_tokenizer("para[last()]")
['para', '[', 'last', '()', ']']
>>> xpath_tokenizer("*/para")
['*', '/', 'para']
>>> xpath_tokenizer("/doc/chapter[5]/section[2]")
['/', 'doc', '/', 'chapter', '[', '5', ']', '/', 'section', '[', '2', ']']
>>> xpath_tokenizer("chapter//para")
['chapter', '/', '/', 'para']
>>> xpath_tokenizer("//para")
['/', '/', 'para']
>>> xpath_tokenizer("//olist/item")
['/', '/', 'olist', '/', 'item']
>>> xpath_tokenizer(".")
['.']
>>> xpath_tokenizer(".//para")
['.', '/', '/', 'para']
>>> xpath_tokenizer("..")
['..']
>>> xpath_tokenizer("../@lang")
['..', '/', '@', 'lang']
>>> xpath_tokenizer("chapter[title]")
['chapter', '[', 'title', ']']
>>> xpath_tokenizer("employee[@secretary and @assistant]")
['employee', '[', '@', 'secretary', '', 'and', '', '@', 'assistant', ']']
>>> # additional tests
>>> xpath_tokenizer("{http://spam}egg")
['{http://spam}egg']
>>> xpath_tokenizer("./spam.egg")
['.', '/', 'spam.egg']
>>> xpath_tokenizer(".//{http://spam}egg")
['.', '/', '/', '{http://spam}egg']
"""
out = []
for op, tag in ElementPath.xpath_tokenizer(p):
out.append(op or tag)
return out
#
# xinclude tests (samples from appendix C of the xinclude specification)
XINCLUDE = {}
XINCLUDE["C1.xml"] = """\
120 Mz is adequate for an average home user.
"""
XINCLUDE["disclaimer.xml"] = """\
The opinions represented herein represent those of the individual
and should not be interpreted as official policy endorsed by this
organization.
"""
XINCLUDE["C2.xml"] = """\
This document has been accessed
times.
"""
XINCLUDE["count.txt"] = "324387"
XINCLUDE["C3.xml"] = """\
The following is the source of the "data.xml" resource:
"""
XINCLUDE["data.xml"] = """\
"""
XINCLUDE["C5.xml"] = """\
"""
XINCLUDE["default.xml"] = """\
Example.
"""
def xinclude_loader(href, parse="xml", encoding=None):
try:
data = XINCLUDE[href]
except KeyError:
raise IOError("resource not found")
if parse == "xml":
return ElementTree.XML(data)
return data
def xinclude():
r"""
Basic inclusion example (XInclude C.1)
>>> document = xinclude_loader("C1.xml")
>>> ElementInclude.include(document, xinclude_loader)
>>> print serialize(document) # C1
120 Mz is adequate for an average home user.
The opinions represented herein represent those of the individual
and should not be interpreted as official policy endorsed by this
organization.
Textual inclusion example (XInclude C.2)
>>> document = xinclude_loader("C2.xml")
>>> ElementInclude.include(document, xinclude_loader)
>>> print serialize(document) # C2
This document has been accessed
324387 times.
Textual inclusion of XML example (XInclude C.3)
>>> document = xinclude_loader("C3.xml")
>>> ElementInclude.include(document, xinclude_loader)
>>> print serialize(document) # C3
The following is the source of the "data.xml" resource:
<?xml version='1.0'?>
<data>
<item><![CDATA[Brooks & Shields]]></item>
</data>
## Fallback example (XInclude C.5)
## Note! Fallback support is not yet implemented
## >>> document = xinclude_loader("C5.xml")
## >>> ElementInclude.include(document, xinclude_loader)
## Traceback (most recent call last):
## IOError: resource not found
## >>> # print serialize(document) # C5
"""
def xinclude_default():
"""
>>> document = xinclude_loader("default.xml")
>>> ElementInclude.include(document)
>>> print serialize(document) # default
Example.
text
texttail
"""
#
# xmlwriter
## def xmlwriter():
## r"""
## >>> file = StringIO.StringIO()
## >>> w = SimpleXMLWriter.XMLWriter(file)
## >>> html = w.start("html")
## >>> x = w.start("head")
## >>> w.element("title", "my document")
## >>> w.data("\n")
## >>> w.element("meta", name="hello", value="goodbye")
## >>> w.data("\n")
## >>> w.end()
## >>> x = w.start("body")
## >>> w.element("h1", "this is a heading")
## >>> w.data("\n")
## >>> w.element("p", u"this is a paragraph")
## >>> w.data("\n")
## >>> w.element("p", u"reserved characters: <&>")
## >>> w.data("\n")
## >>> w.element("p", u"detta är också ett stycke")
## >>> w.data("\n")
## >>> w.close(html)
## >>> print file.getvalue()
## my document
##
## this is a heading
## this is a paragraph
## reserved characters: <&>
## detta är också ett stycke
##
## """
# --------------------------------------------------------------------
# reported bugs
## def bug_xmltoolkit21():
## """
## marshaller gives obscure errors for non-string values
## >>> elem = ElementTree.Element(123)
## >>> serialize(elem) # tag
## Traceback (most recent call last):
## TypeError: cannot serialize 123 (type int)
## >>> elem = ElementTree.Element("elem")
## >>> elem.text = 123
## >>> serialize(elem) # text
## Traceback (most recent call last):
## TypeError: cannot serialize 123 (type int)
## >>> elem = ElementTree.Element("elem")
## >>> elem.tail = 123
## >>> serialize(elem) # tail
## Traceback (most recent call last):
## TypeError: cannot serialize 123 (type int)
## >>> elem = ElementTree.Element("elem")
## >>> elem.set(123, "123")
## >>> serialize(elem) # attribute key
## Traceback (most recent call last):
## TypeError: cannot serialize 123 (type int)
## >>> elem = ElementTree.Element("elem")
## >>> elem.set("123", 123)
## >>> serialize(elem) # attribute value
## Traceback (most recent call last):
## TypeError: cannot serialize 123 (type int)
## """
def bug_xmltoolkit25():
"""
typo in ElementTree.findtext
>>> tree = ElementTree.ElementTree(SAMPLE_XML)
>>> tree.findtext("tag")
'text'
>>> tree.findtext("section/tag")
'subtext'
"""
def bug_xmltoolkit28():
"""
.//tag causes exceptions
>>> tree = ElementTree.XML("")
>>> summarize_list(tree.findall(".//thead"))
[]
>>> summarize_list(tree.findall(".//tbody"))
['tbody']
"""
## def bug_xmltoolkitX1():
## """
## dump() doesn't flush the output buffer
## >>> tree = ElementTree.XML("")
## >>> ElementTree.dump(tree); sys.stdout.write("tail")
##
## tail
## """
## def bug_xmltoolkit39():
## """
## non-ascii element and attribute names doesn't work
## >>> tree = ElementTree.XML("")
## >>> ElementTree.tostring(tree, "utf-8")
## ''
## >>> tree = ElementTree.XML("")
## >>> tree.attrib
## {u'\\xe4ttr': u'v\\xe4lue'}
## >>> ElementTree.tostring(tree, "utf-8")
## ''
## >>> tree = ElementTree.XML("text")
## >>> ElementTree.tostring(tree, "utf-8")
## 'text'
## >>> tree = ElementTree.Element(u"täg")
## >>> ElementTree.tostring(tree, "utf-8")
## ''
## >>> tree = ElementTree.Element("tag")
## >>> tree.set(u"ättr", u"välue")
## >>> ElementTree.tostring(tree, "utf-8")
## ''
## """
## def bug_xmltoolkit45():
## """
## problems parsing mixed unicode/non-ascii html documents
## latin-1 text
## >>> p = HTMLTreeBuilder.TreeBuilder()
## >>> p.feed("välue
")
## >>> serialize(p.close())
## 'välue
'
## utf-8 text
## >>> p = HTMLTreeBuilder.TreeBuilder(encoding="utf-8")
## >>> p.feed("v\xc3\xa4lue
")
## >>> serialize(p.close())
## 'välue
'
## utf-8 text using meta tag
## >>> p = HTMLTreeBuilder.TreeBuilder()
## >>> p.feed("v\xc3\xa4lue
")
## >>> serialize(p.close().find("p"))
## 'välue
'
## latin-1 character references
## >>> p = HTMLTreeBuilder.TreeBuilder()
## >>> p.feed("välue
")
## >>> serialize(p.close())
## 'välue
'
## latin-1 character entities
## >>> p = HTMLTreeBuilder.TreeBuilder()
## >>> p.feed("välue
")
## >>> serialize(p.close())
## 'välue
'
## mixed latin-1 text and unicode entities
## >>> p = HTMLTreeBuilder.TreeBuilder()
## >>> p.feed("”välue”
")
## >>> serialize(p.close())
## '”välue”
'
## mixed unicode and latin-1 entities
## >>> p = HTMLTreeBuilder.TreeBuilder()
## >>> p.feed("”välue”
")
## >>> serialize(p.close())
## '”välue”
'
## """
# --------------------------------------------------------------------
if __name__ == "__main__":
import doctest, selftest
failed, tested = doctest.testmod(selftest)
print tested - failed, "tests ok."