# $Id: selftest.py 2193 2004-12-05 18:03:00Z fredrik $ # -*- coding: iso-8859-1 -*- # elementtree selftest program # this test script uses Python's "doctest" module to check that the # *test script* works as expected. # TODO: add more elementtree method tests # TODO: add xml/html parsing tests # TODO: etc import re, sys, string, StringIO from lxml import etree as ElementTree from lxml import _elementpath as ElementPath from lxml import ElementInclude #from elementtree import ElementTree #from elementtree import ElementPath #from elementtree import ElementInclude #from elementtree import HTMLTreeBuilder #from elementtree import SimpleXMLWriter def fix_compatibility(xml_data): xml_data = re.sub('\s*xmlns:[a-z0-9]+="http://www.w3.org/2001/XInclude"', '', xml_data) return xml_data def serialize(elem, encoding=None): import StringIO file = StringIO.StringIO() tree = ElementTree.ElementTree(elem) if encoding: tree.write(file, encoding) else: tree.write(file) return fix_compatibility( file.getvalue() ) def summarize(elem): return elem.tag def summarize_list(seq): return map(summarize, seq) def normalize_crlf(tree): for elem in tree.getiterator(): if elem.text: elem.text = string.replace(elem.text, "\r\n", "\n") if elem.tail: elem.tail = string.replace(elem.tail, "\r\n", "\n") SAMPLE_XML = ElementTree.XML(""" text

subtext

""") # # interface tests def check_string(string): len(string) for char in string: if len(char) != 1: print "expected one-character string, got %r" % char new_string = string + "" new_string = string + " " string[:0] def check_string_or_none(value): if value is None: return return check_string(value) def check_mapping(mapping): len(mapping) keys = mapping.keys() items = mapping.items() for key in keys: item = mapping[key] mapping["key"] = "value" if mapping["key"] != "value": print "expected value string, got %r" % mapping["key"] def check_element(element): if not hasattr(element, "tag"): print "no tag member" if not hasattr(element, "attrib"): print "no attrib member" if not hasattr(element, "text"): print "no text member" if not hasattr(element, "tail"): print "no tail member" check_string(element.tag) check_mapping(element.attrib) check_string_or_none(element.text) check_string_or_none(element.tail) for elem in element: check_element(elem) def check_element_tree(tree): check_element(tree.getroot()) # -------------------------------------------------------------------- # element tree tests ## def sanity(): ## """ ## >>> from elementtree.ElementTree import * ## >>> from elementtree.ElementInclude import * ## >>> from elementtree.ElementPath import * ## >>> from elementtree.HTMLTreeBuilder import * ## >>> from elementtree.SimpleXMLTreeBuilder import * ## >>> from elementtree.SimpleXMLWriter import * ## >>> from elementtree.TidyHTMLTreeBuilder import * ## >>> from elementtree.TidyTools import * ## >>> from elementtree.XMLTreeBuilder import * ## """ def interface(): """ Test element tree interface. >>> element = ElementTree.Element("tag") >>> check_element(element) >>> tree = ElementTree.ElementTree(element) >>> check_element_tree(tree) """ ## def simplefind(): ## """ ## Test find methods using the elementpath fallback. ## >>> CurrentElementPath = ElementTree.ElementPath ## >>> ElementTree.ElementPath = ElementTree._SimpleElementPath() ## >>> elem = SAMPLE_XML ## >>> elem.find("tag").tag ## 'tag' ## >>> ElementTree.ElementTree(elem).find("tag").tag ## 'tag' ## >>> elem.findtext("tag") ## 'text' ## >>> elem.findtext("tog") ## >>> elem.findtext("tog", "default") ## 'default' ## >>> ElementTree.ElementTree(elem).findtext("tag") ## 'text' ## >>> summarize_list(elem.findall("tag")) ## ['tag', 'tag'] ## >>> summarize_list(elem.findall(".//tag")) ## ['tag', 'tag', 'tag'] ## Path syntax doesn't work in this case. ## >>> elem.find("section/tag") ## >>> elem.findtext("section/tag") ## >>> elem.findall("section/tag") ## [] ## >>> ElementTree.ElementPath = CurrentElementPath ## """ def find(): """ Test find methods (including xpath syntax). >>> elem = SAMPLE_XML >>> elem.find("tag").tag 'tag' >>> ElementTree.ElementTree(elem).find("tag").tag 'tag' >>> elem.find("section/tag").tag 'tag' >>> ElementTree.ElementTree(elem).find("section/tag").tag 'tag' >>> elem.findtext("tag") 'text' >>> elem.findtext("tog") >>> elem.findtext("tog", "default") 'default' >>> ElementTree.ElementTree(elem).findtext("tag") 'text' >>> elem.findtext("section/tag") 'subtext' >>> ElementTree.ElementTree(elem).findtext("section/tag") 'subtext' >>> summarize_list(elem.findall("tag")) ['tag', 'tag'] >>> summarize_list(elem.findall("*")) ['tag', 'tag', 'section'] >>> summarize_list(elem.findall(".//tag")) ['tag', 'tag', 'tag'] >>> summarize_list(elem.findall("section/tag")) ['tag'] >>> summarize_list(elem.findall("section//tag")) ['tag'] >>> summarize_list(elem.findall("section/*")) ['tag'] >>> summarize_list(elem.findall("section//*")) ['tag'] >>> summarize_list(elem.findall("section/.//*")) ['tag'] >>> summarize_list(elem.findall("*/*")) ['tag'] >>> summarize_list(elem.findall("*//*")) ['tag'] >>> summarize_list(elem.findall("*/tag")) ['tag'] >>> summarize_list(elem.findall("*/./tag")) ['tag'] >>> summarize_list(elem.findall("./tag")) ['tag', 'tag'] >>> summarize_list(elem.findall(".//tag")) ['tag', 'tag', 'tag'] >>> summarize_list(elem.findall("././tag")) ['tag', 'tag'] >>> summarize_list(ElementTree.ElementTree(elem).findall("/tag")) ['tag', 'tag'] >>> summarize_list(ElementTree.ElementTree(elem).findall("./tag")) ['tag', 'tag'] """ def bad_find(): """ Check bad or unsupported path expressions. >>> elem = SAMPLE_XML >>> elem.findall("/tag") Traceback (most recent call last): SyntaxError: cannot use absolute path on element >>> elem.findall("../tag") Traceback (most recent call last): SyntaxError: unsupported path syntax (..) >>> elem.findall("section//") Traceback (most recent call last): SyntaxError: path cannot end with // >>> elem.findall("tag[tag]") Traceback (most recent call last): SyntaxError: expected path separator ([) """ def parsefile(): """ Test parsing from file. >>> tree = ElementTree.parse("samples/simple.xml") >>> normalize_crlf(tree) >>> tree.write(sys.stdout) text texttail >>> tree = ElementTree.parse("samples/simple-ns.xml") >>> normalize_crlf(tree) >>> tree.write(sys.stdout) text texttail """ ## def parsehtml(): ## """ ## Test HTML parsing. ## >>> p = HTMLTreeBuilder.TreeBuilder() ## >>> p.feed("

spamegg

") ## >>> serialize(p.close()) ## '

spamegg

' ## """ ## def parseliteral(): ## r""" ## >>> element = ElementTree.XML("text") ## >>> ElementTree.ElementTree(element).write(sys.stdout) ## text ## >>> element = ElementTree.fromstring("text") ## >>> ElementTree.ElementTree(element).write(sys.stdout) ## text ## >>> print ElementTree.tostring(element) ## text ## >>> print ElementTree.tostring(element, "ascii") ## ## text ## >>> _, ids = ElementTree.XMLID("text") ## >>> len(ids) ## 0 ## >>> _, ids = ElementTree.XMLID("text") ## >>> len(ids) ## 1 ## >>> ids["body"].tag ## 'body' ## """ ## def simpleparsefile(): ## """ ## Test the xmllib-based parser. ## >>> from elementtree import SimpleXMLTreeBuilder ## >>> parser = SimpleXMLTreeBuilder.TreeBuilder() ## >>> tree = ElementTree.parse("samples/simple.xml", parser) ## >>> normalize_crlf(tree) ## >>> tree.write(sys.stdout) ## ## text ## texttail ## ## ## """ def iterparse(): """ Test iterparse interface. >>> iterparse = ElementTree.iterparse >>> context = iterparse("samples/simple.xml") >>> for action, elem in context: ... print action, elem.tag end element end element end empty-element end root >>> context.root.tag 'root' >>> context = iterparse("samples/simple-ns.xml") >>> for action, elem in context: ... print action, elem.tag end {namespace}element end {namespace}element end {namespace}empty-element end {namespace}root >>> events = () >>> context = iterparse("samples/simple.xml", events) >>> for action, elem in context: ... print action, elem.tag >>> events = () >>> context = iterparse("samples/simple.xml", events=events) >>> for action, elem in context: ... print action, elem.tag >>> events = ("start", "end") >>> context = iterparse("samples/simple.xml", events) >>> for action, elem in context: ... print action, elem.tag start root start element end element start element end element start empty-element end empty-element end root >>> events = ("start", "end", "start-ns", "end-ns") >>> context = iterparse("samples/simple-ns.xml", events) >>> for action, elem in context: ... if action in ("start", "end"): ... print action, elem.tag ... else: ... print action, elem start-ns ('', 'namespace') start {namespace}root start {namespace}element end {namespace}element start {namespace}element end {namespace}element start {namespace}empty-element end {namespace}empty-element end {namespace}root end-ns None """ ## def fancyparsefile(): ## """ ## Test the "fancy" parser. ## Sanity check. ## >>> from elementtree import XMLTreeBuilder ## >>> parser = XMLTreeBuilder.FancyTreeBuilder() ## >>> tree = ElementTree.parse("samples/simple.xml", parser) ## >>> normalize_crlf(tree) ## >>> tree.write(sys.stdout) ## ## text ## texttail ## ## ## Callback check. ## >>> class MyFancyParser(XMLTreeBuilder.FancyTreeBuilder): ## ... def start(self, elem): ## ... print "START", elem.tag ## ... def end(self, elem): ## ... print "END", elem.tag ## >>> parser = MyFancyParser() ## >>> tree = ElementTree.parse("samples/simple.xml", parser) ## START root ## START element ## END element ## START element ## END element ## START empty-element ## END empty-element ## END root ## """ def writefile(): """ >>> elem = ElementTree.Element("tag") >>> elem.text = "text" >>> serialize(elem) 'text' >>> ElementTree.SubElement(elem, "subtag").text = "subtext" >>> serialize(elem) 'textsubtext' """ def writestring(): """ >>> elem = ElementTree.XML("text") >>> ElementTree.tostring(elem) 'text' >>> elem = ElementTree.fromstring("text") >>> ElementTree.tostring(elem) 'text' """ ## def encoding(): ## r""" ## Test encoding issues. ## >>> elem = ElementTree.Element("tag") ## >>> elem.text = u"abc" ## >>> serialize(elem) ## 'abc' ## >>> serialize(elem, "utf-8") ## 'abc' ## >>> serialize(elem, "us-ascii") ## 'abc' ## >>> serialize(elem, "iso-8859-1") ## "\nabc" ## >>> elem.text = "<&\"\'>" ## >>> serialize(elem) ## '<&"\'>' ## >>> serialize(elem, "utf-8") ## '<&"\'>' ## >>> serialize(elem, "us-ascii") # cdata characters ## '<&"\'>' ## >>> serialize(elem, "iso-8859-1") ## '\n<&"\'>' ## >>> elem.attrib["key"] = "<&\"\'>" ## >>> elem.text = None ## >>> serialize(elem) ## '' ## >>> serialize(elem, "utf-8") ## '' ## >>> serialize(elem, "us-ascii") ## '' ## >>> serialize(elem, "iso-8859-1") ## '\n' ## >>> elem.text = u'\xe5\xf6\xf6<>' ## >>> elem.attrib.clear() ## >>> serialize(elem) ## 'åöö<>' ## >>> serialize(elem, "utf-8") ## '\xc3\xa5\xc3\xb6\xc3\xb6<>' ## >>> serialize(elem, "us-ascii") ## 'åöö<>' ## >>> serialize(elem, "iso-8859-1") ## "\n\xe5\xf6\xf6<>" ## >>> elem.attrib["key"] = u'\xe5\xf6\xf6<>' ## >>> elem.text = None ## >>> serialize(elem) ## '' ## >>> serialize(elem, "utf-8") ## '' ## >>> serialize(elem, "us-ascii") ## '' ## >>> serialize(elem, "iso-8859-1") ## '\n' ## """ ENTITY_XML = """\ %user-entities; ]> &entity; """ ## def entity(): ## """ ## Test entity handling. ## 1) bad entities ## >>> ElementTree.XML("&entity;") ## Traceback (most recent call last): ## ExpatError: undefined entity: line 1, column 10 ## >>> ElementTree.XML(ENTITY_XML) ## Traceback (most recent call last): ## ExpatError: undefined entity &entity;: line 5, column 10 ## (add more tests here) ## """ ## def xmllang(): ## """ ## This appears to be a problem; in underlying libxml2? ## 1) xml namespace ## >>> elem = ElementTree.XML("") ## >>> serialize(elem) # 1.1 ## '' ## """ def namespace(): """ Test namespace issues. 2) other "well-known" namespaces >>> elem = ElementTree.XML("") >>> serialize(elem) # 2.1 '' >>> elem = ElementTree.XML("") >>> serialize(elem) # 2.2 '' >>> elem = ElementTree.XML("") >>> serialize(elem) # 2.3 '' 3) unknown namespaces """ ## def qname(): ## """ ## Test QName handling. ## 1) decorated tags ## >>> elem = ElementTree.Element("{uri}tag") ## >>> serialize(elem) # 1.1 ## '' ## >>> elem = ElementTree.Element(ElementTree.QName("{uri}tag")) ## >>> serialize(elem) # 1.2 ## '' ## >>> elem = ElementTree.Element(ElementTree.QName("uri", "tag")) ## >>> serialize(elem) # 1.3 ## '' ## 2) decorated attributes ## >>> elem.clear() ## >>> elem.attrib["{uri}key"] = "value" ## >>> serialize(elem) # 2.1 ## '' ## >>> elem.clear() ## >>> elem.attrib[ElementTree.QName("{uri}key")] = "value" ## >>> serialize(elem) # 2.2 ## '' ## 3) decorated values are not converted by default, but the ## QName wrapper can be used for values ## >>> elem.clear() ## >>> elem.attrib["{uri}key"] = "{uri}value" ## >>> serialize(elem) # 3.1 ## '' ## >>> elem.clear() ## >>> elem.attrib["{uri}key"] = ElementTree.QName("{uri}value") ## >>> serialize(elem) # 3.2 ## '' ## >>> elem.clear() ## >>> subelem = ElementTree.Element("tag") ## >>> subelem.attrib["{uri1}key"] = ElementTree.QName("{uri2}value") ## >>> elem.append(subelem) ## >>> elem.append(subelem) ## >>> serialize(elem) # 3.3 ## '' ## """ def xpath_tokenizer(p): """ Test the XPath tokenizer. >>> # tests from the xml specification >>> xpath_tokenizer("*") ['*'] >>> xpath_tokenizer("text()") ['text', '()'] >>> xpath_tokenizer("@name") ['@', 'name'] >>> xpath_tokenizer("@*") ['@', '*'] >>> xpath_tokenizer("para[1]") ['para', '[', '1', ']'] >>> xpath_tokenizer("para[last()]") ['para', '[', 'last', '()', ']'] >>> xpath_tokenizer("*/para") ['*', '/', 'para'] >>> xpath_tokenizer("/doc/chapter[5]/section[2]") ['/', 'doc', '/', 'chapter', '[', '5', ']', '/', 'section', '[', '2', ']'] >>> xpath_tokenizer("chapter//para") ['chapter', '/', '/', 'para'] >>> xpath_tokenizer("//para") ['/', '/', 'para'] >>> xpath_tokenizer("//olist/item") ['/', '/', 'olist', '/', 'item'] >>> xpath_tokenizer(".") ['.'] >>> xpath_tokenizer(".//para") ['.', '/', '/', 'para'] >>> xpath_tokenizer("..") ['..'] >>> xpath_tokenizer("../@lang") ['..', '/', '@', 'lang'] >>> xpath_tokenizer("chapter[title]") ['chapter', '[', 'title', ']'] >>> xpath_tokenizer("employee[@secretary and @assistant]") ['employee', '[', '@', 'secretary', '', 'and', '', '@', 'assistant', ']'] >>> # additional tests >>> xpath_tokenizer("{http://spam}egg") ['{http://spam}egg'] >>> xpath_tokenizer("./spam.egg") ['.', '/', 'spam.egg'] >>> xpath_tokenizer(".//{http://spam}egg") ['.', '/', '/', '{http://spam}egg'] """ out = [] for op, tag in ElementPath.xpath_tokenizer(p): out.append(op or tag) return out # # xinclude tests (samples from appendix C of the xinclude specification) XINCLUDE = {} XINCLUDE["C1.xml"] = """\

120 Mz is adequate for an average home user.

""" XINCLUDE["disclaimer.xml"] = """\

The opinions represented herein represent those of the individual and should not be interpreted as official policy endorsed by this organization.

""" XINCLUDE["C2.xml"] = """\

This document has been accessed times.

""" XINCLUDE["count.txt"] = "324387" XINCLUDE["C3.xml"] = """\

The following is the source of the "data.xml" resource:

""" XINCLUDE["data.xml"] = """\ """ XINCLUDE["C5.xml"] = """\

Report error

""" XINCLUDE["default.xml"] = """\

Example.

""" def xinclude_loader(href, parse="xml", encoding=None): try: data = XINCLUDE[href] except KeyError: raise IOError("resource not found") if parse == "xml": return ElementTree.XML(data) return data def xinclude(): r""" Basic inclusion example (XInclude C.1) >>> document = xinclude_loader("C1.xml") >>> ElementInclude.include(document, xinclude_loader) >>> print serialize(document) # C1

120 Mz is adequate for an average home user.

The opinions represented herein represent those of the individual and should not be interpreted as official policy endorsed by this organization.

Textual inclusion example (XInclude C.2) >>> document = xinclude_loader("C2.xml") >>> ElementInclude.include(document, xinclude_loader) >>> print serialize(document) # C2

This document has been accessed 324387 times.

Textual inclusion of XML example (XInclude C.3) >>> document = xinclude_loader("C3.xml") >>> ElementInclude.include(document, xinclude_loader) >>> print serialize(document) # C3

The following is the source of the "data.xml" resource:

<?xml version='1.0'?> <data> <item><![CDATA[Brooks & Shields]]></item> </data> ## Fallback example (XInclude C.5) ## Note! Fallback support is not yet implemented ## >>> document = xinclude_loader("C5.xml") ## >>> ElementInclude.include(document, xinclude_loader) ## Traceback (most recent call last): ## IOError: resource not found ## >>> # print serialize(document) # C5 """ def xinclude_default(): """ >>> document = xinclude_loader("default.xml") >>> ElementInclude.include(document) >>> print serialize(document) # default

Example.

text texttail """ # # xmlwriter ## def xmlwriter(): ## r""" ## >>> file = StringIO.StringIO() ## >>> w = SimpleXMLWriter.XMLWriter(file) ## >>> html = w.start("html") ## >>> x = w.start("head") ## >>> w.element("title", "my document") ## >>> w.data("\n") ## >>> w.element("meta", name="hello", value="goodbye") ## >>> w.data("\n") ## >>> w.end() ## >>> x = w.start("body") ## >>> w.element("h1", "this is a heading") ## >>> w.data("\n") ## >>> w.element("p", u"this is a paragraph") ## >>> w.data("\n") ## >>> w.element("p", u"reserved characters: <&>") ## >>> w.data("\n") ## >>> w.element("p", u"detta �r ocks� ett stycke") ## >>> w.data("\n") ## >>> w.close(html) ## >>> print file.getvalue() ## my document ## ##

this is a heading

this is a paragraph

reserved characters: <&>

detta är också ett stycke

## ## """ # -------------------------------------------------------------------- # reported bugs ## def bug_xmltoolkit21(): ## """ ## marshaller gives obscure errors for non-string values ## >>> elem = ElementTree.Element(123) ## >>> serialize(elem) # tag ## Traceback (most recent call last): ## TypeError: cannot serialize 123 (type int) ## >>> elem = ElementTree.Element("elem") ## >>> elem.text = 123 ## >>> serialize(elem) # text ## Traceback (most recent call last): ## TypeError: cannot serialize 123 (type int) ## >>> elem = ElementTree.Element("elem") ## >>> elem.tail = 123 ## >>> serialize(elem) # tail ## Traceback (most recent call last): ## TypeError: cannot serialize 123 (type int) ## >>> elem = ElementTree.Element("elem") ## >>> elem.set(123, "123") ## >>> serialize(elem) # attribute key ## Traceback (most recent call last): ## TypeError: cannot serialize 123 (type int) ## >>> elem = ElementTree.Element("elem") ## >>> elem.set("123", 123) ## >>> serialize(elem) # attribute value ## Traceback (most recent call last): ## TypeError: cannot serialize 123 (type int) ## """ def bug_xmltoolkit25(): """ typo in ElementTree.findtext >>> tree = ElementTree.ElementTree(SAMPLE_XML) >>> tree.findtext("tag") 'text' >>> tree.findtext("section/tag") 'subtext' """ def bug_xmltoolkit28(): """ .//tag causes exceptions >>> tree = ElementTree.XML("

") >>> summarize_list(tree.findall(".//thead")) [] >>> summarize_list(tree.findall(".//tbody")) ['tbody'] """ ## def bug_xmltoolkitX1(): ## """ ## dump() doesn't flush the output buffer ## >>> tree = ElementTree.XML("

") ## >>> ElementTree.dump(tree); sys.stdout.write("tail") ##

## tail ## """ ## def bug_xmltoolkit39(): ## """ ## non-ascii element and attribute names doesn't work ## >>> tree = ElementTree.XML("") ## >>> ElementTree.tostring(tree, "utf-8") ## '' ## >>> tree = ElementTree.XML("") ## >>> tree.attrib ## {u'\\xe4ttr': u'v\\xe4lue'} ## >>> ElementTree.tostring(tree, "utf-8") ## '' ## >>> tree = ElementTree.XML("text") ## >>> ElementTree.tostring(tree, "utf-8") ## 'text' ## >>> tree = ElementTree.Element(u"t�g") ## >>> ElementTree.tostring(tree, "utf-8") ## '' ## >>> tree = ElementTree.Element("tag") ## >>> tree.set(u"�ttr", u"v�lue") ## >>> ElementTree.tostring(tree, "utf-8") ## '' ## """ ## def bug_xmltoolkit45(): ## """ ## problems parsing mixed unicode/non-ascii html documents ## latin-1 text ## >>> p = HTMLTreeBuilder.TreeBuilder() ## >>> p.feed("

v�lue

") ## >>> serialize(p.close()) ## '

välue

' ## utf-8 text ## >>> p = HTMLTreeBuilder.TreeBuilder(encoding="utf-8") ## >>> p.feed("

v\xc3\xa4lue

") ## >>> serialize(p.close()) ## '

välue

' ## utf-8 text using meta tag ## >>> p = HTMLTreeBuilder.TreeBuilder() ## >>> p.feed("

v\xc3\xa4lue

") ## >>> serialize(p.close().find("p")) ## '

välue

' ## latin-1 character references ## >>> p = HTMLTreeBuilder.TreeBuilder() ## >>> p.feed("

välue

") ## >>> serialize(p.close()) ## '

välue

' ## latin-1 character entities ## >>> p = HTMLTreeBuilder.TreeBuilder() ## >>> p.feed("

välue

") ## >>> serialize(p.close()) ## '

välue

' ## mixed latin-1 text and unicode entities ## >>> p = HTMLTreeBuilder.TreeBuilder() ## >>> p.feed("

”v�lue”

") ## >>> serialize(p.close()) ## '

”välue”

' ## mixed unicode and latin-1 entities ## >>> p = HTMLTreeBuilder.TreeBuilder() ## >>> p.feed("

”välue”

") ## >>> serialize(p.close()) ## '

”välue”

' ## """ # -------------------------------------------------------------------- if __name__ == "__main__": import doctest, selftest failed, tested = doctest.testmod(selftest) print tested - failed, "tests ok."