#! /usr/bin/env python1.5 """Tests for XMLParser.py.""" import string import sys from TAL.tests import utils import unittest # BBB 2005/05/01 -- to be changed after 12 months # ignore deprecation warnings on import for now import warnings showwarning = warnings.showwarning warnings.showwarning = lambda *a, **k: None # this old import should remain here until the TAL package is # completely removed, so that API backward compatibility is properly # tested from TAL import XMLParser # restore warning machinery warnings.showwarning = showwarning class EventCollector(XMLParser.XMLParser): def __init__(self): self.events = [] self.append = self.events.append XMLParser.XMLParser.__init__(self) self.parser.ordered_attributes = 1 def get_events(self): # Normalize the list of events so that buffer artefacts don't # separate runs of contiguous characters. L = [] prevtype = None for event in self.events: type = event[0] if type == prevtype == "data": L[-1] = ("data", L[-1][1] + event[1]) else: L.append(event) prevtype = type self.events = L return L # structure markup def StartElementHandler(self, tag, attrs): self.append(("starttag", tag, attrs)) def EndElementHandler(self, tag): self.append(("endtag", tag)) # all other markup def CommentHandler(self, data): self.append(("comment", data)) def handle_charref(self, data): self.append(("charref", data)) def CharacterDataHandler(self, data): self.append(("data", data)) def StartDoctypeDeclHandler(self, rootelem, publicId, systemId, subset): self.append(("doctype", rootelem, systemId, publicId, subset)) def XmlDeclHandler(self, version, encoding, standalone): self.append(("decl", version, encoding, standalone)) def ExternalEntityRefHandler(self, data): self.append(("entityref", data)) def ProcessingInstructionHandler(self, target, data): self.append(("pi", target, data)) class EventCollectorExtra(EventCollector): def handle_starttag(self, tag, attrs): EventCollector.handle_starttag(self, tag, attrs) self.append(("starttag_text", self.get_starttag_text())) class SegmentedFile: def __init__(self, parts): self.parts = list(parts) def read(self, bytes): if self.parts: s = self.parts.pop(0) else: s = '' return s class XMLParserTestCase(unittest.TestCase): def _run_check(self, source, events, collector=EventCollector): parser = collector() if isinstance(source, type([])): parser.parseStream(SegmentedFile(source)) else: parser.parseString(source) if utils.oldexpat: while events[0][0] in ('decl', 'doctype'): del events[0] self.assertEquals(parser.get_events(), events) def _run_check_extra(self, source, events): self._run_check(source, events, EventCollectorExtra) def _parse_error(self, source): def parse(source=source): parser = XMLParser.XMLParser() parser.parseString(source) self.assertRaises(XMLParser.XMLParseError, parse) def check_processing_instruction_plus(self): self._run_check("", [ ("pi", "processing", "instruction"), ("starttag", "a", []), ("endtag", "a"), ]) def _check_simple_html(self): self._run_check("""\ &entity; sample text """, [ ("decl", "1.0", "iso-8859-1", -1), ("doctype", "html", "foo", "bar", 0), ("starttag", "html", []), # ("entityref", "entity"), ("data", " \n"), ("comment", "comment1a\n-><", [ ("starttag", "a", []), ("starttag", "b", []), ("endtag", "a"), ("endtag", "b"), ]) except: e = sys.exc_info()[1] self.assert_(e.lineno == 1, "did not receive correct position information") else: self.fail("expected parse error: bad nesting") def check_attr_syntax(self): output = [ ("starttag", "a", ["b", "v", "c", "v"]), ("endtag", "a"), ] self._run_check("""""", output) self._run_check("""""", output) self._run_check("""""", output) self._run_check("""""", output) def check_attr_values(self): self._run_check("""""", [("starttag", "a", ["b", "xxx xxx", "c", "yyy yyy", "d", " xyz "]), ("endtag", "a"), ]) self._run_check("""""", [ ("starttag", "a", ["b", "", "c", "", "d", ""]), ("endtag", "a"), ]) def check_attr_entity_replacement(self): self._run_check("""""", [ ("starttag", "a", ["b", "&><\"'"]), ("endtag", "a"), ]) def check_attr_funky_names(self): self._run_check("""""", [ ("starttag", "a", ["a.b", "v", "e-f", "v"]), ("endtag", "a"), ]) def check_starttag_end_boundary(self): self._run_check("""""", [ ("starttag", "a", ["b", "<"]), ("endtag", "a"), ]) self._run_check("""""", [ ("starttag", "a", ["b", ">"]), ("endtag", "a"), ]) def check_buffer_artefacts(self): output = [("starttag", "a", ["b", "<"]), ("endtag", "a")] self._run_check([""], output) self._run_check([""], output) self._run_check([""], output) self._run_check([""], output) self._run_check([""], output) self._run_check([""], output) output = [("starttag", "a", ["b", ">"]), ("endtag", "a")] self._run_check([""], output) self._run_check([""], output) self._run_check([""], output) self._run_check([""], output) self._run_check([""], output) self._run_check([""], output) def check_starttag_junk_chars(self): self._parse_error("<") self._parse_error("<>") self._parse_error("") self._parse_error("") self._parse_error("") self._parse_error("") self._parse_error("<$") self._parse_error("<$>") self._parse_error("") self._parse_error("'") self._parse_error("