1 # -*- coding: utf-8 -*-
4 HTML parser test cases for etree
8 import tempfile, os, os.path, sys
10 this_dir = os.path.dirname(__file__)
11 if this_dir not in sys.path:
12 sys.path.insert(0, this_dir) # needed for Py3
14 from common_imports import etree, StringIO, BytesIO, fileInTestDir, _bytes, _str
15 from common_imports import SillyFileLike, HelperTestCase, write_to_file
18 unicode = __builtins__["unicode"]
19 except (NameError, KeyError):
22 class HtmlParserTestCase(HelperTestCase):
23 """HTML parser test cases
27 html_str = _bytes("<html><head><title>test</title></head><body><h1>page title</h1></body></html>")
28 html_str_pretty = _bytes("""\
30 <head><title>test</title></head>
31 <body><h1>page title</h1></body>
34 broken_html_str = _bytes("<html><head><title>test<body><h1>page title</h3></p></html>")
35 uhtml_str = _str("<html><head><title>test á\uF8D2</title></head><body><h1>page á\uF8D2 title</h1></body></html>")
38 super(HtmlParserTestCase, self).tearDown()
39 self.etree.set_default_parser()
41 def test_module_HTML(self):
42 element = self.etree.HTML(self.html_str)
43 self.assertEqual(self.etree.tostring(element, method="html"),
46 def test_module_HTML_unicode(self):
47 element = self.etree.HTML(self.uhtml_str)
48 self.assertEqual(unicode(self.etree.tostring(element, method="html",
49 encoding='UTF8'), 'UTF8'),
50 unicode(self.uhtml_str.encode('UTF8'), 'UTF8'))
52 def test_module_HTML_pretty_print(self):
53 element = self.etree.HTML(self.html_str)
54 self.assertEqual(self.etree.tostring(element, method="html", pretty_print=True),
57 def test_module_parse_html_error(self):
58 parser = self.etree.HTMLParser(recover=False)
59 parse = self.etree.parse
60 f = BytesIO("<html></body>")
61 self.assertRaises(self.etree.XMLSyntaxError,
64 def test_html_element_name_empty(self):
65 parser = self.etree.HTMLParser()
66 Element = parser.makeelement
69 self.assertRaises(ValueError, Element, '{}')
70 self.assertRaises(ValueError, setattr, el, 'tag', '{}')
72 self.assertRaises(ValueError, Element, '{test}')
73 self.assertRaises(ValueError, setattr, el, 'tag', '{test}')
75 def test_html_element_name_colon(self):
76 parser = self.etree.HTMLParser()
77 Element = parser.makeelement
79 pname = Element('p:name')
80 self.assertEquals(pname.tag, 'p:name')
82 pname = Element('{test}p:name')
83 self.assertEquals(pname.tag, '{test}p:name')
85 pname = Element('name')
87 self.assertEquals(pname.tag, 'p:name')
89 def test_html_element_name_quote(self):
90 parser = self.etree.HTMLParser()
91 Element = parser.makeelement
93 self.assertRaises(ValueError, Element, 'p"name')
94 self.assertRaises(ValueError, Element, "na'me")
95 self.assertRaises(ValueError, Element, '{test}"name')
96 self.assertRaises(ValueError, Element, "{test}name'")
99 self.assertRaises(ValueError, setattr, el, 'tag', "pname'")
100 self.assertRaises(ValueError, setattr, el, 'tag', '"pname')
101 self.assertEquals(el.tag, "name")
103 def test_html_element_name_space(self):
104 parser = self.etree.HTMLParser()
105 Element = parser.makeelement
107 self.assertRaises(ValueError, Element, ' name ')
108 self.assertRaises(ValueError, Element, 'na me')
109 self.assertRaises(ValueError, Element, '{test} name')
112 self.assertRaises(ValueError, setattr, el, 'tag', ' name ')
113 self.assertEquals(el.tag, "name")
115 def test_html_subelement_name_empty(self):
116 parser = self.etree.HTMLParser()
117 Element = parser.makeelement
119 SubElement = self.etree.SubElement
122 self.assertRaises(ValueError, SubElement, el, '{}')
123 self.assertRaises(ValueError, SubElement, el, '{test}')
125 def test_html_subelement_name_colon(self):
126 parser = self.etree.HTMLParser()
127 Element = parser.makeelement
128 SubElement = self.etree.SubElement
131 pname = SubElement(el, 'p:name')
132 self.assertEquals(pname.tag, 'p:name')
134 pname = SubElement(el, '{test}p:name')
135 self.assertEquals(pname.tag, '{test}p:name')
137 def test_html_subelement_name_quote(self):
138 parser = self.etree.HTMLParser()
139 Element = parser.makeelement
140 SubElement = self.etree.SubElement
143 self.assertRaises(ValueError, SubElement, el, "name'")
144 self.assertRaises(ValueError, SubElement, el, 'na"me')
145 self.assertRaises(ValueError, SubElement, el, "{test}na'me")
146 self.assertRaises(ValueError, SubElement, el, '{test}"name')
148 def test_html_subelement_name_space(self):
149 parser = self.etree.HTMLParser()
150 Element = parser.makeelement
151 SubElement = self.etree.SubElement
154 self.assertRaises(ValueError, SubElement, el, ' name ')
155 self.assertRaises(ValueError, SubElement, el, 'na me')
156 self.assertRaises(ValueError, SubElement, el, '{test} name')
158 def test_module_parse_html_norecover(self):
159 parser = self.etree.HTMLParser(recover=False)
160 parse = self.etree.parse
161 f = BytesIO(self.broken_html_str)
162 self.assertRaises(self.etree.XMLSyntaxError,
165 def test_parse_encoding_8bit_explicit(self):
166 text = _str('Søk på nettet')
167 html_latin1 = (_str('<p>%s</p>') % text).encode('iso-8859-1')
169 tree = self.etree.parse(
170 BytesIO(html_latin1),
171 self.etree.HTMLParser(encoding="iso-8859-1"))
173 self.assertEquals(p.text, text)
175 def test_parse_encoding_8bit_override(self):
176 text = _str('Søk på nettet')
177 wrong_head = _str('''
179 <meta http-equiv="Content-Type"
180 content="text/html; charset=UTF-8" />
182 html_latin1 = (_str('<html>%s<body><p>%s</p></body></html>') % (wrong_head,
184 ).encode('iso-8859-1')
186 self.assertRaises(self.etree.ParseError,
188 BytesIO(html_latin1))
190 tree = self.etree.parse(
191 BytesIO(html_latin1),
192 self.etree.HTMLParser(encoding="iso-8859-1"))
194 self.assertEquals(p.text, text)
196 def test_module_HTML_broken(self):
197 element = self.etree.HTML(self.broken_html_str)
198 self.assertEqual(self.etree.tostring(element, method="html"),
201 def test_module_HTML_cdata(self):
202 # by default, libxml2 generates CDATA nodes for <script> content
203 html = _bytes('<html><head><style>foo</style></head></html>')
204 element = self.etree.HTML(html)
205 self.assertEquals(element[0][0].text, "foo")
207 def test_module_HTML_access(self):
208 element = self.etree.HTML(self.html_str)
209 self.assertEqual(element[0][0].tag, 'title')
211 def test_module_parse_html(self):
212 parser = self.etree.HTMLParser()
213 filename = tempfile.mktemp(suffix=".html")
214 write_to_file(filename, self.html_str, 'wb')
216 f = open(filename, 'rb')
217 tree = self.etree.parse(f, parser)
219 self.assertEqual(self.etree.tostring(tree.getroot(), method="html"),
224 def test_module_parse_html_filelike(self):
225 parser = self.etree.HTMLParser()
226 f = SillyFileLike(self.html_str)
227 tree = self.etree.parse(f, parser)
228 html = self.etree.tostring(tree.getroot(),
229 method="html", encoding='UTF-8')
230 self.assertEqual(html, self.html_str)
232 ## def test_module_parse_html_filelike_unicode(self):
233 ## parser = self.etree.HTMLParser()
234 ## f = SillyFileLike(self.uhtml_str)
235 ## tree = self.etree.parse(f, parser)
236 ## html = self.etree.tostring(tree.getroot(), encoding='UTF-8')
237 ## self.assertEqual(unicode(html, 'UTF-8'), self.uhtml_str)
239 def test_html_file_error(self):
240 parser = self.etree.HTMLParser()
241 parse = self.etree.parse
242 self.assertRaises(IOError,
243 parse, "__some_hopefully_nonexisting_file__.html",
246 def test_default_parser_HTML_broken(self):
247 self.assertRaises(self.etree.XMLSyntaxError,
248 self.etree.parse, BytesIO(self.broken_html_str))
250 self.etree.set_default_parser( self.etree.HTMLParser() )
252 tree = self.etree.parse(BytesIO(self.broken_html_str))
253 self.assertEqual(self.etree.tostring(tree.getroot(), method="html"),
256 self.etree.set_default_parser()
258 self.assertRaises(self.etree.XMLSyntaxError,
259 self.etree.parse, BytesIO(self.broken_html_str))
261 def test_html_iterparse(self):
262 iterparse = self.etree.iterparse
264 '<html><head><title>TITLE</title><body><p>P</p></body></html>')
266 iterator = iterparse(f, html=True)
267 self.assertEquals(None, iterator.root)
269 events = list(iterator)
271 self.assert_(root is not None)
273 [('end', root[0][0]), ('end', root[0]), ('end', root[1][0]),
274 ('end', root[1]), ('end', root)],
277 def test_html_iterparse_file(self):
278 iterparse = self.etree.iterparse
279 iterator = iterparse(fileInTestDir("css_shakespear.html"),
282 self.assertEquals(None, iterator.root)
283 events = list(iterator)
285 self.assert_(root is not None)
286 self.assertEquals(249, len(events))
289 [ event for (event, element) in events if event != 'end' ])
291 def test_html_iterparse_start(self):
292 iterparse = self.etree.iterparse
294 '<html><head><title>TITLE</title><body><p>P</p></body></html>')
296 iterator = iterparse(f, html=True, events=('start',))
297 self.assertEquals(None, iterator.root)
299 events = list(iterator)
301 self.assert_(root is not None)
303 [('start', root), ('start', root[0]), ('start', root[0][0]),
304 ('start', root[1]), ('start', root[1][0])],
307 def test_html_parser_target_tag(self):
308 assertFalse = self.assertFalse
310 class Target(object):
311 def start(self, tag, attrib):
312 events.append(("start", tag))
315 events.append(("end", tag))
319 parser = self.etree.HTMLParser(target=Target())
321 parser.feed("<html><body></body></html>")
322 done = parser.close()
324 self.assertEquals("DONE", done)
326 ("start", "html"), ("start", "body"),
327 ("end", "body"), ("end", "html")], events)
329 def test_html_parser_target_doctype_empty(self):
330 assertFalse = self.assertFalse
332 class Target(object):
333 def start(self, tag, attrib):
334 events.append(("start", tag))
337 events.append(("end", tag))
338 def doctype(self, *args):
339 events.append(("doctype", args))
343 parser = self.etree.HTMLParser(target=Target())
344 parser.feed("<!DOCTYPE><html><body></body></html>")
345 done = parser.close()
347 self.assertEquals("DONE", done)
349 ("doctype", (None, None, None)),
350 ("start", "html"), ("start", "body"),
351 ("end", "body"), ("end", "html")], events)
353 def test_html_parser_target_doctype_html(self):
354 assertFalse = self.assertFalse
356 class Target(object):
357 def start(self, tag, attrib):
358 events.append(("start", tag))
361 events.append(("end", tag))
362 def doctype(self, *args):
363 events.append(("doctype", args))
367 parser = self.etree.HTMLParser(target=Target())
368 parser.feed("<!DOCTYPE html><html><body></body></html>")
369 done = parser.close()
371 self.assertEquals("DONE", done)
373 ("doctype", ("html", None, None)),
374 ("start", "html"), ("start", "body"),
375 ("end", "body"), ("end", "html")], events)
377 def test_html_parser_target_doctype_html_full(self):
378 assertFalse = self.assertFalse
380 class Target(object):
381 def start(self, tag, attrib):
382 events.append(("start", tag))
385 events.append(("end", tag))
386 def doctype(self, *args):
387 events.append(("doctype", args))
391 parser = self.etree.HTMLParser(target=Target())
392 parser.feed('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "sys.dtd">'
393 '<html><body></body></html>')
394 done = parser.close()
396 self.assertEquals("DONE", done)
398 ("doctype", ("html", "-//W3C//DTD HTML 4.01//EN", "sys.dtd")),
399 ("start", "html"), ("start", "body"),
400 ("end", "body"), ("end", "html")], events)
404 suite = unittest.TestSuite()
405 suite.addTests([unittest.makeSuite(HtmlParserTestCase)])
408 if __name__ == '__main__':
409 print('to test use test.py %s' % __file__)